From f1f9e9cd8022d4329c55479c35dbfade20b465df Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 6 Mar 2007 18:44:56 +0000 Subject: [PATCH] sharing of mdsmap with clients as appropriate; mds recovery bugfix; some mds cache documentation git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1174 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 41 +- branches/sage/cephmds2/client/Client.cc | 4 +- branches/sage/cephmds2/doc/Replication.txt | 19 - branches/sage/cephmds2/doc/caching.txt | 418 ++++++++++++--------- branches/sage/cephmds2/doc/performance.txt | 36 -- branches/sage/cephmds2/include/frag.h | 159 ++++++++ branches/sage/cephmds2/mds/MDCache.cc | 40 +- branches/sage/cephmds2/mds/MDS.cc | 39 +- branches/sage/cephmds2/mds/MDS.h | 1 + branches/sage/cephmds2/mds/MDSMap.h | 17 +- branches/sage/cephmds2/mds/mdstypes.h | 73 +--- branches/sage/cephmds2/mon/MDSMonitor.cc | 10 +- 12 files changed, 494 insertions(+), 363 deletions(-) delete mode 100644 branches/sage/cephmds2/doc/Replication.txt delete mode 100644 branches/sage/cephmds2/doc/performance.txt create mode 100644 branches/sage/cephmds2/include/frag.h diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 6e22da94feedf..befb6e56aabd6 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -1,9 +1,7 @@ monday - retest with 3+ - no failures - full failure document cache - pg rewrite + tag osd ops, objects with filelayout + mds diropen doc - mdsmonitor beacon semantics @@ -13,7 +11,7 @@ doc - journal content - importmaps and up:resolve - metablob version semantics - + mds - bystanders should avoid contacting auth when it is ambiguous. @@ -30,7 +28,6 @@ mds - failures during recovery stages (resolve, rejoin)... make sure rejoin still works! - fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) - incremental mdsmaps -- client mount logging - client failure - EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) @@ -44,6 +41,19 @@ mds - link - rename - dirslices. +- dirslice vs readdir +- redo hard links +- anchortable +- sync clients on stat + - will need to ditch 10s client metadata caching before this is useful + - implement truncate +- statfs? +- btree directories (for efficient large directories) +- consistency points/snapshots + +- fix MExportAck and others to use dir+dentry, not inode + (otherwise this all breaks with hard links.. altho it probably needs reworking already?) + monitor @@ -198,31 +208,12 @@ mds client - fstat -- make_request: cope with mds failure - mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. - test client caps migration w/ mds exports - some heuristic behavior to consolidate caps to inode auth? -MDS TODO -- fix hashed readdir: should (optionally) do a lock on dir namespace? -- fix hard links - - they mostly work, but they're fragile -- sync clients on stat - - will need to ditch 10s client metadata caching before this is useful - - implement truncate -- implement hashed directories -- statfs? -- rewrite journal + recovery -- figure out online failure recovery -- more distributed fh management? -- btree directories (for efficient large directories) -- consistency points/snapshots - -- fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already?) - diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc index cef5f6129ef87..ecdf82193be27 100644 --- a/branches/sage/cephmds2/client/Client.cc +++ b/branches/sage/cephmds2/client/Client.cc @@ -643,13 +643,13 @@ void Client::handle_client_request_forward(MClientRequestForward *fwd) request->mds.insert(fwd->get_source().num()); request->mds.insert(fwd->get_dest_mds()); request->num_fwd = fwd->get_num_fwd(); - dout(-10) << "handle_client_request tid " << tid + dout(10) << "handle_client_request tid " << tid << " fwd " << fwd->get_num_fwd() << " to mds" << fwd->get_dest_mds() << ", mds set now " << request->mds << endl; } else { - dout(-10) << "handle_client_request tid " << tid + dout(10) << "handle_client_request tid " << tid << " previously forwarded to mds" << fwd->get_dest_mds() << ", mds still " << request->mds << endl; diff --git a/branches/sage/cephmds2/doc/Replication.txt b/branches/sage/cephmds2/doc/Replication.txt deleted file mode 100644 index 0f8d4c9079e4d..0000000000000 --- a/branches/sage/cephmds2/doc/Replication.txt +++ /dev/null @@ -1,19 +0,0 @@ - -Primary copy replication. - -Inodes: - -- The primary's list of replicas (cached_by) is inclusive at all times. -- The primary's list never includes the local node. -- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight. - -- Replicas can be created in two ways: - - via a Discover + DiscoverReply - - via an export and import. (The old auth keeps a copy, and adds itself to the replica list as it exports.) - - -Directories (and their dentries): - -- The primary has an open_by list that is inclusive at all times. -- ..Never includes local node -- No per-dentry replica lists. All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list. \ No newline at end of file diff --git a/branches/sage/cephmds2/doc/caching.txt b/branches/sage/cephmds2/doc/caching.txt index a2791bdb5fbfa..fe0c78331bd86 100644 --- a/branches/sage/cephmds2/doc/caching.txt +++ b/branches/sage/cephmds2/doc/caching.txt @@ -1,234 +1,302 @@ +SPANNING TREE PROPERTY + +All metadata that exists in the cache is attached directly or +indirectly to the root inode. That is, if the /usr/bin/vi inode is in +the cache, then /usr/bin, /usr, and / are too, including the inodes, +directory objects, and dentries. + AUTHORITY The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a serial (normally 0) to +Additionally, each replica is assigned a nonce (initial 0) to disambiguate multiple replicas of the same item (see below). - set cached_by; - map cached_by_serial; + map replicas; // maps replicating mds# to nonce The cached_by set _always_ includes all nodes that cache the -partcuarly inode, but may additionally include nodes that used to +partcuarly object, but may additionally include nodes that used to cache it but no longer do. In those cases, an expire message should -be in transit. - - -REPLICA - -The replica maintains a notion of who it believes is the authority for -each replicated inode. There are two possibilities: - - - Ordinarily, this notion is correct. - - If the part of the file system in question was recently exported to - a new MDS, the inodes old authority is acting as a CACHEPROXY, - and will forward relevant messages on to the authority. - -When a repica is expired from cache, and expire is sent to the -authority. The expire includes the serial number issued when the -replica was originally created to disambiguate potentially concurrent -replication activity. - - -EXPORTS - -- The old authority suddenly becomes a replica. It's serial is well - defined. It also becomes a CACHEPROXY, which means its cached_by - remains defined (with an alternate meaning!). While a proxy, the - node will forward relevant messages from the replica to the - authority (but not the other way around--the authority knows all - replicas). - -- Once the export is acked, the old authority sends a - message to the replica notifying it of the new authority. As soon - as all replicas acknowedge receipt of this notice, the old authority - can cease CACHEPROXY responsibilities and become a regular replica. - At this point it's cached_by is no longer defined. - -- Replicas always know who the authority for the inode is, OR they - know prior owner acting as a CACHEPROXY. (They don't know which it - is.) - - -CACHED_BY - -The authority always has an inclusive list of nodes who cache an item. -As such it can confidently send updates to replicas for locking, -invalidating, etc. When a replica is expired from cache, an expire is -sent to the authority. If the serial matches, the node is removed -from the cached_by list. +be in transit. That is, we have two invariants: + 1) the authority's replica set will always include all actual + replicas, and + 2) cache expiration notices will be reliably delivered to the + authority. +The second invariant is particularly important because the presence of +replicas will pin the metadata object in memory on the authority, +preventing it from being trimmed from the cache. Notification of +expiration of the replicas is required to allow previously replicated +objects from eventually being trimmed from the cache as well. +Each metdata object has a authority bit that indicates whether it is +authoritative or a replica. -SUBTREE AUTHORITY DELEGATION: imports versus hashing -Authority is generally defined recursively: an inode's authority -matches the containing directory, and a directory's authority matches -the directory inode's. Thus the authority delegation chain can be -broken/redefined in two ways: +REPLICA NONCE + +Each replicated object maintains a "nonce" value, issued by the +authority at the time the replica was created. If the authority has +already created a replica for the given MDS, the new replica will be +issues a new (incremented) nonce. This nonce is attached +to cache expirations, and allows the authority to disambiguate +expirations when multiple replicas of the same object are created and +cache expiration is coincident with replication. That is, when an +old replica is expired from the replicating MDS at the same time that +a new replica is issued by the authority and the resulting messages +cross paths, the authority can tell that it was the old replica that +was expired and effectively ignore the expiration message. The +replica is removed from the replicas map only if the nonce matches. - - Imports and exports redefine the directory inode -> directory - linkage, such that the directory authority is explicitly specified - via dir.dir_auth: - dir.dir_auth == -1 -> directory matches its inode - dir.dir_auth >= 0 -> directory authority is dir.dir_auth +SUBTREE PARTITION - - Hashed directories redefine the directory -> inode linkage. In - non-hashed directories, inodes match their containing directory. - In hashed directories, each dentry's authority is defined by a hash - function. +Authority of the file system namespace is partitioned using a +subtree-based partitioning strategy. This strategy effectively +separates directory inodes from directory contents, such that the +directory contents are the unit of redelegation. That is, if / is +assigned to mds0 and /usr to mds1, the inode for /usr will be managed +by mds0 (it is part of the / directory), while the contents of /usr +(and everything nested beneath it) will be managed by mds1. - inode.hash_seed == 0 -> inode matches containing directory - inode.hash_seed > 0 -> defined by hash(hash_seed, dentry) +The description for this partition exists solely in the collective +memory of the MDS cluster and in the individual MDS journals. It is +not described in the regular on-disk metadata structures. This is +related to the fact that authority delegation is a property of the +{\it directory} and not the directory's {\it inode}. -A directory's "containing_import" (bad name, FIXME) is either the -import or hashed directory that is responsible for delegating a -subtree. Note that the containing_import of a directory may be itself -because it is an import, but it cannot be itself because it is hashed. +Subsequently, if an MDS is authoritative for a directory inode and does +not yet have any state associated with the directory in its cache, +then it can assume that it is also authoritative for the directory. -Thus: +Directory state consists of a data object that describes any cached +dentries contained in the directory, information about the +relationship between the cached contents and what appears on disk, and +any delegation of authority. That is, each CDir object has a dir_auth +element. Normally dir_auth has a value of AUTH_PARENT, meaning that +the authority for the directory is the same as the directory's inode. +When dir_auth specifies another metadata server, that directory is +point of authority delegation and becomes a {\it subtree root}. A +CDir is a subtree root iff its dir_auth specifies an MDS id (and is not +AUTH_PARENT). - - Import and export operations' manipulation of dir_auth is - completely orthogonal to hashing operations. Hashing methods can - ignore dir_auth, except when they create imports/exports (and break - the inode<->dir auth linkage). + - A dir is a subtree root iff dir_auth != AUTH_PARENT. - - Hashdirs act sort of like imports in that they bound an - authoritative region. That is, either hashdirs or imports can be - the key for nested_exports. In some cases, a dir may be both an - import and a hash. + - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the + converse may not be true. - - Export_dir won't export a hashdir. This is because it's tricky - (tho not necessarily impossible) due to the way nested_exports is - used with imports versus hashdirs. +The authority for any metadata object in the cache can be determined +by following the parent pointers toward the root until a subtree root +CDir object is reached, at which point the authority is specified by +its dir_auth. +Each MDS cache maintains a subtree data structure that describes the +subtree partition for all objects currently in the cache: + map< CDir*, set > subtrees; + - A dir will appear in the subtree map (as a key) IFF it is a subtree + root. -FREEZING +Each subtree root will have an entry in the map. The map value is a +set of all other subtree roots nested beneath that point. Nested +subtree roots effectively bound or prune a subtree. For example, if +we had the following partition: -There are two types of freezing: + mds0 / + mds1 /usr + mds0 /usr/local + mds0 /home - - TREE: recursively freezes everything nested beneath a directory, - until an export of edge of cache is reached. - - DIR: freezes the contents of a single directory. +The subtree map on mds0 would be -Some notes: + / -> (/usr, /home) + /home -> () - - Occurs on the authoritative node only. +and on mds1: - - Used for suspending critical operations while migrating authority - between nodes or hashing/unhashing directories. + /usr -> (/usr/local) - - Freezes the contents of the cache such that items may not be added, - items cannot be auth pinned, and/or subsequently reexported. The - namespace of the affected portions of the hierarchy may not change. - The content of inodes and other orthogonal operations - (e.g. replication, inode locking and modification) are unaffected. -Two states are defined: freezing and frozen. The freezing state is -used while waiting for auth_pins to be removed. Once all auth_pins -are gone, the state is changed to frozen. New auth_pins cannot be -added while freezing or frozen. +AMBIGUOUS DIR_AUTH + +While metadata for a subtree is being migrated between two MDS nodes, +the dir_auth for the subtree root is allowed to be ambiguous. That +is, it will specify both the old and new MDS ids, indicating that a +migration is in progress. + +If a replicated metadata object is expired from the cache from a +subtree whose authority is ambiguous, the cache expiration is sent to +both potential authorities. This ensures that the message will be +reliably delivered, even if either of those nodes fails. A number of +alternative strategies were considered. Sending the expiration to the +old or new authority and having it forwarded if authority has been +delegated can result in message loss if the forwarding node fails. +Pinning ambiguous metadata in cache is computationally expensive for +implementation reasons, and while delaying the transmission of expiration +messages is difficult to implement because the replicating must send +the final expiration messages when the subtree authority is +disambiguated, forcing it to keep certain elements of it cache in +memory. Although duplicated expirations incurs a small communications +overhead, the implementation is much simpler. AUTH PINS -An auth pin keeps a given item on the authoritative node until it is -removed. The pins are tracked recursively, so that a subtree cannot -be frozen if it contains any auth pins. - -If a pin is placed on a non-authoritative item, the item is allowed to -become authoritative; the specific restriction is it cannot be frozen, -which only happens during export-type operations. - - -TYPES OF EXPORTS - -- Actual export of a subtree from one node to another -- A rename between directories on different nodes exports the renamed -_inode_. (If it is a directory, it becomes an export such that the -directory itself does not move.) -- A hash or unhash operation will migrate inodes within the directory -either to or from the directory's main authority. - -EXPORT PROCESS - - +Most operations that modify metadata must allow some amount of time to +pass in order for the operation to be journaled or for communication +to take place between the object's authority and any replicas. For +this reason it must not only be pinned in the authority's metadata +cache, but also be locked such that the object's authority is not +allowed to change until the operation completes. This is accomplished +using {\it auth pins}, which increment a reference counter on the +object in question, as well as all parent metadata objects up to the +root of the subtree. As long as the pin is in place, it is impossible +for that subtree (or any fragment of it that contains one or more +pins) to be migrated to a different MDS node. Pins can be placed on +both inodes and directories. +Auth pins can only exist for authoritative metadata, because they are +only created if the object is authoritative, and their presense +prevents the migration of authority. -HASHING -- All nodes discover and open directory - -- Prep message distributes subdir inode replicas for exports so that - peers can open those dirs. This is necessary because subdirs are - converted into exports or imports as needed to avoid migrating - anything except the hashed dir itself. The prep is needed for the - same reasons its important with exports: the inode authority must - always have the exported dir open so that it gets accurate dir - authority updates, and can keep the inode->dir_auth up to date. - -- MHashDir messsage distributes the directory contents. - -- While auth is frozen_dir, we can't get_or_open_dir. Otherwise the - Prep messages won't be inclusive of all dirs, and the - imports/exports won't get set up properly. - -TODO -readdir - - -- subtrees stop at hashed dir. hashed dir's dir_auth follows parent - subtree, unless the dir is also an explicit import. thus a hashed - dir can also be an import dir. +FREEZING +More specifically, auth pins prevent a subtree from being frozen. +When a subtree is frozen, all updates to metadata are forbidden. This +includes updates to the replicas map that describes which replicas +(and nonces) exist for each object. + +In order for metadata to be migrated between MDS nodes, it must first +be frozen. The root of the subtree is initially marked as {\it +freezing}. This prevents the creation of any new auth pins within the +subtree. After all existing auth pins are removed, the subtree is +then marked as {\it frozen}, at which point all updates are +forbidden. This allows metadata state to be packaged up in a message +and transmitted to the new authority, without worrying about +intervening updates. + +If the directory at the base of a freezing or frozen subtree is not +also a subtree root (that is, it has dir_auth == AUTH_PARENT), the +directory's parent inode is auth pinned. + + - a frozen tree root dir will auth_pin its inode IFF it is auth AND + not a subtree root. + +This prevents a parent directory from being concurrently frozen, and a +range of resulting implementation complications relating metadata +migration. + + +CACHE EXPIRATION FOR FROZEN SUBTREES + +Cache expiration messages that are received for a subtree that is +frozen are temporarily set aside instead of being processed. Only +when the subtree is unfrozen are the expirations either processed (if +the MDS is authoritative) or discarded (if it is not). Because either +the exporting or importing metadata can fail during the migration +process, the MDS cannot tell whether it will be authoritative or not +until the process completes. + +During a migration, the subtree will first be frozen on both the +exporter and importer, and then all other replicas will be informed of +a subtrees ambiguous authority. This ensures that all expirations +during migration will go to both parties, and nothing will be lost in +the event of a failure. + + + + +NORMAL MIGRATION + +The exporter begins by doing some checks in export_dir() to verify +that it is permissible to export the subtree at this time. In +particular, the cluster must not be degraded, the subtree root may not +be freezing or frozen, and the path must be pinned (\ie not conflicted +with a rename). If these conditions are met, the subtree root +directory is temporarily auth pinned, the subtree freeze is initiated, +and the exporter is committed to the subtree migration, barring an +intervening failure of the importer or itself. + +The MExportDiscover serves simply to ensure that the inode for the +base directory being exported is open on the destination node. It is +pinned by the importer to prevent it from being trimmed. This occurs +before the exporter completes the freeze of the subtree to ensure that +the importer is able to replicate the necessary metadata. When the +exporter receives the MDiscoverAck, it allows the freeze to proceed by +removing its temporary auth pin. + +The MExportPrep message then follows to populate the importer with a +spanning tree that includes all dirs, inodes, and dentries necessary +to reach any nested subtrees within the exported region. This +replicates metadata as well, but it is pushed out by the exporter, +avoiding deadlock with the regular discover and replication process. +The importer is responsible for opening the bounding directories from +any third parties authoritative for those subtrees before +acknowledging. This ensures that the importer has correct dir_auth +information about where authority is redelegated for all points nested +beneath the subtree being migrated. While processing the MExportPrep, +the importer freezes the entire subtree region to prevent any new +replication or cache expiration. + +A warning stage occurs only if the base subtree directory is open by +nodes other than the importer and exporter. If it is not, then this +implies that no metadata within or nested beneath the subtree is +replicated by any node other than the importer an exporter. If it is, +then a MExportWarning message informs any bystanders that the +authority for the region is temporarily ambiguous, and lists both the +exporter and importer as authoritative MDS nodes. In particular, +bystanders who are trimming items from their cache must send +MCacheExpire messages to both the old and new authorities. This is +necessary to ensure that the surviving authority reliably receives all +expirations even if the importer or exporter fails. While the subtree +is frozen (on both the importer and exporter), expirations will not be +immediately processed; instead, they will be queued until the region +is unfrozen and it can be determined that the node is or is not +authoritative. + +The exporter walks the subtree hierarchy and packages up an MExport +message containing all metadata and important state (\eg, information +about metadata replicas). At the same time, the expoter's metadata +objects are flagged as non-authoritative. The MExport message sends +the actual subtree metadata to the importer. Upon receipt, the +importer inserts the data into its cache, marks all objects as +authoritative, and logs a copy of all metadata in an EImportStart +journal message. Once that has safely flushed, it replies with an +MExportAck. The exporter can now log an EExport journal entry, which +ultimately specifies that the export was a success. In the presence +of failures, it is the existence of the EExport entry only that +disambiguates authority during recovery. + +Once logged, the exporter will send an MExportNotify to any +bystanders, informing them that the authority is no longer ambiguous +and cache expirations should be sent only to the new authority (the +importer). Once these are acknowledged back to the exporter, +implicitly flushing the bystander to exporter message streams of any +stray expiration notices, the exporter unfreezes the subtree, cleans +up its migration-related state, and sends a final MExportFinish to the +importer. Upon receipt, the importer logs an EImportFinish(true) +(noting locally that the export was indeed a success), unfreezes its +subtree, processes any queued cache expierations, and cleans up its +state. -bananas -apples -blueberries -green pepper -carrots -celery +PARTIAL FAILURE RECOVERY -dir is a subtree root iff dir_auth.first != parent. -if dir_auth.first = parent then inode auth == dir auth, but the converse may not be true. -that is, you may have inode auth = dir auth, but dir_auth.first = whoami. e.g., - /usr is import. - /usr/bin is export. - /usr frozen for export, with bound /usr/bin. - /usr/bin imports completely. /usr/bin.dir_auth = whoami, not parent, because inode is not authpinnable. -this is safe because the /usr bound is known, and an abort can adjust the bound's dir_auth. +RECOVERY FROM JOURNAL -- if i am auth, any subtree bound will be a subtree root, and an export, frozen, or both. -- if i am auth and unfrozen/freezing, any subtree bound will be an export. and subtree root. -- if i am auth and frozen, any subtree bound will be an export, or subtree root noted in export_bounds/import_bounds. -- if a dir is a subtree root, it is - - auth, import - - auth, export - - nonauth, frozen, importing - - auth, frozen, imported - - auth, parent is auth+frozen for import|export, i am known bound. - - auth, parent is auth+frozen for import - - frozen and exporting - -- a frozen tree root dir will auth_pin it's inode IFF it is auth AND not a subtree root. --------------------------- -dir is a subtree root iff dir_auth.first != parent. -if subtree root and not root, will appear in subtree_bounds[parent subtree root]. diff --git a/branches/sage/cephmds2/doc/performance.txt b/branches/sage/cephmds2/doc/performance.txt deleted file mode 100644 index 7ca278bd284b1..0000000000000 --- a/branches/sage/cephmds2/doc/performance.txt +++ /dev/null @@ -1,36 +0,0 @@ - - -quick performance test 2005-05-11. fakemds, 100x100, asdf/asdf, debug 13 - -g marshalling -real 3m8.697s -user 2m53.282s -sys 0m6.291s - -real 3m3.337s -user 2m49.467s -sys 0m6.243s - - -g no marshalling -real 2m1.464s -user 1m42.680s -sys 0m8.128s - -real 1m49.469s -user 1m34.523s -sys 0m6.410s - - -O3 marshalling -real 1m29.833s -user 1m11.474s -sys 0m7.588s - -real 1m9.439s -user 0m56.071s -sys 0m5.643s - - - -O3 no marshalling -real 1m2.739s -user 0m46.578s -sys 0m7.882s - diff --git a/branches/sage/cephmds2/include/frag.h b/branches/sage/cephmds2/include/frag.h new file mode 100644 index 0000000000000..b92de3ca8c1ea --- /dev/null +++ b/branches/sage/cephmds2/include/frag.h @@ -0,0 +1,159 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __FRAG_H +#define __FRAG_H + +#include +#include +#include "buffer.h" + +/* + * + * the goal here is to use a binary split strategy to partition a namespace. + * frag_t represents a particular fragment. bits() tells you the size of the + * fragment, and value() it's name. this is roughly analogous to an ip address + * and netmask. + * + * fragtree_t represents an entire namespace and it's partition. it essentially + * tells you where fragments are split into other fragments, and by how much + * (i.e. by how many bits, resulting in a power of 2 number of child fragments). + * + * this vaguely resembles a btree, in that when a fragment becomes large or small + * we can split or merge, except that there is no guarantee of being balanced. + * presumably we are partitioning the output of a (perhaps specialized) hash + * function. + * + */ + +/** + * frag_t + * + * description of an individual fragment. that is, a particular piece + * of the overall namespace. + * + * this is conceptually analogous to an ip address and netmask. + * + * we write it as v/b, where v is a value and b is the number of bits. + * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, + * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. + */ +class frag_t { + /* encoded value. + * 8 upper bits = "bits" + * 24 lower bits = "value" + */ + __uint32_t _enc; + + public: + frag_t() : _enc(0) { } + frag_t(unsigned v, unsigned b) : _enc((b << 24) + v) { } + + // accessors + unsigned value() const { return _enc & 0xffffff; } + unsigned bits() const { return _enc >> 24; } + unsigned mask() const { return 0xffffffff >> (32-bits()); } + operator unsigned() const { return _enc; } + + // tests + bool contains(frag_t sub) const { + return (sub.bits() >= bits() && // they are more specific than us, + (sub.value() & mask()) == value()); // and they are contained by us. + } + bool root() const { + return bits() == 0; + } + frag_t parent() const { + assert(bits() > 0); + return frag_t(value() & (mask() >> 1), bits()-1); + } + + // splitting + frag_t left_half() const { + return frag_t(value(), bits()+1); + } + frag_t right_half() const { + return frag_t(value() | (1<& frag_tments) const { + assert(nb > 0); + unsigned nway = 1 << (nb-1); + for (unsigned i=0; i: + // frag_t f is split by b bits. + // if child frag_t does not appear, it is not split. + std::map _splits; + + public: + // accessors + int get_split(frag_t hb) { + if (_splits.count(hb)) + return _splits[hb]; + else + return 0; + } + + // modifiers + void split(frag_t hb, int b) { + assert(_splits.count(hb) == 0); + _splits[hb] = b; + } + void merge(frag_t hb, int b) { + assert(_splits[hb] == b); + _splits.erase(hb); + } + + // verify that we describe a legal partition of the namespace. + void verify() { + std::map copy; + std::list q; + q.push_back(frag_t()); + + while (1) { + frag_t cur = q.front(); + q.pop_front(); + int b = get_split(cur); + if (!b) continue; + copy[cur] = b; + cur.split(b, q); + } + + assert(copy == _splits); + } + + // encoding + void _encode(bufferlist& bl) { + ::_encode(_splits, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(_splits, bl, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 9ee894224ebe6..771a73532f96b 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -993,33 +993,35 @@ void MDCache::handle_import_map(MMDSImportMap *m) } } - // note ambiguous imports too.. unless i'm already active - if (!mds->is_active() && !mds->is_stopping()) { + show_subtrees(); + + + // recovering? + if (!mds->is_rejoin() && !mds->is_active() && !mds->is_stopping()) { + // note ambiguous imports too.. unless i'm already active for (map >::iterator pi = m->ambiguous_imap.begin(); pi != m->ambiguous_imap.end(); ++pi) { dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; other_ambiguous_imports[from][pi->first].swap( pi->second ); } - } - show_subtrees(); - - // did i get them all? - got_import_map.insert(from); - - if (got_import_map == recovery_set) { - dout(10) << "got all import maps, ready to rejoin" << endl; - disambiguate_imports(); - recalc_auth_bits(); - trim_non_auth(); + // did i get them all? + got_import_map.insert(from); - // move to rejoin state - mds->set_want_state(MDSMap::STATE_REJOIN); - - } else { - dout(10) << "still waiting for more importmaps, got " << got_import_map - << ", need " << recovery_set << endl; + if (got_import_map == recovery_set) { + dout(10) << "got all import maps, ready to rejoin" << endl; + disambiguate_imports(); + recalc_auth_bits(); + trim_non_auth(); + + // move to rejoin state + mds->set_want_state(MDSMap::STATE_REJOIN); + + } else { + dout(10) << "still waiting for more importmaps, got " << got_import_map + << ", need " << recovery_set << endl; + } } delete m; diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index 227e52f644338..71f2148e32afe 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -93,6 +93,9 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { server = new Server(this); locker = new Locker(this, mdcache); + + // clients + last_client_mdsmap_bcast = 0; // beacon beacon_last_seq = 0; @@ -106,7 +109,6 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { want_state = state = MDSMap::STATE_DNE; - logger = logger2 = 0; // i'm ready! @@ -440,6 +442,10 @@ void MDS::handle_mds_map(MMDSMap *m) mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); set oldactive; mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); + set oldcreating; + mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING); + set oldout; + mdsmap->get_mds_set(oldout, MDSMap::STATE_OUT); // decode and process mdsmap->decode(m->get_encoded()); @@ -571,21 +577,17 @@ void MDS::handle_mds_map(MMDSMap *m) } } + // we need to make sure clients find out about (new) mds addresses. + // clients don't care about mds state. + bool share_with_clients = false; + // REJOIN // is everybody finally rejoining? if (is_rejoin() || is_active() || is_stopping()) { // did we start? if (!wasrejoining && mdsmap->is_rejoining()) { mdcache->send_cache_rejoins(); - - // share the map with mounted clients - dout(10) << "sharing mdsmap with mounted clients" << endl; - for (set::const_iterator p = clientmap.get_mount_set().begin(); - p != clientmap.get_mount_set().end(); - ++p) { - messenger->send_message(new MMDSMap(mdsmap), - clientmap.get_inst(*p)); - } + share_with_clients = true; } // did we finish? if (wasrejoining && !mdsmap->is_rejoining()) { @@ -616,6 +618,23 @@ void MDS::handle_mds_map(MMDSMap *m) } } + // inst set changed? + if (mdsmap->get_same_inst_since() > last_client_mdsmap_bcast) + share_with_clients = true; + + // share map with clients? + if (share_with_clients) { + // share the map with mounted clients + dout(10) << "sharing mdsmap with mounted clients" << endl; + for (set::const_iterator p = clientmap.get_mount_set().begin(); + p != clientmap.get_mount_set().end(); + ++p) { + messenger->send_message(new MMDSMap(mdsmap), + clientmap.get_inst(*p)); + } + last_client_mdsmap_bcast = mdsmap->get_epoch(); + } + delete m; } diff --git a/branches/sage/cephmds2/mds/MDS.h b/branches/sage/cephmds2/mds/MDS.h index 98d1785fd3cbe..fa2aa536cc7f3 100644 --- a/branches/sage/cephmds2/mds/MDS.h +++ b/branches/sage/cephmds2/mds/MDS.h @@ -184,6 +184,7 @@ class MDS : public Dispatcher { // -- client map -- ClientMap clientmap; + epoch_t last_client_mdsmap_bcast; void log_clientmap(Context *c); diff --git a/branches/sage/cephmds2/mds/MDSMap.h b/branches/sage/cephmds2/mds/MDSMap.h index e146b3029fdf3..f19ee448acfc2 100644 --- a/branches/sage/cephmds2/mds/MDSMap.h +++ b/branches/sage/cephmds2/mds/MDSMap.h @@ -65,7 +65,8 @@ class MDSMap { protected: epoch_t epoch; - utime_t ctime; + utime_t created; + epoch_t same_inst_since; int anchortable; // which MDS has anchortable (fixme someday) int root; // which MDS has root directory @@ -79,12 +80,13 @@ class MDSMap { friend class MDSMonitor; public: - MDSMap() : epoch(0), anchortable(0), root(0) {} + MDSMap() : epoch(0), same_inst_since(0), anchortable(0), root(0) {} epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } - const utime_t& get_ctime() const { return ctime; } + const utime_t& get_create() const { return created; } + epoch_t get_same_inst_since() const { return same_inst_since; } int get_anchortable() const { return anchortable; } int get_root() const { return root; } @@ -249,7 +251,8 @@ class MDSMap { // serialize, unserialize void encode(bufferlist& blist) { blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&ctime, sizeof(ctime)); + blist.append((char*)&created, sizeof(created)); + blist.append((char*)&same_inst_since, sizeof(same_inst_since)); blist.append((char*)&anchortable, sizeof(anchortable)); blist.append((char*)&root, sizeof(root)); @@ -263,8 +266,10 @@ class MDSMap { int off = 0; blist.copy(off, sizeof(epoch), (char*)&epoch); off += sizeof(epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); + blist.copy(off, sizeof(created), (char*)&created); + off += sizeof(created); + blist.copy(off, sizeof(same_inst_since), (char*)&same_inst_since); + off += sizeof(same_inst_since); blist.copy(off, sizeof(anchortable), (char*)&anchortable); off += sizeof(anchortable); blist.copy(off, sizeof(root), (char*)&root); diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h index 750d928a1d68d..c2457d6077154 100644 --- a/branches/sage/cephmds2/mds/mdstypes.h +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -13,6 +13,7 @@ using namespace std; #include +#include "include/frag.h" // md ops @@ -169,76 +170,8 @@ inline mds_load_t operator/( mds_load_t& a, double d ) // dir slices struct dirslice_t { - short hash_mask; - short hash_val; -}; - -/* - * hashbit - */ -class hashbit { - unsigned _value:24; - unsigned _bits:8; - - public: - hashbit() : - _value(0), _bits(0) { } - hashbit(unsigned v, unsigned b) : - _value(v), _bits(b) { } - - // accessors - unsigned value() const { return _value; } - unsigned bits() const { return _bits; } - unsigned mask() const { return 0xffffffff >> (32-_bits); } - operator unsigned() const { - return (_bits << 24) + _value; - } - - // tests - bool contains(hashbit sub) { - return (sub.bits() >= bits() && // they are more specific than us, - (sub.value() & mask()) == value()); // and they are contained by us. - } - - // splitting - hashbit left_half() { - return hashbit(_value, _bits+1); - } - hashbit right_half() { - return hashbit(_value | (1<<_bits), _bits+1); - } - void split(int nb, vector& fragments) { - assert(nb > 0); - unsigned nway = 1 << (nb-1); - fragments.clear(); - fragments.reserve(nway); - for (unsigned i=0; i: - // hashbit hb is split n ways. - // if child hashbit does not appear, it is not split. - map _splits; - - public: - void split(hashbit hb, int n) { - assert(_splits.count(hb) == 0); - _splits[hb] = n; - } - void merge(hashbit hb, int n) { - assert(_splits[hb] == n); - _splits.erase(hb); - } - + inodeno_t ino; + frag_t fg; }; diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc index 73387f07a8df6..b12dc493d84f2 100644 --- a/branches/sage/cephmds2/mon/MDSMonitor.cc +++ b/branches/sage/cephmds2/mon/MDSMonitor.cc @@ -73,7 +73,7 @@ void MDSMonitor::election_finished() void MDSMonitor::create_initial() { mdsmap.epoch = 0; // until everyone boots - mdsmap.ctime = g_clock.now(); + mdsmap.created = g_clock.now(); mdsmap.encode(encoded_map); @@ -229,6 +229,9 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); mdsmap.mds_inc[from]++; + // someone (new) joined the cluster + mdsmap.same_inst_since = mdsmap.epoch+1; + // starting -> creating|starting|replay if (mdsmap.is_degraded() && !mdsmap.is_failed(from)) { @@ -276,6 +279,11 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) << " -> " << MDSMap::get_state_name(state) << endl; + // did someone leave the cluster? + if (state == MDSMap::STATE_OUT && mdsmap.mds_state[from] != MDSMap::STATE_OUT) + mdsmap.same_inst_since = mdsmap.epoch+1; + + // change the state mdsmap.mds_state[from] = state; if (mdsmap.is_up(from)) mdsmap.mds_state_seq[from] = seq; -- 2.39.5