From e14f24e63c6607ebf44383e8ac2eb5cc5e931306 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 9 Mar 2007 00:44:31 +0000 Subject: [PATCH] gobs of dirfrag_t refactoring. mds readdir implementation. minor client tweaks. ripped out more old hashing stuff. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1181 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/client/Client.cc | 30 +- branches/sage/cephmds2/client/Client.h | 3 +- branches/sage/cephmds2/include/frag.h | 88 +++- branches/sage/cephmds2/mds/CDir.cc | 49 +- branches/sage/cephmds2/mds/CDir.h | 31 +- branches/sage/cephmds2/mds/CInode.cc | 87 +++- branches/sage/cephmds2/mds/CInode.h | 66 ++- branches/sage/cephmds2/mds/MDCache.cc | 277 ++++++----- branches/sage/cephmds2/mds/MDCache.h | 20 +- branches/sage/cephmds2/mds/MDS.cc | 2 + branches/sage/cephmds2/mds/Migrator.cc | 239 +++++----- branches/sage/cephmds2/mds/Migrator.h | 39 +- branches/sage/cephmds2/mds/Server.cc | 440 ++++++------------ branches/sage/cephmds2/mds/Server.h | 7 +- .../sage/cephmds2/mds/events/EImportFinish.h | 12 +- .../sage/cephmds2/mds/events/EImportMap.h | 9 +- .../sage/cephmds2/mds/events/EImportStart.h | 18 +- branches/sage/cephmds2/mds/events/EMetaBlob.h | 26 +- branches/sage/cephmds2/mds/journal.cc | 60 ++- branches/sage/cephmds2/mds/mdstypes.h | 3 + .../sage/cephmds2/messages/MCacheExpire.h | 42 +- .../sage/cephmds2/messages/MClientReply.h | 64 ++- branches/sage/cephmds2/messages/MExportDir.h | 24 +- .../sage/cephmds2/messages/MExportDirAck.h | 16 +- .../cephmds2/messages/MExportDirDiscover.h | 20 +- .../cephmds2/messages/MExportDirDiscoverAck.h | 29 +- .../sage/cephmds2/messages/MExportDirNotify.h | 50 +- .../cephmds2/messages/MExportDirNotifyAck.h | 16 +- .../sage/cephmds2/messages/MExportDirPrep.h | 86 ++-- .../cephmds2/messages/MExportDirPrepAck.h | 20 +- .../sage/cephmds2/messages/MMDSImportMap.h | 10 +- 31 files changed, 952 insertions(+), 931 deletions(-) diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc index ecdf82193be27..6d09efa823ef4 100644 --- a/branches/sage/cephmds2/client/Client.cc +++ b/branches/sage/cephmds2/client/Client.cc @@ -265,7 +265,6 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) dout(12) << "insert_inode " << dname << " ino " << st->inode.ino << " size " << st->inode.size << " mtime " << st->inode.mtime - << " hashed " << st->hashed << endl; if (dn) { @@ -349,20 +348,27 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) */ void Client::update_inode_dist(Inode *in, InodeStat *st) { - // dir info - in->dir_auth = st->dir_auth; - in->dir_hashed = st->hashed; - in->dir_replicated = st->replicated; + // auth + in->dir_auth = -1; + if (!st->dirfrag_auth.empty()) { // HACK FIXME ******* FIXME FIXME FIXME FIXME dirfrag_t + in->dir_auth = st->dirfrag_auth.begin()->second; + } + + // replicated + in->dir_replicated = false; + if (!st->dirfrag_rep.empty()) + in->dir_replicated = true; // FIXME - // dir replication - if (st->spec_defined) { - if (st->dist.empty() && !in->dir_contacts.empty()) + // dist + if (!st->dirfrag_dist.empty()) { // FIXME + set dist = st->dirfrag_dist.begin()->second; + if (dist.empty() && !in->dir_contacts.empty()) dout(9) << "lost dist spec for " << in->inode.ino - << " " << st->dist << endl; - if (!st->dist.empty() && in->dir_contacts.empty()) + << " " << dist << endl; + if (!dist.empty() && in->dir_contacts.empty()) dout(9) << "got dist spec for " << in->inode.ino - << " " << st->dist << endl; - in->dir_contacts = st->dist; + << " " << dist << endl; + in->dir_contacts = dist; } } diff --git a/branches/sage/cephmds2/client/Client.h b/branches/sage/cephmds2/client/Client.h index 0da6404b98f3a..38f44d89c13a8 100644 --- a/branches/sage/cephmds2/client/Client.h +++ b/branches/sage/cephmds2/client/Client.h @@ -452,8 +452,7 @@ protected: dn->inode = 0; in->dn = 0; put_inode(in); - assert(!in->is_dir() || in->dir == 0); - + // unlink from dir dn->dir->dentries.erase(dn->name); if (dn->dir->is_empty()) diff --git a/branches/sage/cephmds2/include/frag.h b/branches/sage/cephmds2/include/frag.h index edde910ce604e..00f424fdbdaf3 100644 --- a/branches/sage/cephmds2/include/frag.h +++ b/branches/sage/cephmds2/include/frag.h @@ -44,6 +44,8 @@ * * this is conceptually analogous to an ip address and netmask. * + * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). + * * we write it as v/b, where v is a value and b is the number of bits. * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. @@ -58,6 +60,10 @@ class frag_t { public: frag_t() : _enc(0) { } frag_t(unsigned v, unsigned b) : _enc((b << 24) + v) { } + frag_t(unsigned e) : _enc(e) { } + + // constructors + void from_unsigned(unsigned e) { _enc = e; } // accessors unsigned value() const { return _enc & 0xffffff; } @@ -66,6 +72,9 @@ class frag_t { operator unsigned() const { return _enc; } // tests + bool contains(unsigned v) const { + return (v & mask()) == value(); + } bool contains(frag_t sub) const { return (sub.bits() >= bits() && // they are more specific than us, (sub.value() & mask()) == value()); // and they are contained by us. @@ -85,17 +94,17 @@ class frag_t { frag_t right_half() const { return frag_t(value() | (1<& frag_tments) const { + void split(int nb, list& fragments) const { assert(nb > 0); unsigned nway = 1 << (nb-1); for (unsigned i=0; i::const_iterator p = _splits.find(hb); + if (p == _splits.end()) return 0; + else + return p->second; + } + void get_leaves(list& ls) const { + list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = get_split(t); + if (nb) + t.split(nb, q); // queue up children + else + ls.push_back(t); // not spit, it's a leaf. + } + } + + frag_t operator[](unsigned v) const { + frag_t t; + while (1) { + assert(t.contains(v)); + int nb = get_split(t); + + // is this a leaf? + if (nb == 0) return t; // done. + + // pick appropriate child fragment. + unsigned nway = 1 << (nb-1); + unsigned i; + for (i=0; i copy; std::list q; q.push_back(frag_t()); @@ -156,4 +205,27 @@ class fragtree_t { } }; +inline ostream& operator<<(ostream& out, fragtree_t& ft) +{ + out << "fragtree_t("; + + bool first = true; + list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = ft.get_split(t); + if (nb) { + if (first) + first = false; + else + out << ' '; + out << t << '%' << nb; + t.split(nb, q); // queue up children + } + } + return out << ")"; +} + #endif diff --git a/branches/sage/cephmds2/mds/CDir.cc b/branches/sage/cephmds2/mds/CDir.cc index 77a431649f548..da95e4f6c6799 100644 --- a/branches/sage/cephmds2/mds/CDir.cc +++ b/branches/sage/cephmds2/mds/CDir.cc @@ -97,9 +97,10 @@ ostream& operator<<(ostream& out, CDir& dir) // ------------------------------------------------------------------- // CDir -CDir::CDir(CInode *in, MDCache *mdcache, bool auth) +CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) { inode = in; + frag = fg; this->cache = mdcache; nitems = 0; @@ -277,8 +278,8 @@ void CDir::link_inode_work( CDentry *dn, CInode *in ) nitems++; // adjust dir size - // set dir version - in->inode.version = dn->get_version(); + // set inode version + //in->inode.version = dn->get_version(); // clear dangling in->state_clear(CInode::STATE_DANGLING); @@ -564,13 +565,12 @@ void CDir::last_put() class C_Dir_Fetch : public Context { protected: CDir *dir; - off_t offset; public: bufferlist bl; - C_Dir_Fetch(CDir *d, off_t o=0) : dir(d), offset(o) { } + C_Dir_Fetch(CDir *d) : dir(d) { } void finish(int result) { - dir->_fetch_dir_read(offset, bl); + dir->_fetched(bl); } }; @@ -591,22 +591,22 @@ void CDir::fetch(Context *c) if (cache->mds->logger) cache->mds->logger->inc("fdir"); // start by reading the first hunk of it - C_Dir_Fetch *fin = new C_Dir_Fetch(this, 0); + C_Dir_Fetch *fin = new C_Dir_Fetch(this); cache->mds->objecter->read( get_ondisk_object(), 0, 0, // whole object &fin->bl, fin ); } -void CDir::_fetch_dir_read(off_t read_off, bufferlist &bl) +void CDir::_fetched(bufferlist &bl) { - dout(10) << "_fetch_dir_read " << read_off << "~" << bl.length() + dout(10) << "_fetched " << 0 << "~" << bl.length() << " on " << *this << endl; // give up? if (!is_auth() || is_frozen()) { - dout(10) << "_fetch_dir_read canceling (!auth or frozen)" << endl; + dout(10) << "_fetched canceling (!auth or frozen)" << endl; //ondisk_bl.clear(); //ondisk_size = 0; @@ -617,7 +617,6 @@ void CDir::_fetch_dir_read(off_t read_off, bufferlist &bl) // add to our buffer size_t ondisk_size; - assert(read_off == 0); // for now. assert(bl.length() > sizeof(ondisk_size)); bl.copy(0, sizeof(ondisk_size), (char*)&ondisk_size); off_t have = bl.length() - sizeof(ondisk_size); @@ -635,7 +634,7 @@ void CDir::_fetch_dir_read(off_t read_off, bufferlist &bl) bl.copy(off, sizeof(got_version), (char*)&got_version); off += sizeof(got_version); - dout(10) << "_fetch_dir_read " << num_dn << " dn, got_version " << got_version + dout(10) << "_fetched " << num_dn << " dn, got_version " << got_version << ", " << ondisk_size << " bytes" << endl; @@ -725,19 +724,41 @@ void CDir::_fetch_dir_read(off_t read_off, bufferlist &bl) assert(0); } - // clean underwater item? - if (dn && + /** clean underwater item? + * Underwater item is something that is dirty in our cache from + * journal replay, but was previously flushed to disk before the + * mds failed. + * + * We only do this is committed_version == 0. that implies either + * - this is a fetch after from a clean/empty CDir is created + * (and has no effect, since the dn won't exist); or + * - this is a fetch after _recovery_, which is what we're worried + * about. Items that are marked dirty from the journal should be + * marked clean if they appear on disk. + */ + if (committed_version == 0 && + dn && dn->get_version() <= got_version && dn->is_dirty()) { dout(10) << "readdir had underwater dentry " << *dn << ", marking clean" << endl; dn->mark_clean(); + if (dn->get_inode()) { assert(dn->get_inode()->get_version() <= got_version); + dout(10) << "readdir had underwater inode " << *dn->get_inode() << ", marking clean" << endl; dn->get_inode()->mark_clean(); } } } + // take the loaded version? + // only if we are a fresh CDir* with no prior state. + if (version == 0) { + assert(projected_version == 0); + assert(!state_test(STATE_COMMITTING)); + projected_version = version = committing_version = committed_version = got_version; + } + // mark complete, !fetching state_set(STATE_COMPLETE); state_clear(STATE_FETCHING); diff --git a/branches/sage/cephmds2/mds/CDir.h b/branches/sage/cephmds2/mds/CDir.h index 2f3b49d2c6bc5..467eea5b005de 100644 --- a/branches/sage/cephmds2/mds/CDir.h +++ b/branches/sage/cephmds2/mds/CDir.h @@ -48,8 +48,9 @@ class Context; // >= 0 is the auth mds #define CDIR_AUTH_PARENT -1 // default #define CDIR_AUTH_UNKNOWN -2 -#define CDIR_AUTH_DEFAULT pair(-1, -2) -#define CDIR_AUTH_UNDEF pair(-2, -2) +#define CDIR_AUTH_DEFAULT pair(-1, -2) +#define CDIR_AUTH_UNDEF pair(-2, -2) +#define CDIR_AUTH_ROOTINODE pair( 0, -2) ostream& operator<<(ostream& out, class CDir& dir); @@ -246,7 +247,7 @@ class CDir : public MDSCacheObject { friend class CDirExport; public: - CDir(CInode *in, MDCache *mdcache, bool auth); + CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); @@ -368,7 +369,7 @@ class CDir : public MDSCacheObject { } void fetch(Context *c); - void _fetch_dir_read(off_t off, bufferlist &bl); + void _fetched(bufferlist &bl); // -- commit -- map > waiting_for_commit; @@ -498,7 +499,7 @@ class CDir : public MDSCacheObject { // discover class CDirDiscover { - inodeno_t ino; + dirfrag_t dirfrag; int nonce; int dir_auth; int dir_rep; @@ -507,7 +508,7 @@ class CDirDiscover { public: CDirDiscover() {} CDirDiscover(CDir *dir, int nonce) { - ino = dir->ino(); + dirfrag = dir->dirfrag(); this->nonce = nonce; //dir_auth = dir->dir_auth.first; dir_rep = dir->dir_rep; @@ -515,7 +516,7 @@ class CDirDiscover { } void update_dir(CDir *dir) { - assert(dir->ino() == ino); + assert(dir->dirfrag() == dirfrag); assert(!dir->is_auth()); dir->replica_nonce = nonce; @@ -524,11 +525,11 @@ class CDirDiscover { dir->dir_rep_by = rep_by; } - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } void _encode(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&dirfrag, sizeof(dirfrag)); bl.append((char*)&nonce, sizeof(nonce)); bl.append((char*)&dir_auth, sizeof(dir_auth)); bl.append((char*)&dir_rep, sizeof(dir_rep)); @@ -536,8 +537,8 @@ class CDirDiscover { } void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); bl.copy(off, sizeof(nonce), (char*)&nonce); off += sizeof(nonce); bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); @@ -554,7 +555,7 @@ class CDirDiscover { class CDirExport { struct { - inodeno_t ino; + dirfrag_t dirfrag; long nitems; // actual real entries long nden; // num dentries (including null ones) version_t version; @@ -573,7 +574,7 @@ class CDirExport { assert(dir->get_version() == dir->get_projected_version()); - st.ino = dir->ino(); + st.dirfrag = dir->dirfrag(); st.nitems = dir->nitems; st.nden = dir->items.size(); st.version = dir->version; @@ -589,11 +590,11 @@ class CDirExport { replicas = dir->replicas; } - inodeno_t get_ino() { return st.ino; } + dirfrag_t get_dirfrag() { return st.dirfrag; } __uint64_t get_nden() { return st.nden; } void update_dir(CDir *dir) { - assert(dir->ino() == st.ino); + assert(dir->dirfrag() == st.dirfrag); //dir->nitems = st.nitems; diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index f0cc483d730e1..c1951cd6d972f 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -89,7 +89,7 @@ CInode::CInode(MDCache *c, bool auth) { //num_parents = 0; parent = NULL; - dir = NULL; // CDir opened separately + dir = NULL; // deprecated auth_pins = 0; nested_auth_pins = 0; @@ -101,12 +101,27 @@ CInode::CInode(MDCache *c, bool auth) { } CInode::~CInode() { - if (dir) { delete dir; dir = 0; } + if (dir) + delete dir; + + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + delete p->second; } // dirfrags +frag_t CInode::pick_dirfrag(const string& dn) +{ + if (dirfragtree.empty()) + return frag_t(); // avoid the string hash if we can. + + static hash H; + return dirfragtree[H(dn)]; +} + // new interface for old way void CInode::get_dirfrags(list& ls) { @@ -124,7 +139,7 @@ void CInode::get_subtree_dirfrags(list& ls) ls.push_back(dir); } -/* new +/* new way void CInode::get_dirfrags(list& ls) { for (map::iterator p = dirfrags.begin(); @@ -220,19 +235,56 @@ bool CInode::dir_is_auth() { CDir *CInode::get_or_open_dir(MDCache *mdcache) { - assert(is_dir()); - - if (dir) return dir; + return get_or_open_dirfrag(mdcache, frag_t()); +} - // can't open a dir if we're frozen_dir, bc of hashing stuff. - assert(!is_frozen_dir()); +CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) +{ + assert(is_dir()); + if (1) { // old + if (!dir) + dir = new CDir(this, fg, mdcache, true); + return dir; + } else { // new + // have it? + CDir *dir = get_dirfrag(fg); + if (dir) return dir; + + // create it. + assert(is_auth()); + dirfrags[fg] = new CDir(this, fg, mdcache, true); + return dir; + } +} - // only auth can open dir alone. - assert(is_auth()); - set_dir( new CDir(this, mdcache, true) ); +CDir *CInode::add_dirfrag(CDir *dir) +{ + if (1) { // old + assert(!this->dir); + this->dir = dir; + } else { + assert(dirfrags.count(dir->dirfrag().frag) == 0); + dirfrags[dir->dirfrag().frag] = dir; + } return dir; } +void CInode::close_dirfrag(frag_t fg) +{ + if (1) { // old + assert(dir); + assert(dir->get_num_ref() == 0); + delete dir; + dir = 0; + } else { // new + assert(dirfrags.count(fg)); + assert(dirfrags[fg]->get_num_ref() == 0); + delete dirfrags[fg]; + dirfrags.erase(fg); + } +} + +/* CDir *CInode::set_dir(CDir *newdir) { assert(dir == 0); @@ -247,6 +299,7 @@ void CInode::close_dir() delete dir; dir = 0; } +*/ void CInode::set_auth(bool a) @@ -521,17 +574,11 @@ pair CInode::authority() if (is_dangling()) return dangling_auth; // explicit - if (is_root()) { // i am root - if (dir) - return dir->get_dir_auth(); // bit of a chicken/egg issue here! - else { - return CDIR_AUTH_UNDEF; - } - } + if (is_root()) + return CDIR_AUTH_ROOTINODE; // root _inode_ is locked to mds0. - // this is useless if we hose the hashing crap. if (parent) - return parent->dir->dentry_authority( parent->name ); + return parent->dir->authority(); return CDIR_AUTH_UNDEF; } diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index 223524004b77f..269e25b403384 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -139,10 +139,11 @@ class CInode : public MDSCacheObject { MDCache *mdcache; // inode contents proper - inode_t inode; // the inode itself - string symlink; // symlink dest, if symlink + inode_t inode; // the inode itself + string symlink; // symlink dest, if symlink fragtree_t dirfragtree; // dir frag tree, if any + frag_t pick_dirfrag(const string &dn); // -- cache infrastructure -- // old way, deprecate me! @@ -151,13 +152,21 @@ class CInode : public MDSCacheObject { map dirfrags; // cached dir fragments CDir* get_dirfrag(frag_t fg) { - // old way - assert(fg == 0); - return dir; + if (1) // old + return dir; + else { // new + if (dirfrags.count(fg)) + return dirfrags[fg]; + else + return 0; + } } void get_dirfrags(list& ls); void get_nested_dirfrags(list& ls); void get_subtree_dirfrags(list& ls); + CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); + CDir *add_dirfrag(CDir *dir); + void close_dirfrag(frag_t fg); protected: // parent dentries in cache @@ -228,14 +237,14 @@ protected: CDentry* get_parent_dn() { return parent; } CDir *get_parent_dir(); CInode *get_parent_inode(); - CInode *get_realm_root(); // import, hash, or root - - CDir *get_or_open_dir(MDCache *mdcache); - CDir *set_dir(CDir *newdir); - void close_dir(); - bool dir_is_auth(); + CDir *get_or_open_dir(MDCache *mdcache); // deprecated + //CDir *set_dir(CDir *newdir); // deprecated + //void close_dir(); // deprecated + + bool dir_is_auth(); // FIXME deprecate me + // -- misc -- @@ -490,6 +499,9 @@ public: class CInodeDiscover { inode_t inode; + string symlink; + fragtree_t dirfragtree; + int replica_nonce; int hardlock_state; @@ -499,6 +511,9 @@ class CInodeDiscover { CInodeDiscover() {} CInodeDiscover(CInode *in, int nonce) { inode = in->inode; + symlink = in->symlink; + dirfragtree = in->dirfragtree; + replica_nonce = nonce; hardlock_state = in->hardlock.get_replica_state(); @@ -510,6 +525,8 @@ class CInodeDiscover { void update_inode(CInode *in) { in->inode = inode; + in->symlink = symlink; + in->dirfragtree = dirfragtree; in->replica_nonce = replica_nonce; in->hardlock.set_state(hardlock_state); @@ -518,16 +535,20 @@ class CInodeDiscover { void _encode(bufferlist& bl) { bl.append((char*)&inode, sizeof(inode)); + ::_encode(symlink, bl); + dirfragtree._encode(bl); bl.append((char*)&replica_nonce, sizeof(replica_nonce)); bl.append((char*)&hardlock_state, sizeof(hardlock_state)); bl.append((char*)&filelock_state, sizeof(filelock_state)); } void _decode(bufferlist& bl, int& off) { - bl.copy(off,sizeof(inode_t), (char*)&inode); - off += sizeof(inode_t); - bl.copy(off, sizeof(int), (char*)&replica_nonce); - off += sizeof(int); + bl.copy(off,sizeof(inode), (char*)&inode); + off += sizeof(inode); + ::_decode(symlink, bl, off); + dirfragtree._decode(bl, off); + bl.copy(off, sizeof(replica_nonce), (char*)&replica_nonce); + off += sizeof(replica_nonce); bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); off += sizeof(hardlock_state); bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); @@ -543,6 +564,7 @@ class CInodeExport { struct { inode_t inode; + meta_load_t popularity_justme; meta_load_t popularity_curdom; bool is_dirty; // dirty inode? @@ -550,6 +572,9 @@ class CInodeExport { int num_caps; } st; + string symlink; + fragtree_t dirfragtree; + map replicas; map cap_map; @@ -560,6 +585,9 @@ public: CInodeExport() {} CInodeExport(CInode *in) { st.inode = in->inode; + symlink = in->symlink; + dirfragtree = in->dirfragtree; + st.is_dirty = in->is_dirty(); replicas = in->replicas; @@ -582,6 +610,8 @@ public: void update_inode(CInode *in, set& new_client_caps) { in->inode = st.inode; + in->symlink = symlink; + in->dirfragtree = dirfragtree; in->popularity[MDS_POP_JUSTME] += st.popularity_justme; in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; @@ -605,7 +635,9 @@ public: void _encode(bufferlist& bl) { st.num_caps = cap_map.size(); bl.append((char*)&st, sizeof(st)); - + ::_encode(symlink, bl); + dirfragtree._encode(bl); + // cached_by + nonce ::_encode(replicas, bl); @@ -624,6 +656,8 @@ public: int _decode(bufferlist& bl, int off = 0) { bl.copy(off, sizeof(st), (char*)&st); off += sizeof(st); + ::_decode(symlink, bl, off); + dirfragtree._decode(bl, off); ::_decode(replicas, bl, off); diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 0a6ededd23fdc..3046524765cd6 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -222,10 +222,9 @@ int MDCache::open_root(Context *c) CInode *root = create_root_inode(); // root directory too - assert(root->dir == NULL); - root->set_dir(new CDir(root, this, true)); - adjust_subtree_auth(root->dir, 0); - root->dir->dir_rep = CDir::REP_ALL; //NONE; + CDir *dir = root->get_or_open_dirfrag(this, frag_t()); + adjust_subtree_auth(dir, 0); + dir->dir_rep = CDir::REP_ALL; //NONE; show_subtrees(); @@ -522,19 +521,19 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair& bound_inos, pair auth) +void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, pair auth) { dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth << " on " << *dir - << " bound_inos " << bound_inos + << " bound_dfs " << bound_dfs << endl; // make bounds list set bounds; - for (list::iterator p = bound_inos.begin(); - p != bound_inos.end(); + for (list::iterator p = bound_dfs.begin(); + p != bound_dfs.end(); ++p) { - CDir *bd = get_dir(*p); + CDir *bd = get_dirfrag(*p); if (bd) bounds.insert(bd); } @@ -623,19 +622,17 @@ void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) assert(bounds == subtrees[dir]); } -void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) +void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) { // for debugging only. assert(subtrees.count(dir)); // make sure that any bounds i do have are properly noted as such. int failed = 0; - for (list::const_iterator p = bounds.begin(); + for (list::const_iterator p = bounds.begin(); p != bounds.end(); ++p) { - CInode *bdi = get_inode(*p); - if (!bdi) continue; - CDir *bd = bdi->dir; + CDir *bd = get_dirfrag(*p); if (!bd) continue; if (subtrees[dir].count(bd) == 0) { dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << endl; @@ -744,7 +741,7 @@ void MDCache::log_import_map(Context *onsync) CDir *dir = p->first; if (!dir->is_auth()) continue; - le->imports.insert(dir->ino()); + le->imports.insert(dir->dirfrag()); le->metablob.add_dir_context(dir, true); le->metablob.add_dir(dir, false); @@ -753,7 +750,7 @@ void MDCache::log_import_map(Context *onsync) q != p->second.end(); ++q) { CDir *bound = *q; - le->bounds[dir->ino()].insert(bound->ino()); + le->bounds[dir->dirfrag()].insert(bound->dirfrag()); le->metablob.add_dir_context(bound); le->metablob.add_dir(bound, false); } @@ -818,26 +815,26 @@ void MDCache::send_import_map_now(int who) if (dir->authority().first != mds->get_nodeid()) continue; - if (migrator->is_importing(dir->ino())) { + if (migrator->is_importing(dir->dirfrag())) { // ambiguous (mid-import) - m->add_ambiguous_import(dir->ino(), - migrator->get_import_bound_inos(dir->ino())); + m->add_ambiguous_import(dir->dirfrag(), + migrator->get_import_bound_inos(dir->dirfrag())); } else { // not ambiguous. - m->add_import(dir->ino()); + m->add_import(dir->dirfrag()); // bounds too for (set::iterator q = subtrees[dir].begin(); q != subtrees[dir].end(); ++q) { CDir *bound = *q; - m->add_import_export(dir->ino(), bound->ino()); + m->add_import_export(dir->dirfrag(), bound->dirfrag()); } } } // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); + for (map >::iterator p = my_ambiguous_imports.begin(); p != my_ambiguous_imports.end(); ++p) m->add_ambiguous_import(p->first, p->second); @@ -950,9 +947,15 @@ void MDCache::handle_mds_recovery(int who) dn->get_inode()->take_waiting(CInode::WAIT_ANY, waiters); // recurse? - if (dn->get_inode()->dir && - !dn->get_inode()->dir->is_subtree_root()) - q.push_back(dn->get_inode()->dir); + list ls; + dn->get_inode()->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *subdir = *p; + if (!subdir->is_subtree_root()) + q.push_back(subdir); + } } } } @@ -981,10 +984,10 @@ void MDCache::handle_import_map(MMDSImportMap *m) int from = m->get_source().num(); // update my dir_auth values - for (map >::iterator pi = m->imap.begin(); + for (map >::iterator pi = m->imap.begin(); pi != m->imap.end(); ++pi) { - CDir *im = get_dir(pi->first); + CDir *im = get_dirfrag(pi->first); if (im) { adjust_bounded_subtree_auth(im, pi->second, from); try_subtree_merge(im); @@ -994,16 +997,16 @@ void MDCache::handle_import_map(MMDSImportMap *m) // am i a surviving ambiguous importer? if (mds->is_active() || mds->is_stopping()) { // check for any import success/failure (from this node) - map >::iterator p = my_ambiguous_imports.begin(); + map >::iterator p = my_ambiguous_imports.begin(); while (p != my_ambiguous_imports.end()) { - map >::iterator n = p; + map >::iterator n = p; n++; - CDir *dir = get_dir(p->first); + CDir *dir = get_dirfrag(p->first); assert(dir); dout(10) << "checking ambiguous import " << *dir << endl; - assert(migrator->is_importing(dir->ino())); - assert(migrator->get_import_state(dir->ino()) == Migrator::IMPORT_ACKING); - if (migrator->get_import_peer(dir->ino()) == from) { + assert(migrator->is_importing(dir->dirfrag())); + assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); + if (migrator->get_import_peer(dir->dirfrag()) == from) { if (dir->auth_is_ambiguous()) { dout(7) << "ambiguous import succeeded on " << *dir << endl; migrator->import_finish(dir, true); // don't wait for log flush @@ -1023,7 +1026,7 @@ void MDCache::handle_import_map(MMDSImportMap *m) // recovering? if (!mds->is_rejoin() && !mds->is_active() && !mds->is_stopping()) { // note ambiguous imports too.. unless i'm already active - for (map >::iterator pi = m->ambiguous_imap.begin(); + for (map >::iterator pi = m->ambiguous_imap.begin(); pi != m->ambiguous_imap.end(); ++pi) { dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; @@ -1059,17 +1062,17 @@ void MDCache::disambiguate_imports() // FIXME what about surviving bystanders // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); + for (map > >::iterator p = other_ambiguous_imports.begin(); p != other_ambiguous_imports.end(); ++p) { int who = p->first; dout(10) << "ambiguous imports for mds" << who << endl; - for (map >::iterator q = p->second.begin(); + for (map >::iterator q = p->second.begin(); q != p->second.end(); ++q) { dout(10) << " ambiguous import " << q->first << " bounds " << q->second << endl; - CDir *dir = get_dir(q->first); + CDir *dir = get_dirfrag(q->first); if (!dir) continue; if (dir->authority().first == CDIR_AUTH_UNKNOWN) { @@ -1085,9 +1088,9 @@ void MDCache::disambiguate_imports() // my ambiguous imports while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); + map >::iterator q = my_ambiguous_imports.begin(); - CDir *dir = get_dir(q->first); + CDir *dir = get_dirfrag(q->first); if (!dir) continue; if (dir->authority().first != CDIR_AUTH_UNKNOWN) { @@ -1104,7 +1107,7 @@ void MDCache::disambiguate_imports() } -void MDCache::add_ambiguous_import(inodeno_t base, list& bounds) +void MDCache::add_ambiguous_import(dirfrag_t base, list& bounds) { assert(my_ambiguous_imports.count(base) == 0); my_ambiguous_imports[base].swap( bounds ); @@ -1114,39 +1117,39 @@ void MDCache::add_ambiguous_import(inodeno_t base, list& bounds) void MDCache::add_ambiguous_import(CDir *base, const set& bounds) { // make a list - list binos; + list binos; for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) - binos.push_back((*p)->ino()); + binos.push_back((*p)->dirfrag()); // note: this can get called twice if the exporter fails during recovery - if (my_ambiguous_imports.count(base->ino())) - my_ambiguous_imports.erase(base->ino()); + if (my_ambiguous_imports.count(base->dirfrag())) + my_ambiguous_imports.erase(base->dirfrag()); - add_ambiguous_import(base->ino(), binos); + add_ambiguous_import(base->dirfrag(), binos); } -void MDCache::cancel_ambiguous_import(inodeno_t dirino) +void MDCache::cancel_ambiguous_import(dirfrag_t df) { - assert(my_ambiguous_imports.count(dirino)); - dout(10) << "cancel_ambiguous_import " << dirino - << " bounds " << my_ambiguous_imports[dirino] + assert(my_ambiguous_imports.count(df)); + dout(10) << "cancel_ambiguous_import " << df + << " bounds " << my_ambiguous_imports[df] << endl; - my_ambiguous_imports.erase(dirino); + my_ambiguous_imports.erase(df); } -void MDCache::finish_ambiguous_import(inodeno_t dirino) +void MDCache::finish_ambiguous_import(dirfrag_t df) { - assert(my_ambiguous_imports.count(dirino)); - list bound_inos; - bound_inos.swap(my_ambiguous_imports[dirino]); - my_ambiguous_imports.erase(dirino); + assert(my_ambiguous_imports.count(df)); + list bound_inos; + bound_inos.swap(my_ambiguous_imports[df]); + my_ambiguous_imports.erase(df); - dout(10) << "finish_ambiguous_import " << dirino + dout(10) << "finish_ambiguous_import " << df << " bounds " << bound_inos << endl; - CDir *dir = get_dir(dirino); + CDir *dir = get_dirfrag(df); assert(dir); // adjust dir_auth, import maps @@ -1185,13 +1188,17 @@ void MDCache::recalc_auth_bits() } } - if (in->dir) { - if (in->dir->authority().first == mds->get_nodeid()) - in->dir->state_set(CDir::STATE_AUTH); + list ls; + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *dir = *p; + if (dir->authority().first == mds->get_nodeid()) + dir->state_set(CDir::STATE_AUTH); else { - in->dir->state_clear(CDir::STATE_AUTH); - if (in->dir->is_dirty()) - in->dir->mark_clean(); + dir->state_clear(CDir::STATE_AUTH); + if (dir->is_dirty()) + dir->mark_clean(); } } } @@ -1602,7 +1609,7 @@ bool MDCache::trim(int max) assert(a != mds->get_nodeid()); if (expiremap.count(a) == 0) expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dentry(con->ino(), dir->ino(), dn->get_name(), dn->get_replica_nonce()); + expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->get_replica_nonce()); } } @@ -1615,7 +1622,7 @@ bool MDCache::trim(int max) // expire the inode, too. CInode *in = dn->get_inode(); assert(in); - trim_inode(dn, in, con->ino(), expiremap); + trim_inode(dn, in, con->dirfrag(), expiremap); } else { assert(dn->is_null()); @@ -1635,11 +1642,20 @@ bool MDCache::trim(int max) // troot inode+dir? if (max == 0 && // only if we're trimming everything! lru.lru_get_size() == 0 && - root && - root->get_num_ref() == 0 && - root->dir && - root->dir->get_num_ref() == 0) - trim_inode(0, root, 1, expiremap); + root) { + // root dirfrags? + list ls; + root->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) + if ((*p)->get_num_ref() == 0) + trim_dirfrag(*p, (*p)->dirfrag(), expiremap); + + // root inode? + if (root->get_num_ref() == 0) + trim_inode(0, root, dirfrag_t(1,frag_t()), expiremap); // hrm, FIXME + } // send expires for (map::iterator it = expiremap.begin(); @@ -1652,47 +1668,54 @@ bool MDCache::trim(int max) return true; } -void MDCache::trim_inode(CDentry *dn, CInode *in, inodeno_t conino, map& expiremap) +void MDCache::trim_dirfrag(CDir *dir, dirfrag_t condf, map& expiremap) { - assert(in->get_num_ref() == 0); + assert(dir->get_num_ref() == 0); + + CInode *in = dir->get_inode(); + + if (!dir->is_auth()) { + pair dirauth = dir->authority(); + assert(dirauth.second < 100); // hack die bug die - // DIR - pair dirauth = CDIR_AUTH_UNDEF; - if (in->dir) { - if (!in->dir->is_auth()) { - dirauth = in->dir->authority(); - assert(dirauth.second < 100); // hack die bug die - - // was this an auth delegation? (if so, slightly modified container) - inodeno_t dconino = conino; - if (in->dir->is_subtree_root()) { - dout(12) << " this is a subtree, removing from map, container is " << *in->dir << endl; - dconino = in->ino(); - } + // was this an auth delegation? (if so, slightly modified container) + dirfrag_t dcondf = condf; + if (dir->is_subtree_root()) { + dout(12) << " this is a subtree, removing from map, container is " << *dir << endl; + dcondf = dir->dirfrag(); + } - for (int a=dirauth.first; - a != dirauth.second && dirauth.second >= 0 && dirauth.second != mds->get_nodeid(); - a=dirauth.second) { - dout(12) << " sending expire to mds" << a << " on " << *in->dir << endl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dir(dconino, in->ino(), in->dir->replica_nonce); - } + for (int a=dirauth.first; + a != dirauth.second && dirauth.second >= 0 && dirauth.second != mds->get_nodeid(); + a=dirauth.second) { + dout(12) << " sending expire to mds" << a << " on " << *in->dir << endl; + assert(a != mds->get_nodeid()); + if (expiremap.count(a) == 0) + expiremap[a] = new MCacheExpire(mds->get_nodeid()); + expiremap[a]->add_dir(dcondf, dir->dirfrag(), dir->replica_nonce); } - - if (in->dir->is_subtree_root()) - remove_subtree(in->dir); // remove from subtree map - in->close_dir(); } + if (dir->is_subtree_root()) + remove_subtree(dir); // remove from subtree map + in->close_dirfrag(dir->dirfrag().frag); +} + +void MDCache::trim_inode(CDentry *dn, CInode *in, dirfrag_t condf, map& expiremap) +{ + assert(in->get_num_ref() == 0); + + // DIR + list dfls; + in->get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); + p != dfls.end(); + ++p) + trim_dirfrag(*p, condf, expiremap); + // INODE if (!in->is_auth()) { pair auth = in->authority(); - if (auth.first < 0) { // e.g., root - assert(in->ino() == 1); - auth = dirauth; - } for (int a=auth.first; a != auth.second && auth.second >= 0 && auth.second != mds->get_nodeid(); @@ -1701,7 +1724,7 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, inodeno_t conino, mapget_nodeid()); if (expiremap.count(a) == 0) expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_inode(conino, in->ino(), in->get_replica_nonce()); + expiremap[a]->add_inode(condf, in->ino(), in->get_replica_nonce()); } } @@ -1750,10 +1773,13 @@ void MDCache::trim_non_auth() } else if (dn->is_primary()) { CInode *in = dn->get_inode(); - if (in->dir) { - if (in->dir->is_subtree_root()) - remove_subtree(in->dir); - in->close_dir(); + list ls; + in->get_dirfrags(ls); + for (list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *subdir = *p; + if (subdir->is_subtree_root()) + remove_subtree(subdir); + in->close_dirfrag(subdir->dirfrag().frag); } dir->unlink_inode(dn); remove_inode(in); @@ -1770,10 +1796,14 @@ void MDCache::trim_non_auth() } if (lru.lru_get_size() == 0) { - if (root->dir) { - assert(root->dir->get_num_ref() == 0); - remove_subtree(root->dir); - root->close_dir(); + list ls; + root->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + assert((*p)->get_num_ref() == 0); + remove_subtree((*p)); + root->close_dirfrag((*p)->dirfrag().frag); } assert(root->get_num_ref() == 0); remove_inode(root); @@ -1790,13 +1820,11 @@ void MDCache::handle_cache_expire(MCacheExpire *m) dout(7) << "cache_expire from mds" << from << endl; // loop over realms - for (map::iterator p = m->realms.begin(); + for (map::iterator p = m->realms.begin(); p != m->realms.end(); ++p) { // get container - CInode *coni = get_inode(p->first); - CDir *con = coni ? coni->dir : 0; - + CDir *con = get_dirfrag(p->first); assert(con); // we had better have this. if (!con->is_auth()) { @@ -1843,10 +1871,10 @@ void MDCache::handle_cache_expire(MCacheExpire *m) } // DIRS - for (map::iterator it = p->second.dirs.begin(); + for (map::iterator it = p->second.dirs.begin(); it != p->second.dirs.end(); it++) { - CDir *dir = get_dir(it->first); + CDir *dir = get_dirfrag(it->first); int nonce = it->second; if (!dir) { @@ -1872,11 +1900,11 @@ void MDCache::handle_cache_expire(MCacheExpire *m) } // DENTRIES - for (map >::iterator pd = p->second.dentries.begin(); + for (map >::iterator pd = p->second.dentries.begin(); pd != p->second.dentries.end(); ++pd) { dout(0) << " dn expires in dir " << pd->first << endl; - CDir *dir = get_dir(pd->first); + CDir *dir = get_dirfrag(pd->first); if (!dir) { dout(0) << " dn expires on " << pd->first << " from " << from << ", don't have it" << endl; @@ -2334,7 +2362,8 @@ int MDCache::path_traverse(filepath& origpath, return 1; } - cur->get_or_open_dir(this); + frag_t fg = cur->pick_dirfrag(path[depth]); + cur->get_or_open_dirfrag(this, fg); assert(cur->dir); } else { // discover dir from/via inode auth @@ -3376,14 +3405,14 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) dout2(7) << ", now " << *cur->dir << endl; } else { // add it (_replica_) - cur->set_dir( new CDir(cur, this, false) ); - m->get_dir(i).update_dir(cur->dir); + CDir *ndir = cur->add_dirfrag( new CDir(cur, frag_t(), this, false) ); // FIXME dirfrag_t + m->get_dir(i).update_dir(ndir); // is this a dir_auth delegation boundary? if (m->get_source().num() != cur->authority().first) - adjust_subtree_auth(cur->dir, m->get_source().num()); + adjust_subtree_auth(ndir, m->get_source().num()); - dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl; + dout(7) << "added " << *ndir << " nonce " << ndir->replica_nonce << endl; // get waiters cur->take_waiting(CInode::WAIT_DIR, finished); diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index 99dbbdec71799..9b18aa09f4374 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -121,8 +121,8 @@ public: void adjust_bounded_subtree_auth(CDir *dir, set& bounds, int a) { adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); } - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { + void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); + void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); } void adjust_export_state(CDir *dir); @@ -133,7 +133,7 @@ public: void get_subtree_bounds(CDir *root, set& bounds); void get_wouldbe_subtree_bounds(CDir *root, set& bounds); void verify_subtree_bounds(CDir *root, const set& bounds); - void verify_subtree_bounds(CDir *root, const list& boundinos); + void verify_subtree_bounds(CDir *root, const list& bounds); void get_auth_subtrees(set& s); void get_fullauth_subtrees(set& s); @@ -168,9 +168,9 @@ protected: // recovery protected: // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; + map > my_ambiguous_imports; // from MMDSImportMaps - map > > other_ambiguous_imports; + map > > other_ambiguous_imports; set recovery_set; set wants_import_map; // nodes i need to send my import map to @@ -195,10 +195,10 @@ public: // ambiguous imports - void add_ambiguous_import(inodeno_t base, list& bounds); + void add_ambiguous_import(dirfrag_t base, list& bounds); void add_ambiguous_import(CDir *base, const set& bounds); - void cancel_ambiguous_import(inodeno_t dirino); - void finish_ambiguous_import(inodeno_t dirino); + void cancel_ambiguous_import(dirfrag_t dirino); + void finish_ambiguous_import(dirfrag_t dirino); @@ -242,7 +242,9 @@ public: void set_cache_size(size_t max) { lru.lru_set_max(max); } size_t get_cache_size() { return lru.lru_get_size(); } bool trim(int max = -1); // trim cache - void trim_inode(CDentry *dn, CInode *in, inodeno_t conino, + void trim_dirfrag(CDir *dir, dirfrag_t condf, + map& expiremap); + void trim_inode(CDentry *dn, CInode *in, dirfrag_t condf, map& expiremap); void trim_non_auth(); // trim out trimmable non-auth items diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index b0ae8ab2b2d91..c790369e40cfc 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -1039,6 +1039,7 @@ void MDS::my_dispatch(Message *m) // HACK to force export to test foreign renames if (false && whoami == 0) { + /* static bool didit = false; // 7 to 1 @@ -1051,6 +1052,7 @@ void MDS::my_dispatch(Message *m) didit = true; } } + */ } diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 361ff2fcfb8c5..18368d7fb6135 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -344,27 +344,27 @@ void Migrator::handle_mds_failure(int who) // check my imports - map::iterator q = import_state.begin(); + map::iterator q = import_state.begin(); while (q != import_state.end()) { - map::iterator next = q; + map::iterator next = q; next++; - inodeno_t dirino = q->first; - CInode *diri = mds->mdcache->get_inode(dirino); - CDir *dir = mds->mdcache->get_dir(dirino); + dirfrag_t df = q->first; + CInode *diri = mds->mdcache->get_inode(df.ino); + CDir *dir = mds->mdcache->get_dirfrag(df); - if (import_peer[dirino] == who) { - switch (import_state[dirino]) { + if (import_peer[df] == who) { + switch (import_state[df]) { case IMPORT_DISCOVERED: dout(10) << "import state=discovered : unpinning inode " << *diri << endl; assert(diri); // unpin base diri->put(CInode::PIN_IMPORTING); - import_state.erase(dirino); - import_peer.erase(dirino); + import_state.erase(df); + import_peer.erase(df); break; case IMPORT_PREPPING: - if (import_state[dirino] == IMPORT_PREPPING) { + if (import_state[df] == IMPORT_PREPPING) { dout(10) << "import state=prepping : unpinning base+bounds " << *dir << endl; } assert(dir); @@ -376,7 +376,7 @@ void Migrator::handle_mds_failure(int who) assert(dir); // adjust auth back to me - cache->adjust_subtree_auth(dir, import_peer[dirino]); + cache->adjust_subtree_auth(dir, import_peer[df]); cache->try_subtree_merge(dir); // bystanders? @@ -385,7 +385,7 @@ void Migrator::handle_mds_failure(int who) } else { // notify them; wait in aborting state import_notify_abort(dir); - import_state[dirino] = IMPORT_ABORTING; + import_state[df] = IMPORT_ABORTING; } break; @@ -484,7 +484,7 @@ void Migrator::export_dir(CDir *dir, export_peer[dir] = dest; // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR); + mds->send_message_mds(new MExportDirDiscover(dir), dest, MDS_PORT_MIGRATOR); dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack) // take away the popularity we're sending. FIXME: do this later? @@ -501,16 +501,14 @@ void Migrator::export_dir(CDir *dir, */ void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) { - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); dout(7) << "export_discover_ack from " << m->get_source() << " on " << *dir << ", releasing auth_pin" << endl; export_state[dir] = EXPORT_FREEZING; - + dir->auth_unpin(); // unpin to allow freeze to complete delete m; // done @@ -543,7 +541,7 @@ void Migrator::export_frozen(CDir *dir, set &bounds = export_bounds[dir]; // generate prep message, log entry. - MExportDirPrep *prep = new MExportDirPrep(dir->inode); + MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag()); // include list of bystanders for (map::iterator p = dir->replicas_begin(); @@ -575,7 +573,7 @@ void Migrator::export_frozen(CDir *dir, dout(7) << " export bound " << *bound << endl; - prep->add_export( bound->ino() ); + prep->add_export( bound->dirfrag() ); /* first assemble each trace, in trace order, and put in message */ list inode_trace; @@ -609,7 +607,7 @@ void Migrator::export_frozen(CDir *dir, it++) { CInode *in = *it; dout(7) << " added " << *in << endl; - prep->add_inode( in->parent->get_dir()->ino(), + prep->add_inode( in->parent->get_dir()->dirfrag(), in->parent->get_name(), in->replicate_to(dest) ); } @@ -623,9 +621,7 @@ void Migrator::export_frozen(CDir *dir, void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) { - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); dout(7) << "export_prep_ack " << *dir << endl; @@ -654,10 +650,10 @@ void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) //mds->send_message_mds(new MExportDirWarning(dir->ino(), export_peer[dir]), //p->first, MDS_PORT_MIGRATOR); - MExportDirNotify *notify = new MExportDirNotify(dir->ino(), true, + MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true, pair(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), pair(mds->get_nodeid(),export_peer[dir])); - notify->copy_exports(export_bounds[dir]); + notify->copy_bounds(export_bounds[dir]); mds->send_message_mds(notify, p->first, MDS_PORT_MIGRATOR); } @@ -731,7 +727,7 @@ void Migrator::export_go(CDir *dir) dest ); // send the export data! - MExportDir *req = new MExportDir(dir->ino()); + MExportDir *req = new MExportDir(dir->dirfrag()); // export state req->set_dirstate( export_data[dir] ); @@ -740,7 +736,7 @@ void Migrator::export_go(CDir *dir) for (set::iterator p = export_bounds[dir].begin(); p != export_bounds[dir].end(); ++p) - req->add_export((*p)->ino()); + req->add_export((*p)->dirfrag()); //s end mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); @@ -970,7 +966,7 @@ public: */ void Migrator::handle_export_ack(MExportDirAck *m) { - CDir *dir = cache->get_dir(m->get_ino()); + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); assert(dir->is_frozen_tree_root()); // i'm exporting! @@ -989,7 +985,7 @@ void Migrator::handle_export_ack(MExportDirAck *m) p != export_bounds[dir].end(); ++p) { CDir *bound = *p; - le->get_bounds().insert(bound->ino()); + le->get_bounds().insert(bound->dirfrag()); le->metablob.add_dir_context(bound); le->metablob.add_dir(bound, false); } @@ -1032,7 +1028,7 @@ void Migrator::export_reverse(CDir *dir) } // re-import the metadata - list imported_subdirs; + list imported_subdirs; int num_imported_inodes = 0; for (list::iterator p = export_data[dir].begin(); @@ -1076,7 +1072,7 @@ void Migrator::export_notify_abort(CDir* dir) MExportDirNotify *notify = new MExportDirNotify(dir->ino(), false, pair(mds->get_nodeid(), export_peer[dir]), pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN)); - notify->copy_exports(export_bounds[dir]); + notify->copy_bounds(export_bounds[dir]); mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); } } @@ -1103,16 +1099,16 @@ void Migrator::export_logged_finish(CDir *dir) MExportDirNotify *notify; if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) // dest is still alive. - notify = new MExportDirNotify(dir->ino(), true, + notify = new MExportDirNotify(dir->dirfrag(), true, pair(mds->get_nodeid(), dest), pair(dest, CDIR_AUTH_UNKNOWN)); else // dest is dead. bystanders will think i am only auth, as per mdcache->handle_mds_failure() - notify = new MExportDirNotify(dir->ino(), true, + notify = new MExportDirNotify(dir->dirfrag(), true, pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), pair(dest, CDIR_AUTH_UNKNOWN)); - notify->copy_exports(export_bounds[dir]); + notify->copy_bounds(export_bounds[dir]); mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); } @@ -1135,9 +1131,7 @@ void Migrator::export_logged_finish(CDir *dir) */ void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) { - CInode *in = cache->get_inode(m->get_ino()); - CDir *dir = in ? in->dir : 0; - + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); int from = m->get_source().num(); @@ -1163,7 +1157,7 @@ void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) if (export_notify_ack_waiting[dir].empty()) export_finish(dir); } - else if (import_state.count(dir->ino()) && import_state[dir->ino()] == IMPORT_ABORTING) { + else if (import_state.count(dir->dirfrag()) && import_state[dir->dirfrag()] == IMPORT_ABORTING) { // reversing import dout(7) << "handle_export_notify_ack from " << m->get_source() << ": aborting import on " @@ -1294,7 +1288,7 @@ void Migrator::handle_export_discover_2(MExportDirDiscover *m, CInode *in, int r assert(0); // this shouldn't happen if the auth pins his path properly!!!! - mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false), + mds->send_message_mds(new MExportDirDiscoverAck(m->get_dirfrag(), false), m->get_source().num(), MDS_PORT_MIGRATOR); delete m; return; @@ -1302,28 +1296,16 @@ void Migrator::handle_export_discover_2(MExportDirDiscover *m, CInode *in, int r assert(in->is_dir()); - - /* - if (in->is_frozen()) { - dout(7) << "frozen, waiting." << endl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return; - } - - // pin auth too, until the import completes. - in->auth_pin(); - */ - // pin inode in the cache (for now) in->get(CInode::PIN_IMPORTING); - import_state[in->ino()] = IMPORT_DISCOVERED; - import_peer[in->ino()] = m->get_source().num(); + // note import state + import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; + import_peer[m->get_dirfrag()] = m->get_source().num(); // reply dout(7) << " sending export_discover_ack on " << *in << endl; - mds->send_message_mds(new MExportDirDiscoverAck(in->ino()), + mds->send_message_mds(new MExportDirDiscoverAck(m->get_dirfrag()), m->get_source().num(), MDS_PORT_MIGRATOR); delete m; } @@ -1332,7 +1314,7 @@ void Migrator::handle_export_discover_2(MExportDirDiscover *m, CInode *in, int r void Migrator::handle_export_prep(MExportDirPrep *m) { - CInode *diri = cache->get_inode(m->get_ino()); + CInode *diri = cache->get_inode(m->get_dirfrag().ino); assert(diri); int oldauth = m->get_source().num(); @@ -1341,19 +1323,18 @@ void Migrator::handle_export_prep(MExportDirPrep *m) list finished; // assimilate root dir. - CDir *dir = diri->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); if (dir) { dout(7) << "handle_export_prep on " << *dir << " (had dir)" << endl; if (!m->did_assim()) - m->get_dir(diri->ino())->update_dir(dir); + m->get_dirfrag(m->get_dirfrag())->update_dir(dir); } else { assert(!m->did_assim()); // open dir i'm importing. - diri->set_dir( new CDir(diri, mds->mdcache, false) ); - dir = diri->dir; - m->get_dir(diri->ino())->update_dir(dir); + dir = diri->add_dirfrag( new CDir(diri, m->get_dirfrag().frag, mds->mdcache, false) ); // FIXME + m->get_dirfrag(m->get_dirfrag())->update_dir(dir); dout(7) << "handle_export_prep on " << *dir << " (opening dir)" << endl; @@ -1373,7 +1354,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) dir->get(CDir::PIN_IMPORTING); // change import state - import_state[diri->ino()] = IMPORT_PREPPING; + import_state[dir->dirfrag()] = IMPORT_PREPPING; // bystander list import_bystanders[dir] = m->get_bystanders(); @@ -1393,44 +1374,48 @@ void Migrator::handle_export_prep(MExportDirPrep *m) (*it)->update_inode(in); // link to the containing dir - CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) ); - assert(condiri && condiri->dir); - cache->add_inode( in ); - condiri->dir->add_dentry( m->get_dentry(in->ino()), in ); + CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); + assert(condir); + cache->add_inode( in ); + condir->add_dentry( m->get_dentry(in->ino()), in ); dout(7) << " added " << *in << endl; } - assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) ); + assert( in->get_parent_dir()->dirfrag() == m->get_containing_dirfrag(in->ino()) ); - // dir - if (m->have_dir(in->ino())) { - if (in->dir) { - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " updated " << *in->dir << endl; - } else { - in->set_dir( new CDir(in, mds->mdcache, false) ); - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " added " << *in->dir << endl; + // dirs + for (list::iterator pf = m->get_inode_dirfrags(in->ino()).begin(); + pf != m->get_inode_dirfrags(in->ino()).end(); + ++pf) { + CDir *dir = in->get_dirfrag(*pf); + if (dir) { + m->get_dirfrag(dirfrag_t(in->ino(),*pf))->update_dir(dir); + dout(7) << " updated " << *dir << endl; + } else { + dir = in->add_dirfrag( new CDir(in, *pf, mds->mdcache, false) ); + m->get_dirfrag(dirfrag_t(in->ino(), *pf))->update_dir(dir); + dout(7) << " added " << *dir << endl; in->take_waiting(CInode::WAIT_DIR, finished); - } + } } } // open export dirs/bounds? - assert(import_bound_inos.count(diri->ino()) == 0); - import_bound_inos[diri->ino()].clear(); - for (list::iterator it = m->get_exports().begin(); + assert(import_bound_inos.count(dir->dirfrag()) == 0); + import_bound_inos[dir->dirfrag()].clear(); + for (list::iterator it = m->get_exports().begin(); it != m->get_exports().end(); it++) { dout(7) << " checking dir " << hex << *it << dec << endl; - CInode *in = cache->get_inode(*it); + CInode *in = cache->get_inode(it->ino); assert(in); // note bound. - import_bound_inos[dir->ino()].push_back(*it); + import_bound_inos[dir->dirfrag()].push_back(*it); - if (!in->dir) { + CDir *dir = cache->get_dirfrag(*it); + if (!dir) { dout(7) << " opening nested export on " << *in << endl; cache->open_remote_dir(in, new C_MDS_RetryMessage(mds, m)); @@ -1443,23 +1428,22 @@ void Migrator::handle_export_prep(MExportDirPrep *m) // verify we have all exports int waiting_for = 0; - for (list::iterator it = m->get_exports().begin(); + for (list::iterator it = m->get_exports().begin(); it != m->get_exports().end(); it++) { - inodeno_t ino = *it; - CInode *in = cache->get_inode(ino); - assert(in); - if (in->dir) { - if (!in->dir->state_test(CDir::STATE_IMPORTBOUND)) { - dout(7) << " pinning import bound " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTBOUND); - in->dir->state_set(CDir::STATE_IMPORTBOUND); - import_bounds[dir].insert(in->dir); + dirfrag_t df = *it; + CDir *dir = cache->get_dirfrag(df); + if (dir) { + if (!dir->state_test(CDir::STATE_IMPORTBOUND)) { + dout(7) << " pinning import bound " << *dir << endl; + dir->get(CDir::PIN_IMPORTBOUND); + dir->state_set(CDir::STATE_IMPORTBOUND); + import_bounds[dir].insert(dir); } else { - dout(7) << " already pinned import bound " << *in << endl; + dout(7) << " already pinned import bound " << *dir << endl; } } else { - dout(7) << " waiting for nested export dir on " << *in << endl; + dout(7) << " waiting for nested export dir on " << *cache->get_inode(df.ino) << endl; waiting_for++; } } @@ -1480,11 +1464,11 @@ void Migrator::handle_export_prep(MExportDirPrep *m) // ok! dout(7) << " sending export_prep_ack on " << *dir << endl; - mds->send_message_mds(new MExportDirPrepAck(dir->ino()), + mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), m->get_source().num(), MDS_PORT_MIGRATOR); // note new state - import_state[diri->ino()] = IMPORT_PREPPED; + import_state[dir->dirfrag()] = IMPORT_PREPPED; // done delete m; @@ -1501,11 +1485,11 @@ class C_MDS_ImportDirLoggedStart : public Context { Migrator *migrator; CDir *dir; int from; - list imported_subdirs; - list exports; + list imported_subdirs; + list exports; public: C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f, - list& is, list& e) : + list& is, list& e) : migrator(m), dir(d), from(f) { imported_subdirs.swap(is); exports.swap(e); @@ -1517,7 +1501,8 @@ public: void Migrator::handle_export_dir(MExportDir *m) { - CDir *dir = cache->get_dir(m->get_ino()); + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + assert(dir); int oldauth = m->get_source().num(); dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; @@ -1526,7 +1511,7 @@ void Migrator::handle_export_dir(MExportDir *m) cache->show_subtrees(); // start the journal entry - EImportStart *le = new EImportStart(dir->ino(), m->get_exports()); + EImportStart *le = new EImportStart(dir->dirfrag(), m->get_exports()); le->metablob.add_dir_context(dir); // adjust auth (list us _first_) @@ -1534,7 +1519,7 @@ void Migrator::handle_export_dir(MExportDir *m) cache->verify_subtree_bounds(dir, import_bounds[dir]); // add this crap to my cache - list imported_subdirs; + list imported_subdirs; int num_imported_inodes = 0; for (list::iterator p = m->get_dirstate().begin(); @@ -1572,7 +1557,7 @@ void Migrator::handle_export_dir(MExportDir *m) imported_subdirs, m->get_exports())); // note state - import_state[dir->ino()] = IMPORT_LOGGINGSTART; + import_state[dir->dirfrag()] = IMPORT_LOGGINGSTART; // some stats if (mds->logger) { @@ -1598,7 +1583,7 @@ void Migrator::import_reverse(CDir *dir, bool fix_dir_auth) // update auth, with possible subtree merge. if (fix_dir_auth) { assert(dir->is_subtree_root()); - cache->adjust_subtree_auth(dir, import_peer[dir->ino()]); + cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]); cache->try_subtree_merge(dir); } @@ -1655,7 +1640,7 @@ void Migrator::import_reverse(CDir *dir, bool fix_dir_auth) // notify them; wait in aborting state dout(7) << "notifying bystanders of abort" << endl; import_notify_abort(dir); - import_state[dir->ino()] = IMPORT_ABORTING; + import_state[dir->dirfrag()] = IMPORT_ABORTING; } } @@ -1669,10 +1654,10 @@ void Migrator::import_notify_abort(CDir *dir) // NOTE: the bystander will think i am _only_ auth, because they will have seen // the exporter's failure and updated the subtree auth. see mdcache->handle_mds_failure(). MExportDirNotify *notify = - new MExportDirNotify(dir->ino(), true, + new MExportDirNotify(dir->dirfrag(), true, pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(import_peer[dir->ino()], CDIR_AUTH_UNKNOWN)); - notify->copy_exports(import_bounds[dir]); + pair(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN)); + notify->copy_bounds(import_bounds[dir]); mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); } } @@ -1707,9 +1692,9 @@ void Migrator::import_reverse_unpin(CDir *dir) } // clean up - import_state.erase(dir->ino()); - import_peer.erase(dir->ino()); - import_bound_inos.erase(dir->ino()); + import_state.erase(dir->dirfrag()); + import_peer.erase(dir->dirfrag()); + import_bound_inos.erase(dir->dirfrag()); import_bounds.erase(dir); import_bystanders.erase(dir); @@ -1719,17 +1704,17 @@ void Migrator::import_reverse_unpin(CDir *dir) void Migrator::import_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports) + list &imported_subdirs, + list &exports) { dout(7) << "import_logged " << *dir << endl; // note state - import_state[dir->ino()] = IMPORT_ACKING; + import_state[dir->dirfrag()] = IMPORT_ACKING; // send notify's etc. dout(7) << "sending ack for " << *dir << " to old auth mds" << from << endl; - mds->send_message_mds(new MExportDirAck(dir->inode->ino()), + mds->send_message_mds(new MExportDirAck(dir->dirfrag()), from, MDS_PORT_MIGRATOR); cache->show_subtrees(); @@ -1774,9 +1759,9 @@ void Migrator::import_finish(CDir *dir, bool now) cache->try_subtree_merge(dir); // clear import state (we're done!) - import_state.erase(dir->ino()); - import_peer.erase(dir->ino()); - import_bound_inos.erase(dir->ino()); + import_state.erase(dir->dirfrag()); + import_peer.erase(dir->dirfrag()); + import_bound_inos.erase(dir->dirfrag()); import_bounds.erase(dir); import_bystanders.erase(dir); @@ -1881,7 +1866,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol int Migrator::decode_import_dir(bufferlist& bl, int oldauth, CDir *import_root, - list& imported_subdirs, + list& imported_subdirs, EImportStart *le) { int off = 0; @@ -1890,16 +1875,16 @@ int Migrator::decode_import_dir(bufferlist& bl, CDirExport dstate; off = dstate._decode(bl, off); - CInode *diri = cache->get_inode(dstate.get_ino()); + CInode *diri = cache->get_inode(dstate.get_dirfrag().ino); assert(diri); - CDir *dir = diri->get_or_open_dir(mds->mdcache); + CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, dstate.get_dirfrag().frag); assert(dir); dout(7) << "decode_import_dir " << *dir << endl; // add to list if (dir != import_root) - imported_subdirs.push_back(dir->ino()); + imported_subdirs.push_back(dir->dirfrag()); // assimilate state dstate.update_dir( dir ); @@ -2023,7 +2008,7 @@ void Migrator::handle_export_warning(MExportDirWarning *m) void Migrator::handle_export_notify(MExportDirNotify *m) { - CDir *dir = cache->get_dir(m->get_ino()); + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); int from = m->get_source().num(); pair old_auth = m->get_old_auth(); @@ -2031,7 +2016,7 @@ void Migrator::handle_export_notify(MExportDirNotify *m) if (!dir) { dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on missing dir " << m->get_ino() << endl; + << " on missing dir " << m->get_dirfrag() << endl; } else if (dir->authority() != old_auth) { dout(7) << "handle_export_notify old_auth was " << dir->authority() << " != " << old_auth << " -> " << new_auth @@ -2040,7 +2025,7 @@ void Migrator::handle_export_notify(MExportDirNotify *m) dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth << " on " << *dir << endl; // adjust auth - cache->adjust_bounded_subtree_auth(dir, m->get_exports(), new_auth); + cache->adjust_bounded_subtree_auth(dir, m->get_bounds(), new_auth); // induce a merge? cache->try_subtree_merge(dir); @@ -2048,7 +2033,7 @@ void Migrator::handle_export_notify(MExportDirNotify *m) // send ack if (m->wants_ack()) { - mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()), + mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag()), from, MDS_PORT_MIGRATOR); } else { // aborted. no ack. diff --git a/branches/sage/cephmds2/mds/Migrator.h b/branches/sage/cephmds2/mds/Migrator.h index 2fca1503efeac..5a688701262c0 100644 --- a/branches/sage/cephmds2/mds/Migrator.h +++ b/branches/sage/cephmds2/mds/Migrator.h @@ -99,19 +99,21 @@ public: const static int IMPORT_ABORTING = 7; // notifying bystanders of an abort before unfreezing protected: - map import_state; - map import_peer; - map > import_bound_inos; + map import_state; // FIXME make these dirfrags + map import_peer; + map > import_bound_inos; map > import_bounds; map > import_bystanders; + + /* // -- hashing madness -- multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir // maps frozen_dir_ino's to waiting-for-discover ino's. multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - + */ public: @@ -127,12 +129,12 @@ public: return 0; } bool is_exporting() { return !export_state.empty(); } - int is_importing(inodeno_t dirino) { - if (import_state.count(dirino)) return import_state[dirino]; + int is_importing(dirfrag_t df) { + if (import_state.count(df)) return import_state[df]; return 0; } bool is_importing() { return !import_state.empty(); } - const list& get_import_bound_inos(inodeno_t base) { + const list& get_import_bound_inos(dirfrag_t base) { assert(import_bound_inos.count(base)); return import_bound_inos[base]; } @@ -141,13 +143,13 @@ public: return import_bounds[base]; } - int get_import_state(inodeno_t dirino) { - assert(import_state.count(dirino)); - return import_state[dirino]; + int get_import_state(dirfrag_t df) { + assert(import_state.count(df)); + return import_state[df]; } - int get_import_peer(inodeno_t dirino) { - assert(import_peer.count(dirino)); - return import_peer[dirino]; + int get_import_peer(dirfrag_t df) { + assert(import_peer.count(df)); + return import_peer[df]; } @@ -200,11 +202,13 @@ public: int decode_import_dir(bufferlist& bl, int oldauth, CDir *import_root, - list& imported_subdirs, + list& imported_subdirs, EImportStart *le); + /* void got_hashed_replica(CDir *import, inodeno_t dir_ino, inodeno_t replica_ino); + */ public: void import_reverse(CDir *dir, bool fix_dir_auth=true); protected: @@ -212,8 +216,8 @@ protected: void import_reverse_unpin(CDir *dir); void import_notify_abort(CDir *dir); void import_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports); + list &imported_subdirs, + list &exports); void handle_export_finish(MExportDirFinish *m); public: void import_finish(CDir *dir, bool now=false); @@ -230,6 +234,7 @@ protected: // -- hashed directories -- + /* // HASH public: void hash_dir(CDir *dir); // on auth @@ -287,7 +292,7 @@ protected: void handle_unhash_dir(MUnhashDir *m); void handle_unhash_dir_notify(MUnhashDirNotify *m); friend class C_MDC_UnhashPrepFreeze; - + */ }; diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index fa73eef2b83d9..6135aaadc83f3 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -26,8 +26,6 @@ #include "messages/MClientMountAck.h" #include "messages/MClientRequest.h" #include "messages/MClientReply.h" -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" #include "messages/MLock.h" @@ -75,13 +73,6 @@ void Server::dispatch(Message *m) handle_client_request((MClientRequest*)m); return; - case MSG_MDS_HASHREADDIR: - handle_hash_readdir((MHashReaddir*)m); - return; - case MSG_MDS_HASHREADDIRREPLY: - handle_hash_readdir_reply((MHashReaddirReply*)m); - return; - } dout(1) << " main unknown message " << m->get_type() << endl; @@ -384,8 +375,10 @@ void Server::handle_client_request(MClientRequest *req) // is this a special debug command? if (refpath.depth() - 1 == trace.size() && refpath.last_bit().find(".ceph.") == 0) { + /* +FIXME dirfrag CDir *dir = 0; - if (trace.empty()) + if (!trace.empty()) dir = mdcache->get_root()->dir; else dir = trace[trace.size()-1]->get_inode()->dir; @@ -395,18 +388,13 @@ void Server::handle_client_request(MClientRequest *req) if (refpath.last_bit() == ".ceph.hash" && refpath.depth() > 1) { dout(1) << "got explicit hash command " << refpath << endl; - /* - CDir *dir = trace[trace.size()-1]->get_inode()->dir; - if (!dir->is_hashed() && - !dir->is_hashing() && - dir->is_auth()) - mdcache->migrator->hash_dir(dir); - */ + /// .... } else if (refpath.last_bit() == ".ceph.commit") { dout(1) << "got explicit commit command on " << *dir << endl; dir->commit(0, 0); } +*/ } // @@ -537,19 +525,18 @@ void Server::dispatch_request(Message *m, CInode *ref) // FIXME: this probably should go somewhere else. -bool Server::try_open_dir(CInode *in, MClientRequest *req) +CDir* Server::try_open_dir(CInode *in, frag_t fg, MClientRequest *req) { - if (!in->dir && in->is_frozen_dir()) { + if (!in->get_dirfrag(fg) && in->is_frozen_dir()) { // doh! dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; assert(in->get_parent_dir()); in->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, in)); - return false; + return 0; } - - in->get_or_open_dir(mds->mdcache); - return true; + + return in->get_or_open_dirfrag(mds->mdcache, fg); } @@ -822,252 +809,80 @@ int Server::encode_dir_contents(CDir *dir, } -/* - * note: this is pretty sloppy, but should work just fine i think... - */ -void Server::handle_hash_readdir(MHashReaddir *m) -{ - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); - - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // complete? - if (!dir->is_complete()) { - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryMessage(mds, m)); - return; - } - - // get content - list inls; - list dnls; - int num = encode_dir_contents(dir, inls, dnls); - - // sent it back! - messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), - m->get_source_inst(), MDS_PORT_CACHE); -} - - -void Server::handle_hash_readdir_reply(MHashReaddirReply *m) -{ - /* - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); - - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // move items to hashed_readdir gather - int from = m->get_source().num(); - assert(dir->hashed_readdir.count(from) == 0); - dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(), - m->get_in()); - dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(), - m->get_dn()); - delete m; - - // gather finished? - if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) { - dout(7) << "still waiting for more hashed readdir bits" << endl; - return; - } - - dout(7) << "got last bit! finishing waiters" << endl; - - // do these finishers. they'll copy the results. - list finished; - dir->take_waiting(CDir::WAIT_THISHASHEDREADDIR, finished); - finish_contexts(finished); - - // now discard these results - for (map, list > >::iterator it = dir->hashed_readdir.begin(); - it != dir->hashed_readdir.end(); - it++) { - for (list::iterator ci = it->second.first.begin(); - ci != it->second.first.end(); - ci++) - delete *ci; - } - dir->hashed_readdir.clear(); - - // unpin dir (we're done!) - dir->auth_unpin(); - - // trigger any waiters for next hashed readdir cycle - dir->take_waiting(CDir::WAIT_NEXTHASHEDREADDIR, mds->finished_queue); - */ -} - - -class C_MDS_HashReaddir : public Context { - Server *server; - MClientRequest *req; - CDir *dir; -public: - C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) { - this->server = server; - this->req = req; - this->dir = dir; - } - void finish(int r) { - server->finish_hash_readdir(req, dir); - } -}; - -void Server::finish_hash_readdir(MClientRequest *req, CDir *dir) -{ - dout(7) << "finish_hash_readdir on " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds()); - - // reply! - MClientReply *reply = new MClientReply(req); - reply->set_result(0); - - for (int i=0; imdsmap->get_num_mds(); i++) { - reply->copy_dir_items(dir->hashed_readdir[i].first, - dir->hashed_readdir[i].second); - } - - // ok! - reply_request(req, reply, dir->inode); -} - - void Server::handle_client_readdir(MClientRequest *req, - CInode *cur) + CInode *diri) { // it's a directory, right? - if (!cur->is_dir()) { + if (!diri->is_dir()) { // not a dir dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; reply_request(req, -ENOTDIR); return; } - // auth? - if (!cur->dir_is_auth()) { - int dirauth = cur->authority().first; - if (cur->dir) - dirauth = cur->dir->authority().first; - assert(dirauth >= 0); - assert(dirauth != mds->get_nodeid()); - - // forward to authority - dout(10) << " forwarding readdir to authority " << dirauth << endl; - mdcache->request_forward(req, dirauth); + // which frag? + frag_t fg = req->get_iarg(); + + // does it exist? + if (diri->dirfragtree[fg] != fg) { + dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl; + reply_request(req, -EAGAIN); return; } - if (!try_open_dir(cur, req)) + // get the dir? + if (!diri->get_dirfrag(fg) && !diri->is_auth()) { + dout(10) << "not auth for " << fg << " or the inode " << *diri << ", fwd" << endl; + mdcache->request_forward(req, diri->authority().first); return; - assert(cur->dir->is_auth()); - - // unhashing? wait! - if (cur->dir->is_hashed() && - cur->dir->is_unhashing()) { - dout(10) << "unhashing, waiting" << endl; - cur->dir->add_waiter(CDir::WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, cur)); + } + + CDir *dir = try_open_dir(diri, fg, req); + if (!dir) return; + assert(dir); + + if (!dir->is_auth()) { + dout(10) << "not auth for " << *dir << ", fwd" << endl; + mdcache->request_forward(req, dir->authority().first); return; } + // ok! + assert(dir->is_auth()); + // check perm - if (!mds->locker->inode_hard_read_start(cur,req)) + if (!mds->locker->inode_hard_read_start(diri,req)) return; - mds->locker->inode_hard_read_finish(cur); - - CDir *dir = cur->dir; - assert(dir); + mds->locker->inode_hard_read_finish(diri); if (!dir->is_complete()) { // fetch - dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryRequest(mds, req, cur)); + dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; + dir->fetch(new C_MDS_RetryRequest(mds, req, diri)); return; } - if (dir->is_hashed()) { - // HASHED - /* - dout(7) << "hashed dir" << endl; - if (!dir->can_auth_pin()) { - dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl; - dir->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (!dir->hashed_readdir.empty()) { - dout(7) << "another readdir gather in progres, waiting" << endl; - dir->add_waiter(CDir::WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - // start new readdir gather - dout(7) << "staring new hashed readdir gather" << endl; - - // pin auth for process! - dir->auth_pin(); - - // get local bits - encode_dir_contents(cur->dir, - dir->hashed_readdir[mds->get_nodeid()].first, - dir->hashed_readdir[mds->get_nodeid()].second); - - // request other bits - for (int i=0; imdsmap->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER); - } - - // wait - dir->add_waiter(CDir::WAIT_THISHASHEDREADDIR, - new C_MDS_HashReaddir(this, req, dir)); - */ - } else { - // NON-HASHED - // build dir contents - list inls; - list dnls; - int numfiles = encode_dir_contents(cur->dir, inls, dnls); - - // . too - dnls.push_back("."); - inls.push_back(new InodeStat(cur, mds->get_nodeid())); - ++numfiles; - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(inls, dnls, numfiles); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(0); - - //balancer->hit_dir(cur->dir); - - // reply - reply_request(req, reply, cur); - } + // build dir contents + list inls; + list dnls; + int numfiles = encode_dir_contents(dir, inls, dnls); + + // . too + dnls.push_back("."); + inls.push_back(new InodeStat(diri, mds->get_nodeid())); + ++numfiles; + + // yay, reply + MClientReply *reply = new MClientReply(req); + reply->take_dir_items(inls, dnls, numfiles); + + dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; + reply->set_result(fg); + + //balancer->hit_dir(diri->dir); + + // reply + reply_request(req, reply, diri); } @@ -1110,12 +925,14 @@ public: void Server::handle_client_mknod(MClientRequest *req, CInode *diri) { + CDir *dir = 0; CInode *newi = 0; CDentry *dn = 0; - + // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) + if (!prepare_mknod(req, diri, &dir, &newi, &dn)) return; + assert(dir); assert(newi); assert(dn); @@ -1127,7 +944,7 @@ void Server::handle_client_mknod(MClientRequest *req, CInode *diri) // prepare finisher C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); EUpdate *le = new EUpdate("mknod"); - le->metablob.add_dir_context(diri->dir); + le->metablob.add_dir_context(dir); inode_t *pi = le->metablob.add_dentry(dn, true, newi); pi->version = dn->get_projected_version(); @@ -1151,29 +968,34 @@ CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& return false; } - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { + // which dirfrag? + frag_t fg = diri->pick_dirfrag(name); + CDir *dir = diri->get_dirfrag(fg); + + // not open? + if (!dir && !diri->is_auth()) { int dirauth = diri->authority().first; dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl; mdcache->request_forward(req, dirauth); return false; } - if (!try_open_dir(diri, req)) - return false; - CDir *dir = diri->dir; - - // make sure it's my dentry - int dnauth = dir->dentry_authority(name).first; - if (dnauth != mds->get_nodeid()) { - // fw - dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir + // not me? + if (dir && !dir->is_auth()) { + int auth = dir->authority().first; + dout(7) << "validate_new_dentry_dir on " << req->get_path() << ", dentry " << *dir << " dn " << name - << " not mine, fw to " << dnauth << endl; - mdcache->request_forward(req, dnauth); + << " not mine, fw to mds" << auth << endl; + mdcache->request_forward(req, auth); return false; } + // ok, let's open it then. + assert(diri->is_auth()); + dir = try_open_dir(diri, fg, req); + if (!dir) + return false; + // dir auth pinnable? if (!dir->can_auth_pin()) { dout(7) << "validate_new_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; @@ -1205,7 +1027,7 @@ CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& * 2 - already exists (only if okexist=true) */ int Server::prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, + CDir **pdir, CInode **pin, CDentry **pdn, bool okexist) { dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl; @@ -1214,7 +1036,7 @@ int Server::prepare_mknod(MClientRequest *req, CInode *diri, filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); string name = req->get_filepath().last_bit(); - CDir *dir = validate_new_dentry_dir(req, diri, name); + CDir *dir = *pdir = validate_new_dentry_dir(req, diri, name); if (!dir) return 0; // make sure name doesn't already exist @@ -1281,12 +1103,14 @@ int Server::prepare_mknod(MClientRequest *req, CInode *diri, void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) { + CDir *dir = 0; CInode *newi = 0; CDentry *dn = 0; // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) + if (!prepare_mknod(req, diri, &dir, &newi, &dn)) return; + assert(dir); assert(newi); assert(dn); @@ -1297,17 +1121,17 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) newi->inode.layout = g_OSD_MDDirLayout; // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dir(mds->mdcache); + CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); newdir->mark_complete(); newdir->mark_dirty(newdir->pre_dirty()); // prepare finisher C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); EUpdate *le = new EUpdate("mkdir"); - le->metablob.add_dir_context(diri->dir); + le->metablob.add_dir_context(dir); inode_t *pi = le->metablob.add_dentry(dn, true, newi); pi->version = dn->get_projected_version(); - le->metablob.add_dir(newi->dir, true); + le->metablob.add_dir(newdir, true); // log + wait mdlog->submit_entry(le); @@ -1335,12 +1159,14 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) void Server::handle_client_symlink(MClientRequest *req, CInode *diri) { + CDir *dir = 0; CInode *newi = 0; CDentry *dn = 0; // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) + if (!prepare_mknod(req, diri, &dir, &newi, &dn)) return; + assert(dir); assert(newi); assert(dn); @@ -1352,7 +1178,7 @@ void Server::handle_client_symlink(MClientRequest *req, CInode *diri) // prepare finisher C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); EUpdate *le = new EUpdate("symlink"); - le->metablob.add_dir_context(diri->dir); + le->metablob.add_dir_context(dir); inode_t *pi = le->metablob.add_dentry(dn, true, newi); pi->version = dn->get_projected_version(); @@ -1473,9 +1299,10 @@ void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vect } // what was the new dentry again? - CDir *dir = diri->dir; - assert(dir); string dname = req->get_filepath().last_bit(); + frag_t fg = diri->pick_dirfrag(dname); + CDir *dir = diri->get_dirfrag(fg); + assert(dir); CDentry *dn = dir->lookup(dname); assert(dn); assert(dn->is_xlockedbyme(req)); @@ -1538,9 +1365,9 @@ void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, // UNLINK void Server::handle_client_unlink(MClientRequest *req, - CInode *diri) + CInode *diri) { - // rmdir or unlink + // rmdir or unlink? bool rmdir = false; if (req->get_op() == MDS_OP_RMDIR) rmdir = true; @@ -1560,27 +1387,27 @@ void Server::handle_client_unlink(MClientRequest *req, } // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { + frag_t fg = diri->pick_dirfrag(name); + if (!diri->get_dirfrag(fg) && !diri->is_auth()) { int dirauth = diri->authority().first; dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; mdcache->request_forward(req, dirauth); return; } - if (!try_open_dir(diri, req)) return; - CDir *dir = diri->dir; - int dnauth = dir->dentry_authority(name).first; + CDir *dir = try_open_dir(diri, fg, req); + if (!dir) return; // does it exist? CDentry *dn = dir->lookup(name); if (!dn) { - if (dnauth == mds->get_nodeid()) { + if (!dir->is_complete()) { + dout(7) << "handle_client_rmdir/unlink missing dn " << name + << " but dir not complete, fetching " << *dir << endl; + dir->fetch(new C_MDS_RetryRequest(mds, req, diri)); + } else { dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl; reply_request(req, -ENOENT); - } else { - // send to authority! - dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl; - mdcache->request_forward(req, dnauth); } return; } @@ -1610,6 +1437,8 @@ void Server::handle_client_unlink(MClientRequest *req, dout(7) << "handle_client_unlink on non-dir " << *in << endl; } + int inauth = in->authority().first; + // dir stuff if (in->is_dir()) { if (rmdir) { @@ -1617,18 +1446,18 @@ void Server::handle_client_unlink(MClientRequest *req, // open dir? if (in->is_auth() && !in->dir) { - if (!try_open_dir(in, req)) return; + if (!try_open_dir(in, frag_t(), req)) return; // FIXME } // not dir auth? (or not open, which implies the same!) if (!in->dir) { - dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl; - mdcache->request_forward(req, dnauth); + dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to auth " << inauth << endl; + mdcache->request_forward(req, inauth); return; } if (!in->dir->is_auth()) { int dirauth = in->dir->authority().first; - dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl; + dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dirauth << endl; mdcache->request_forward(req, dirauth); return; } @@ -1678,10 +1507,10 @@ void Server::handle_client_unlink(MClientRequest *req, } // am i dentry auth? - if (dnauth != mds->get_nodeid()) { + if (inauth != mds->get_nodeid()) { // not auth; forward! - dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl; - mdcache->request_forward(req, dnauth); + dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << inauth << endl; + mdcache->request_forward(req, inauth); return; } @@ -1837,26 +1666,20 @@ void Server::handle_client_rename(MClientRequest *req, return; } + frag_t srcfg = srcdiri->pick_dirfrag(srcname); + // am i not open, not auth? - if (!srcdiri->dir && !srcdiri->is_auth()) { + if (!srcdiri->get_dirfrag(srcfg) && !srcdiri->is_auth()) { int dirauth = srcdiri->authority().first; dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; mdcache->request_forward(req, dirauth); return; } - - if (!try_open_dir(srcdiri, req)) return; - CDir *srcdir = srcdiri->dir; + + CDir *srcdir = try_open_dir(srcdiri, srcfg, req); + if (!srcdir) return; dout(7) << "handle_client_rename srcdir is " << *srcdir << endl; - // make sure it's my dentry - int srcauth = srcdir->dentry_authority(srcname).first; - if (srcauth != mds->get_nodeid()) { - // fw - dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl; - mdcache->request_forward(req, srcauth); - return; - } // ok, done passing buck. // src dentry @@ -1951,11 +1774,12 @@ void Server::handle_client_rename_2(MClientRequest *req, return; } + frag_t dfg = d->pick_dirfrag(destname); if (trace.size() == destpath.depth()) { if (d->is_dir()) { // mv /some/thing /to/some/dir - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some/dir + destdir = try_open_dir(d, dfg, req); // /to/some/dir + if (!destdir) return; destname = req->get_filepath().last_bit(); // thing destpath.add_dentry(destname); } else { @@ -1967,8 +1791,8 @@ void Server::handle_client_rename_2(MClientRequest *req, else if (trace.size() == destpath.depth()-1) { if (d->is_dir()) { // mv /some/thing /to/some/place_that_maybe_dne (we might be replica) - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some + destdir = try_open_dir(d, dfg, req); // /to/some + if (!destdir) return; destname = destpath.last_bit(); // place_that_MAYBE_dne } else { dout(7) << "dest dne" << endl; @@ -2349,13 +2173,15 @@ void Server::handle_client_openc(MClientRequest *req, CInode *diri) { dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; + CDir *dir = 0; CInode *in = 0; CDentry *dn = 0; // make dentry and inode, xlock dentry. - int r = prepare_mknod(req, diri, &in, &dn); + int r = prepare_mknod(req, diri, &dir, &in, &dn); if (!r) return; // wait on something + assert(dir); assert(in); assert(dn); @@ -2368,7 +2194,7 @@ void Server::handle_client_openc(MClientRequest *req, CInode *diri) // prepare finisher C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in); EUpdate *le = new EUpdate("openc"); - le->metablob.add_dir_context(diri->dir); + le->metablob.add_dir_context(dir); inode_t *pi = le->metablob.add_dentry(dn, true, in); pi->version = dn->get_projected_version(); diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index d4509f1418e07..de52d4d513d99 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -51,7 +51,7 @@ public: LogEvent *event, LogEvent *event2 = 0); - bool try_open_dir(CInode *in, MClientRequest *req); + CDir *try_open_dir(CInode *in, frag_t fg, MClientRequest *req); // clients @@ -86,9 +86,6 @@ public: int encode_dir_contents(CDir *dir, list& inls, list& dnls); - void handle_hash_readdir(MHashReaddir *m); - void handle_hash_readdir_reply(MHashReaddirReply *m); - void finish_hash_readdir(MClientRequest *req, CDir *dir); // namespace changes void handle_client_mknod(MClientRequest *req, CInode *ref); @@ -128,7 +125,7 @@ public: CDir *validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& dname); int prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, + CDir **pdir, CInode **pin, CDentry **pdn, bool okexist=false); diff --git a/branches/sage/cephmds2/mds/events/EImportFinish.h b/branches/sage/cephmds2/mds/events/EImportFinish.h index 14a9ab6403af6..7d51c038f3fab 100644 --- a/branches/sage/cephmds2/mds/events/EImportFinish.h +++ b/branches/sage/cephmds2/mds/events/EImportFinish.h @@ -22,17 +22,17 @@ class EImportFinish : public LogEvent { protected: - inodeno_t dirino; // imported dir + dirfrag_t base; // imported dir bool success; public: EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - dirino(dir->ino()), + base(dir->dirfrag()), success(s) { } EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } void print(ostream& out) { - out << "import_finish " << dirino; + out << "import_finish " << base; if (success) out << " success"; else @@ -40,12 +40,12 @@ class EImportFinish : public LogEvent { } virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&base, sizeof(base)); bl.append((char*)&success, sizeof(success)); } void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); + bl.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); bl.copy(off, sizeof(success), (char*)&success); off += sizeof(success); } diff --git a/branches/sage/cephmds2/mds/events/EImportMap.h b/branches/sage/cephmds2/mds/events/EImportMap.h index 640365222d3e0..2bfaa0d2a21aa 100644 --- a/branches/sage/cephmds2/mds/events/EImportMap.h +++ b/branches/sage/cephmds2/mds/events/EImportMap.h @@ -20,9 +20,8 @@ class EImportMap : public LogEvent { public: EMetaBlob metablob; - set imports; - //set hashdirs; - map > bounds; + set imports; + map > bounds; EImportMap() : LogEvent(EVENT_IMPORTMAP) { } @@ -34,7 +33,7 @@ public: void encode_payload(bufferlist& bl) { metablob._encode(bl); ::_encode(imports, bl); - for (set::iterator p = imports.begin(); + for (set::iterator p = imports.begin(); p != imports.end(); ++p) { ::_encode(bounds[*p], bl); @@ -45,7 +44,7 @@ public: void decode_payload(bufferlist& bl, int& off) { metablob._decode(bl, off); ::_decode(imports, bl, off); - for (set::iterator p = imports.begin(); + for (set::iterator p = imports.begin(); p != imports.end(); ++p) { ::_decode(bounds[*p], bl, off); diff --git a/branches/sage/cephmds2/mds/events/EImportStart.h b/branches/sage/cephmds2/mds/events/EImportStart.h index 59c074dec6f4f..742de69860735 100644 --- a/branches/sage/cephmds2/mds/events/EImportStart.h +++ b/branches/sage/cephmds2/mds/events/EImportStart.h @@ -24,29 +24,29 @@ class EImportStart : public LogEvent { protected: - inodeno_t dirino; - list bounds; + dirfrag_t base; + list bounds; public: EMetaBlob metablob; - EImportStart(inodeno_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - dirino(di), bounds(b) { } + EImportStart(dirfrag_t di, + list& b) : LogEvent(EVENT_IMPORTSTART), + base(di), bounds(b) { } EImportStart() : LogEvent(EVENT_IMPORTSTART) { } void print(ostream& out) { - out << "EImportStart " << metablob; + out << "EImportStart " << base << " " << metablob; } virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&base, sizeof(base)); metablob._encode(bl); ::_encode(bounds, bl); } void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); + bl.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); metablob._decode(bl, off); ::_decode(bounds, bl, off); } diff --git a/branches/sage/cephmds2/mds/events/EMetaBlob.h b/branches/sage/cephmds2/mds/events/EMetaBlob.h index 50368b6ae1a92..d1a63a111855b 100644 --- a/branches/sage/cephmds2/mds/events/EMetaBlob.h +++ b/branches/sage/cephmds2/mds/events/EMetaBlob.h @@ -209,8 +209,8 @@ class EMetaBlob { }; // my lumps. preserve the order we added them in a list. - list lump_order; - map lump_map; + list lump_order; + map lump_map; public: @@ -267,11 +267,11 @@ class EMetaBlob { } dirlump& add_dir(CDir *dir, bool dirty) { - if (lump_map.count(dir->ino()) == 0) { - lump_order.push_back(dir->ino()); - lump_map[dir->ino()].dirv = dir->get_projected_version(); + if (lump_map.count(dir->dirfrag()) == 0) { + lump_order.push_back(dir->dirfrag()); + lump_map[dir->dirfrag()].dirv = dir->get_projected_version(); } - dirlump& l = lump_map[dir->ino()]; + dirlump& l = lump_map[dir->dirfrag()]; if (dir->is_complete()) l.mark_complete(); //if (dir->is_import()) l.mark_import(); if (dirty) l.mark_dirty(); @@ -280,7 +280,7 @@ class EMetaBlob { void add_dir_context(CDir *dir, bool toroot=false) { // already have this dir? (we must always add in order) - if (lump_map.count(dir->ino())) + if (lump_map.count(dir->dirfrag())) return; CInode *diri = dir->get_inode(); @@ -300,7 +300,7 @@ class EMetaBlob { void _encode(bufferlist& bl) { int n = lump_map.size(); bl.append((char*)&n, sizeof(n)); - for (list::iterator i = lump_order.begin(); + for (list::iterator i = lump_order.begin(); i != lump_order.end(); ++i) { bl.append((char*)&(*i), sizeof(*i)); @@ -312,11 +312,11 @@ class EMetaBlob { bl.copy(off, sizeof(n), (char*)&n); off += sizeof(n); for (int i=0; i::iterator lp = lump_map.begin(); + for (map::iterator lp = lump_map.begin(); lp != lump_map.end(); ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(lp->first); if (!dir) continue; // we expired it @@ -116,13 +113,10 @@ void EMetaBlob::expire(MDS *mds, Context *c) // examine dirv's for my lumps // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); + for (map::iterator lp = lump_map.begin(); lp != lump_map.end(); ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(lp->first); if (!dir) continue; // we expired it @@ -171,27 +165,26 @@ void EMetaBlob::replay(MDS *mds) dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl; // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); + for (list::iterator lp = lump_order.begin(); lp != lump_order.end(); ++lp) { dout(10) << "EMetaBlob.replay dir " << *lp << endl; dirlump &lump = lump_map[*lp]; // the dir - CInode *diri = mds->mdcache->get_inode(*lp); - CDir *dir; - if (!diri) { - assert(*lp == 1); - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << endl; - } - if (diri->dir) { - dir = diri->dir; - dout(20) << "EMetaBlob.replay had dir " << *dir << endl; - } else { - dir = diri->get_or_open_dir(mds->mdcache); - if (*lp == 1) - dir->set_dir_auth(CDIR_AUTH_UNKNOWN); + CDir *dir = mds->mdcache->get_dirfrag(*lp); + if (!dir) { + // hmm. do i have the inode? + CInode *diri = mds->mdcache->get_inode((*lp).ino); + if (!diri) { + assert((*lp).ino == 1); + diri = mds->mdcache->create_root_inode(); + dout(10) << "EMetaBlob.replay created root " << *diri << endl; + } + // create the dirfrag + dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag); + if ((*lp).ino == 1) + dir->set_dir_auth(CDIR_AUTH_UNKNOWN); // FIXME: can root dir be fragmented? hrm. dout(10) << "EMetaBlob.replay added dir " << *dir << endl; } dir->set_version( lump.dirv ); @@ -471,11 +464,10 @@ void EImportMap::replay(MDS *mds) metablob.replay(mds); // restore import/export maps - for (set::iterator p = imports.begin(); + for (set::iterator p = imports.begin(); p != imports.end(); ++p) { - CInode *diri = mds->mdcache->get_inode(*p); - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(*p); mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid()); } } @@ -606,17 +598,17 @@ bool EImportStart::has_expired(MDS *mds) void EImportStart::expire(MDS *mds, Context *c) { - dout(10) << "EImportStart.expire " << dirino << endl; + dout(10) << "EImportStart.expire " << base << endl; metablob.expire(mds, c); } void EImportStart::replay(MDS *mds) { - dout(10) << "EImportStart.replay " << dirino << endl; + dout(10) << "EImportStart.replay " << base << endl; metablob.replay(mds); // put in ambiguous import list - mds->mdcache->add_ambiguous_import(dirino, bounds); + mds->mdcache->add_ambiguous_import(base, bounds); } // ----------------------- @@ -633,11 +625,11 @@ void EImportFinish::expire(MDS *mds, Context *c) void EImportFinish::replay(MDS *mds) { - dout(10) << "EImportFinish.replay " << dirino << " success=" << success << endl; + dout(10) << "EImportFinish.replay " << base << " success=" << success << endl; if (success) - mds->mdcache->finish_ambiguous_import(dirino); + mds->mdcache->finish_ambiguous_import(base); else - mds->mdcache->cancel_ambiguous_import(dirino); + mds->mdcache->cancel_ambiguous_import(base); } diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h index ac0e0db41eb7e..df34dc9b5d03d 100644 --- a/branches/sage/cephmds2/mds/mdstypes.h +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -64,6 +64,9 @@ inline bool operator<(dirfrag_t l, dirfrag_t r) { if (l.ino == r.ino && l.frag < r.frag) return true; return false; } +inline bool operator==(dirfrag_t l, dirfrag_t r) { + return l.ino == r.ino && l.frag == r.frag; +} // ================================================================ diff --git a/branches/sage/cephmds2/messages/MCacheExpire.h b/branches/sage/cephmds2/messages/MCacheExpire.h index dedf2fd0a4c52..9a6b3f1497920 100644 --- a/branches/sage/cephmds2/messages/MCacheExpire.h +++ b/branches/sage/cephmds2/messages/MCacheExpire.h @@ -24,10 +24,10 @@ public: */ struct realm { map inodes; - map dirs; - map > dentries; + map dirs; + map > dentries; }; - map realms; + map realms; int get_from() { return from; } @@ -38,31 +38,27 @@ public: virtual char *get_type_name() { return "CEx";} - void add_inode(inodeno_t r, inodeno_t ino, int nonce) { + void add_inode(dirfrag_t r, inodeno_t ino, int nonce) { realms[r].inodes[ino] = nonce; } - void add_dir(inodeno_t r, inodeno_t ino, int nonce) { - realms[r].dirs[ino] = nonce; + void add_dir(dirfrag_t r, dirfrag_t df, int nonce) { + realms[r].dirs[df] = nonce; } - void add_dentry(inodeno_t r, inodeno_t dirino, const string& dn, int nonce) { - realms[r].dentries[dirino][dn] = nonce; + void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, int nonce) { + realms[r].dentries[df][dn] = nonce; } - //badbadbad - //void add_dentries(inodeno_t r, inodeno_t dirino, map& dmap) { - //realms[r].dentries[dirino] = dmap; - //} - void add_realm(inodeno_t ino, realm& r) { - realm& myr = realms[ino]; + void add_realm(dirfrag_t df, realm& r) { + realm& myr = realms[df]; for (map::iterator p = r.inodes.begin(); p != r.inodes.end(); ++p) myr.inodes[p->first] = p->second; - for (map::iterator p = r.dirs.begin(); + for (map::iterator p = r.dirs.begin(); p != r.dirs.end(); ++p) myr.dirs[p->first] = p->second; - for (map >::iterator p = r.dentries.begin(); + for (map >::iterator p = r.dentries.begin(); p != r.dentries.end(); ++p) for (map::iterator q = p->second.begin(); @@ -82,7 +78,7 @@ public: off += sizeof(nr); while (nr--) { - inodeno_t r; + dirfrag_t r; payload.copy(off, sizeof(r), (char*)&r); off += sizeof(r); @@ -93,10 +89,10 @@ public: payload.copy(off, sizeof(int), (char*)&n); off += sizeof(int); for (int i=0; i::iterator q = realms.begin(); + for (map::iterator q = realms.begin(); q != realms.end(); ++q) { payload.append((char*)&q->first, sizeof(q->first)); @@ -117,7 +113,7 @@ public: int n = q->second.dentries.size(); payload.append((char*)&n, sizeof(n)); - for (map >::iterator p = q->second.dentries.begin(); + for (map >::iterator p = q->second.dentries.begin(); p != q->second.dentries.end(); ++p) { payload.append((char*)&p->first, sizeof(p->first)); diff --git a/branches/sage/cephmds2/messages/MClientReply.h b/branches/sage/cephmds2/messages/MClientReply.h index 0f7b44bfeed84..038190a774bf3 100644 --- a/branches/sage/cephmds2/messages/MClientReply.h +++ b/branches/sage/cephmds2/messages/MClientReply.h @@ -54,13 +54,12 @@ class InodeStat { public: inode_t inode; string symlink; // symlink content (if symlink) - + fragtree_t dirfragtree; // mds distribution hints - int dir_auth; - bool hashed, replicated; - bool spec_defined; - set dist; // where am i replicated? + map dirfrag_auth; + map > dirfrag_dist; + set dirfrag_rep; public: InodeStat() {} @@ -77,49 +76,42 @@ class InodeStat { // symlink content? if (in->is_symlink()) symlink = in->symlink; + + // dirfragtree + dirfragtree = in->dirfragtree; - // replicated where? - if (in->dir && in->dir->is_auth()) { - spec_defined = true; - in->dir->get_dist_spec(this->dist, whoami); - } else - spec_defined = false; - - if (in->dir) - dir_auth = in->dir->get_dir_auth().first; - else - dir_auth = -1; - - // dir info - hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right. - replicated = (in->dir && in->dir->is_rep()); + // dirfrag info + list ls; + in->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *dir = *p; + dirfrag_auth[dir->dirfrag().frag] = dir->get_dir_auth().first; + if (dir->is_auth()) + dir->get_dist_spec(dirfrag_dist[dir->dirfrag().frag], whoami); + if (dir->is_rep()) + dirfrag_rep.insert(dir->dirfrag().frag); + } } void _encode(bufferlist &bl) { bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&spec_defined, sizeof(spec_defined)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); - bl.append((char*)&hashed, sizeof(hashed)); - bl.append((char*)&replicated, sizeof(replicated)); - + ::_encode(dirfrag_auth, bl); + ::_encode(dirfrag_dist, bl); + ::_encode(dirfrag_rep, bl); ::_encode(symlink, bl); - ::_encode(dist, bl); // distn + dirfragtree._encode(bl); } void _decode(bufferlist &bl, int& off) { bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); - bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); - off += sizeof(spec_defined); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); - bl.copy(off, sizeof(hashed), (char*)&hashed); - off += sizeof(hashed); - bl.copy(off, sizeof(replicated), (char*)&replicated); - off += sizeof(replicated); - + ::_decode(dirfrag_auth, bl, off); + ::_decode(dirfrag_dist, bl, off); + ::_decode(dirfrag_rep, bl, off); ::_decode(symlink, bl, off); - ::_decode(dist, bl, off); + dirfragtree._decode(bl, off); } }; diff --git a/branches/sage/cephmds2/messages/MExportDir.h b/branches/sage/cephmds2/messages/MExportDir.h index 39ecae2422a72..1d694f1a4d7d5 100644 --- a/branches/sage/cephmds2/messages/MExportDir.h +++ b/branches/sage/cephmds2/messages/MExportDir.h @@ -19,25 +19,25 @@ class MExportDir : public Message { - inodeno_t ino; + dirfrag_t dirfrag; list dirstate; // a bl for reach dir - list exports; + list exports; public: MExportDir() {} - MExportDir(inodeno_t dirino) : + MExportDir(dirfrag_t df) : Message(MSG_MDS_EXPORTDIR), - ino(dirino) { + dirfrag(df) { } virtual char *get_type_name() { return "Ex"; } void print(ostream& o) { - o << "export(" << ino << ")"; + o << "export(" << dirfrag << ")"; } - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } list& get_dirstate() { return dirstate; } - list& get_exports() { return exports; } + list& get_exports() { return exports; } void add_dir(bufferlist& dir) { dirstate.push_back(dir); @@ -45,19 +45,19 @@ class MExportDir : public Message { void set_dirstate(const list& ls) { dirstate = ls; } - void add_export(inodeno_t dirino) { - exports.push_back(dirino); + void add_export(dirfrag_t df) { + exports.push_back(df); } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); ::_decode(exports, payload, off); ::_decode(dirstate, payload, off); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); ::_encode(exports, payload); ::_encode(dirstate, payload); } diff --git a/branches/sage/cephmds2/messages/MExportDirAck.h b/branches/sage/cephmds2/messages/MExportDirAck.h index dc0c5abdeb478..5ae7b6e9642f7 100644 --- a/branches/sage/cephmds2/messages/MExportDirAck.h +++ b/branches/sage/cephmds2/messages/MExportDirAck.h @@ -17,27 +17,27 @@ #include "MExportDir.h" class MExportDirAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirAck() {} - MExportDirAck(inodeno_t i) : - Message(MSG_MDS_EXPORTDIRACK), ino(i) { } + MExportDirAck(dirfrag_t i) : + Message(MSG_MDS_EXPORTDIRACK), dirfrag(i) { } virtual char *get_type_name() { return "ExAck"; } void print(ostream& o) { - o << "export_ack(" << ino << ")"; + o << "export_ack(" << dirfrag << ")"; } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/branches/sage/cephmds2/messages/MExportDirDiscover.h b/branches/sage/cephmds2/messages/MExportDirDiscover.h index d263cc913eba3..5a2a503524986 100644 --- a/branches/sage/cephmds2/messages/MExportDirDiscover.h +++ b/branches/sage/cephmds2/messages/MExportDirDiscover.h @@ -19,30 +19,34 @@ #include "include/types.h" class MExportDirDiscover : public Message { - inodeno_t ino; + dirfrag_t dirfrag; string path; public: - inodeno_t get_ino() { return ino; } + inodeno_t get_ino() { return dirfrag.ino; } + dirfrag_t get_dirfrag() { return dirfrag; } string& get_path() { return path; } MExportDirDiscover() {} - MExportDirDiscover(CInode *in) : + MExportDirDiscover(CDir *dir) : Message(MSG_MDS_EXPORTDIRDISCOVER) { - in->make_path(path); - ino = in->ino(); + dir->get_inode()->make_path(path); + dirfrag = dir->dirfrag(); } virtual char *get_type_name() { return "ExDis"; } + void print(ostream& o) { + o << "export_discover " << dirfrag << " " << path; + } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); ::_decode(path, payload, off); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); ::_encode(path, payload); } }; diff --git a/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h b/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h index 5d226f0f4f35b..2d962a4269195 100644 --- a/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h +++ b/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h @@ -19,32 +19,39 @@ #include "include/types.h" class MExportDirDiscoverAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; bool success; public: - inodeno_t get_ino() { return ino; } + inodeno_t get_ino() { return dirfrag.ino; } + dirfrag_t get_dirfrag() { return dirfrag; } bool is_success() { return success; } MExportDirDiscoverAck() {} - MExportDirDiscoverAck(inodeno_t ino, bool success=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK) { - this->ino = ino; - this->success = false; - } - virtual char *get_type_name() { return "ExDisA"; } + MExportDirDiscoverAck(dirfrag_t df, bool s=true) : + Message(MSG_MDS_EXPORTDIRDISCOVERACK), + dirfrag(df), + success(s) { } + virtual char *get_type_name() { return "ExDisA"; } + void print(ostream& o) { + o << "export_discover_ack " << dirfrag; + if (success) + o << " success"; + else + o << " failure"; + } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); payload.copy(off, sizeof(success), (char*)&success); off += sizeof(success); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); payload.append((char*)&success, sizeof(success)); } }; diff --git a/branches/sage/cephmds2/messages/MExportDirNotify.h b/branches/sage/cephmds2/messages/MExportDirNotify.h index 36d8d7c01aef8..686d8052d396a 100644 --- a/branches/sage/cephmds2/messages/MExportDirNotify.h +++ b/branches/sage/cephmds2/messages/MExportDirNotify.h @@ -19,31 +19,26 @@ using namespace std; class MExportDirNotify : public Message { - inodeno_t ino; + dirfrag_t base; bool ack; pair old_auth, new_auth; - list exports; // bounds; these dirs are _not_ included (tho the inodes are) - - //list subdirs; + list bounds; // bounds; these dirs are _not_ included (tho the dirfragdes are) public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return base; } pair get_old_auth() { return old_auth; } pair get_new_auth() { return new_auth; } bool wants_ack() { return ack; } - list& get_exports() { return exports; } - //list::iterator subdirs_begin() { return subdirs.begin(); } - //list::iterator subdirs_end() { return subdirs.end(); } - //int num_subdirs() { return subdirs.size(); } + list& get_bounds() { return bounds; } MExportDirNotify() {} - MExportDirNotify(inodeno_t i, bool a, pair oa, pair na) : + MExportDirNotify(dirfrag_t i, bool a, pair oa, pair na) : Message(MSG_MDS_EXPORTDIRNOTIFY), - ino(i), ack(a), old_auth(oa), new_auth(na) { } + base(i), ack(a), old_auth(oa), new_auth(na) { } virtual char *get_type_name() { return "ExNot"; } void print(ostream& o) { - o << "export_notify(" << ino; + o << "export_notify(" << base; o << " " << old_auth << " -> " << new_auth; if (ack) o << " ack)"; @@ -51,45 +46,38 @@ class MExportDirNotify : public Message { o << " no ack)"; } - /* - void copy_subdirs(list& s) { - this->subdirs = s; - } - */ - void copy_exports(list& ex) { - this->exports = ex; + void copy_bounds(list& ex) { + this->bounds = ex; } - void copy_exports(set& ex) { - for (set::iterator i = ex.begin(); + void copy_bounds(set& ex) { + for (set::iterator i = ex.begin(); i != ex.end(); ++i) - exports.push_back(*i); + bounds.push_back(*i); } - void copy_exports(set& ex) { + void copy_bounds(set& ex) { for (set::iterator i = ex.begin(); i != ex.end(); ++i) - exports.push_back((*i)->ino()); + bounds.push_back((*i)->dirfrag()); } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); payload.copy(off, sizeof(ack), (char*)&ack); off += sizeof(ack); payload.copy(off, sizeof(old_auth), (char*)&old_auth); off += sizeof(old_auth); payload.copy(off, sizeof(new_auth), (char*)&new_auth); off += sizeof(new_auth); - ::_decode(exports, payload, off); - //::_decode(subdirs, payload, off); + ::_decode(bounds, payload, off); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&base, sizeof(base)); payload.append((char*)&ack, sizeof(ack)); payload.append((char*)&old_auth, sizeof(old_auth)); payload.append((char*)&new_auth, sizeof(new_auth)); - ::_encode(exports, payload); - //::_encode(subdirs, payload); + ::_encode(bounds, payload); } }; diff --git a/branches/sage/cephmds2/messages/MExportDirNotifyAck.h b/branches/sage/cephmds2/messages/MExportDirNotifyAck.h index e1b996717d37c..f53100a2e053c 100644 --- a/branches/sage/cephmds2/messages/MExportDirNotifyAck.h +++ b/branches/sage/cephmds2/messages/MExportDirNotifyAck.h @@ -19,29 +19,29 @@ using namespace std; class MExportDirNotifyAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirNotifyAck() {} - MExportDirNotifyAck(inodeno_t ino) : + MExportDirNotifyAck(dirfrag_t dirfrag) : Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->ino = ino; + this->dirfrag = dirfrag; } virtual char *get_type_name() { return "ExNotA"; } void print(ostream& o) { - o << "export_notify_ack(" << ino << ")"; + o << "export_notify_ack(" << dirfrag << ")"; } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/branches/sage/cephmds2/messages/MExportDirPrep.h b/branches/sage/cephmds2/messages/MExportDirPrep.h index 798f3af8d84fa..2fc85db4b67c9 100644 --- a/branches/sage/cephmds2/messages/MExportDirPrep.h +++ b/branches/sage/cephmds2/messages/MExportDirPrep.h @@ -20,40 +20,44 @@ #include "include/types.h" class MExportDirPrep : public Message { - inodeno_t ino; + dirfrag_t dirfrag; /* nested export discover payload. not all inodes will have dirs; they may require a separate discover. dentries are the links to each inode. dirs map includes base dir (ino) */ - list exports; + list exports; list inodes; - map inode_dirino; + map inode_dirfrag; map inode_dentry; - map dirs; + map > frags_by_ino; + map dirs; set bystanders; bool b_did_assim; public: - inodeno_t get_ino() { return ino; } - list& get_exports() { return exports; } + dirfrag_t get_dirfrag() { return dirfrag; } + list& get_exports() { return exports; } list& get_inodes() { return inodes; } - inodeno_t get_containing_dirino(inodeno_t ino) { - return inode_dirino[ino]; + list& get_inode_dirfrags(inodeno_t ino) { + return frags_by_ino[ino]; + } + dirfrag_t get_containing_dirfrag(inodeno_t ino) { + return inode_dirfrag[ino]; } string& get_dentry(inodeno_t ino) { return inode_dentry[ino]; } - bool have_dir(inodeno_t ino) { - return dirs.count(ino); + bool have_dir(dirfrag_t df) { + return dirs.count(df); } - CDirDiscover* get_dir(inodeno_t ino) { - return dirs[ino]; + CDirDiscover* get_dirfrag(dirfrag_t df) { + return dirs[df]; } set &get_bystanders() { return bystanders; } @@ -63,17 +67,16 @@ class MExportDirPrep : public Message { MExportDirPrep() { b_did_assim = false; } - MExportDirPrep(CInode *in) : - Message(MSG_MDS_EXPORTDIRPREP) { - ino = in->ino(); - b_did_assim = false; - } + MExportDirPrep(dirfrag_t df) : + Message(MSG_MDS_EXPORTDIRPREP), + dirfrag(df), + b_did_assim(false) { } ~MExportDirPrep() { for (list::iterator iit = inodes.begin(); iit != inodes.end(); iit++) delete *iit; - for (map::iterator dit = dirs.begin(); + for (map::iterator dit = dirs.begin(); dit != dirs.end(); dit++) delete dit->second; @@ -81,18 +84,21 @@ class MExportDirPrep : public Message { virtual char *get_type_name() { return "ExP"; } + void print(ostream& o) { + o << "export_prep(" << dirfrag << ")"; + } - - void add_export(inodeno_t dirino) { - exports.push_back( dirino ); + void add_export(dirfrag_t df) { + exports.push_back( df ); } - void add_inode(inodeno_t dirino, const string& dentry, CInodeDiscover *in) { + void add_inode(dirfrag_t df, const string& dentry, CInodeDiscover *in) { inodes.push_back(in); - inode_dirino.insert(pair(in->get_ino(), dirino)); - inode_dentry.insert(pair(in->get_ino(), dentry)); + inode_dirfrag[in->get_ino()] = df; + inode_dentry[in->get_ino()] = dentry; } void add_dir(CDirDiscover *dir) { - dirs.insert(pair(dir->get_ino(), dir)); + dirs[dir->get_dirfrag()] = dir; + frags_by_ino[dir->get_dirfrag().ino].push_back(dir->get_dirfrag().frag); } void add_bystander(int who) { bystanders.insert(who); @@ -100,11 +106,11 @@ class MExportDirPrep : public Message { virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); ::_decode(exports, payload, off); - + // inodes int ni; payload.copy(off, sizeof(int), (char*)&ni); @@ -121,10 +127,13 @@ class MExportDirPrep : public Message { inode_dentry[in->get_ino()] = d; // dir ino - inodeno_t dino; - payload.copy(off, sizeof(dino), (char*)&dino); - off += sizeof(dino); - inode_dirino[in->get_ino()] = dino; + dirfrag_t df; + payload.copy(off, sizeof(df), (char*)&df); + off += sizeof(df); + inode_dirfrag[in->get_ino()] = df; + + // child frags + ::_decode(frags_by_ino[in->get_ino()], payload, off); } // dirs @@ -134,14 +143,14 @@ class MExportDirPrep : public Message { for (int i=0; i_decode(payload, off); - dirs[dir->get_ino()] = dir; + dirs[dir->get_dirfrag()] = dir; } ::_decode(bystanders, payload, off); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); ::_encode(exports, payload); @@ -157,14 +166,17 @@ class MExportDirPrep : public Message { _encode(inode_dentry[(*iit)->get_ino()], payload); // dir ino - inodeno_t ino = inode_dirino[(*iit)->get_ino()]; - payload.append((char*)&ino, sizeof(ino)); + dirfrag_t df = inode_dirfrag[(*iit)->get_ino()]; + payload.append((char*)&df, sizeof(df)); + + // child frags + ::_encode(frags_by_ino[(*iit)->get_ino()], payload); } // dirs int nd = dirs.size(); payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirs.begin(); + for (map::iterator dit = dirs.begin(); dit != dirs.end(); dit++) dit->second->_encode(payload); diff --git a/branches/sage/cephmds2/messages/MExportDirPrepAck.h b/branches/sage/cephmds2/messages/MExportDirPrepAck.h index 6baa20fd15ee2..38735d263f3e8 100644 --- a/branches/sage/cephmds2/messages/MExportDirPrepAck.h +++ b/branches/sage/cephmds2/messages/MExportDirPrepAck.h @@ -18,26 +18,28 @@ #include "include/types.h" class MExportDirPrepAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirPrepAck() {} - MExportDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRPREPACK) { - this->ino = ino; - } + MExportDirPrepAck(dirfrag_t df) : + Message(MSG_MDS_EXPORTDIRPREPACK), + dirfrag(df) { } virtual char *get_type_name() { return "ExPAck"; } + void print(ostream& o) { + o << "export_prep_ack(" << dirfrag << ")"; + } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/branches/sage/cephmds2/messages/MMDSImportMap.h b/branches/sage/cephmds2/messages/MMDSImportMap.h index c7575809ee2e9..abf728878e6c6 100644 --- a/branches/sage/cephmds2/messages/MMDSImportMap.h +++ b/branches/sage/cephmds2/messages/MMDSImportMap.h @@ -21,8 +21,8 @@ class MMDSImportMap : public Message { public: - map > imap; - map > ambiguous_imap; + map > imap; + map > ambiguous_imap; MMDSImportMap() : Message(MSG_MDS_IMPORTMAP) {} @@ -34,14 +34,14 @@ class MMDSImportMap : public Message { << " imports)"; } - void add_import(inodeno_t im) { + void add_import(dirfrag_t im) { imap[im].clear(); } - void add_import_export(inodeno_t im, inodeno_t ex) { + void add_import_export(dirfrag_t im, dirfrag_t ex) { imap[im].push_back(ex); } - void add_ambiguous_import(inodeno_t im, const list& m) { + void add_ambiguous_import(dirfrag_t im, const list& m) { ambiguous_imap[im] = m; } -- 2.39.5