From b57474a371e0c6a85fdb6ffca338ed5b323353a3 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 20 Jul 2007 22:06:33 +0000 Subject: [PATCH] fragtree, fragset work; migrator/cache dirfrag bounds cleanup git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1539 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/TODO | 16 +- trunk/ceph/include/frag.h | 279 +++++++++++++++++++++++++++++---- trunk/ceph/mds/CDir.cc | 2 + trunk/ceph/mds/CInode.cc | 23 ++- trunk/ceph/mds/CInode.h | 6 +- trunk/ceph/mds/Locker.cc | 15 +- trunk/ceph/mds/MDCache.cc | 42 +++-- trunk/ceph/mds/MDCache.h | 15 +- trunk/ceph/mds/MDS.cc | 20 --- trunk/ceph/mds/Migrator.cc | 188 +++++++++++----------- trunk/ceph/mds/Migrator.h | 2 +- trunk/ceph/mds/Server.cc | 75 +++------ trunk/ceph/mds/Server.h | 3 +- trunk/ceph/mds/journal.cc | 10 +- trunk/ceph/messages/MOSDPing.h | 22 ++- 15 files changed, 468 insertions(+), 250 deletions(-) diff --git a/trunk/ceph/TODO b/trunk/ceph/TODO index f56d68b1d8f1e..a4a61ea8da65f 100644 --- a/trunk/ceph/TODO +++ b/trunk/ceph/TODO @@ -58,13 +58,19 @@ sage mds *** - should get_dirfrags(frag_t) return partial matches? bc we might have the two frags listed separately even tho they've merged.. - - dirfragtree is lazily consistent. no lock. bcast by primary when it updates. - - CDir is never request pinned - - add a CInode sticky_dir flag to somehow pin all cdirs on the fly. - - STICKY dir state and pin? make sure it's kept across import/export/fragment - - pull _bound maps out of Migrator; they are redundant (trust the subtree map!) + - fragset_t to describe bounds; we need to tolerate concurrent merge/splits +/ - fragtree_t +/ - get_leaves(fg, ls) needs to be smarter +/ - force_to_leaf() + +/ - CDir is never request pinned +/ - add a CInode sticky_dir flag to somehow pin all cdirs on the fly. +/ - STICKY dir state and pin? make sure it's kept across import/export/fragment +/ - pull _bound maps out of Migrator; they are redundant (trust the subtree map!) - auth journals and applies update in the request update pipeline + - dirfragtree is lazily consistent. no lock. bcast by primary when it updates. + - bcast to dir replicas - inode auth will journal inode update separately/lazily - also on handle_resolve(), if there is a mismatch. diff --git a/trunk/ceph/include/frag.h b/trunk/ceph/include/frag.h index 641aa0707ee4c..d58f91927ce80 100644 --- a/trunk/ceph/include/frag.h +++ b/trunk/ceph/include/frag.h @@ -33,9 +33,9 @@ * * this vaguely resembles a btree, in that when a fragment becomes large or small * we can split or merge, except that there is no guarantee of being balanced. + * * presumably we are partitioning the output of a (perhaps specialized) hash * function. - * */ /** @@ -51,6 +51,9 @@ * we write it as v/b, where v is a value and b is the number of bits. * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. + * + * this makes the right most bit of v the "most significant", which is the + * opposite of what we usually see. */ typedef uint32_t _frag_t; @@ -64,7 +67,8 @@ class frag_t { public: frag_t() : _enc(0) { } - frag_t(unsigned v, unsigned b) : _enc((b << 24) + v) { } + frag_t(unsigned v, unsigned b) : _enc((b << 24) + + (v & (0xffffffff >> b))) { } frag_t(_frag_t e) : _enc(e) { } // constructors @@ -81,7 +85,7 @@ class frag_t { return (v & mask()) == value(); } bool contains(frag_t sub) const { - return (sub.bits() >= bits() && // they are more specific than us, + return (sub.bits() >= bits() && // they at least as specific as us, (sub.value() & mask()) == value()); // and they are contained by us. } bool is_root() const { @@ -93,18 +97,34 @@ class frag_t { } // splitting - frag_t left_half() const { - return frag_t(value(), bits()+1); - } - frag_t right_half() const { - return frag_t(value() | (1<& fragments) const { assert(nb > 0); unsigned nway = 1 << (nb-1); for (unsigned i=0; i 0); + return frag_t(_enc ^ (1 << (bits()-1))); + } + bool is_left() const { + return + bits() > 0 && + (value() & (1 << (bits()-1)) == 0); + } + bool is_right() const { + return + bits() > 0 && + (value() & (1 << (bits()-1)) == 1); + } + frag_t left_child() const { + return frag_t(value(), bits()+1); + } + frag_t right_child() const { + return frag_t(value() | (1<: @@ -125,6 +143,13 @@ class fragtree_t { std::map _splits; public: + // ------------- + // basics + void swap(fragtree_t& other) { + _splits.swap(other._splits); + } + + // ------------- // accessors bool empty() { return _splits.empty(); @@ -136,10 +161,28 @@ class fragtree_t { else return p->second; } + + + bool is_leaf(frag_t x) const { + list ls; + get_leaves_under_split(x, ls); + if (!ls.empty() && + ls.front() == x) + return true; + return false; + } + + /** + * get_leaves -- list all leaves + */ void get_leaves(list& ls) const { - get_leaves(frag_t(), ls); + return get_leaves_under_split(frag_t(), ls); } - void get_leaves(frag_t under, list& ls) const { + + /** + * get_leaves_under_split -- list all leaves under a known split point (or root) + */ + void get_leaves_under_split(frag_t under, list& ls) const { list q; q.push_back(under); while (!q.empty()) { @@ -152,23 +195,80 @@ class fragtree_t { ls.push_back(t); // not spit, it's a leaf. } } - bool contains(frag_t fg) const { + + /** + * get_branch -- get branch point for frag @x + * - may be @x itself, if @x is a split + * - may be root (frag_t()) + */ + frag_t get_branch(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + if (get_split(x)) return x; // found it! + x = x.parent(); + } + } + /** + * get_branch_or_leaf -- get branch or leaf point parent for frag @x + * - may be @x itself, if @x is a split or leaf + * - may be root (frag_t()) + */ + frag_t get_branch_or_leaf(frag_t x) const { + frag_t branch = get_branch(x); + int nb = get_split(branch); + if (nb > 0 && // if branch is a split, and + branch.bits() + nb <= x.bits()) // one of the children is or contains x + return frag_t(branch.bits()+nb, x.value()); // then return that child (it's a leaf) + else + return branch; + } + + /** + * get_leaves_under(x, ls) -- search for any leaves fully contained by x + */ + void get_leaves_under(frag_t x, list& ls) const { list q; - q.push_back(frag_t()); + q.push_back(get_branch(x)); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) + t.split(nb, q); // queue up children + else + ls.push_back(t); // not spit, it's a leaf. + } + } + + /** + * contains(fg) -- does fragtree contain the specific frag @x + */ + bool contains(frag_t x) const { + list q; + q.push_back(get_branch(x)); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip int nb = get_split(t); if (nb) { - if (t == fg) return false; // it's split. + if (t == x) return false; // it's split. t.split(nb, q); // queue up children } else { - if (t == fg) return true; // it's there. + if (t == x) return true; // it's there. } } return false; } + /** + * operator[] -- map a (hash?) value to a frag + */ frag_t operator[](unsigned v) const { frag_t t; while (1) { @@ -191,18 +291,62 @@ class fragtree_t { assert(i < nway); } } - + + + // --------------- // modifiers - void swap(fragtree_t& other) { - _splits.swap(other._splits); + void split(frag_t x, int b) { + assert(is_leaf(x)); + _splits[x] = b; } - void split(frag_t hb, int b) { - assert(_splits.count(hb) == 0); - _splits[hb] = b; + void merge(frag_t x, int b) { + assert(!is_leaf(x)); + assert(_splits[x] == b); + _splits.erase(x); } - void merge(frag_t hb, int b) { - assert(_splits[hb] == b); - _splits.erase(hb); + + void force_to_leaf(frag_t x) { + assert(!is_leaf(x)); + + frag_t parent = get_branch_or_leaf(x); + assert(parent.bits() <= x.bits()); + + // do we need to split from parent to x? + if (parent.bits() < x.bits()) { + int spread = x.bits() - parent.bits(); + int nb = get_split(parent); + if (nb == 0) { + // easy: split parent (a leaf) by the difference + split(parent, spread); + return; + } + assert(nb > spread); + + // add an intermediary split + merge(parent, nb); + split(parent, spread); + + list subs; + parent.split(spread, subs); + for (list::iterator p = subs.begin(); + p != subs.end(); + ++p) + split(*p, nb - spread); + } + + // x is now a leaf or split. + // hoover up any children. + list q; + q.push_back(x); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = get_split(t); + if (nb) { + merge(t, nb); // merge this point, and + t.split(nb, q); // queue up children + } + } } // verify that we describe a legal partition of the namespace. @@ -230,13 +374,35 @@ class fragtree_t { void _decode(bufferlist& bl, int& off) { ::_decode(_splits, bl, off); } + + void print(ostream& out) { + out << "fragtree_t("; + list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + // newline + indent? + if (t.bits()) { + out << endl; + for (unsigned i=0; i q; q.push_back(frag_t()); while (!q.empty()) { @@ -244,10 +410,7 @@ inline ostream& operator<<(ostream& out, fragtree_t& ft) q.pop_front(); int nb = ft.get_split(t); if (nb) { - if (first) - first = false; - else - out << ' '; + if (t.bits()) out << ' '; out << t << '%' << nb; t.split(nb, q); // queue up children } @@ -255,4 +418,56 @@ inline ostream& operator<<(ostream& out, fragtree_t& ft) return out << ")"; } + +/** + * fragset_t -- a set of fragments + */ +class fragset_t { + set _set; + +public: + set &get() { return _set; } + set::iterator begin() { return _set.begin(); } + set::iterator end() { return _set.end(); } + + bool empty() const { return _set.empty(); } + + bool contains(frag_t f) const { + while (1) { + if (_set.count(f)) return true; + if (f.bits() == 0) return false; + f = f.parent(); + } + } + + void insert(frag_t f) { + _set.insert(f); + simplify(); + } + + void simplify() { + while (1) { + bool clean = true; + set::iterator p = _set.begin(); + while (p != _set.end()) { + if (_set.count(p->get_sibling())) { + _set.erase(p->get_sibling()); + _set.insert(p->parent()); + _set.erase(p++); + clean = false; + } else { + p++; + } + } + if (clean) + break; + } + } +}; + +inline ostream& operator<<(ostream& out, fragset_t& fs) +{ + return out << "fragset_t(" << fs.get() << ")" << endl; +} + #endif diff --git a/trunk/ceph/mds/CDir.cc b/trunk/ceph/mds/CDir.cc index 99078ff563166..8c1f83db9aa81 100644 --- a/trunk/ceph/mds/CDir.cc +++ b/trunk/ceph/mds/CDir.cc @@ -363,6 +363,8 @@ void CDir::steal_dentry(CDentry *dn) items[dn->name] = dn; + if (nitems == 0) + get(PIN_CHILD); nitems++; if (dn->is_null()) nnull++; diff --git a/trunk/ceph/mds/CInode.cc b/trunk/ceph/mds/CInode.cc index 9ede5b6d5360a..402d50e679dfa 100644 --- a/trunk/ceph/mds/CInode.cc +++ b/trunk/ceph/mds/CInode.cc @@ -133,10 +133,10 @@ frag_t CInode::pick_dirfrag(const string& dn) return dirfragtree[H(dn)]; } -void CInode::get_dirfrags(frag_t fg, list& ls) +void CInode::get_dirfrags_under(frag_t fg, list& ls) { list fglist; - dirfragtree.get_leaves(fg, fglist); + dirfragtree.get_leaves_under(fg, fglist); for (list::iterator p = fglist.begin(); p != fglist.end(); ++p) @@ -301,14 +301,21 @@ void CInode::fragment_dir(frag_t basefrag, int bits) f->state_set(base->get_state() & (CDir::STATE_DIRTY | CDir::STATE_COMPLETE | - CDir::STATE_FROZENDIR)); + CDir::STATE_FROZENDIR | + CDir::STATE_EXPORT | + CDir::STATE_EXPORTBOUND | + CDir::STATE_IMPORTBOUND | + CDir::STATE_STICKY | + 0)); + if (f->state_test(CDir::STATE_DIRTY)) f->get(CDir::PIN_DIRTY); + if (f->state_test(CDir::STATE_FROZENDIR)) f->get(CDir::PIN_FROZEN); + if (f->state_test(CDir::STATE_EXPORT)) f->get(CDir::PIN_EXPORT); + if (f->state_test(CDir::STATE_EXPORTBOUND)) f->get(CDir::PIN_EXPORTBOUND); + if (f->state_test(CDir::STATE_IMPORTBOUND)) f->get(CDir::PIN_IMPORTBOUND); + if (f->state_test(CDir::STATE_STICKY)) f->get(CDir::PIN_STICKY); + f->set_version(base->get_version()); - if (base->state_test(CDir::STATE_EXPORT)) { - f->state_set(CDir::STATE_EXPORT); - f->get(CDir::PIN_EXPORT); - } - // dup replica map f->replica_map = base->replica_map; diff --git a/trunk/ceph/mds/CInode.h b/trunk/ceph/mds/CInode.h index b7adff9f69dbb..56e16e8346f5c 100644 --- a/trunk/ceph/mds/CInode.h +++ b/trunk/ceph/mds/CInode.h @@ -120,8 +120,8 @@ class CInode : public MDSCacheObject { // inode contents proper inode_t inode; // the inode itself string symlink; // symlink dest, if symlink - fragtree_t dirfragtree; // dir frag tree, if any - map dirfrag_size; // size of each dirfrag + fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. + //map dirfrag_size; // size of each dirfrag off_t last_open_journaled; // log offset for the last journaled EOpen @@ -154,7 +154,7 @@ public: else return 0; } - void get_dirfrags(frag_t fg, list& ls); + void get_dirfrags_under(frag_t fg, list& ls); void get_dirfrags(list& ls); void get_nested_dirfrags(list& ls); void get_subtree_dirfrags(list& ls); diff --git a/trunk/ceph/mds/Locker.cc b/trunk/ceph/mds/Locker.cc index 418bbf3116b90..190bd5618faf8 100644 --- a/trunk/ceph/mds/Locker.cc +++ b/trunk/ceph/mds/Locker.cc @@ -747,12 +747,19 @@ SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) switch (lock_type) { case LOCK_OTYPE_DN: { - CDir *dir = mdcache->get_dirfrag(info.dirfrag); + // be careful; info.dirfrag may have incorrect frag; recalculate based on dname. + CInode *diri = mdcache->get_inode(info.dirfrag.ino); + frag_t fg; + CDir *dir = 0; CDentry *dn = 0; - if (dir) - dn = dir->lookup(info.dname); + if (diri) { + fg = diri->pick_dirfrag(info.dname); + dir = diri->get_dirfrag(fg); + if (dir) + dn = dir->lookup(info.dname); + } if (!dn) { - dout(7) << "get_lock don't have dn " << info.dirfrag << " " << info.dname << endl; + dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << endl; return 0; } return &dn->lock; diff --git a/trunk/ceph/mds/MDCache.cc b/trunk/ceph/mds/MDCache.cc index 1d432f0449536..8451c0836bc30 100644 --- a/trunk/ceph/mds/MDCache.cc +++ b/trunk/ceph/mds/MDCache.cc @@ -3989,14 +3989,20 @@ CDir *MDCache::path_traverse_to_dir(filepath& path) } - -void MDCache::open_remote_dir(CInode *diri, frag_t fg, Context *fin) +/** + * open_remote_dir -- open up a remote dirfrag + * + * @diri - base inode + * @approxfg - approximate fragment. + * @fin - completion callback + */ +void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, Context *fin) { dout(10) << "open_remote_dir on " << *diri << endl; assert(diri->is_dir()); assert(!diri->is_auth()); - assert(diri->get_dirfrag(fg) == 0); + assert(diri->get_dirfrag(approxfg) == 0); int auth = diri->authority().first; @@ -4007,13 +4013,13 @@ void MDCache::open_remote_dir(CInode *diri, frag_t fg, Context *fin) diri->ino(), want, true); // need the base dir open - dis->set_base_dir_frag(fg); + dis->set_base_dir_frag(approxfg); mds->send_message_mds(dis, auth, MDS_PORT_CACHE); dir_discovers[diri->ino()].insert(auth); diri->add_waiter(CInode::WAIT_DIR, fin); } else { // mds is down or recovering. forge a replica! - forge_replica_dir(diri, fg, auth); + forge_replica_dir(diri, approxfg, auth); } } @@ -4046,6 +4052,19 @@ CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) } } +class C_MDC_RetryOpenRemoteIno : public Context { + MDCache *mdcache; + inodeno_t ino; + MDRequest *mdr; + Context *onfinish; +public: + C_MDC_RetryOpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : + mdcache(mdc), ino(i), mdr(r), onfinish(c) {} + void finish(int r) { + mdcache->open_remote_ino(ino, mdr, onfinish); + } +}; + class C_MDC_OpenRemoteIno : public Context { MDCache *mdcache; @@ -4126,8 +4145,9 @@ void MDCache::open_remote_ino_2(inodeno_t ino, if (!in->is_auth()) { dout(10) << "opening remote dirfrag " << frag << " under " << *in << endl; - open_remote_dir(in, frag, - new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); + /* FIXME: we re-query the anchortable just to avoid a fragtree update race */ + open_remote_dirfrag(in, frag, + new C_MDC_RetryOpenRemoteIno(this, ino, mdr, onfinish)); return; } @@ -5268,7 +5288,9 @@ void MDCache::handle_dir_update(MDirUpdate *m) // discover it? if (m->should_discover()) { - m->tried_discover(); // only once! + // only try once! + // this is key to avoid a fragtree update race, among other things. + m->tried_discover(); vector trace; filepath path = m->get_path(); @@ -5283,8 +5305,8 @@ void MDCache::handle_dir_update(MDirUpdate *m) CInode *in = get_inode(m->get_dirfrag().ino); assert(in); - open_remote_dir(in, m->get_dirfrag().frag, - new C_MDS_RetryMessage(mds, m)); + open_remote_dirfrag(in, m->get_dirfrag().frag, + new C_MDS_RetryMessage(mds, m)); return; } diff --git a/trunk/ceph/mds/MDCache.h b/trunk/ceph/mds/MDCache.h index 6f2dbee4293ea..88513bd8c34c9 100644 --- a/trunk/ceph/mds/MDCache.h +++ b/trunk/ceph/mds/MDCache.h @@ -91,6 +91,7 @@ struct MDRequest { // -- my pins and locks -- // cache pins (so things don't expire) set< MDSCacheObject* > pins; + set stickydirs; // auth pins set< MDSCacheObject* > auth_pins; @@ -156,6 +157,12 @@ struct MDRequest { pins.insert(o); } } + void set_stickydirs(CInode *in) { + if (stickydirs.count(in) == 0) { + in->get_stickydirs(); + stickydirs.insert(in); + } + } // auth pins bool is_auth_pinned(MDSCacheObject *object) { @@ -438,10 +445,12 @@ public: if (!have_inode(df.ino)) return NULL; return inode_map[df.ino]->get_dirfrag(df.frag); } - void get_dirfrags(dirfrag_t df, list& ls) { + /* + void get_dirfrags_under(dirfrag_t df, list& ls) { if (have_inode(df.ino)) - inode_map[df.ino]->get_dirfrags(df.frag, ls); + inode_map[df.ino]->get_dirfrags_under(df.frag, ls); } + */ MDSCacheObject *get_object(MDSCacheObjectInfo &info); @@ -518,7 +527,7 @@ public: } CDir *path_traverse_to_dir(filepath& path); - void open_remote_dir(CInode *diri, frag_t fg, Context *fin); + void open_remote_dirfrag(CInode *diri, frag_t fg, Context *fin); CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, diff --git a/trunk/ceph/mds/MDS.cc b/trunk/ceph/mds/MDS.cc index cce42573f2e70..8c267271edfbf 100644 --- a/trunk/ceph/mds/MDS.cc +++ b/trunk/ceph/mds/MDS.cc @@ -1218,26 +1218,6 @@ void MDS::my_dispatch(Message *m) - // HACK to force export to test foreign renames - if (false && whoami == 0) { - /* - static bool didit = false; - - // 7 to 1 - CInode *in = mdcache->get_inode(1001); - if (in && in->is_dir() && !didit) { - CDir *dir = in->get_or_open_dir(mdcache); - if (dir->is_auth()) { - dout(1) << "FORCING EXPORT" << endl; - mdcache->migrator->export_dir(dir,1); - didit = true; - } - } - */ - } - - - // shut down? if (is_stopping()) { if (mdcache->shutdown_pass()) { diff --git a/trunk/ceph/mds/Migrator.cc b/trunk/ceph/mds/Migrator.cc index 050fa09a88271..0e47de3432421 100644 --- a/trunk/ceph/mds/Migrator.cc +++ b/trunk/ceph/mds/Migrator.cc @@ -1275,13 +1275,8 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) } // yay - import_discovered(in, df); - delete m; -} - -void Migrator::import_discovered(CInode *in, dirfrag_t df) -{ - dout(7) << "import_discovered " << df << " inode " << *in << endl; + + dout(7) << "handle_export_discover have " << df << " inode " << *in << endl; // pin inode in the cache (for now) assert(in->is_dir()); @@ -1314,12 +1309,21 @@ void Migrator::handle_export_cancel(MExportDirCancel *m) void Migrator::handle_export_prep(MExportDirPrep *m) { - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - int oldauth = m->get_source().num(); assert(oldauth != mds->get_nodeid()); + // make sure we didn't abort + if (import_state.count(m->get_dirfrag()) == 0 || + import_state[m->get_dirfrag()] != IMPORT_DISCOVERED || + import_peer[m->get_dirfrag()] != oldauth) { + dout(10) << "handle_export_prep import has aborted, dropping" << endl; + delete m; + return; + } + + CInode *diri = cache->get_inode(m->get_dirfrag().ino); + assert(diri); + list finished; // assimilate root dir. @@ -1336,9 +1340,16 @@ void Migrator::handle_export_prep(MExportDirPrep *m) dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << endl; } assert(dir->is_auth() == false); - + cache->show_subtrees(); + // build bound map + map bound_dirfragset; + for (list::iterator p = m->get_bounds().begin(); + p != m->get_bounds().end(); + ++p) + bound_dirfragset[p->ino].insert(p->frag); + // assimilate contents? if (!m->did_assim()) { dout(7) << "doing assim on " << *dir << endl; @@ -1390,85 +1401,82 @@ void Migrator::handle_export_prep(MExportDirPrep *m) } } - // open export dirs/bounds? - for (list::iterator it = m->get_bounds().begin(); - it != m->get_bounds().end(); - it++) { - dout(7) << " checking bound " << hex << *it << dec << endl; - CInode *in = cache->get_inode(it->ino); + // make bound sticky + for (map::iterator p = bound_dirfragset.begin(); + p != bound_dirfragset.end(); + ++p) { + CInode *in = cache->get_inode(p->first); assert(in); - - CDir *dir = cache->get_dirfrag(*it); - if (!dir) { - dout(7) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, it->frag, - new C_MDS_RetryMessage(mds, m)); - } + in->get_stickydirs(); + dout(7) << " set stickydirs on bound inode " << *in << endl; } + } else { dout(7) << " not doing assim on " << *dir << endl; } - - // verify we have all bounds + if (!finished.empty()) + mds->queue_waiters(finished); + + + // open all bounds set import_bounds; - int waiting_for = 0; - for (list::iterator it = m->get_bounds().begin(); - it != m->get_bounds().end(); - it++) { - dirfrag_t df = *it; - CInode *in = cache->get_inode(df.ino); + for (map::iterator p = bound_dirfragset.begin(); + p != bound_dirfragset.end(); + ++p) { + CInode *in = cache->get_inode(p->first); assert(in); + + // map fragset into a frag_t list, based on the inode fragtree list fglist; - in->dirfragtree.get_leaves(df.frag, fglist); + for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) + in->dirfragtree.get_leaves_under(*q, fglist); + dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << endl; + for (list::iterator q = fglist.begin(); q != fglist.end(); ++q) { - CDir *bound = cache->get_dirfrag(dirfrag_t(df.ino, *q)); - if (bound) { - if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { - dout(7) << " pinning import bound " << *bound << endl; - bound->get(CDir::PIN_IMPORTBOUND); - bound->state_set(CDir::STATE_IMPORTBOUND); - import_bounds.insert(bound); - } else { - dout(7) << " already pinned import bound " << *bound << endl; - } + CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q)); + if (!bound) { + dout(7) << " opening bounding dirfrag " << *q << " on " << *in << endl; + cache->open_remote_dirfrag(in, *q, + new C_MDS_RetryMessage(mds, m)); + return; + } + + if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { + dout(7) << " pinning import bound " << *bound << endl; + bound->get(CDir::PIN_IMPORTBOUND); + bound->state_set(CDir::STATE_IMPORTBOUND); + import_bounds.insert(bound); } else { - dout(7) << " waiting for nested export dir on " << *cache->get_inode(df.ino) << endl; - waiting_for++; + dout(7) << " already pinned import bound " << *bound << endl; } } } - if (waiting_for) { - dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl; - } else { - dout(7) << " all ready, noting auth and freezing import region" << endl; - - // note that i am an ambiguous auth for this subtree. - // specify bounds, since the exporter explicitly defines the region. - cache->adjust_bounded_subtree_auth(dir, import_bounds, - pair(oldauth, mds->get_nodeid())); - cache->verify_subtree_bounds(dir, import_bounds); - - // freeze. - dir->_freeze_tree(); - - // ok! - dout(7) << " sending export_prep_ack on " << *dir << endl; - mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // note new state - import_state[dir->dirfrag()] = IMPORT_PREPPED; - - // done - delete m; - } + dout(7) << " all ready, noting auth and freezing import region" << endl; + + // note that i am an ambiguous auth for this subtree. + // specify bounds, since the exporter explicitly defines the region. + cache->adjust_bounded_subtree_auth(dir, import_bounds, + pair(oldauth, mds->get_nodeid())); + cache->verify_subtree_bounds(dir, import_bounds); + + // freeze. + dir->_freeze_tree(); + + // ok! + dout(7) << " sending export_prep_ack on " << *dir << endl; + mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + // note new state + import_state[dir->dirfrag()] = IMPORT_PREPPED; + + // done + delete m; - // finish waiters - finish_contexts(finished, 0); } @@ -1670,24 +1678,35 @@ void Migrator::import_reverse_unfreeze(CDir *dir) import_reverse_unpin(dir); } -void Migrator::import_reverse_unpin(CDir *dir) +void Migrator::import_remove_pins(CDir *dir) { - dout(7) << "import_reverse_unpin " << *dir << endl; - - // remove importing pin + // root dir->put(CDir::PIN_IMPORTING); dir->state_clear(CDir::STATE_IMPORTING); - // remove bound pins + // bounds set bounds; cache->get_subtree_bounds(dir, bounds); + set didinodes; for (set::iterator it = bounds.begin(); it != bounds.end(); it++) { CDir *bd = *it; bd->put(CDir::PIN_IMPORTBOUND); bd->state_clear(CDir::STATE_IMPORTBOUND); + CInode *bdi = bd->get_inode(); + if (didinodes.count(bdi) == 0) { + bdi->put_stickydirs(); + didinodes.insert(bdi); + } } +} + +void Migrator::import_reverse_unpin(CDir *dir) +{ + dout(7) << "import_reverse_unpin " << *dir << endl; + + import_remove_pins(dir); // clean up import_state.erase(dir->dirfrag()); @@ -1732,20 +1751,7 @@ void Migrator::import_finish(CDir *dir, bool now) mds->mdlog->submit_entry(new EImportFinish(dir, true)); // remove pins - dir->put(CDir::PIN_IMPORTING); - dir->state_clear(CDir::STATE_IMPORTING); - - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bd = *it; - - // remove bound pin - bd->put(CDir::PIN_IMPORTBOUND); - bd->state_clear(CDir::STATE_IMPORTBOUND); - } + import_remove_pins(dir); // unfreeze dir->unfreeze_tree(); diff --git a/trunk/ceph/mds/Migrator.h b/trunk/ceph/mds/Migrator.h index 2a537738998d9..60db3d89a4832 100644 --- a/trunk/ceph/mds/Migrator.h +++ b/trunk/ceph/mds/Migrator.h @@ -215,7 +215,6 @@ public: // importer void handle_export_discover(MExportDirDiscover *m); void handle_export_cancel(MExportDirCancel *m); - void import_discovered(CInode *in, dirfrag_t df); void handle_export_prep(MExportDirPrep *m); void handle_export_dir(MExportDir *m); @@ -231,6 +230,7 @@ public: public: void import_reverse(CDir *dir, bool fix_dir_auth=true); protected: + void import_remove_pins(CDir *dir); void import_reverse_unfreeze(CDir *dir); void import_reverse_unpin(CDir *dir); void import_notify_abort(CDir *dir); diff --git a/trunk/ceph/mds/Server.cc b/trunk/ceph/mds/Server.cc index 3208a5b0a205c..086f9cadefece 100644 --- a/trunk/ceph/mds/Server.cc +++ b/trunk/ceph/mds/Server.cc @@ -868,8 +868,7 @@ CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dn // which dirfrag? frag_t fg = diri->pick_dirfrag(dname); - - CDir *dir = try_open_auth_dir(diri, fg, mdr); + CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); if (!dir) return 0; @@ -1161,37 +1160,41 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mus -CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr) +/** + * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth + * + * @diri base indoe + * @fg the exact frag we want + * @mdr request + */ +CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr) { CDir *dir = diri->get_dirfrag(fg); // not open and inode not mine? if (!dir && !diri->is_auth()) { int inauth = diri->authority().first; - dout(7) << "try_open_auth_dir: not open, not inode auth, fw to mds" << inauth << endl; + dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds" << inauth << endl; mdcache->request_forward(mdr, inauth); return 0; } // not open and inode frozen? if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_auth_dir: dir inode is frozen, waiting " << *diri << endl; + dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << endl; assert(diri->get_parent_dir()); diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } // invent? - if (!dir) { - assert(diri->is_auth()); + if (!dir) dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - } - assert(dir); // am i auth for the dirfrag? if (!dir->is_auth()) { int auth = dir->authority().first; - dout(7) << "try_open_auth_dir: not auth for " << *dir + dout(7) << "try_open_auth_dirfrag: not auth for " << *dir << ", fw to mds" << auth << endl; mdcache->request_forward(mdr, auth); return 0; @@ -1200,38 +1203,6 @@ CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr) return dir; } -/* -CDir* Server::try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr) -{ - CDir *dir = diri->get_dirfrag(fg); - if (dir) - return dir; - - if (diri->is_auth()) { - // auth - // not open and inode frozen? - if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_dir: dir inode is auth+frozen, waiting " << *diri << endl; - assert(diri->get_parent_dir()); - diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // invent? - if (!dir) { - assert(diri->is_auth()); - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - } - assert(dir); - return dir; - } else { - // not auth - mdcache->open_remote_dir(diri, fg, - new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } -} -*/ /** predirty_dn_diri @@ -1522,14 +1493,14 @@ void Server::handle_client_readdir(MDRequest *mdr) // which frag? frag_t fg = req->args.readdir.frag; - // does it exist? + // does the frag exist? if (diri->dirfragtree[fg] != fg) { dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl; reply_request(mdr, -EAGAIN); return; } - CDir *dir = try_open_auth_dir(diri, fg, mdr); + CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); if (!dir) return; // ok! @@ -2517,9 +2488,7 @@ bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = in->get_dirfrag(*p); - if (!dir) - dir = in->get_or_open_dirfrag(mdcache, *p); + CDir *dir = in->get_or_open_dirfrag(mdcache, *p); assert(dir); // dir looks empty but incomplete? @@ -2708,18 +2677,18 @@ void Server::handle_client_rename(MDRequest *mdr) if (srcdn->is_primary() && !srcdn->is_auth() && srci->is_dir()) { - dout(10) << "srci is remote dir, opening all frags" << endl; + dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << endl; + mdr->set_stickydirs(srci); + list frags; srci->dirfragtree.get_leaves(frags); for (list::iterator p = frags.begin(); p != frags.end(); ++p) { CDir *dir = srci->get_dirfrag(*p); - if (dir) { - dout(10) << " opened " << *dir << endl; - mdr->pin(dir); - } else { - mdcache->open_remote_dir(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); + if (!dir) { + dout(10) << " opening " << *dir << endl; + mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); return; } } diff --git a/trunk/ceph/mds/Server.h b/trunk/ceph/mds/Server.h index 59d00e1fa777b..2a32c1b41b968 100644 --- a/trunk/ceph/mds/Server.h +++ b/trunk/ceph/mds/Server.h @@ -81,8 +81,7 @@ public: CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth); CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); - CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr); - //CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr); + CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr); version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime); diff --git a/trunk/ceph/mds/journal.cc b/trunk/ceph/mds/journal.cc index b52f61b7f2c40..196c16f81156c 100644 --- a/trunk/ceph/mds/journal.cc +++ b/trunk/ceph/mds/journal.cc @@ -936,11 +936,11 @@ void EPurgeFinish::replay(MDS *mds) bool EExport::has_expired(MDS *mds) { CDir *dir = mds->mdcache->get_dirfrag(base); - if (!dir) return true; - if (!mds->mdcache->migrator->is_exporting(dir)) - return true; - dout(10) << "EExport.has_expired still exporting " << *dir << endl; - return false; + if (dir && mds->mdcache->migrator->is_exporting(dir)) { + dout(10) << "EExport.has_expired still exporting " << *dir << endl; + return false; + } + return true; } void EExport::expire(MDS *mds, Context *c) diff --git a/trunk/ceph/messages/MOSDPing.h b/trunk/ceph/messages/MOSDPing.h index 739875479749d..dda21888c31d7 100644 --- a/trunk/ceph/messages/MOSDPing.h +++ b/trunk/ceph/messages/MOSDPing.h @@ -36,23 +36,19 @@ class MOSDPing : public Message { virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen); - off += sizeof(avg_qlen); - payload.copy(off, sizeof(read_mean_time), (char*)&read_mean_time); - off += sizeof(read_mean_time); + ::_decode(map_epoch, payload, off); + ::_decode(ack, payload, off); + ::_decode(avg_qlen, payload, off); + ::_decode(read_mean_time, payload, off); } virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&avg_qlen, sizeof(avg_qlen)); - payload.append((char*)&read_mean_time, sizeof(read_mean_time)); + ::_encode(map_epoch, payload); + ::_encode(ack, payload); + ::_encode(avg_qlen, payload); + ::_encode(read_mean_time, payload); } - virtual char *get_type_name() { return "oping"; } + virtual char *get_type_name() { return "osd_ping"; } }; #endif -- 2.39.5