From c23e3c15aeadd993d209220752f1fe824b323fd6 Mon Sep 17 00:00:00 2001 From: sageweil Date: Thu, 5 Jul 2007 16:26:26 +0000 Subject: [PATCH] * partial start on rejoin rework git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1478 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 53 ++------ branches/sage/cephmds2/mds/MDCache.cc | 126 ++++++++++-------- .../sage/cephmds2/messages/MMDSCacheRejoin.h | 41 +++--- 3 files changed, 104 insertions(+), 116 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index b08cfdaffd15c..6c79cd849c641 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -27,7 +27,7 @@ some smallish projects: code cleanup - endian portability - word size - - clean up all encoded structures + - clean up all encoded structures general kernel planning - soft consistency on lookup? @@ -48,6 +48,12 @@ sage doc sage mds +- fix rejoin + - validate dirfrag/dentry/inode connectivity + - carefully document rejoin + - cases + - confounding factors + - unlink needs to journal on witnesses (probably), since unlinked inodes may be in those journals - stray reintegration @@ -84,16 +90,13 @@ sage mds - failures during migration.. what about client stale/reap stuff and misplaced WR caps? - inode.max_size +- inode.allocated_size - real chdir (directory "open") - relative metadata ops - - - osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. -- incremental mdsmaps? - - EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) - dir version/committed/etc versus migration, log expires. - DOCUMENT. @@ -113,46 +116,6 @@ sage mds - - - - -foreign rename -- question: can we generalize foreign and local rename? -- initiated by dest. - - if we get into race with lock acquisition, drop locks and forward to new dest. -- how to do pre-auth pinning? - - is it sufficient to wait on, then grab, all local auth pins, _then_ do foreign locks? - - local auth pins can hold subtrees in freezing state, preventing exports, and additional auth_pins. - - so, wait, then grab all local auth_pins, - - then work on locks in proper order (*), - - if we detect we are missing a local auth_pin (i.e. migration race), drop all auth_pins and wait/restart - - need to more carefully look at lock dependencies to avoid deadlock... - - establish a complete full ordering on locks, based on any lock dependencies? - - is it possible to "leak" locks, e.g. get inode_hard lock, work on something else, but inode moves and we dont notice? - - pin paths for those locks? - - can we pin when we choose order, so that locks are sure to proceed? -- we can change active_requests to key of reqid (also unique), and use the same key for foreign locks - - clean up dentry_xlock_request.. just merge it into destroy_xlock_start, if !is_auth(). -- renamer will - - check preconditions (i.e. i am dest) - - grab all locks (avoiding deadlock) - - verify preconditions are still true, else forward/retry (actually, this already happens w/ the way we structure the lock acquisition code...) - - prepare foreign bits (using foreign request_auth_pins, locks, etc.) - - source unlink, - - anchortable update (if source is anchored), - - dest nlink-- (if dest is remote link on foreign host) - - make sure replicas have either both source+dest pinned in cache (or neither...) - - use foreign request_pins? - - log update - - do update locally - - async commit + unlock -- rejoin will need to explicitly resolve uncommitted items. - - fully implement link/unlink first, and use that as a model? - -monitor -- finish generic paxos - osdmon - distribute w/ paxos framework - allow fresh replacement osds. add osd_created in osdmap, probably diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index e6e0a48281544..6681a18d5ae89 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -1552,6 +1552,12 @@ void MDCache::recalc_auth_bits() show_cache(); } + + +// =========================================================================== +// REJOIN + + /* * rejoin phase! * we start out by sending rejoins to everyone in the recovery set. @@ -1659,33 +1665,35 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) { dout(10) << "cache_rejoin_walk " << *dir << endl; - //if (mds->is_rejoin()) - rejoin->add_weak_dirfrag(dir->dirfrag()); - //else - //rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce()); - + // walk dirfrag's dentries list nested; // finish this dir, then do nested items - // walk dentries + rejoin->add_weak_dirfrag(dir->dirfrag()); + for (map::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { // dentry CDentry *dn = p->second; - if (mds->is_rejoin()) - rejoin->add_weak_dentry(dir->dirfrag(), p->first); - else { + if (mds->is_rejoin()) { + // weak + if (dn->is_null()) { + rejoin->add_weak_null_dentry(dir->dirfrag(), p->first); + } else if (dn->is_primary()) { + rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); + } else { + rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first, dn->get_remote_ino()); + } + } else { + // strong rejoin->add_strong_dentry(dir->dirfrag(), p->first, dn->get_replica_nonce(), dn->lock.get_state()); } - - // inode? - if (dn->is_primary() && dn->get_inode()) { - CInode *in = dn->get_inode(); - if (mds->is_rejoin() && in->get_caps_wanted() == 0) - rejoin->add_weak_inode(in->ino()); - else { + + if (dn->is_primary()) { + // strong inode? + if (!mds->is_rejoin() || dn->get_inode()->get_caps_wanted()) rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), in->get_caps_wanted(), in->authlock.get_state(), @@ -1693,7 +1701,6 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) in->dirfragtreelock.get_state(), in->filelock.get_state(), in->dirlock.get_state()); - } // dirfrags in this subtree? list dfs; @@ -1705,7 +1712,7 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) nested.push_back(*p); } } - + // recurse into nested dirs for (list::iterator p = nested.begin(); p != nested.end(); @@ -1760,37 +1767,53 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) dout(10) << "i am active. removing stale cache replicas" << endl; // first, scour cache of unmentioned replica references + + // FIXME: what about root and stray inodes. + assert(0); // fixme this is broken re incorrectly corrected inode. + + for (hash_map::iterator p = inode_map.begin(); p != inode_map.end(); ++p) { - // inode CInode *in = p->second; - if (in->is_replica(from) && m->weak_inodes.count(p->first) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << endl; - } - - // dentry - if (in->parent) { - CDentry *dn = in->parent; - if (dn->is_replica(from) && - (m->weak_dentries.count(dn->get_dir()->dirfrag()) == 0 || - m->weak_dentries[dn->get_dir()->dirfrag()].count(dn->get_name()) == 0)) { - dentry_remove_replica(dn, from); - dout(10) << " rem " << *dn << endl; - } - } - - // dir + if (!in->is_dir()) continue; + list dfs; in->get_dirfrags(dfs); for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { CDir *dir = *p; - if (dir->is_replica(from) && m->weak_dirfrags.count(dir->dirfrag()) == 0) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << endl; + + if (!dir->is_replica(from) || + m->weak_dirfrags.count(dir->dirfrag())) + continue; + + dir->remove_replica(from); + dout(10) << " rem " << *dir << endl; + + // dentries + for (hash_map::iterator p = dir->entries.begin(); + p != entries.end(); + ++p) { + CDentry *dn = p->second; + + if (dn->is_replica(from) && + (m->weak_dentries.count(dn->get_dir()->dirfrag()) == 0 || + m->weak_dentries[dn->get_dir()->dirfrag()].count(dn->get_name()) == 0)) { + dentry_remove_replica(dn, from); + dout(10) << " rem " << *dn << endl; + + // inode? + if (!dn->is_primary()) + continue; + + CInode *in = dn->get_inode(); + if (in->is_replica(from)) { + inode_remove_replica(in, from); + dout(10) << " rem " << *in << endl; + } + } } } } @@ -1800,10 +1823,10 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) } // dirs - for (set::iterator p = m->weak_dirfrags.begin(); - p != m->weak_dirfrags.end(); + for (map >::iterator p = m->weak_dentires.begin(); + p != m->weak_dentries.end(); ++p) { - CDir *dir = get_dirfrag(*p); + CDir *dir = get_dirfrag(p->first); if (dir) { int nonce = dir->add_replica(from); dout(10) << " have " << *dir << endl; @@ -1811,10 +1834,10 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) ack->add_strong_dirfrag(*p, nonce); // dentries - for (set::iterator q = m->weak_dentries[*p].begin(); - q != m->weak_dentries[*p].end(); + for (map::iterator q = p->second.begin(); + q != p->second.end(); ++q) { - CDentry *dn = dir->lookup(*q); + CDentry *dn = dir->lookup(q->first); if (dn) { int nonce = dn->add_replica(from); dout(10) << " have " << *dn << endl; @@ -1945,8 +1968,8 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) mdr->locks.insert(lock); } } - for (map >::iterator p = m->xlocked_dentries.begin(); - p != m->xlocked_dentries.end(); + for (map >::iterator p = m->authpinned_dentries.begin(); + p != m->authpinned_dentries.end(); ++p) { CDir *dir = get_dirfrag(p->first); if (!dir) continue; // already missing, from above. @@ -3099,23 +3122,18 @@ void MDCache::dispatch(Message *m) { switch (m->get_type()) { + // RESOLVE case MSG_MDS_IMPORTMAP: handle_import_map((MMDSImportMap*)m); break; - case MSG_MDS_RESOLVEACK: handle_resolve_ack((MMDSResolveAck*)m); break; + // REJOIN case MSG_MDS_CACHEREJOIN: handle_cache_rejoin((MMDSCacheRejoin*)m); break; - /* - case MSG_MDS_CACHEREJOINACK: - handle_cache_rejoin_ack((MMDSCacheRejoinAck*)m); - break; - */ - case MSG_MDS_DISCOVER: handle_discover((MDiscover*)m); diff --git a/branches/sage/cephmds2/messages/MMDSCacheRejoin.h b/branches/sage/cephmds2/messages/MMDSCacheRejoin.h index 42f62244b996d..22f5c78235bcc 100644 --- a/branches/sage/cephmds2/messages/MMDSCacheRejoin.h +++ b/branches/sage/cephmds2/messages/MMDSCacheRejoin.h @@ -83,20 +83,28 @@ class MMDSCacheRejoin : public Message { dn_strong(int n, int l) : nonce(n), lock(l) {} }; + struct dn_weak { + inodeno_t ino; + inodeno_t remote_ino; + }; + // -- data -- int32_t op; - set weak_inodes; + // weak + map > weak_dentries; + + // strong map strong_inodes; + map strong_dirfrags; + map > strong_dentries; + + // full list full_inodes; + + // authpins, xlocks map authpinned_inodes; map > xlocked_inodes; - - set weak_dirfrags; - map strong_dirfrags; - - map > weak_dentries; - map > strong_dentries; map > authpinned_dentries; map > xlocked_dentries; @@ -112,9 +120,6 @@ class MMDSCacheRejoin : public Message { // -- builders -- // inodes - void add_weak_inode(inodeno_t ino) { - weak_inodes.insert(ino); - } void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); } @@ -130,15 +135,21 @@ class MMDSCacheRejoin : public Message { // dirfrags void add_weak_dirfrag(dirfrag_t df) { - weak_dirfrags.insert(df); + weak_dentires[df]; } void add_strong_dirfrag(dirfrag_t df, int n) { strong_dirfrags[df] = dirfrag_strong(n); } // dentries - void add_weak_dentry(dirfrag_t df, const string& dname) { - weak_dentries[df].insert(dname); + void add_weak_null_dentry(dirfrag_t df, const string& dname) { + weak_dentries[df][dname] = dn_weak(0, 0); + } + void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { + weak_dentries[df][dname] = dn_weak(ino, 0); + } + void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { + weak_dentries[df][dname] = dn_weak(0, ino); } void add_strong_dentry(dirfrag_t df, const string& dname, int n, int ls) { strong_dentries[df][dname] = dn_strong(n, ls); @@ -153,7 +164,6 @@ class MMDSCacheRejoin : public Message { // -- encoding -- void encode_payload() { ::_encode(op, payload); - ::_encode(weak_inodes, payload); ::_encode(strong_inodes, payload); uint32_t nfull = full_inodes.size(); @@ -163,7 +173,6 @@ class MMDSCacheRejoin : public Message { ::_encode(authpinned_inodes, payload); ::_encode(xlocked_inodes, payload); - ::_encode(weak_dirfrags, payload); ::_encode(strong_dirfrags, payload); ::_encode(weak_dentries, payload); ::_encode(strong_dentries, payload); @@ -173,7 +182,6 @@ class MMDSCacheRejoin : public Message { void decode_payload() { int off = 0; ::_decode(op, payload, off); - ::_decode(weak_inodes, payload, off); ::_decode(strong_inodes, payload, off); uint32_t nfull; @@ -183,7 +191,6 @@ class MMDSCacheRejoin : public Message { ::_decode(authpinned_inodes, payload, off); ::_decode(xlocked_inodes, payload, off); - ::_decode(weak_dirfrags, payload, off); ::_decode(strong_dirfrags, payload, off); ::_decode(weak_dentries, payload, off); ::_decode(strong_dentries, payload, off); -- 2.39.5