From ac311657639e33f3aeb0ec7263530c53d47bebcf Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 12 Oct 2007 03:23:46 +0000 Subject: [PATCH] fixed bug in trim_non_auth that broke rejoin; fixed up handle_mds_map to be more robust/clean git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1925 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/mds/cmds.cc | 2 +- branches/sage/mds/mds/CDir.h | 2 + branches/sage/mds/mds/Locker.cc | 6 +- branches/sage/mds/mds/MDCache.cc | 38 +++++++++- branches/sage/mds/mds/MDLog.cc | 1 + branches/sage/mds/mds/MDS.cc | 93 +++++++++++------------- branches/sage/mds/msg/SimpleMessenger.cc | 1 + 7 files changed, 85 insertions(+), 58 deletions(-) diff --git a/branches/sage/mds/cmds.cc b/branches/sage/mds/cmds.cc index 2fe19901ff767..6e475ad4b588d 100644 --- a/branches/sage/mds/cmds.cc +++ b/branches/sage/mds/cmds.cc @@ -101,7 +101,7 @@ int main(int argc, char **argv) mds->mds_lock.Unlock(); // done - delete mds; + //delete mds; return 0; } diff --git a/branches/sage/mds/mds/CDir.h b/branches/sage/mds/mds/CDir.h index 0552cbb144024..9878438d24b60 100644 --- a/branches/sage/mds/mds/CDir.h +++ b/branches/sage/mds/mds/CDir.h @@ -61,6 +61,7 @@ class CDir : public MDSCacheObject { static const int PIN_IMPORTBOUND = 9; static const int PIN_EXPORTBOUND = 10; static const int PIN_STICKY = 11; + static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth() const char *pin_name(int p) { switch (p) { case PIN_DNWAITER: return "dnwaiter"; @@ -73,6 +74,7 @@ class CDir : public MDSCacheObject { case PIN_IMPORTBOUND: return "importbound"; case PIN_EXPORTBOUND: return "exportbound"; case PIN_STICKY: return "sticky"; + case PIN_SUBTREETEMP: return "subtreetemp"; default: return generic_pin_name(p); } } diff --git a/branches/sage/mds/mds/Locker.cc b/branches/sage/mds/mds/Locker.cc index 3cde2bb5e45e8..e7655cdd4d97d 100644 --- a/branches/sage/mds/mds/Locker.cc +++ b/branches/sage/mds/mds/Locker.cc @@ -1627,9 +1627,11 @@ void Locker::scatter_try_unscatter(ScatterLock *lock, Context *c) assert(!lock->get_parent()->is_ambiguous_auth()); // request unscatter? - if (lock->get_state() == LOCK_SCATTER) + int auth = lock->get_parent()->authority().first; + if (lock->get_state() == LOCK_SCATTER && + mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE) mds->send_message_mds(new MLock(lock, LOCK_AC_REQUNSCATTER, mds->get_nodeid()), - lock->get_parent()->authority().first, MDS_PORT_LOCKER); + auth, MDS_PORT_LOCKER); // wait... lock->add_waiter(SimpleLock::WAIT_STABLE, c); diff --git a/branches/sage/mds/mds/MDCache.cc b/branches/sage/mds/mds/MDCache.cc index be6f046f7667c..b3a7c024750d9 100644 --- a/branches/sage/mds/mds/MDCache.cc +++ b/branches/sage/mds/mds/MDCache.cc @@ -428,6 +428,10 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) */ void MDCache::adjust_export_state(CDir *dir) { + dout(15) << "adjust_export_state, me " << mds->get_nodeid() << dendl; + dout(15) << " inode " << dir->get_inode()->authority().first << " " << *dir->get_inode() << dendl; + dout(15) << " dir " << dir->authority().first << " " << *dir << dendl; + // be auth bit agnostic, so that we work during recovery // (before recalc_auth_bits) if (dir->authority().first != mds->get_nodeid() && @@ -1145,6 +1149,13 @@ void MDCache::handle_mds_failure(int who) rejoin_sent.erase(who); // i need to send another rejoin_ack_gather.erase(who); // i'll need/get another. + + dout(10) << " wants_resolve " << wants_resolve << dendl; + dout(10) << " got_resolve " << got_resolve << dendl; + dout(10) << " rejoin_sent " << rejoin_sent << dendl; + dout(10) << " rejoin_gather " << rejoin_gather << dendl; + dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; + // tell the migrator too. migrator->handle_mds_failure_or_stop(who); @@ -1393,9 +1404,9 @@ void MDCache::maybe_resolve_finish() else { dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; disambiguate_imports(); - if (mds->is_resolve()) { recalc_auth_bits(); + show_subtrees(0); trim_non_auth(); mds->resolve_done(); } @@ -1670,6 +1681,8 @@ void MDCache::rejoin_send_rejoins() map rejoins; + show_subtrees(0); + // encode cap list once. bufferlist cap_export_bl; if (mds->is_rejoin()) { @@ -1841,13 +1854,14 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) p != dir->items.end(); ++p) { CDentry *dn = p->second; - assert(dn->is_primary()); dout(15) << " add_weak_primary_dentry " << *dn << dendl; + assert(dn->is_primary()); + assert(dn->inode->is_dir()); rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); dn->get_inode()->get_nested_dirfrags(nested); if (dn->get_inode()->dirlock.is_updated()) { - // include full inode to shed our dirtyscattered state + // include full inode to shed any dirtyscattered state rejoin->add_full_inode(dn->get_inode()->inode, dn->get_inode()->symlink, dn->get_inode()->dirfragtree); @@ -2014,6 +2028,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) p != weak->weak.end(); ++p) { CDir *dir = get_dirfrag(p->first); + if (!dir) dout(0) << " missing dirfrag " << p->first << dendl; assert(dir); int nonce = dir->add_replica(from); @@ -3133,8 +3148,11 @@ void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expire } } - if (dir->is_subtree_root()) + if (dir->is_subtree_root()) { + assert(!dir->is_auth() || + (!dir->is_replicated() && dir->inode->is_base())); remove_subtree(dir); // remove from subtree map + } in->close_dirfrag(dir->dirfrag().frag); } @@ -3215,6 +3233,12 @@ void MDCache::trim_non_auth() { dout(7) << "trim_non_auth" << dendl; + // temporarily pin all subtree roots + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + p++) + p->first->get(CDir::PIN_SUBTREETEMP); + // note first auth item we see. // when we see it the second time, stop. CDentry *first_auth = 0; @@ -3294,6 +3318,12 @@ void MDCache::trim_non_auth() // move everything in the pintail to the top bit of the lru. lru.lru_touch_entire_pintail(); + // unpin all subtrees + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + p++) + p->first->put(CDir::PIN_SUBTREETEMP); + show_subtrees(); } diff --git a/branches/sage/mds/mds/MDLog.cc b/branches/sage/mds/mds/MDLog.cc index c4f9b73eb0647..fc7cdffbe6e10 100644 --- a/branches/sage/mds/mds/MDLog.cc +++ b/branches/sage/mds/mds/MDLog.cc @@ -361,6 +361,7 @@ void MDLog::_expired(LogSegment *ls) num_events -= ls->num_events; journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos + journaler->write_head(0); logger->set("expos", ls->offset); logger->inc("segtrm"); diff --git a/branches/sage/mds/mds/MDS.cc b/branches/sage/mds/mds/MDS.cc index 7790df97c239f..6fc8ef46d9039 100644 --- a/branches/sage/mds/mds/MDS.cc +++ b/branches/sage/mds/mds/MDS.cc @@ -498,23 +498,13 @@ void MDS::handle_mds_map(MMDSMap *m) return; } - // note some old state + // keep old map, for a moment + MDSMap *oldmap = mdsmap; int oldwhoami = whoami; int oldstate = state; - set oldresolve; - mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - bool wasrejoining = mdsmap->is_rejoining(); - set oldfailed; - mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - set oldactive; - mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); - set oldcreating; - mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING); - set oldstopped; - mdsmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); - bool wasdegraded = mdsmap->is_degraded(); // decode and process + mdsmap = new MDSMap; mdsmap->decode(m->get_encoded()); // see who i am @@ -546,7 +536,6 @@ void MDS::handle_mds_map(MMDSMap *m) messenger->send_message(new MOSDGetMap(0), monmap->get_inst(mon)); } - } // tell objecter my incarnation @@ -596,20 +585,20 @@ void MDS::handle_mds_map(MMDSMap *m) return; } } - + // RESOLVE // is someone else newly resolving? if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set resolve; + set oldresolve, resolve; + oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); if (oldresolve != resolve) { dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { - if (*p == whoami) continue; - if (oldresolve.count(*p)) continue; - mdcache->send_resolve(*p); // now or later. - } + for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) + if (*p != whoami && + oldresolve.count(*p) == 0) + mdcache->send_resolve(*p); // now or later. } } @@ -617,55 +606,56 @@ void MDS::handle_mds_map(MMDSMap *m) // is everybody finally rejoining? if (is_rejoin() || is_active() || is_stopping()) { // did we start? - if (!wasrejoining && mdsmap->is_rejoining()) + if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) rejoin_joint_start(); // did we finish? if (g_conf.mds_dump_cache_after_rejoin && - wasrejoining && !mdsmap->is_rejoining()) + oldmap->is_rejoining() && !mdsmap->is_rejoining()) mdcache->dump_cache(); // for DEBUG only } - if (wasdegraded && !mdsmap->is_degraded()) + if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) dout(1) << "cluster recovered." << dendl; - + // did someone go active? if (is_active() || is_stopping()) { - set active; + set oldactive, active; + oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); - for (set::iterator p = active.begin(); p != active.end(); ++p) { - if (*p == whoami) continue; // not me - if (oldactive.count(*p)) continue; // newly so? - handle_mds_recovery(*p); - } + for (set::iterator p = active.begin(); p != active.end(); ++p) + if (*p != whoami && // not me + oldactive.count(*p) == 0) // newly so? + handle_mds_recovery(*p); } + // did someone fail or stop? if (is_active() || is_stopping()) { - // did anyone go down? - set failed; + // new failed? + set oldfailed, failed; + oldmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) { - if (oldfailed.count(*p)) continue; // newly so? - mdcache->handle_mds_failure(*p); - } + for (set::iterator p = failed.begin(); p != failed.end(); ++p) + if (oldfailed.count(*p) == 0) + mdcache->handle_mds_failure(*p); + + // or down then up? + // did their addr/inst change? + set up; + mdsmap->get_up_mds_set(up); + for (set::iterator p = up.begin(); p != up.end(); ++p) + if (oldmap->have_inst(*p) && + oldmap->get_inst(*p) != mdsmap->get_inst(*p)) + mdcache->handle_mds_failure(*p); // did anyone stop? - set stopped; + set oldstopped, stopped; + oldmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) { - if (oldstopped.count(*p)) continue; // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); - } + for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) + if (oldstopped.count(*p) == 0) // newly so? + mdcache->migrator->handle_mds_failure_or_stop(*p); } - - // in set set changed? - /* - if (state >= MDSMap::STATE_ACTIVE && // only if i'm active+. otherwise they'll get map during reconnect. - mdsmap->get_same_in_set_since() > last_client_mdsmap_bcast) { - bcast_mds_map(); - } - */ - // just got mdsmap+osdmap? if (hadepoch == 0 && mdsmap->get_epoch() > 0 && @@ -677,6 +667,7 @@ void MDS::handle_mds_map(MMDSMap *m) } delete m; + delete oldmap; } void MDS::bcast_mds_map() diff --git a/branches/sage/mds/msg/SimpleMessenger.cc b/branches/sage/mds/msg/SimpleMessenger.cc index 4b94297559315..f9ecd890f90ba 100644 --- a/branches/sage/mds/msg/SimpleMessenger.cc +++ b/branches/sage/mds/msg/SimpleMessenger.cc @@ -86,6 +86,7 @@ int Rank::Accepter::start() dout(10) << "accepter.start" << dendl; char hostname[100]; + memset(hostname, 0, 100); gethostname(hostname, 100); dout(2) << "accepter.start my hostname is " << hostname << dendl; -- 2.39.5