From acc862d454a30759a60d008be378bf5d81574ddd Mon Sep 17 00:00:00 2001 From: sageweil Date: Wed, 15 Aug 2007 21:48:58 +0000 Subject: [PATCH] lots of migrator bugfixes, refactoring, cleanup. needs more testing, esp bystander vs ambiguous imports git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1638 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/mds/TODO | 2 + branches/sage/mds/config.cc | 4 +- branches/sage/mds/mds/CDir.h | 4 +- branches/sage/mds/mds/MDCache.cc | 375 +++++++------------ branches/sage/mds/mds/MDS.cc | 6 +- branches/sage/mds/mds/Migrator.cc | 25 +- branches/sage/mds/mds/Migrator.h | 3 - branches/sage/mds/mds/Server.cc | 9 + branches/sage/mds/mds/events/EMetaBlob.h | 15 +- branches/sage/mds/mds/journal.cc | 14 +- branches/sage/mds/mds/mdstypes.h | 1 + branches/sage/mds/messages/MMDSCacheRejoin.h | 21 +- branches/sage/mds/mon/MDSMonitor.cc | 9 +- 13 files changed, 201 insertions(+), 287 deletions(-) diff --git a/branches/sage/mds/TODO b/branches/sage/mds/TODO index b2e8df3e69553..c93cd3f962b89 100644 --- a/branches/sage/mds/TODO +++ b/branches/sage/mds/TODO @@ -51,6 +51,8 @@ sage doc sage mds +- fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage + - the split/merge plan: - hmm, should we move ESubtreeMap out of the journal? diff --git a/branches/sage/mds/config.cc b/branches/sage/mds/config.cc index f4cae3d1522b7..7e7a93c8981df 100644 --- a/branches/sage/mds/config.cc +++ b/branches/sage/mds/config.cc @@ -179,7 +179,7 @@ md_config_t g_conf = { mds_decay_halflife: 10, mds_beacon_interval: 5, //30.0, - mds_beacon_grace: 30, //60*60.0, + mds_beacon_grace: 10, //60*60.0, mds_log: true, mds_log_max_len: MDS_CACHE_SIZE / 3, @@ -199,7 +199,7 @@ md_config_t g_conf = { mds_bal_merge_size: 50, mds_bal_merge_rd: 1000, mds_bal_merge_wr: 1000, - mds_bal_interval: 30, // seconds + mds_bal_interval: 3000, // seconds mds_bal_fragment_interval: 5, // seconds mds_bal_idle_threshold: .1, mds_bal_max: -1, diff --git a/branches/sage/mds/mds/CDir.h b/branches/sage/mds/mds/CDir.h index cc43beccfe261..4fb97e2055011 100644 --- a/branches/sage/mds/mds/CDir.h +++ b/branches/sage/mds/mds/CDir.h @@ -139,9 +139,7 @@ class CDir : public MDSCacheObject { // -- wait masks -- static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - static const int WAIT_FREEZEABLE = (1<<2); // hard_pins removed - static const int WAIT_UNFREEZE = WAIT_AUTHPINNABLE; // unfreeze - static const int WAIT_IMPORTED = (1<<3); // import finish + static const int WAIT_FREEZEABLE = (1<<2); // auth pins removed static const int WAIT_DNLOCK_OFFSET = 4; diff --git a/branches/sage/mds/mds/MDCache.cc b/branches/sage/mds/mds/MDCache.cc index aa1353c61747d..4575345ee7026 100644 --- a/branches/sage/mds/mds/MDCache.cc +++ b/branches/sage/mds/mds/MDCache.cc @@ -1050,7 +1050,8 @@ void MDCache::_logged_subtree_map(off_t off) void MDCache::send_resolve(int who) { - if (migrator->is_exporting()) + if (migrator->is_importing() || + migrator->is_exporting()) send_resolve_later(who); else send_resolve_now(who); @@ -1071,6 +1072,8 @@ void MDCache::maybe_send_pending_resolves() if (migrator->is_exporting() || migrator->is_importing()) { dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << endl; + migrator->show_importing(); + migrator->show_exporting(); return; // not now } @@ -1182,29 +1185,6 @@ void MDCache::handle_mds_failure(int who) got_resolve.erase(who); // i'll get another. rejoin_ack_gather.erase(who); // i'll need/get another. - // adjust subtree auth - list subs; - list_subtrees(subs); - for (list::iterator p = subs.begin(); - p != subs.end(); - ++p) { - CDir *dir = *p; - // only if we are a _bystander_. - if (dir->dir_auth.first == who && - dir->dir_auth.second >= 0 && - dir->dir_auth.second != mds->get_nodeid()) { - dout(7) << "disambiguating auth for " << *dir << endl; - adjust_subtree_auth(dir, dir->dir_auth.second); - try_subtree_merge(dir); - } - else if (dir->dir_auth.second == who && - dir->dir_auth.first != mds->get_nodeid()) { - dout(7) << "disambiguating auth for " << *dir << endl; - adjust_subtree_auth(dir, dir->dir_auth.first); - try_subtree_merge(dir); - } - } - // tell the migrator too. migrator->handle_mds_failure_or_stop(who); @@ -1445,21 +1425,18 @@ void MDCache::handle_resolve(MMDSResolve *m) show_subtrees(); - // resolving? - if (mds->is_resolve()) { - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imports.begin(); - pi != m->ambiguous_imports.end(); - ++pi) { - dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; - other_ambiguous_imports[from][pi->first].swap( pi->second ); - } - - // did i get them all? - got_resolve.insert(from); - - maybe_resolve_finish(); + // note ambiguous imports too + for (map >::iterator pi = m->ambiguous_imports.begin(); + pi != m->ambiguous_imports.end(); + ++pi) { + dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; + other_ambiguous_imports[from][pi->first].swap( pi->second ); } + + // did i get them all? + got_resolve.insert(from); + + maybe_resolve_finish(); delete m; } @@ -1467,19 +1444,21 @@ void MDCache::handle_resolve(MMDSResolve *m) void MDCache::maybe_resolve_finish() { if (got_resolve != recovery_set) { - dout(10) << "still waiting for more resolves, got (" << got_resolve + dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" << got_resolve << "), need (" << recovery_set << ")" << endl; } else if (!need_resolve_ack.empty()) { - dout(10) << "still waiting for resolve_ack from (" << need_resolve_ack << ")" << endl; + dout(10) << "maybe_resolve_finish still waiting for resolve_ack from (" << need_resolve_ack << ")" << endl; } else { - dout(10) << "got all import maps, resolve_acks, done resolving subtrees" << endl; + dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << endl; disambiguate_imports(); - recalc_auth_bits(); - trim_non_auth(); - mds->resolve_done(); + if (mds->is_resolve()) { + recalc_auth_bits(); + trim_non_auth(); + mds->resolve_done(); + } } } @@ -1544,8 +1523,6 @@ void MDCache::disambiguate_imports() { dout(10) << "disambiguate_imports" << endl; - // FIXME what about surviving bystanders - // other nodes' ambiguous imports for (map > >::iterator p = other_ambiguous_imports.begin(); p != other_ambiguous_imports.end(); @@ -1560,7 +1537,8 @@ void MDCache::disambiguate_imports() CDir *dir = get_dirfrag(q->first); if (!dir) continue; - if (dir->authority().first == CDIR_AUTH_UNKNOWN) { + if (dir->authority().first == CDIR_AUTH_UNKNOWN || // if i am resolving + dir->is_ambiguous_auth()) { // if i am a surviving bystander dout(10) << "mds" << who << " did import " << *dir << endl; adjust_bounded_subtree_auth(dir, q->second, who); try_subtree_merge(dir); @@ -1859,29 +1837,28 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) if (mds->is_rejoin()) { // WEAK + dout(15) << " add_weak_dirfrag " << *dir << endl; rejoin->add_weak_dirfrag(dir->dirfrag()); for (map::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { CDentry *dn = p->second; - if (dn->is_primary()) { - rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); - dn->get_inode()->get_nested_dirfrags(nested); - } else if (dn->is_remote()) - rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first, - dn->get_remote_ino(), dn->get_remote_d_type()); - else - assert(0); // i shouldn't have a non-auth null dentry after replay + trim_non_auth() + assert(dn->is_primary()); + dout(15) << " add_weak_primary_dentry " << *dn << endl; + rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); + dn->get_inode()->get_nested_dirfrags(nested); } } else { // STRONG + dout(15) << " add_strong_dirfrag " << *dir << endl; rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce()); for (map::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { CDentry *dn = p->second; + dout(15) << " add_strong_dentry " << *dn << endl; rejoin->add_strong_dentry(dir->dirfrag(), p->first, dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), @@ -1890,6 +1867,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) dn->lock.get_state()); if (dn->is_primary()) { CInode *in = dn->get_inode(); + dout(15) << " add_strong_inode " << *in << endl; rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), in->get_caps_wanted(), in->authlock.get_state(), @@ -1932,9 +1910,6 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) case MMDSCacheRejoin::OP_ACK: handle_cache_rejoin_ack(m); break; - case MMDSCacheRejoin::OP_PURGE: - handle_cache_rejoin_purge(m); - break; case MMDSCacheRejoin::OP_MISSING: handle_cache_rejoin_missing(m); break; @@ -1968,7 +1943,6 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) // possible response(s) MMDSCacheRejoin *ack = 0; // if survivor - MMDSCacheRejoin *purge = 0; // if i'm missing something, purge it from the (recovering) sender. bool survivor = false; // am i a survivor? if (mds->is_active() || mds->is_stopping()) { @@ -2020,12 +1994,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) p != weak->weak.end(); ++p) { CDir *dir = get_dirfrag(p->first); - if (!dir) { - dout(10) << " purge " << p->first << endl; - if (!purge) purge = new MMDSCacheRejoin(MMDSCacheRejoin::OP_PURGE); - purge->add_weak_dirfrag(p->first, p->second); - continue; - } + assert(dir); int nonce = dir->add_replica(from); dout(10) << " have " << *dir << endl; @@ -2037,49 +2006,39 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) q != p->second.end(); ++q) { CDentry *dn = dir->lookup(q->first); - if (!dn || - (dn->is_primary() && !q->second.is_primary())) { // make sure dn type matches, or purge - dout(10) << " purge " << p->first << " " << q->first << endl; - if (!purge) purge = new MMDSCacheRejoin(MMDSCacheRejoin::OP_PURGE); - purge->add_weak_null_dentry(p->first, q->first); - continue; - } + assert(dn); + assert(dn->is_primary()); if (survivor) dentry_remove_replica(dn, from); - int nonce = dn->add_replica(from); + int dnonce = dn->add_replica(from); dout(10) << " have " << *dn << endl; if (ack) ack->add_strong_dentry(p->first, q->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - nonce, dn->lock.get_replica_state()); - - // inode? - if (dn->is_primary()) { - assert(q->second.is_primary()); // or we would have purged, above - CInode *in = dn->get_inode(); - assert(in); + dn->get_inode()->ino(), inodeno_t(0), 0, + dnonce, dn->lock.get_replica_state()); - if (survivor) inode_remove_replica(in, from); - int nonce = in->add_replica(from); - dout(10) << " have " << *in << endl; + // inode + CInode *in = dn->get_inode(); + assert(in); - // scatter the dirlock, just in case? - if (!survivor && in->is_dir()) - in->dirlock.set_state(LOCK_SCATTER); - - if (ack) { - ack->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack->add_strong_inode(in->ino(), - nonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } + if (survivor) inode_remove_replica(in, from); + int inonce = in->add_replica(from); + dout(10) << " have " << *in << endl; + + // scatter the dirlock, just in case? + if (!survivor && in->is_dir()) + in->dirlock.set_state(LOCK_SCATTER); + + if (ack) { + ack->add_full_inode(in->inode, in->symlink, in->dirfragtree); + ack->add_strong_inode(in->ino(), + inonce, + 0, + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); } } } @@ -2087,13 +2046,6 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) if (survivor) rejoin_scour_survivor_replicas(from, ack); - // send purge? - // (before ack) - if (purge) { - assert(0); // not if sender did trim_non_auth(). - mds->send_message_mds(purge, from, MDS_PORT_CACHE); - } - if (survivor) { // send ack mds->send_message_mds(ack, from, MDS_PORT_CACHE); @@ -2199,7 +2151,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack) if (dir->is_auth() && dir->is_replica(from) && - ack->strong_dirfrags.count(dir->dirfrag())) { + ack->strong_dirfrags.count(dir->dirfrag()) == 0) { dir->remove_replica(from); dout(10) << " rem " << *dir << endl; } @@ -2319,49 +2271,53 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) if (dn->is_primary()) { CInode *in = dn->get_inode(); assert(in); - assert(strong->strong_inodes.count(in->ino())); - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; - - // caps_wanted - if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; - dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) - << " on " << *in << endl; - } - // scatterlock? - if (is.dirlock == LOCK_SCATTER || - is.dirlock == LOCK_GLOCKC) // replica still has wrlocks - in->dirlock.set_state(LOCK_SCATTER); - - // auth pin? - if (strong->authpinned_inodes.count(in->ino())) { - metareqid_t ri = strong->authpinned_inodes[in->ino()]; - dout(10) << " inode authpin by " << ri << " on " << *in << endl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(in); - } + if (strong->strong_inodes.count(in->ino())) { + MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; - // xlock(s)? - if (strong->xlocked_inodes.count(in->ino())) { - for (map::iterator r = strong->xlocked_inodes[in->ino()].begin(); - r != strong->xlocked_inodes[in->ino()].end(); - ++r) { - SimpleLock *lock = in->get_lock(r->first); - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << endl; - MDRequest *mdr = request_get(r->second); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_LOCK); - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); + // caps_wanted + if (is.caps_wanted) { + in->mds_caps_wanted[from] = is.caps_wanted; + dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) + << " on " << *in << endl; + } + + // scatterlock? + if (is.dirlock == LOCK_SCATTER || + is.dirlock == LOCK_GLOCKC) // replica still has wrlocks + in->dirlock.set_state(LOCK_SCATTER); + + // auth pin? + if (strong->authpinned_inodes.count(in->ino())) { + metareqid_t ri = strong->authpinned_inodes[in->ino()]; + dout(10) << " inode authpin by " << ri << " on " << *in << endl; + + // get/create slave mdrequest + MDRequest *mdr; + if (have_request(ri)) + mdr = request_get(ri); + else + mdr = request_start_slave(ri, from); + mdr->auth_pin(in); + } + + // xlock(s)? + if (strong->xlocked_inodes.count(in->ino())) { + for (map::iterator r = strong->xlocked_inodes[in->ino()].begin(); + r != strong->xlocked_inodes[in->ino()].end(); + ++r) { + SimpleLock *lock = in->get_lock(r->first); + dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << endl; + MDRequest *mdr = request_get(r->second); // should have this from auth_pin above. + assert(mdr->is_auth_pinned(in)); + lock->set_state(LOCK_LOCK); + lock->get_xlock(mdr); + mdr->xlocks.insert(lock); + mdr->locks.insert(lock); + } } + } else { + dout(10) << " sender has dentry but not inode, adding them as a replica" << endl; } in->add_replica(from); @@ -2390,8 +2346,6 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << endl; int from = ack->get_source().num(); - bool rejoin = mds->is_rejoin(); - list waiters; // dirs @@ -2412,6 +2366,27 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) CDentry *dn = dir->lookup(q->first); if (!dn) continue; + // hmm, did we have the proper linkage here? + if (dn->is_null() && + !q->second.is_null()) { + dout(10) << " had bad (missing) linkage for " << *dn << endl; + if (q->second.is_remote()) { + dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); + } else { + CInode *in = get_inode(q->second.ino); + assert(in == 0); // a rename would have been caught be the resolve stage. + // barebones inode; the full inode loop below will clean up. + in = new CInode(this, false); + in->inode.ino = q->second.ino; + add_inode(in); + dn->dir->link_primary_inode(dn, in); + } + } + else if (!dn->is_null() && + q->second.is_null()) { + dout(10) << " had bad linkage for " << *dn << endl; + assert(0); // hrmpf. unlink should use slave requests to clean this up during resolve. + } dn->set_replica_nonce(q->second.nonce); mds->locker->rejoin_set_state(&dn->lock, q->second.lock, waiters); dn->state_clear(CDentry::STATE_REJOINING); @@ -2420,17 +2395,15 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) } // full inodes - if (rejoin) { - for (list::iterator p = ack->full_inodes.begin(); - p != ack->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - dout(10) << " got inode content " << *in << endl; - } + for (list::iterator p = ack->full_inodes.begin(); + p != ack->full_inodes.end(); + ++p) { + CInode *in = get_inode(p->inode.ino); + if (!in) continue; + in->inode = p->inode; + in->symlink = p->symlink; + in->dirfragtree = p->dirfragtree; + dout(10) << " got inode content " << *in << endl; } // inodes @@ -2462,75 +2435,6 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) } -void MDCache::handle_cache_rejoin_purge(MMDSCacheRejoin *purge) -{ - dout(7) << "handle_cache_rejoin_purge from " << purge->get_source() << endl; - assert(mds->is_rejoin()); - - /* - * this is tricky, because we have to trim our cache - * in a particular order, and our input (purge->weak) is sorted - * by dirfrag_t. - * - * so, we carelessly trim, and assuming disconnected inodes will be - * clean in the end... - */ - set disconnected; - - for (map >::iterator p = purge->weak.begin(); - p != purge->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - assert(dir); - - // dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - dir->unlink_inode(dn); - - if (in->has_dirfrags()) { - dout(10) << " disconnecting inode with dirfrags " << *in << endl; - disconnected.insert(in); - } else { - dout(10) << " removing " << *in << endl; - remove_inode(in); - } - } - - dout(10) << " removing " << *dn << endl; - dir->remove_dentry(dn); - } - - if (dir->items.empty()) { - // purge the dir, too. - CInode *diri = dir->get_inode(); - - dout(10) << " closing dirfrag " << *dir << endl; - diri->close_dirfrag(dir->dirfrag().frag); - - // FIXME: what about root, stray. - - if (!diri->get_parent_dn() && - !diri->has_dirfrags()) { - dout(10) << " removing " << *diri << endl; - remove_inode(diri); - disconnected.erase(diri); - } - } - } - - for (set::iterator p = disconnected.begin(); - p != disconnected.end(); - ++p) - dout(0) << " PROBLEM: still have disconnected dir inode " << **p << endl; - assert(disconnected.empty()); -} void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) @@ -2603,9 +2507,11 @@ void MDCache::rejoin_trim_undef_inodes() { dout(10) << "rejoin_trim_undef_inodes" << endl; - set::iterator p = rejoin_undef_inodes.begin(); - while (p != rejoin_undef_inodes.end()) { + while (!rejoin_undef_inodes.empty()) { + set::iterator p = rejoin_undef_inodes.begin(); CInode *in = *p; + rejoin_undef_inodes.erase(p); + in->clear_replica_map(); // close out dirfrags @@ -2644,8 +2550,7 @@ void MDCache::rejoin_trim_undef_inodes() } } - assert(rejoin_undef_inodes.empty()); // hmm: this shouldn't ever happen, actually! - rejoin_undef_inodes.clear(); + assert(rejoin_undef_inodes.empty()); } class C_MDC_RejoinGatherFinish : public Context { diff --git a/branches/sage/mds/mds/MDS.cc b/branches/sage/mds/mds/MDS.cc index 97bb9e2353ca2..d1e7f32f05dcd 100644 --- a/branches/sage/mds/mds/MDS.cc +++ b/branches/sage/mds/mds/MDS.cc @@ -655,8 +655,12 @@ void MDS::handle_mds_map(MMDSMap *m) // just got mdsmap+osdmap? if (hadepoch == 0 && mdsmap->get_epoch() > 0 && - osdmap->get_epoch() > 0) + osdmap->get_epoch() > 0) { boot(); + } else if (want_state != state) { + // resend beacon. + beacon_send(); + } delete m; } diff --git a/branches/sage/mds/mds/Migrator.cc b/branches/sage/mds/mds/Migrator.cc index e7f3168877346..f6ed961f40f1a 100644 --- a/branches/sage/mds/mds/Migrator.cc +++ b/branches/sage/mds/mds/Migrator.cc @@ -407,13 +407,13 @@ void Migrator::show_importing() p++) { CDir *dir = mds->mdcache->get_dirfrag(p->first); if (dir) { - dout(10) << " importing to " << import_peer[p->first] + dout(10) << " importing from " << import_peer[p->first] << ": (" << p->second << ") " << get_import_statename(p->second) << " " << p->first << " " << *dir << endl; } else { - dout(10) << " importing to " << import_peer[p->first] + dout(10) << " importing from " << import_peer[p->first] << ": (" << p->second << ") " << get_import_statename(p->second) << " " << p->first << endl; @@ -1067,14 +1067,14 @@ void Migrator::export_reverse(CDir *dir) // process delayed expires cache->process_delayed_expire(dir); - // unfreeze - dir->unfreeze_tree(); - // some clean up export_data.erase(dir); export_warning_ack_waiting.erase(dir); export_notify_ack_waiting.erase(dir); + // unfreeze + dir->unfreeze_tree(); + cache->show_cache(); } @@ -1732,6 +1732,9 @@ void Migrator::import_reverse_final(CDir *dir) import_bystanders.erase(dir); import_bound_ls.erase(dir); + // send pending import_maps? + mds->mdcache->maybe_send_pending_resolves(); + cache->show_subtrees(); //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) } @@ -1774,9 +1777,6 @@ void Migrator::import_finish(CDir *dir) cache->get_subtree_bounds(dir, bounds); import_remove_pins(dir, bounds); - // unfreeze - dir->unfreeze_tree(); - // adjust auth, with possible subtree merge. cache->adjust_subtree_auth(dir, mds->get_nodeid()); cache->try_subtree_merge(dir); @@ -1790,13 +1790,14 @@ void Migrator::import_finish(CDir *dir) // process delayed expires cache->process_delayed_expire(dir); - // ok now finish contexts - dout(10) << "finishing any waiters on imported data" << endl; - dir->finish_waiting(CDir::WAIT_IMPORTED); + // ok now unfreeze (and thus kick waiters) + dir->unfreeze_tree(); cache->show_subtrees(); //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) + // send pending import_maps? + mds->mdcache->maybe_send_pending_resolves(); // is it empty? if (dir->get_size() == 0 && @@ -1913,7 +1914,7 @@ int Migrator::decode_import_dir(bufferlist& bl, for (list::iterator it = waiters.begin(); it != waiters.end(); it++) - import_root->add_waiter(CDir::WAIT_IMPORTED, *it); + import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure dout(15) << "doing contents" << endl; diff --git a/branches/sage/mds/mds/Migrator.h b/branches/sage/mds/mds/Migrator.h index 135bf9415dfaa..421859bea3974 100644 --- a/branches/sage/mds/mds/Migrator.h +++ b/branches/sage/mds/mds/Migrator.h @@ -54,7 +54,6 @@ public: // export stages. used to clean up intelligently if there's a failure. const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - //const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack @@ -79,7 +78,6 @@ protected: // export fun map export_state; map export_peer; - //map > export_bounds; map > export_data; // only during EXPORTING state map > export_warning_ack_waiting; map > export_notify_ack_waiting; @@ -95,7 +93,6 @@ public: const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish - //const static int IMPORT_LOGGINGFINISH = 7; // logging EImportFinish const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing static const char *get_import_statename(int s) { switch (s) { diff --git a/branches/sage/mds/mds/Server.cc b/branches/sage/mds/mds/Server.cc index f37ace7f30488..36e117850dbef 100644 --- a/branches/sage/mds/mds/Server.cc +++ b/branches/sage/mds/mds/Server.cc @@ -2743,6 +2743,15 @@ void Server::handle_client_rename(MDRequest *mdr) } // -- prepare witnesses -- + /* + * NOTE: we use _all_ replicas as witnesses. + * this probably isn't totally necessary (esp for file renames), + * but if/when we change that, we have to make sure rejoin is + * sufficiently robust to handle strong rejoins from survivors + * with totally wrong dentry->inode linkage. + * (currently, it can ignore rename effects, because the resolve + * stage will sort them out.) + */ set witnesses = mdr->extra_witnesses; if (srcdn->is_auth()) srcdn->list_replicas(witnesses); diff --git a/branches/sage/mds/mds/events/EMetaBlob.h b/branches/sage/mds/mds/events/EMetaBlob.h index 6b0382aa43278..48b03fa7a4949 100644 --- a/branches/sage/mds/mds/events/EMetaBlob.h +++ b/branches/sage/mds/mds/events/EMetaBlob.h @@ -49,18 +49,20 @@ class EMetaBlob { string dn; // dentry version_t dnv; inode_t inode; // if it's not + fragtree_t dirfragtree; string symlink; bool dirty; - fullbit(const string& d, version_t v, inode_t& i, bool dr) : - dn(d), dnv(v), inode(i), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : - dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } + fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, bool dr) : + dn(d), dnv(v), inode(i), dirfragtree(dft), dirty(dr) { } + fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, string& sym, bool dr) : + dn(d), dnv(v), inode(i), dirfragtree(dft), symlink(sym), dirty(dr) { } fullbit(bufferlist& bl, int& off) { _decode(bl, off); } void _encode(bufferlist& bl) { ::_encode(dn, bl); ::_encode(dnv, bl); ::_encode(inode, bl); + ::_encode(dirfragtree, bl); if (inode.is_symlink()) ::_encode(symlink, bl); ::_encode(dirty, bl); @@ -69,6 +71,7 @@ class EMetaBlob { ::_decode(dn, bl, off); ::_decode(dnv, bl, off); ::_decode(inode, bl, off); + ::_decode(dirfragtree, bl, off); if (inode.is_symlink()) ::_decode(symlink, bl, off); ::_decode(dirty, bl, off); @@ -338,14 +341,14 @@ private: if (dirty) { lump.get_dfull().push_front(fullbit(dn->get_name(), dn->get_projected_version(), - in->inode, in->symlink, + in->inode, in->dirfragtree, in->symlink, dirty)); if (pi) lump.get_dfull().front().inode = *pi; return &lump.get_dfull().front().inode; } else { lump.get_dfull().push_back(fullbit(dn->get_name(), dn->get_projected_version(), - in->inode, in->symlink, + in->inode, in->dirfragtree, in->symlink, dirty)); if (pi) lump.get_dfull().back().inode = *pi; return &lump.get_dfull().back().inode; diff --git a/branches/sage/mds/mds/journal.cc b/branches/sage/mds/mds/journal.cc index e09a4f1eaf1e7..4d57008b584d9 100644 --- a/branches/sage/mds/mds/journal.cc +++ b/branches/sage/mds/mds/journal.cc @@ -276,7 +276,7 @@ void EMetaBlob::expire(MDS *mds, Context *c) for (list::iterator p = waitfor_import.begin(); p != waitfor_import.end(); ++p) - (*p)->add_waiter(CDir::WAIT_IMPORTED, gather->new_sub()); + (*p)->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); // have my anchortable ops committed? @@ -404,21 +404,27 @@ void EMetaBlob::replay(MDS *mds) if (!in) { in = new CInode(mds->mdcache); in->inode = p->inode; + in->dirfragtree = p->dirfragtree; if (in->inode.is_symlink()) in->symlink = p->symlink; mds->mdcache->add_inode(in); dir->link_primary_inode(dn, in); if (p->dirty) in->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *in << endl; } else { - if (in->get_parent_dn()) { + if (dn->get_inode() != in && in->get_parent_dn()) { dout(10) << "EMetaBlob.replay unlinking " << *in << endl; in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); } in->inode = p->inode; + in->dirfragtree = p->dirfragtree; if (in->inode.is_symlink()) in->symlink = p->symlink; - dir->link_primary_inode(dn, in); if (p->dirty) in->_mark_dirty(); - dout(10) << "EMetaBlob.replay linked " << *in << endl; + if (dn->get_inode() != in) { + dir->link_primary_inode(dn, in); + dout(10) << "EMetaBlob.replay linked " << *in << endl; + } else { + dout(10) << "EMetaBlob.replay had " << *in << endl; + } } } diff --git a/branches/sage/mds/mds/mdstypes.h b/branches/sage/mds/mds/mdstypes.h index aaf0ea33e47b0..b9ffe9fff65ea 100644 --- a/branches/sage/mds/mds/mdstypes.h +++ b/branches/sage/mds/mds/mdstypes.h @@ -404,6 +404,7 @@ class MDSCacheObject { // -- wait -- const static int WAIT_SINGLEAUTH = (1<<30); const static int WAIT_AUTHPINNABLE = (1<<29); + const static int WAIT_UNFREEZE = WAIT_AUTHPINNABLE; // ============================================ diff --git a/branches/sage/mds/messages/MMDSCacheRejoin.h b/branches/sage/mds/messages/MMDSCacheRejoin.h index 78d9072cc767c..aa862dcd97151 100644 --- a/branches/sage/mds/messages/MMDSCacheRejoin.h +++ b/branches/sage/mds/messages/MMDSCacheRejoin.h @@ -27,7 +27,7 @@ class MMDSCacheRejoin : public Message { static const int OP_WEAK = 1; // replica -> auth, i exist, + maybe open files. static const int OP_STRONG = 2; // replica -> auth, i exist, + open files and lock state. static const int OP_ACK = 3; // auth -> replica, here is your lock state. - static const int OP_PURGE = 4; // auth -> replica, remove these items, they are old/obsolete. + //static const int OP_PURGE = 4; // auth -> replica, remove these items, they are old/obsolete. static const int OP_MISSING = 5; // auth -> replica, i am missing these items static const int OP_FULL = 6; // replica -> auth, here is the full object. static const char *get_opname(int op) { @@ -98,15 +98,8 @@ class MMDSCacheRejoin : public Message { struct dn_weak { inodeno_t ino; - inodeno_t remote_ino; - unsigned char remote_d_type; - dn_weak() : - ino(0), remote_ino(0), remote_d_type(0) {} - dn_weak(inodeno_t pi, inodeno_t ri, unsigned char rdt) : - ino(pi), remote_ino(ri), remote_d_type(rdt) {} - bool is_primary() { return ino > 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return ino == 0 && remote_ino == 0; } + dn_weak() : ino(0) {} + dn_weak(inodeno_t pi) : ino(pi) {} }; // -- data -- @@ -182,14 +175,8 @@ class MMDSCacheRejoin : public Message { void add_weak_dentry(dirfrag_t df, const string& dname, dn_weak& dnw) { weak[df][dname] = dnw; } - void add_weak_null_dentry(dirfrag_t df, const string& dname) { - weak[df][dname] = dn_weak(0, 0, 0); - } void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(ino, 0, 0); - } - void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino, unsigned char rdt) { - weak[df][dname] = dn_weak(0, ino, rdt); + weak[df][dname] = dn_weak(ino); } void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) { strong_dentries[df][dname] = dn_strong(pi, ri, rdt, n, ls); diff --git a/branches/sage/mds/mon/MDSMonitor.cc b/branches/sage/mds/mon/MDSMonitor.cc index de451f5bf3202..89ba5dc435579 100644 --- a/branches/sage/mds/mon/MDSMonitor.cc +++ b/branches/sage/mds/mon/MDSMonitor.cc @@ -88,6 +88,7 @@ bool MDSMonitor::update_from_paxos() mdsmap.decode(mdsmap_bl); // new map + dout(7) << "new map:" << endl; print_map(mdsmap); // bcast map to mds, waiters @@ -109,7 +110,7 @@ void MDSMonitor::encode_pending(bufferlist &bl) { dout(10) << "encode_pending e" << pending_mdsmap.epoch << endl; - print_map(pending_mdsmap); + //print_map(pending_mdsmap); // apply to paxos assert(paxos->get_version() + 1 == pending_mdsmap.epoch); @@ -205,7 +206,7 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) bool MDSMonitor::prepare_update(Message *m) { - dout(10) << "prepare_update " << *m << endl; + dout(7) << "prepare_update " << *m << endl; switch (m->get_type()) { @@ -552,9 +553,9 @@ void MDSMonitor::do_stop() return; } - dout(10) << "do_stop stopping active mds nodes" << endl; - + dout(7) << "do_stop stopping active mds nodes" << endl; print_map(mdsmap); + for (map::iterator p = mdsmap.mds_state.begin(); p != mdsmap.mds_state.end(); ++p) { -- 2.39.5