From 559d2bfa585495654ccd97dd16c767a7793326a9 Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 27 Feb 2007 23:53:40 +0000 Subject: [PATCH] reverse_import works with 2 nodes. bystander subtree cleanup is going to be a little tricky. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1135 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 7 ++ branches/sage/cephmds2/mds/MDCache.cc | 134 +++++++++++++-------- branches/sage/cephmds2/mds/MDCache.h | 1 + branches/sage/cephmds2/mds/Migrator.cc | 143 ++++++++++++----------- branches/sage/cephmds2/mds/Migrator.h | 9 +- branches/sage/cephmds2/mon/MDSMonitor.cc | 14 ++- 6 files changed, 186 insertions(+), 122 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 7dc11e54d4f92..229da68491a7b 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -1,3 +1,10 @@ +monday + retest with 3+ + no failures + full failure + document cache + pg rewrite + doc - mdsmonitor beacon semantics - cache expiration, cache invariants diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 8ec43292a4581..c767c294f7d34 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -367,35 +367,24 @@ void MDCache::adjust_export_state(CDir *dir) void MDCache::try_subtree_merge(CDir *dir) { dout(7) << "try_subtree_merge " << *dir << endl; + assert(subtrees.count(dir)); + set oldbounds = subtrees[dir]; - // try to merge bounds? - set::iterator p = subtrees[dir].begin(); - while (p != subtrees[dir].end()) { - set::iterator next = p; - next++; - CDir *bound = *p; - - if (bound->dir_auth == dir->dir_auth && // if auth matches, - dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, - !bound->state_test(CDir::STATE_EXPORTBOUND)) { // and not an exportbound, - // merge with child. - dout(10) << " merging with child bound " << *bound << endl; - bound->set_dir_auth(CDIR_AUTH_DEFAULT); - - // move child's children under dir. - for (set::iterator q = subtrees[bound].begin(); - q != subtrees[bound].end(); - ++q) - subtrees[dir].insert(*q); - - // bound is no longer a separate subtree. - subtrees[dir].erase(bound); - subtrees.erase(bound); - } + // try merge at my root + try_subtree_merge_at(dir); - // next! - p = next; - } + // try merge at my old bounds + for (set::iterator p = oldbounds.begin(); + p != oldbounds.end(); + ++p) + try_subtree_merge_at(*p); + +} + +void MDCache::try_subtree_merge_at(CDir *dir) +{ + dout(10) << "try_subtree_merge_at " << *dir << endl; + assert(subtrees.count(dir)); // merge with parent? CDir *parent = dir; @@ -407,7 +396,7 @@ void MDCache::try_subtree_merge(CDir *dir) dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, !dir->state_test(CDir::STATE_EXPORTBOUND)) { // not an exportbound, // merge with parent. - dout(10) << " merge with parent " << *parent << endl; + dout(10) << " subtree merge at " << *parent << endl; dir->set_dir_auth(CDIR_AUTH_DEFAULT); // move our bounds under the parent @@ -480,19 +469,54 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair::iterator p = bounds.begin(); p != bounds.end(); ++p) { CDir *bound = *p; - if (subtrees[dir].count(bound)) { + + // new bound? + if (subtrees[dir].count(bound) == 0) { + if (get_subtree_root(bound) == dir) { + dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << endl; + adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. + } + else { + dout(10) << " want bound " << *bound << endl; + // make sure it's nested beneath ambiguous subtree(s) + while (1) { + CDir *t = get_subtree_root(bound->get_parent_dir()); + if (t == dir) break; + while (subtrees[dir].count(t) == 0) + t = get_subtree_root(t->get_parent_dir()); + dout(10) << " swallowing intervening subtree at " << *t << endl; + adjust_subtree_auth(t, auth); + try_subtree_merge_at(t); + } + } + } + else { dout(10) << " already have bound " << *bound << endl; - } else { - dout(10) << " missing bound " << *bound << ", adjusting auth back to old " << oldauth << endl; - adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. } } + // merge stray bounds? + set::iterator p = subtrees[dir].begin(); + while (p != subtrees[dir].end()) { + set::iterator n = p; + n++; + if (bounds.count(*p) == 0) { + CDir *stray = *p; + dout(10) << " swallowing extra subtree at " << *stray << endl; + assert(stray->auth_is_ambiguous()); + adjust_subtree_auth(stray, auth); + try_subtree_merge_at(stray); + } + p = n; + } + // bound should now match. verify_subtree_bounds(dir, bounds); show_subtrees(); @@ -780,9 +804,10 @@ void MDCache::send_pending_import_maps() void MDCache::send_import_map_now(int who) { dout(10) << "send_import_map_now to mds" << who << endl; - MMDSImportMap *m = new MMDSImportMap; + show_subtrees(); + // known for (map >::iterator p = subtrees.begin(); p != subtrees.end(); @@ -833,38 +858,37 @@ void MDCache::handle_import_map(MMDSImportMap *m) dout(7) << "handle_import_map from " << m->get_source() << endl; int from = m->get_source().num(); - // FIXME: check if we are a surviving ambiguous importer - // update my dir_auth values for (map >::iterator pi = m->imap.begin(); pi != m->imap.end(); ++pi) { CDir *im = get_dir(pi->first); - if (im) + if (im) { adjust_bounded_subtree_auth(im, pi->second, from); - - // ambiguous import failure? - if ((mds->is_active() || mds->is_stopping()) && - my_ambiguous_imports.count(pi->first)) { - assert(im); - dout(7) << "ambiguous import failed on " << *im << endl; - migrator->reverse_import(im); + try_subtree_merge(im); } } - // ambiguous import success? + // am i a surviving ambiguous importer? if (mds->is_active() || mds->is_stopping()) { + // check for any import success/failure (from this node) map >::iterator p = my_ambiguous_imports.begin(); while (p != my_ambiguous_imports.end()) { map >::iterator n = p; n++; CDir *dir = get_dir(p->first); assert(dir); + dout(10) << "checking ambiguous import " << *dir << endl; assert(migrator->is_importing(dir->ino())); assert(migrator->get_import_state(dir->ino()) == Migrator::IMPORT_ACKING); if (migrator->get_import_peer(dir->ino()) == from) { - dout(7) << "ambiguous import succeeded on " << *dir << endl; - migrator->import_finish(dir); // success, yay! + if (dir->auth_is_ambiguous()) { + dout(7) << "ambiguous import succeeded on " << *dir << endl; + migrator->import_finish(dir, true); // don't wait for log flush + } else { + dout(7) << "ambiguous import failed on " << *dir << endl; + migrator->import_reverse(dir, false); // don't adjust dir_auth. + } my_ambiguous_imports.erase(p); } p = n; @@ -874,8 +898,12 @@ void MDCache::handle_import_map(MMDSImportMap *m) // note ambiguous imports too for (map >::iterator pi = m->ambiguous_imap.begin(); pi != m->ambiguous_imap.end(); - ++pi) - mds->mdcache->other_ambiguous_imports[from][pi->first].swap( pi->second ); + ++pi) { + dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; + other_ambiguous_imports[from][pi->first].swap( pi->second ); + } + + show_subtrees(); // did i get them all? got_import_map.insert(from); @@ -893,7 +921,7 @@ void MDCache::handle_import_map(MMDSImportMap *m) dout(10) << "still waiting for more importmaps, got " << got_import_map << ", need " << recovery_set << endl; } - + delete m; } @@ -902,15 +930,19 @@ void MDCache::disambiguate_imports() { dout(10) << "disambiguate_imports" << endl; + // FIXME what about surviving bystanders + // other nodes' ambiguous imports for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.begin(); + p != other_ambiguous_imports.end(); ++p) { int who = p->first; + dout(10) << "ambiguous imports for mds" << who << endl; for (map >::iterator q = p->second.begin(); q != p->second.end(); ++q) { + dout(10) << " ambiguous import " << q->first << " bounds " << q->second << endl; CDir *dir = get_dir(q->first); if (!dir) continue; diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index 1031153a39b50..410cc6585f335 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -137,6 +137,7 @@ public: } void adjust_export_state(CDir *dir); void try_subtree_merge(CDir *root); + void try_subtree_merge_at(CDir *root); CDir *get_subtree_root(CDir *dir); void remove_subtree(CDir *dir); void get_subtree_bounds(CDir *root, set& bounds); diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 1bc60ea84150b..1b8adcc35db4e 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -240,20 +240,20 @@ void Migrator::handle_mds_failure(int who) switch (p->second) { case EXPORT_DISCOVERING: - dout(10) << "state discovering : canceling freeze and removing auth_pin" << endl; + dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << endl; dir->unfreeze_tree(); // cancel the freeze dir->auth_unpin(); // remove the auth_pin (that was holding up the freeze) export_state.erase(dir); // clean up break; case EXPORT_FREEZING: - dout(10) << "state freezing : canceling freeze" << endl; + dout(10) << "export state=freezing : canceling freeze" << endl; dir->unfreeze_tree(); // cancel the freeze export_state.erase(dir); // clean up break; case EXPORT_WARNING: - dout(10) << "state warning : unpinning bounds, unfreezing, notifying" << endl; + dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << endl; export_notify_abort(dir); // tell peers about abort // fall-thru @@ -261,7 +261,7 @@ void Migrator::handle_mds_failure(int who) case EXPORT_LOGGINGSTART: case EXPORT_PREPPING: if (p->second != EXPORT_WARNING) - dout(10) << "state loggingstart|prepping : unpinning bounds, unfreezing" << endl; + dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << endl; // unpin bounds for (set::iterator p = export_bounds[dir].begin(); p != export_bounds[dir].end(); @@ -277,14 +277,14 @@ void Migrator::handle_mds_failure(int who) break; case EXPORT_EXPORTING: - dout(10) << "state exporting : reversing, and unfreezing" << endl; - reverse_export(dir); + dout(10) << "export state=exporting : reversing, and unfreezing" << endl; + export_reverse(dir); export_state.erase(dir); // clean up break; case EXPORT_LOGGINGFINISH: case EXPORT_NOTIFYING: - dout(10) << "state loggingfinish|notifying : ignoring dest failure, we were successful." << endl; + dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << endl; // leave export_state, don't clean up now. break; @@ -356,7 +356,7 @@ void Migrator::handle_mds_failure(int who) if (import_peer[dirino] == who) { switch (import_state[dirino]) { case IMPORT_DISCOVERED: - dout(10) << "state discovered : unpinning " << *diri << endl; + dout(10) << "import state=discovered : unpinning " << *diri << endl; assert(diri); // unpin base diri->put(CInode::PIN_IMPORTING); @@ -367,21 +367,23 @@ void Migrator::handle_mds_failure(int who) // NOTE: state order reversal + fall-thru, pay attention. case IMPORT_PREPPED: - dout(10) << "state prepping : unpinning base+bounds, unfreezing, " << *dir << endl; + dout(10) << "import state=prepping : unpinning base+bounds, unfreezing, " << *dir << endl; assert(dir); // unfreeze dir->unfreeze_tree(); // adjust auth back to me - cache->adjust_subtree_auth(dir, mds->get_nodeid()); + cache->adjust_subtree_auth(dir, import_peer[dirino]); cache->try_subtree_merge(dir); + // FIXME what about bystanders + // fall-thru to unpin base+bounds case IMPORT_PREPPING: if (import_state[dirino] == IMPORT_PREPPING) { - dout(10) << "state prepping : unpinning base+bounds " << *dir << endl; + dout(10) << "import state=prepping : unpinning base+bounds " << *dir << endl; } assert(dir); @@ -406,21 +408,17 @@ void Migrator::handle_mds_failure(int who) case IMPORT_LOGGINGSTART: - dout(10) << "state loggingstart : reversing import on " << *dir << endl; - assert(dir); - reverse_import(dir); + dout(10) << "import state=loggingstart : reversing import on " << *dir << endl; + import_reverse(dir); + + // FIXME what about bystanders break; case IMPORT_ACKING: // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - dout(10) << "state acking : noting ambiguous import " << *dir << endl; + dout(10) << "import state=acking : noting ambiguous import " << *dir << endl; cache->add_ambiguous_import(dir, import_bounds[dir]); break; - - case IMPORT_LOGGINGFINISH: - // do nothing special, exporter is no longer involved. - // i will finish/clean up myself. - break; } } @@ -866,11 +864,6 @@ int Migrator::encode_export_dir(list& dirstatelist, dir->state_clear(CDir::STATE_AUTH); dir->replica_nonce = CDIR_NONCE_EXPORT; - // proxy - //dir->state_set(CDir::STATE_PROXY); - //dir->get(CDir::PIN_PROXY); - //export_proxy_dirinos[basedir].push_back(dir->ino()); - list subdirs; if (dir->is_hashed()) { @@ -1021,9 +1014,9 @@ void Migrator::handle_export_ack(MExportDirAck *m) * that is, we don't know they safely received and logged it, so we reverse our changes * and go on. */ -void Migrator::reverse_export(CDir *dir) +void Migrator::export_reverse(CDir *dir) { - dout(7) << "reverse_export " << *dir << endl; + dout(7) << "export_reverse " << *dir << endl; assert(export_state[dir] == EXPORT_EXPORTING); assert(export_bounds.count(dir)); @@ -1058,9 +1051,6 @@ void Migrator::reverse_export(CDir *dir) 0); } - // remove proxy bits - //clear_export_proxy_pins(dir); - // process delayed expires cache->process_delayed_expire(dir); @@ -1104,6 +1094,9 @@ void Migrator::export_logged_finish(CDir *dir) dout(7) << "export_logged_finish " << *dir << endl; dir->put(CDir::PIN_LOGGINGEXPORTFINISH); + if (mds->get_nodeid() == 0 && g_clock.now() > 20.0) assert(0); // hack fake death + + if (export_state.count(dir) == 0|| export_state[dir] != EXPORT_LOGGINGFINISH) { assert(0); // this won't happen. @@ -1423,6 +1416,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) // open export dirs/bounds? assert(import_bound_inos.count(diri->ino()) == 0); + import_bound_inos[diri->ino()].clear(); for (list::iterator it = m->get_exports().begin(); it != m->get_exports().end(); it++) { @@ -1594,9 +1588,9 @@ void Migrator::handle_export_dir(MExportDir *m) * called by both handle_mds_failure and by handle_import_map (if we are * a survivor coping with an exporter failure+recovery). */ -void Migrator::reverse_import(CDir *dir) +void Migrator::import_reverse(CDir *dir, bool fix_dir_auth) { - dout(7) << "reverse_import " << *dir << endl; + dout(7) << "import_reverse " << *dir << endl; // remove importing pin dir->put(CDir::PIN_IMPORTING); @@ -1611,18 +1605,56 @@ void Migrator::reverse_import(CDir *dir) } // update auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, import_peer[dir->ino()]); - cache->try_subtree_merge(dir); - + if (fix_dir_auth) { + assert(dir->is_subtree_root()); + cache->adjust_subtree_auth(dir, import_peer[dir->ino()]); + cache->try_subtree_merge(dir); + } - - assert(0); // implement me. - cache->show_cache(); + // adjust auth bits. + list q; + q.push_back(dir); + while (!q.empty()) { + CDir *cur = q.front(); + q.pop_front(); + + // dir + assert(cur->is_auth()); + cur->state_clear(CDir::STATE_AUTH); + cur->clear_replicas(); + if (cur->is_dirty()) + cur->mark_clean(); + CDir_map_t::iterator it; + for (it = cur->begin(); it != cur->end(); it++) { + CDentry *dn = it->second; + + // dentry + dn->state_clear(CDentry::STATE_AUTH); + dn->clear_replicas(); + if (dn->is_dirty()) + dn->mark_clean(); + // inode? + if (dn->is_primary()) { + CInode *in = dn->get_inode(); + in->state_clear(CDentry::STATE_AUTH); + in->clear_replicas(); + if (in->is_dirty()) + in->mark_clean(); + in->hardlock.clear_gather(); + in->filelock.clear_gather(); + + // non-bounding dir? + if (in->dir && + !in->dir->state_test(CDir::STATE_IMPORTBOUND)) + q.push_back(in->dir); + } + } + } - // ... - // adjust auth/dirty bits + // unfreeze + dir->unfreeze_tree(); // discard expire crap cache->discard_delayed_expire(dir); @@ -1635,6 +1667,9 @@ void Migrator::reverse_import(CDir *dir) import_peer.erase(dir->ino()); import_bound_inos.erase(dir->ino()); import_bounds.erase(dir); + + cache->show_subtrees(); + cache->show_cache(); } @@ -1656,16 +1691,6 @@ void Migrator::import_logged_start(CDir *dir, int from, } -class C_MDS_ImportDirLoggedFinish : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ImportDirLoggedFinish(Migrator *m, CDir *d) : migrator(m), dir(d) { } - void finish(int r) { - migrator->import_logged_finish(dir); - } -}; - void Migrator::handle_export_finish(MExportDirFinish *m) { CDir *dir = cache->get_dir(m->get_ino()); @@ -1675,22 +1700,12 @@ void Migrator::handle_export_finish(MExportDirFinish *m) delete m; } -void Migrator::import_finish(CDir *dir) +void Migrator::import_finish(CDir *dir, bool now) { - dout(7) << "import_finish logging import_finish on " << *dir << endl; + dout(7) << "import_finish on " << *dir << endl; - // note state - import_state[dir->ino()] = IMPORT_LOGGINGFINISH; - // log finish - mds->mdlog->submit_entry(new EImportFinish(dir, true), - new C_MDS_ImportDirLoggedFinish(this,dir)); -} - - -void Migrator::import_logged_finish(CDir *dir) -{ - dout(7) << "import_logged_finish " << *dir << endl; + mds->mdlog->submit_entry(new EImportFinish(dir, true)); // remove pins dir->put(CDir::PIN_IMPORTING); @@ -1957,8 +1972,6 @@ void Migrator::handle_export_warning(MExportDirWarning *m) delete m; - // hack: trim now, to flush out cacheexpire bugs - cache->trim(0); } diff --git a/branches/sage/cephmds2/mds/Migrator.h b/branches/sage/cephmds2/mds/Migrator.h index 12884ee5731ec..17b7e17eeb0e3 100644 --- a/branches/sage/cephmds2/mds/Migrator.h +++ b/branches/sage/cephmds2/mds/Migrator.h @@ -94,7 +94,7 @@ public: const static int IMPORT_PREPPED = 3; // opened bounds, waiting for import const static int IMPORT_LOGGINGSTART = 4; // got import, logging EImportStart const static int IMPORT_ACKING = 5; // logged EImportStart, sent ack, waiting for finish - const static int IMPORT_LOGGINGFINISH = 6; // logging EImportFinish + //const static int IMPORT_LOGGINGFINISH = 6; // logging EImportFinish protected: map import_state; @@ -179,7 +179,7 @@ public: CDir *basedir, CDir *dir, int newauth); - void reverse_export(CDir *dir); + void export_reverse(CDir *dir); void export_notify_abort(CDir* dir); void handle_export_ack(MExportDirAck *m); void export_logged_finish(CDir *dir); @@ -203,16 +203,15 @@ public: inodeno_t dir_ino, inodeno_t replica_ino); public: - void reverse_import(CDir *dir); + void import_reverse(CDir *dir, bool fix_dir_auth=true); protected: void import_logged_start(CDir *dir, int from, list &imported_subdirs, list &exports); void handle_export_finish(MExportDirFinish *m); public: - void import_finish(CDir *dir); + void import_finish(CDir *dir, bool now=false); protected: - void import_logged_finish(CDir *dir); friend class C_MDC_ExportDirDiscover; friend class C_MDS_ImportDirLoggedStart; diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc index cc98ad24c0eb2..3d7900299abed 100644 --- a/branches/sage/cephmds2/mon/MDSMonitor.cc +++ b/branches/sage/cephmds2/mon/MDSMonitor.cc @@ -61,7 +61,7 @@ void MDSMonitor::dispatch(Message *m) void MDSMonitor::print_map() { - dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl; + dout(7) << "print_map epoch " << mdsmap.get_epoch() << " num_mds " << g_conf.num_mds << endl; entity_inst_t blank; set all; mdsmap.get_mds_set(all); @@ -105,6 +105,12 @@ void MDSMonitor::handle_command(MMonCommand *m, int& r, string& rs) getline(ss,rs); } } + else if (m->cmd[1] == "setnum" && m->cmd.size() > 2) { + g_conf.num_mds = atoi(m->cmd[2].c_str()); + ss << "g_conf.num_mds = " << g_conf.num_mds << endl; + getline(ss,rs); + print_map(); + } } } @@ -184,7 +190,13 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) !mdsmap.is_failed(from)) { dout(10) << "mds_beacon currently degraded, mds" << from << " will be standby" << endl; state = MDSMap::STATE_STANDBY; + } + /* + else if (from >= g_conf.num_mds) { + dout(10) << "mds_beacon already have " << g_conf.num_mds << " mds's, standby (increase with 'mds setnum xxx')" << endl; + state = MDSMap::STATE_STANDBY; } + */ else if (state == MDSMap::STATE_STARTING) { if (mdsmap.is_failed(from)) { dout(10) << "mds_beacon will recover mds" << from << endl; -- 2.39.5