From c3df762d44998d2400c43a99fbd244fd8b7e06fd Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 2 Mar 2007 00:54:03 +0000 Subject: [PATCH] fixed cache rejoin. --mds_dump_cache_on_map git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1154 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 12 +- branches/sage/cephmds2/config.cc | 3 + branches/sage/cephmds2/config.h | 1 + branches/sage/cephmds2/mds/MDBalancer.cc | 2 +- branches/sage/cephmds2/mds/MDCache.cc | 136 ++++++++++++------ branches/sage/cephmds2/mds/MDCache.h | 10 +- branches/sage/cephmds2/mds/MDS.cc | 9 ++ branches/sage/cephmds2/mds/Migrator.cc | 16 +-- branches/sage/cephmds2/mds/Renamer.cc | 6 +- branches/sage/cephmds2/mds/journal.cc | 2 +- branches/sage/cephmds2/messages/MExportDir.h | 3 + .../sage/cephmds2/messages/MExportDirAck.h | 5 +- .../sage/cephmds2/messages/MExportDirFinish.h | 4 +- .../sage/cephmds2/messages/MMDSCacheRejoin.h | 4 - .../cephmds2/messages/MMDSCacheRejoinAck.h | 4 - 15 files changed, 140 insertions(+), 77 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 4b3c87e4c0160..6e22da94feedf 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -18,11 +18,14 @@ doc mds - bystanders should avoid contacting auth when it is ambiguous. - CDIR_WAIT_UNAMBIGUOUS? -- openingdir pins should be handled by open_remote_dir, not explicitly by handle_export_dir_prep -- bystander recovery from exporter failure +- during export prep phase, + - if importer is trying to open dirs on a node that fails, what happens? + 1- could detect dependency and fail. + 2- could force a CDir replica, since it'll get rejoined into cache later. + - is this related to solving the larger problem of discover vs mds failure? +- locker vs node failure +- do i need openingdir pins? won't the DIR waiter on the inode be sufficient? - does inode need it's own replica list? no? -- dirslices. -- importer recovery if exporter fails - osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - failures during recovery stages (resolve, rejoin)... make sure rejoin still works! - fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) @@ -40,6 +43,7 @@ mds - file capabilities i/o - link - rename +- dirslices. monitor diff --git a/branches/sage/cephmds2/config.cc b/branches/sage/cephmds2/config.cc index e826ff67117cc..b978002d18b26 100644 --- a/branches/sage/cephmds2/config.cc +++ b/branches/sage/cephmds2/config.cc @@ -199,6 +199,7 @@ md_config_t g_conf = { mds_local_osd: false, mds_thrash_exports: 0, + mds_dump_cache_on_map: false, // --- osd --- osd_rep: OSD_REP_PRIMARY, @@ -611,6 +612,8 @@ void parse_config_options(std::vector& args) g_conf.mds_local_osd = atoi(args[++i]); else if (strcmp(args[i], "--mds_thrash_exports") == 0) g_conf.mds_thrash_exports = atoi(args[++i]); + else if (strcmp(args[i], "--mds_dump_cache_on_map") == 0) + g_conf.mds_dump_cache_on_map = 1; else if (strcmp(args[i], "--client_use_random_mds") == 0) g_conf.client_use_random_mds = true; diff --git a/branches/sage/cephmds2/config.h b/branches/sage/cephmds2/config.h index 3b37b65d5d9f2..eaebd6d620176 100644 --- a/branches/sage/cephmds2/config.h +++ b/branches/sage/cephmds2/config.h @@ -190,6 +190,7 @@ struct md_config_t { bool mds_local_osd; int mds_thrash_exports; + bool mds_dump_cache_on_map; // osd int osd_rep; diff --git a/branches/sage/cephmds2/mds/MDBalancer.cc b/branches/sage/cephmds2/mds/MDBalancer.cc index 071cc06386422..002d17c9f6b8a 100644 --- a/branches/sage/cephmds2/mds/MDBalancer.cc +++ b/branches/sage/cephmds2/mds/MDBalancer.cc @@ -856,7 +856,7 @@ void MDBalancer::add_import(CDir *dir) void MDBalancer::show_imports(bool external) { - mds->mdcache->show_imports(); + mds->mdcache->show_subtrees(); } diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index cc35db8ca8cb8..ffb0237b54eae 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -130,7 +130,7 @@ bool MDCache::shutdown() if (lru.lru_get_size() > 0) { dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; //show_cache(); - show_imports(); + show_subtrees(); //dump(); } return true; @@ -227,7 +227,7 @@ int MDCache::open_root(Context *c) adjust_subtree_auth(root->dir, 0); root->dir->dir_rep = CDir::REP_ALL; //NONE; - show_imports(); + show_subtrees(); if (c) { c->finish(0); @@ -411,7 +411,7 @@ void MDCache::try_subtree_merge_at(CDir *dir) subtrees[parent].erase(dir); } - show_subtrees(); + show_subtrees(15); } @@ -851,6 +851,16 @@ void MDCache::handle_mds_failure(int who) { dout(7) << "handle_mds_failure mds" << who << endl; + // make note of recovery set + mds->mdsmap->get_recovery_mds_set(recovery_set); + recovery_set.erase(mds->get_nodeid()); + dout(1) << "my recovery peers will be " << recovery_set << endl; + + // adjust my recovery lists + wants_import_map.erase(who); // MDS will ask again + got_import_map.erase(who); // i'll get another. + rejoin_ack_gather.erase(who); // i'll need/get another. + // adjust subtree auth for (map >::iterator p = subtrees.begin(); p != subtrees.end(); @@ -878,6 +888,11 @@ void MDCache::handle_mds_failure(int who) show_subtrees(); } +void MDCache::set_recovery_set(set& s) +{ + dout(7) << "set_recovery_set " << s << endl; + recovery_set = s; +} /* @@ -1009,24 +1024,30 @@ void MDCache::disambiguate_imports() } assert(my_ambiguous_imports.empty()); - show_imports(); + show_subtrees(); } void MDCache::add_ambiguous_import(inodeno_t base, list& bounds) { assert(my_ambiguous_imports.count(base) == 0); - my_ambiguous_imports[base].swap(bounds); + my_ambiguous_imports[base].swap( bounds ); } void MDCache::add_ambiguous_import(CDir *base, const set& bounds) { + // make a list list binos; for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) binos.push_back((*p)->ino()); + + // note: this can get called twice if the exporter fails during recovery + if (my_ambiguous_imports.count(base->ino())) + my_ambiguous_imports.erase(base->ino()); + add_ambiguous_import(base->ino(), binos); } @@ -1098,7 +1119,7 @@ void MDCache::recalc_auth_bits() } } } - show_imports(); + show_subtrees(); show_cache(); } @@ -1111,7 +1132,7 @@ void MDCache::recalc_auth_bits() */ void MDCache::send_cache_rejoins() { - dout(10) << "send_cache_rejoins " << endl; + dout(10) << "send_cache_rejoins with recovery_set " << recovery_set << endl; map rejoins; @@ -1126,35 +1147,24 @@ void MDCache::send_cache_rejoins() rejoins[*p] = new MMDSCacheRejoin; } - // build list of dir_auth regions - list dir_auth_regions; - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); + assert(!migrator->is_importing()); + assert(!migrator->is_exporting()); + + // check all subtrees + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); ++p) { - if (!p->second->is_dir()) continue; - if (!p->second->dir) continue; - if (p->second->dir->get_dir_auth().first == CDIR_AUTH_PARENT) continue; + CDir *dir = p->first; + assert(dir->is_subtree_root()); + assert(!dir->auth_is_ambiguous()); - int auth = p->second->dir->get_dir_auth().first; + int auth = dir->get_dir_auth().first; assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - - if (rejoins.count(auth) == 0) - continue; // don't care about this node's regions - // add to list - dout(10) << " on mds" << auth << " region " << *p->second << endl; - dir_auth_regions.push_back(p->second->dir); - } + if (auth == mds->get_nodeid()) continue; // skip my own regions! + if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - // walk the regions - for (list::iterator p = dir_auth_regions.begin(); - p != dir_auth_regions.end(); - ++p) { - CDir *dir = *p; - int to = dir->authority().first; - cache_rejoin_walk(dir, rejoins[to]); + cache_rejoin_walk(dir, rejoins[auth]); } // send the messages @@ -1194,9 +1204,8 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) rejoin->add_inode(in->ino(), in->get_caps_wanted()); - // dir? - if (in->dir && - in->dir->get_dir_auth().first == CDIR_AUTH_PARENT) + // dir (in this subtree)? + if (in->dir && !in->dir->is_subtree_root()) nested.push_back(in->dir); } } @@ -1311,7 +1320,7 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) { - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; + dout(7) << "handle_cache_rejoin_ack from " << m->get_source() << endl; int from = m->get_source().num(); // dirs @@ -1354,7 +1363,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) rejoin_ack_gather.erase(from); if (rejoin_ack_gather.empty()) { dout(7) << "all done, going active!" << endl; - show_imports(); + show_subtrees(); show_cache(); mds->set_want_state(MDSMap::STATE_ACTIVE); } else { @@ -1935,7 +1944,7 @@ bool MDCache::shutdown_pass() if (mds->is_out()) { dout(7) << " already shut down" << endl; show_cache(); - show_imports(); + show_subtrees(); return true; } @@ -2511,7 +2520,7 @@ int MDCache::path_traverse(filepath& origpath, } mds->send_message_mds(req, dauth.first, req->get_dest_port()); - //show_imports(); + //show_subtrees(); if (mds->logger) mds->logger->inc("cfw"); if (onfinish) delete onfinish; @@ -3921,9 +3930,9 @@ void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set& s) // ============================================================== // debug crap -void MDCache::show_subtrees() +void MDCache::show_subtrees(int dbl) { - dout(10) << "show_subtrees:" << endl; + //dout(10) << "show_subtrees" << endl; list > q; string indent; @@ -3964,7 +3973,7 @@ void MDCache::show_subtrees() sprintf(s, "%2d,%2d", dir->get_dir_auth().first, dir->get_dir_auth().second); // print - dout(10) << indent << "|_" << pad << s << " " << auth << *dir << endl; + dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << endl; // nested items? if (!subtrees[dir].empty()) { @@ -3983,6 +3992,7 @@ void MDCache::show_subtrees() } +/* void MDCache::show_imports() { int db = 10; @@ -4008,7 +4018,8 @@ void MDCache::show_imports() show_subtrees(); return; - +} +*/ /// old @@ -4074,14 +4085,14 @@ void MDCache::show_imports() dout(1) << "***** stray item in exports: " << **it << endl; assert(ecopy.size() == 0); } - */ } + */ void MDCache::show_cache() { dout(7) << "show_cache" << endl; - + for (hash_map::iterator it = inode_map.begin(); it != inode_map.end(); it++) { @@ -4095,3 +4106,40 @@ void MDCache::show_cache() } } + +void MDCache::dump_cache() +{ + char fn[20]; + sprintf(fn, "cachedump.%d.mds%d", mds->mdsmap->get_epoch(), mds->get_nodeid()); + + dout(1) << "dump_cache to " << fn << endl; + + ofstream myfile; + myfile.open(fn); + + for (hash_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it++) { + /* + myfile << *((*it).second) << endl; + CDentry *dn = (*it).second->get_parent_dn(); + if (dn) + myfile << *dn << endl; + */ + + if ((*it).second->dir) { + CDir *dir = (*it).second->dir; + myfile << *dir->inode << endl; + myfile << *dir << endl; + + for (CDir_map_t::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + CDentry *dn = p->second; + myfile << *dn << endl; + } + } + } + + myfile.close(); +} diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index 45ed437b1360d..051c9894cba86 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -179,9 +179,10 @@ protected: set recovery_set; set wants_import_map; // nodes i need to send my import map to - set got_import_map; // nodes i need to send my import map to (when exports finish) + set got_import_map; // nodes i got import_maps from set rejoin_ack_gather; // nodes i need a rejoin ack from + void set_recovery_set(set& s); void handle_mds_failure(int who); void handle_import_map(MMDSImportMap *m); void handle_cache_rejoin(MMDSCacheRejoin *m); @@ -196,9 +197,6 @@ public: void send_pending_import_maps(); // maybe. void send_cache_rejoins(); - void set_recovery_set(set& s) { - recovery_set = s; - } // ambiguous imports void add_ambiguous_import(inodeno_t base, list& bounds); @@ -396,9 +394,9 @@ public: if (root) root->dump(); } - void show_imports(); void show_cache(); - void show_subtrees(); + void dump_cache(); + void show_subtrees(int dbl=10); }; diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index 1acb1781a9e5c..20ad46b8c732f 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -461,6 +461,10 @@ void MDS::handle_mds_map(MMDSMap *m) } } + // for debug + if (g_conf.mds_dump_cache_on_map) + mdcache->dump_cache(); + // update my state state = mdsmap->get_state(whoami); @@ -560,9 +564,14 @@ void MDS::handle_mds_map(MMDSMap *m) // REJOIN // is everybody finally rejoining? if (is_rejoin() || is_active() || is_stopping()) { + // did we start? if (!wasrejoining && mdsmap->is_rejoining()) { mdcache->send_cache_rejoins(); } + // did we finish? + if (wasrejoining && !mdsmap->is_rejoining()) { + mdcache->dump_cache(); + } } // did anyone go down? diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index b3875efa6ae8f..0c0c308278c66 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -536,7 +536,7 @@ void Migrator::export_frozen(CDir *dir, // ok! //export_state[dir] = EXPORT_LOGGINGSTART; - cache->show_imports(); + cache->show_subtrees(); // note the bounds. // force it into a subtree by listing auth as . @@ -710,7 +710,7 @@ void Migrator::export_go(CDir *dir) int dest = export_peer[dir]; dout(7) << "export_go " << *dir << " to " << dest << endl; - cache->show_imports(); + cache->show_subtrees(); export_warning_ack_waiting.erase(dir); export_state[dir] = EXPORT_EXPORTING; @@ -754,7 +754,7 @@ void Migrator::export_go(CDir *dir) if (mds->logger) mds->logger->inc("ex"); if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - cache->show_imports(); + cache->show_subtrees(); } @@ -1238,7 +1238,7 @@ void Migrator::export_finish(CDir *dir) // stats //if (mds->logger) mds->logger->set("nex", cache->exports.size()); - cache->show_imports(); + cache->show_subtrees(); // send pending import_maps? mds->mdcache->send_pending_import_maps(); @@ -1363,7 +1363,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) } assert(dir->is_auth() == false); - cache->show_imports(); + cache->show_subtrees(); // assimilate contents? if (!m->did_assim()) { @@ -1525,7 +1525,7 @@ void Migrator::handle_export_dir(MExportDir *m) dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; assert(dir->is_auth() == false); - cache->show_imports(); + cache->show_subtrees(); // start the journal entry EImportStart *le = new EImportStart(dir->ino(), m->get_exports()); @@ -1734,7 +1734,7 @@ void Migrator::import_logged_start(CDir *dir, int from, mds->send_message_mds(new MExportDirAck(dir->inode->ino()), from, MDS_PORT_MIGRATOR); - cache->show_imports(); + cache->show_subtrees(); } @@ -1794,7 +1794,7 @@ void Migrator::import_finish(CDir *dir, bool now) //mds->logger->set("nex", cache->exports.size()); //mds->logger->set("nim", cache->imports.size()); } - cache->show_imports(); + cache->show_subtrees(); // is it empty? if (dir->get_size() == 0 && diff --git a/branches/sage/cephmds2/mds/Renamer.cc b/branches/sage/cephmds2/mds/Renamer.cc index df1993a7ce163..ec865ae8f69bc 100644 --- a/branches/sage/cephmds2/mds/Renamer.cc +++ b/branches/sage/cephmds2/mds/Renamer.cc @@ -272,7 +272,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, } } */ - cache->show_imports(); + cache->show_subtrees(); } /* @@ -359,7 +359,7 @@ void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) srcauth); // tell dest who src is (maybe even me) mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - cache->show_imports(); + cache->show_subtrees(); } @@ -503,7 +503,7 @@ void Renamer::file_rename_foreign_src(CDentry *srcdn, assert(in); assert(in->is_auth()); - if (in->is_dir()) cache->show_imports(); + if (in->is_dir()) cache->show_subtrees(); // encode and export inode state bufferlist inode_state; diff --git a/branches/sage/cephmds2/mds/journal.cc b/branches/sage/cephmds2/mds/journal.cc index 5b242b0e86f86..238ed6bbd719f 100644 --- a/branches/sage/cephmds2/mds/journal.cc +++ b/branches/sage/cephmds2/mds/journal.cc @@ -400,7 +400,7 @@ void EImportMap::replay(MDS *mds) mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid()); } } - mds->mdcache->show_imports(); + mds->mdcache->show_subtrees(); } diff --git a/branches/sage/cephmds2/messages/MExportDir.h b/branches/sage/cephmds2/messages/MExportDir.h index 8fdda89466b1e..39ecae2422a72 100644 --- a/branches/sage/cephmds2/messages/MExportDir.h +++ b/branches/sage/cephmds2/messages/MExportDir.h @@ -31,6 +31,9 @@ class MExportDir : public Message { ino(dirino) { } virtual char *get_type_name() { return "Ex"; } + void print(ostream& o) { + o << "export(" << ino << ")"; + } inodeno_t get_ino() { return ino; } list& get_dirstate() { return dirstate; } diff --git a/branches/sage/cephmds2/messages/MExportDirAck.h b/branches/sage/cephmds2/messages/MExportDirAck.h index c5006314ab050..dc0c5abdeb478 100644 --- a/branches/sage/cephmds2/messages/MExportDirAck.h +++ b/branches/sage/cephmds2/messages/MExportDirAck.h @@ -27,7 +27,10 @@ class MExportDirAck : public Message { Message(MSG_MDS_EXPORTDIRACK), ino(i) { } virtual char *get_type_name() { return "ExAck"; } - + void print(ostream& o) { + o << "export_ack(" << ino << ")"; + } + virtual void decode_payload() { int off = 0; payload.copy(off, sizeof(ino), (char*)&ino); diff --git a/branches/sage/cephmds2/messages/MExportDirFinish.h b/branches/sage/cephmds2/messages/MExportDirFinish.h index 0849aff2d7b2f..3e8b695e443f4 100644 --- a/branches/sage/cephmds2/messages/MExportDirFinish.h +++ b/branches/sage/cephmds2/messages/MExportDirFinish.h @@ -28,7 +28,9 @@ class MExportDirFinish : public Message { this->ino = ino; } virtual char *get_type_name() { return "ExFin"; } - + void print(ostream& o) { + o << "export_finish(" << ino << ")"; + } virtual void decode_payload() { int off = 0; payload.copy(off, sizeof(ino), (char*)&ino); diff --git a/branches/sage/cephmds2/messages/MMDSCacheRejoin.h b/branches/sage/cephmds2/messages/MMDSCacheRejoin.h index 2789e30844743..dab60dbdcbe9e 100644 --- a/branches/sage/cephmds2/messages/MMDSCacheRejoin.h +++ b/branches/sage/cephmds2/messages/MMDSCacheRejoin.h @@ -30,10 +30,6 @@ class MMDSCacheRejoin : public Message { char *get_type_name() { return "cache_rejoin"; } - void print(ostream& out) { - out << "cache_rejoin" << endl; - } - void add_dir(inodeno_t dirino) { dirs.insert(dirino); } diff --git a/branches/sage/cephmds2/messages/MMDSCacheRejoinAck.h b/branches/sage/cephmds2/messages/MMDSCacheRejoinAck.h index b8f0d23ebbba0..713709969b714 100644 --- a/branches/sage/cephmds2/messages/MMDSCacheRejoinAck.h +++ b/branches/sage/cephmds2/messages/MMDSCacheRejoinAck.h @@ -50,10 +50,6 @@ class MMDSCacheRejoinAck : public Message { char *get_type_name() { return "cache_rejoin_ack"; } - void print(ostream& out) { - out << "cache_rejoin" << endl; - } - void add_dir(inodeno_t dirino, int nonce) { dirs.push_back(dirinfo(dirino,nonce)); } -- 2.39.5