From 306373427836ca0c2418dbe6caab26d74d94d12e Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 19 Jun 2015 22:57:57 +0800 Subject: [PATCH] tools/ceph-monstore-tools: add rewrite command "rewrite" command will - add a new osdmap version to update current osdmap held by OSDMonitor - add a new paxos version, as a proposal it will * rewrite all osdmap epochs from specified epoch to the last_committed one with the specified crush map. * add the new osdmap which is added just now so the leader monitor can trigger a recovery process to apply the transaction to all monitors in quorum, and hence bring them back to normal after being injected with a faulty crushmap. Fixes: #11815 Signed-off-by: Kefu Chai --- src/tools/ceph_monstore_tool.cc | 194 ++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index 7ea86ba992168..969340dd97b30 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -164,6 +164,7 @@ int parse_cmd_args( * dump-trace < --trace-file arg > * replay-trace * random-gen + * rewrite-crush * * wanted syntax: * @@ -202,6 +203,8 @@ void usage(const char *n, po::options_description &d) << " (replay-trace -- --help for more info)\n" << " random-gen [-- options] add randomly generated ops to the store\n" << " (random-gen -- --help for more info)\n" + << " rewrite-crush [-- options] add a rewrite commit to the store\n" + << " (rewrite-crush -- --help for more info)\n" << std::endl; std::cerr << d << std::endl; std::cerr @@ -213,6 +216,195 @@ void usage(const char *n, po::options_description &d) << std::endl; } +int update_osdmap(MonitorDBStore& store, version_t ver, bool copy, + ceph::shared_ptr crush, + MonitorDBStore::Transaction* t) { + const string prefix("osdmap"); + + // full + bufferlist bl; + int r = 0; + r = store.get(prefix, store.combine_strings("full", ver), bl); + if (r) { + std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap osdmap; + osdmap.decode(bl); + osdmap.crush = crush; + if (copy) { + osdmap.inc_epoch(); + } + bl.clear(); + // be consistent with OSDMonitor::update_from_paxos() + osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl); + + // incremental + OSDMap::Incremental inc; + if (copy) { + inc.epoch = osdmap.get_epoch(); + inc.fsid = osdmap.get_fsid(); + } else { + bl.clear(); + r = store.get(prefix, ver, bl); + if (r) { + std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap::Incremental inc(bl); + if (inc.crush.length()) { + inc.crush.clear(); + crush->encode(inc.crush); + } + if (inc.fullmap.length()) { + OSDMap fullmap; + fullmap.decode(inc.fullmap); + fullmap.crush = crush; + inc.fullmap.clear(); + fullmap.encode(inc.fullmap); + } + } + assert(osdmap.have_crc()); + inc.full_crc = osdmap.get_crc(); + bl.clear(); + // be consistent with OSDMonitor::update_from_paxos() + inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + t->put(prefix, inc.epoch, bl); + return 0; +} + +int rewrite_transaction(MonitorDBStore& store, int version, + const string& crush_file, + MonitorDBStore::Transaction* t) { + const string prefix("osdmap"); + + // calc the known-good epoch + version_t last_committed = store.get(prefix, "last_committed"); + version_t good_version = 0; + if (version <= 0) { + if (last_committed >= (unsigned)-version) { + good_version = last_committed + version; + } else { + std::cerr << "osdmap-version is less than: -" << last_committed << std::endl; + return EINVAL; + } + } else { + good_version = version; + } + if (good_version >= last_committed) { + std::cout << "good epoch is greater or equal to the last committed one: " + << good_version << " >= " << last_committed << std::endl; + return 0; + } + + // load/extract the crush map + int r = 0; + ceph::shared_ptr crush(new CrushWrapper); + if (crush_file.empty()) { + bufferlist bl; + r = store.get(prefix, store.combine_strings("full", good_version), bl); + if (r) { + std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap osdmap; + osdmap.decode(bl); + crush = osdmap.crush; + } else { + string err; + bufferlist bl; + r = bl.read_file(crush_file.c_str(), &err); + if (r) { + std::cerr << err << ": " << cpp_strerror(r) << std::endl; + return r; + } + bufferlist::iterator p = bl.begin(); + crush->decode(p); + } + + // prepare a transaction to rewrite the epochs + // (good_version, last_committed] + // with the good crush map. + // XXX: may need to break this into several paxos versions? + assert(good_version < last_committed); + for (version_t v = good_version + 1; v <= last_committed; v++) { + cout << "rewriting epoch #" << v << "/" << last_committed << std::endl; + r = update_osdmap(store, v, false, crush, t); + if (r) + return r; + } + + // add a new osdmap epoch to store, so monitors will update their current osdmap + // in addition to the ones stored in epochs. + cout << "adding a new epoch #" << last_committed+1 << std::endl; + r = update_osdmap(store, last_committed++, true, crush, t); + if (r) + return r; + t->put(prefix, store.combine_strings("full", "latest"), last_committed); + t->put(prefix, "last_committed", last_committed); + return 0; +} + +/** + * create a new paxos version which carries a proposal to rewrite all epochs + * of incremental and full map of "osdmap" after a faulty crush map is injected. + * so the leader will trigger a recovery and propagate this fix to its peons, + * after the proposal is accepted, and the transaction in it is applied. all + * monitors will rewrite the bad crush map with the good one, and have a new + * osdmap epoch with the good crush map in it. + */ +int rewrite_crush(const char* progname, + vector& subcmds, + MonitorDBStore& store) { + po::options_description op_desc("Allowed 'rewrite-crush' options"); + int version = -1; + string crush_file; + op_desc.add_options() + ("help,h", "produce this help message") + ("crush", po::value(&crush_file), + ("path to the crush map file " + "(default: will instead extract it from the known-good osdmap)")) + ("good-epoch", po::value(&version), + "known-good epoch of osdmap, if a negative number '-N' is given, the " + "$last_committed-N is used instead (default: -1). " + "Please note, -1 is not necessarily a good epoch, because there are " + "good chance that we have more epochs slipped into the monstore after " + "the one where the crushmap is firstly injected.") + ; + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm); + if (r) { + return -r; + } + if (op_vm.count("help")) { + usage(progname, op_desc); + return 0; + } + + MonitorDBStore::Transaction rewrite_txn; + r = rewrite_transaction(store, version, crush_file, &rewrite_txn); + if (r) { + return r; + } + + // store the transaction into store as a proposal + const string prefix("paxos"); + version_t pending_v = store.get(prefix, "last_committed") + 1; + MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction); + bufferlist bl; + rewrite_txn.encode(bl); + cout << "adding pending commit " << pending_v + << " " << bl.length() << " bytes" << std::endl; + t->put(prefix, pending_v, bl); + t->put(prefix, "pending_v", pending_v); + // a large enough yet unique proposal number will probably do the trick + version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1; + t->put(prefix, "pending_pn", pending_pn); + store.apply_transaction(t); + return 0; +} + int main(int argc, char **argv) { int err = 0; po::options_description desc("Allowed options"); @@ -692,6 +884,8 @@ int main(int argc, char **argv) { << stringify(si_t(total_size)) << std::endl; std::cout << "from '" << store_path << "' to '" << out_path << "'" << std::endl; + } else if (cmd == "rewrite-crush") { + err = rewrite_crush(argv[0], subcmds, st); } else { std::cerr << "Unrecognized command: " << cmd << std::endl; usage(argv[0], desc); -- 2.39.5