From b84803d6092667631527b8b37000cac6447e0b84 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 12 Sep 2017 17:17:13 -0700 Subject: [PATCH] osd: Only scan for omap corruption once Before state 2: Can have complete tables (some may be bad) state 3: Never had complete tables After state 2: Can have complete tables (some may be bad) state 3 with legacy: Can have complete tables (bad ones are cleared) state 3: Never had complete tables Once OSDs boot with this change you can't downgrade to a previous release. If someone does downgrade they could have unstable OSDs that hit assert(state.v < 3). The following command run after shutting down the cluster but before downgrading ceph packages would be a way to fix this. ceph-osdomap-tool --omap-path ... --command resetv2 Fixes: http://tracker.ceph.com/issues/21328 Signed-off-by: David Zafman (cherry picked from commit 8805ef53424e30fd3f24ee38f5a6bdd9e6dd8641) --- src/os/ObjectMap.h | 2 +- src/os/filestore/DBObjectMap.cc | 87 +++++++++++++++++++-------------- src/os/filestore/DBObjectMap.h | 20 ++++++-- src/tools/ceph_osdomap_tool.cc | 4 +- 4 files changed, 70 insertions(+), 43 deletions(-) diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h index 67a5780ae8c28..32b423c286168 100644 --- a/src/os/ObjectMap.h +++ b/src/os/ObjectMap.h @@ -152,7 +152,7 @@ public: const SequencerPosition *spos=0 ///< [in] Sequencer ) { return 0; } - virtual int check(std::ostream &out, bool repair = false) { return 0; } + virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; } virtual void compact() {} diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc index 349fc15b862f9..b36d9fe5509d0 100644 --- a/src/os/filestore/DBObjectMap.cc +++ b/src/os/filestore/DBObjectMap.cc @@ -55,9 +55,9 @@ static void append_escaped(const string &in, string *out) } } -int DBObjectMap::check(std::ostream &out, bool repair) +int DBObjectMap::check(std::ostream &out, bool repair, bool force) { - int errors = 0; + int errors = 0, comp_errors = 0; bool repaired = false; map parent_to_num_children; map parent_to_actual_num_children; @@ -71,34 +71,37 @@ int DBObjectMap::check(std::ostream &out, bool repair) if (header.seq != 0) parent_to_actual_num_children[header.seq] = header.num_children; - // Check complete table - bool complete_error = false; - boost::optional prev; - KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); - for (complete_iter->seek_to_first(); complete_iter->valid(); - complete_iter->next()) { - if (prev && prev >= complete_iter->key()) { - out << "Bad complete for " << header.oid << std::endl; - complete_error = true; - break; - } - prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1); - } - if (complete_error) { - out << "Complete mapping for " << header.seq << " :" << std::endl; - for (complete_iter->seek_to_first(); complete_iter->valid(); - complete_iter->next()) { - out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl; - } - if (repair) { - repaired = true; - KeyValueDB::Transaction t = db->get_transaction(); - t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); - db->submit_transaction(t); - out << "Cleared complete mapping to repair" << std::endl; - } else { - errors++; // Only count when not repaired - } + if (state.v == 2 || force) { + // Check complete table + bool complete_error = false; + boost::optional prev; + KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); + for (complete_iter->seek_to_first(); complete_iter->valid(); + complete_iter->next()) { + if (prev && prev >= complete_iter->key()) { + out << "Bad complete for " << header.oid << std::endl; + complete_error = true; + break; + } + prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1); + } + if (complete_error) { + out << "Complete mapping for " << header.seq << " :" << std::endl; + for (complete_iter->seek_to_first(); complete_iter->valid(); + complete_iter->next()) { + out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl; + } + if (repair) { + repaired = true; + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); + db->submit_transaction(t); + out << "Cleared complete mapping to repair" << std::endl; + } else { + errors++; // Only count when not repaired + comp_errors++; // Track errors here for version update + } + } } if (header.parent == 0) @@ -137,6 +140,17 @@ int DBObjectMap::check(std::ostream &out, bool repair) } parent_to_actual_num_children.erase(i->first); } + + // Only advance the version from 2 to 3 here + // Mark as legacy because there are still older structures + // we don't update. The value of legacy is only used + // for internal assertions. + if (comp_errors == 0 && state.v == 2 && repair) { + state.v = 3; + state.legacy = true; + set_state(); + } + if (errors == 0 && repaired) return -1; return errors; @@ -645,7 +659,7 @@ int DBObjectMap::rm_keys(const ghobject_t &oid, return db->submit_transaction(t); } - assert(state.v < 3); + assert(state.legacy); { // We only get here for legacy (v2) stores @@ -852,7 +866,7 @@ int DBObjectMap::legacy_clone(const ghobject_t &oid, const ghobject_t &target, const SequencerPosition *spos) { - state.v = 2; + state.legacy = true; if (oid == target) return 0; @@ -1030,7 +1044,8 @@ void DBObjectMap::set_state() Mutex::Locker l(header_lock); KeyValueDB::Transaction t = db->get_transaction(); write_state(t); - db->submit_transaction_sync(t); + int ret = db->submit_transaction_sync(t); + assert(ret == 0); dout(1) << __func__ << " done" << dendl; return; } @@ -1048,9 +1063,9 @@ int DBObjectMap::get_state() state.decode(bliter); } else { // New store - // Version 3 means that complete regions never used - state.v = 3; + state.v = State::CUR_VERSION; state.seq = 1; + state.legacy = false; } return 0; } @@ -1236,7 +1251,7 @@ void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t) dout(20) << "clear_header: clearing seq " << header->seq << dendl; t->rmkeys_by_prefix(user_prefix(header)); t->rmkeys_by_prefix(sys_prefix(header)); - if (state.v < 3) + if (state.legacy) t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0 t->rmkeys_by_prefix(xattr_prefix(header)); set keys; diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h index 4d95450fe1d01..fb1653e489d0e 100644 --- a/src/os/filestore/DBObjectMap.h +++ b/src/os/filestore/DBObjectMap.h @@ -229,7 +229,7 @@ public: int upgrade_to_v2(); /// Consistency check, debug, there must be no parallel writes - int check(std::ostream &out, bool repair = false) override; + int check(std::ostream &out, bool repair = false, bool force = false) override; /// Ensure that all previous operations are durable int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override; @@ -265,30 +265,40 @@ public: /// persistent state for store @see generate_header struct State { + static const __u8 CUR_VERSION = 3; __u8 v; uint64_t seq; - State() : v(0), seq(1) {} - explicit State(uint64_t seq) : v(0), seq(seq) {} + // legacy is false when complete regions never used + bool legacy; + State() : v(0), seq(1), legacy(false) {} + explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {} void encode(bufferlist &bl) const { - ENCODE_START(2, 1, bl); + ENCODE_START(3, 1, bl); ::encode(v, bl); ::encode(seq, bl); + ::encode(legacy, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START(2, bl); + DECODE_START(3, bl); if (struct_v >= 2) ::decode(v, bl); else v = 0; ::decode(seq, bl); + if (struct_v >= 3) + ::decode(legacy, bl); + else + legacy = false; DECODE_FINISH(bl); } void dump(Formatter *f) const { + f->dump_unsigned("v", v); f->dump_unsigned("seq", seq); + f->dump_bool("legacy", legacy); } static void generate_test_instances(list &o) { diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc index d7c240d792311..52fa7ae710c64 100644 --- a/src/tools/ceph_osdomap_tool.cc +++ b/src/tools/ceph_osdomap_tool.cc @@ -126,6 +126,7 @@ int main(int argc, char **argv) { omap.get_state(); std::cout << "Version: " << (int)omap.state.v << std::endl; std::cout << "Seq: " << omap.state.seq << std::endl; + std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl; if (cmd == "dump-raw-keys") { KeyValueDB::WholeSpaceIterator i = store->get_iterator(); @@ -178,7 +179,7 @@ int main(int argc, char **argv) { } else if (cmd == "check" || cmd == "repair") { ostringstream ss; bool repair = (cmd == "repair"); - r = omap.check(ss, repair); + r = omap.check(ss, repair, true); if (r) { std::cerr << ss.str() << std::endl; if (r > 0) { @@ -200,6 +201,7 @@ int main(int argc, char **argv) { return 0; } else if (cmd == "resetv2") { omap.state.v = 2; + omap.state.legacy = false; omap.set_state(); } else { std::cerr << "Did not recognize command " << cmd << std::endl; -- 2.39.5