From 2c856473990fceaa4f75f50b75f2b99f4492c86b Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 18 Apr 2017 15:49:21 -0700 Subject: [PATCH] PGLog: store extra duplicate ops beyond the normal log entries This helps us avoid replaying non-idempotent client operations when the pg log is very short, e.g. in an effort to force OSDs to use backfill rather than regular recovery. This can be advantageous to avoid blocking i/o to objects, at the cost of longer total time to become clean (since backfill requires scanning the objects to see what is missing). Signed-off-by: Josh Durgin --- src/osd/PGLog.cc | 60 ++++++++++++++++++++++++++++++++--- src/osd/PGLog.h | 23 ++++++++++++-- src/osd/osd_types.cc | 74 +++++++++++++++++++++++++++++++++++++++++--- src/osd/osd_types.h | 44 +++++++++++++++++++++++--- 4 files changed, 187 insertions(+), 14 deletions(-) diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index a54faafbe2e..cf50db16e6a 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -46,7 +46,8 @@ void PGLog::IndexedLog::split_out_child( void PGLog::IndexedLog::trim( CephContext* cct, eversion_t s, - set *trimmed) + set *trimmed, + set *trimmed_dups) { if (complete_to != log.end() && complete_to->version <= s) { @@ -67,6 +68,18 @@ void PGLog::IndexedLog::trim( unindex(e); // remove from index, + // add to dup list + if (e.version.version + 1000 > s.version) { + dirty_dups = true; + dups.push_back(pg_log_dup_t(e)); + dup_index[e.reqid] = &(dups.back()); + for (const auto& extra : e.extra_reqids) { + dups.push_back(pg_log_dup_t(e.version, extra.second, + extra.first, e.return_code)); + dup_index[extra->first] = &(dups.back()); + } + } + if (rollback_info_trimmed_to_riter == log.rend() || e.version == rollback_info_trimmed_to_riter->version) { log.pop_front(); @@ -76,6 +89,17 @@ void PGLog::IndexedLog::trim( } } + while (!dups.empty()) { + auto &e = *dups.begin(); + if (e.version.version + 1000 > s.version) + break; + generic_dout(20) << "trim dup " << e << dendl; + if (trimmed_dups) + trimmed_dups->insert(e.get_key_name()); + dup_index.erase(e.reqid); + dups.pop_front(); + } + // raise tail? if (tail < s) tail = s; @@ -124,7 +148,7 @@ void PGLog::trim( assert(trim_to <= info.last_complete); dout(10) << "trim " << log << " to " << trim_to << dendl; - log.trim(cct, trim_to, &trimmed); + log.trim(cct, trim_to, &trimmed, &trimmed_dups); info.log_tail = log.tail; } } @@ -446,6 +470,7 @@ void PGLog::write_log_and_missing( << ", dirty_from: " << dirty_from << ", writeout_from: " << writeout_from << ", trimmed: " << trimmed + << ", trimmed_dups: " << trimmed_dups << ", clear_divergent_priors: " << clear_divergent_priors << dendl; _write_log_and_missing( @@ -454,6 +479,7 @@ void PGLog::write_log_and_missing( dirty_from, writeout_from, trimmed, + trimmed_dups, missing, !touched_log, require_rollback, @@ -511,13 +537,14 @@ void PGLog::_write_log_and_missing_wo_missing( eversion_t dirty_from, eversion_t writeout_from, const set &trimmed, + const set &trimmed_dups, bool dirty_divergent_priors, bool touch_log, bool require_rollback, set *log_keys_debug ) { - set to_remove; + set to_remove(trimmed_dups); for (set::const_iterator i = trimmed.begin(); i != trimmed.end(); ++i) { @@ -563,6 +590,18 @@ void PGLog::_write_log_and_missing_wo_missing( (*km)[p->get_key_name()].claim(bl); } + if (dirty_dups) { + pg_log_dup_t min; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), log.dups.begin()->get_key_name()); + for (const auto& entry : log.dups) { + bufferlist bl; + ::encode(entry, bl); + (*km)[entry.get_key_name()].claim(bl); + } + } + if (log_keys_debug) { for (map::iterator i = (*km).begin(); i != (*km).end(); @@ -600,6 +639,7 @@ void PGLog::_write_log_and_missing( eversion_t dirty_from, eversion_t writeout_from, const set &trimmed, + const set &trimmed_dups, const pg_missing_tracker_t &missing, bool touch_log, bool require_rollback, @@ -607,7 +647,7 @@ void PGLog::_write_log_and_missing( bool *rebuilt_missing_with_deletes, // in/out param set *log_keys_debug ) { - set to_remove; + set to_remove(trimmed_dups); for (set::const_iterator i = trimmed.begin(); i != trimmed.end(); ++i) { @@ -652,6 +692,18 @@ void PGLog::_write_log_and_missing( (*km)[p->get_key_name()].claim(bl); } + if (dirty_dups) { + pg_log_dup_t min; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), log.dups.begin()->get_key_name()); + for (const auto& entry : log.dups) { + bufferlist bl; + ::encode(entry, bl); + (*km)[entry.get_key_name()].claim(bl); + } + } + if (log_keys_debug) { for (map::iterator i = (*km).begin(); i != (*km).end(); diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index 19aeadfbd06..d8bf60ee997 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -82,6 +82,7 @@ public: mutable ceph::unordered_map objects; // ptrs into log. be careful! mutable ceph::unordered_map caller_ops; mutable ceph::unordered_multimap extra_caller_ops; + mutable ceph::unordered_map dup_index; // recovery pointers list::iterator complete_to; // not inclusive of referenced item @@ -398,6 +399,7 @@ public: objects.clear(); caller_ops.clear(); extra_caller_ops.clear(); + dup_index.clear(); indexed_data = 0; } void unindex(pg_log_entry_t& e) { @@ -476,7 +478,8 @@ public: void trim( CephContext* cct, eversion_t s, - set *trimmed); + set *trimmed, + set *trimmed_dups); ostream& print(ostream& out) const; }; @@ -492,11 +495,13 @@ protected: eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from eversion_t writeout_from; ///< must writout keys >= writeout_from set trimmed; ///< must clear keys in trimmed + set trimmed_dups; ///< must clear keys in trimmed_dups CephContext *cct; bool pg_log_debug; /// Log is clean on [dirty_to, dirty_from) bool touched_log; bool clear_divergent_priors; + bool dirty_dups; /// log.dups is updated bool rebuilt_missing_with_deletes = false; void mark_dirty_to(eversion_t to) { @@ -519,6 +524,7 @@ public: (writeout_from != eversion_t::max()) || !(trimmed.empty()) || !missing.is_clean() || + !(trimmed_dups.empty()) || rebuilt_missing_with_deletes; } void mark_log_for_rewrite() { @@ -554,9 +560,11 @@ protected: dirty_from = eversion_t::max(); touched_log = true; trimmed.clear(); + trimmed_dups.clear(); writeout_from = eversion_t::max(); check(); missing.flush(); + dirty_dups = false; } public: // cppcheck-suppress noExplicitConstructor @@ -1111,6 +1119,7 @@ public: eversion_t dirty_from, eversion_t writeout_from, const set &trimmed, + const set &trimmed_dups, bool dirty_divergent_priors, bool touch_log, bool require_rollback, @@ -1126,6 +1135,7 @@ public: eversion_t dirty_from, eversion_t writeout_from, const set &trimmed, + const set &trimmed_dups, const pg_missing_tracker_t &missing, bool touch_log, bool require_rollback, @@ -1181,6 +1191,7 @@ public: bool has_divergent_priors = false; missing.may_include_deletes = false; list entries; + list dups; if (p) { for (p->seek_to_first(); p->valid() ; p->next(false)) { // non-log pgmeta_oid keys are prefixed with _; skip those @@ -1209,6 +1220,13 @@ public: assert(missing.may_include_deletes); } missing.add(oid, item.need, item.have, item.is_delete()); + } else if (p->key().substr(0, 4) == string("dup_")) { + pg_log_dup_t dup; + ::decode(dup, bp); + if (!dups.empty()) { + assert(dups.back().version < dup.version); + } + dups.push_back(dup); } else { pg_log_entry_t e; e.decode_with_checksum(bp); @@ -1229,7 +1247,8 @@ public: info.log_tail, on_disk_can_rollback_to, on_disk_rollback_info_trimmed_to, - std::move(entries)); + std::move(entries), + std::move(dups)); if (has_divergent_priors || debug_verify_stored_missing) { // build missing diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 645229cd1d5..49e6713e860 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -4118,6 +4118,57 @@ ostream& operator<<(ostream& out, const pg_log_entry_t& e) return out; } +// -- pg_log_dup_t -- + +string pg_log_dup_t::get_key_name() const +{ + return "dup_" + version.get_key_name(); +} + +void pg_log_dup_t::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(reqid, bl); + ::encode(version, bl); + ::encode(user_version, bl); + ::encode(return_code, bl); + ENCODE_FINISH(bl); +} + +void pg_log_dup_t::decode(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + ::decode(reqid, bl); + ::decode(version, bl); + ::decode(user_version, bl); + ::decode(return_code, bl); + DECODE_FINISH(bl); +} + +void pg_log_dup_t::dump(Formatter *f) const +{ + f->dump_stream("reqid") << reqid; + f->dump_stream("version") << version; + f->dump_stream("user_version") << user_version; + f->dump_stream("return_code") << return_code; +} + +void pg_log_dup_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_dup_t()); + o.push_back(new pg_log_dup_t(osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + eversion_t(1,2), 1, 0); + o.push_back(new pg_log_dup_t(osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + eversion_t(1,2), 2, -ENOENT); +} + +ostream& operator<<(ostream& out, const pg_log_dup_t& e) +{ + out << e.reqid << " v" << e.version << " uv" << e.user_version + << " rc=" << e.return_code; + return out; +} + // -- pg_log_t -- @@ -4159,18 +4210,19 @@ void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap, void pg_log_t::encode(bufferlist& bl) const { - ENCODE_START(6, 3, bl); + ENCODE_START(7, 3, bl); ::encode(head, bl); ::encode(tail, bl); ::encode(log, bl); ::encode(can_rollback_to, bl); ::encode(rollback_info_trimmed_to, bl); + ::encode(dups, bl); ENCODE_FINISH(bl); } void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool) { - DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); + DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl); ::decode(head, bl); ::decode(tail, bl); if (struct_v < 2) { @@ -4185,6 +4237,10 @@ void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool) ::decode(rollback_info_trimmed_to, bl); else rollback_info_trimmed_to = tail; + + if (struct_v >= 7) + ::decode(dups, bl); + DECODE_FINISH(bl); // handle hobject_t format change @@ -4209,6 +4265,13 @@ void pg_log_t::dump(Formatter *f) const f->close_section(); } f->close_section(); + f->open_array_section("dups"); + for (const auto& entry : dups) { + f->open_object_section("entry"); + entry.dump(f); + f->close_section(); + } + f->close_section(); } void pg_log_t::generate_test_instances(list& o) @@ -4280,13 +4343,16 @@ void pg_log_t::copy_up_to(const pg_log_t &other, int max) } } -ostream& pg_log_t::print(ostream& out) const +ostream& pg_log_t::print(ostream& out) const { out << *this << std::endl; for (list::const_iterator p = log.begin(); p != log.end(); - ++p) + ++p) out << *p << std::endl; + for (const auto& entry : dups) { + out << " dup entry: " << entry << std::endl; + } return out; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 62448bbe368..5fdd784fe50 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -3405,7 +3405,30 @@ WRITE_CLASS_ENCODER(pg_log_entry_t) ostream& operator<<(ostream& out, const pg_log_entry_t& e); +struct pg_log_dup_t { + osd_reqid_t reqid; // caller+tid to uniquely identify request + eversion_t version; + version_t user_version; // the user version for this entry + int32_t return_code; // only stored for ERRORs for dup detection + pg_log_dup_t() + : user_version(0), return_code(0) {} + pg_log_dup_t(const pg_log_entry_t &entry) explicit + : reqid(entry.reqid), version(entry.version), + user_version(entry.user_version), return_code(entry.return_code) + {} + pg_log_dup_t(const eversion_t& v, version_t uv, + const osd_reqid_t& rid, int return_code) + : reqid(rid), version(v), user_version(uv), + return_code(return_code) + {} + string get_key_name() const; + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(pg_log_dup_t) /** * pg_log_t - incremental log of recent pg changes. @@ -3432,32 +3455,39 @@ protected: public: mempool::osd_pglog::list log; // the actual log. + mempool::osd_pglog::list dups; // entries just for dup op detection pg_log_t() = default; pg_log_t(const eversion_t &last_update, const eversion_t &log_tail, const eversion_t &can_rollback_to, const eversion_t &rollback_info_trimmed_to, - mempool::osd_pglog::list &&entries) + mempool::osd_pglog::list &&entries, + mempool::osd_pglog::list &&dup_entries) : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), rollback_info_trimmed_to(rollback_info_trimmed_to), - log(std::move(entries)) {} + log(std::move(entries)), dups(std::move(dup_entries)) {} pg_log_t(const eversion_t &last_update, const eversion_t &log_tail, const eversion_t &can_rollback_to, const eversion_t &rollback_info_trimmed_to, - const std::list &entries) + const std::list &entries, + const std::list &dup_entries) : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), rollback_info_trimmed_to(rollback_info_trimmed_to) { for (auto &&entry: entries) { log.push_back(entry); } + for (auto &&entry: dup_entries) { + dups.push_back(entry); + } } void clear() { eversion_t z; rollback_info_trimmed_to = can_rollback_to = head = tail = z; log.clear(); + dups.clear(); } eversion_t get_rollback_info_trimmed_to() const { @@ -3485,12 +3515,18 @@ public: oldlog.erase(i++); } + // osd_reqid is unique, so it doesn't matter if there are extra + // dup entries in each pg. To avoid storing oid with the dup + // entries, just copy the whole list. + auto childdups(dups); + return pg_log_t( head, tail, can_rollback_to, rollback_info_trimmed_to, - std::move(childlog)); + std::move(childlog), + std::move(childdups)); } mempool::osd_pglog::list rewind_from_head(eversion_t newhead) { -- 2.39.5