From d84a49e262e1a66103a97730d8b09587e5694c44 Mon Sep 17 00:00:00 2001 From: Xiaoxi CHEN Date: Mon, 23 Apr 2018 12:57:52 -0500 Subject: [PATCH] mon/OSDMonitor: add feature into osdmap cache key. Change the cache from SimpleLRU inc_osd_cache; to SimpleLRU, bufferlist> inc_osd_cache; By doing this , we can cache several encoded osdmap for different features, not necessary to re-encoding for each client. Signed-off-by: Xiaoxi CHEN --- src/mon/OSDMonitor.cc | 159 ++++++++++++++++++++++++++++++++---------- src/mon/OSDMonitor.h | 22 ++++-- 2 files changed, 142 insertions(+), 39 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index dff2831f07fb0..ea050535cac70 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -262,8 +262,9 @@ void OSDMonitor::create_initial() } // encode into pending incremental + uint64_t features = newmap.get_encoding_features(); newmap.encode(pending_inc.fullmap, - mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED); + features | CEPH_FEATURE_RESERVED); pending_inc.full_crc = newmap.get_crc(); dout(20) << " full crc " << pending_inc.full_crc << dendl; } @@ -1262,7 +1263,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) // determine appropriate features features = tmp.get_encoding_features(); - dout(10) << __func__ << " encoding full map with " << features << dendl; + dout(10) << __func__ << " encoding full map with " + << ceph_release_name(tmp.require_osd_release) + << " features " << features << dendl; // the features should be a subset of the mon quorum's features! assert((features & ~mon->get_quorum_con_features()) == 0); @@ -1476,8 +1479,13 @@ void OSDMonitor::share_map_with_random_osd() } dout(10) << "committed, telling random " << s->inst << " all about it" << dendl; + + // get feature of the peer + // use quorum_con_features, if it's an anonymous connection. + uint64_t features = s->con_features ? s->con_features : + mon->get_quorum_con_features(); // whatev, they'll request more if they need it - MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch()); + MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features); s->con->send_message(m); // NOTE: do *not* record osd has up to this epoch (as we do // elsewhere) as they may still need to request older values. @@ -2041,6 +2049,11 @@ bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op) { op->mark_osdmon_event(__func__); MMonGetOSDMap *m = static_cast(op->get_req()); + + uint64_t features = mon->get_quorum_con_features(); + if (m->get_session() && m->get_session()->con_features) + features = m->get_session()->con_features; + dout(10) << __func__ << " " << *m << dendl; MOSDMap *reply = new MOSDMap(mon->monmap->fsid); epoch_t first = get_first_committed(); @@ -2049,13 +2062,13 @@ bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op) for (epoch_t e = std::max(first, m->get_full_first()); e <= std::min(last, m->get_full_last()) && max > 0; ++e, --max) { - int r = get_version_full(e, reply->maps[e]); + int r = get_version_full(e, features, reply->maps[e]); assert(r >= 0); } for (epoch_t e = std::max(first, m->get_inc_first()); e <= std::min(last, m->get_inc_last()) && max > 0; ++e, --max) { - int r = get_version(e, reply->incremental_maps[e]); + int r = get_version(e, features, reply->incremental_maps[e]); assert(r >= 0); } reply->oldest_map = first; @@ -3326,25 +3339,25 @@ void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start) } -MOSDMap *OSDMonitor::build_latest_full() +MOSDMap *OSDMonitor::build_latest_full(uint64_t features) { MOSDMap *r = new MOSDMap(mon->monmap->fsid); - get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]); + get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]); r->oldest_map = get_first_committed(); r->newest_map = osdmap.get_epoch(); return r; } -MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to) +MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features) { - dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl; + dout(10) << "build_incremental [" << from << ".." << to << "] with features " << std::hex << features << dendl; MOSDMap *m = new MOSDMap(mon->monmap->fsid); m->oldest_map = get_first_committed(); m->newest_map = osdmap.get_epoch(); for (epoch_t e = to; e >= from && e > 0; e--) { bufferlist bl; - int err = get_version(e, bl); + int err = get_version(e, features, bl); if (err == 0) { assert(bl.length()); // if (get_version(e, bl) > 0) { @@ -3354,7 +3367,7 @@ MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to) } else { assert(err == -ENOENT); assert(!bl.length()); - get_version_full(e, bl); + get_version_full(e, features, bl); if (bl.length() > 0) { //else if (get_version("full", e, bl) > 0) { dout(20) << "build_incremental full " << e << " " @@ -3372,7 +3385,7 @@ void OSDMonitor::send_full(MonOpRequestRef op) { op->mark_osdmon_event(__func__); dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl; - mon->send_reply(op, build_latest_full()); + mon->send_reply(op, build_latest_full(op->get_session()->con_features)); } void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first) @@ -3404,6 +3417,11 @@ void OSDMonitor::send_incremental(epoch_t first, dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]" << " to " << session->inst << dendl; + // get feature of the peer + // use quorum_con_features, if it's an anonymous connection. + uint64_t features = session->con_features ? session->con_features : + mon->get_quorum_con_features(); + if (first <= session->osd_epoch) { dout(10) << __func__ << " " << session->inst << " should already have epoch " << session->osd_epoch << dendl; @@ -3420,7 +3438,7 @@ void OSDMonitor::send_incremental(epoch_t first, first = get_first_committed(); bufferlist bl; - int err = get_version_full(first, bl); + int err = get_version_full(first, features, bl); assert(err == 0); assert(bl.length()); dout(20) << "send_incremental starting with base full " @@ -3441,7 +3459,7 @@ void OSDMonitor::send_incremental(epoch_t first, while (first <= osdmap.get_epoch()) { epoch_t last = std::min(first + g_conf->osd_map_message_max - 1, osdmap.get_epoch()); - MOSDMap *m = build_incremental(first, last); + MOSDMap *m = build_incremental(first, last, features); if (req) { // send some maps. it may not be all of them, but it will get them @@ -3481,14 +3499,70 @@ void OSDMonitor::get_removed_snaps_range( int OSDMonitor::get_version(version_t ver, bufferlist& bl) { - if (inc_osd_cache.lookup(ver, &bl)) { - return 0; - } - int ret = PaxosService::get_version(ver, bl); - if (!ret) { - inc_osd_cache.add(ver, bl); - } + return get_version(ver, mon->get_quorum_con_features(), bl); +} + +void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features) +{ + OSDMap::Incremental inc; + bufferlist::iterator q = bl.begin(); + inc.decode(q); + // always encode with subset of osdmap's canonical features + uint64_t f = features & inc.encode_features; + dout(20) << __func__ << " " << inc.epoch << " with features " << f + << dendl; + bl.clear(); + if (inc.fullmap.length()) { + // embedded full map? + OSDMap m; + m.decode(inc.fullmap); + inc.fullmap.clear(); + m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED); + } + if (inc.crush.length()) { + // embedded crush map + CrushWrapper c; + auto p = inc.crush.begin(); + c.decode(p); + inc.crush.clear(); + c.encode(inc.crush, f); + } + inc.encode(bl, f | CEPH_FEATURE_RESERVED); +} + +void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features) +{ + OSDMap m; + bufferlist::iterator q = bl.begin(); + m.decode(q); + // always encode with subset of osdmap's canonical features + uint64_t f = features & m.get_encoding_features(); + dout(20) << __func__ << " " << m.get_epoch() << " with features " << f + << dendl; + bl.clear(); + m.encode(bl, f | CEPH_FEATURE_RESERVED); +} + +int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl) +{ + uint64_t significant_features = OSDMap::get_significant_features(features); + if (inc_osd_cache.lookup({ver, significant_features}, &bl)) { + return 0; + } + int ret = PaxosService::get_version(ver, bl); + if (ret < 0) { return ret; + } + // NOTE: this check is imprecise; the OSDMap encoding features may + // be a subset of the latest mon quorum features, but worst case we + // reencode once and then cache the (identical) result under both + // feature masks. + if (significant_features != + OSDMap::get_significant_features(mon->get_quorum_con_features())) { + reencode_incremental_map(bl, features); + } + inc_osd_cache.add({ver, significant_features}, bl); + return 0; } int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc) @@ -3527,7 +3601,8 @@ int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl) bufferlist osdm_bl; bool has_cached_osdmap = false; for (version_t v = ver-1; v >= closest_pinned; --v) { - if (full_osd_cache.lookup(v, &osdm_bl)) { + if (full_osd_cache.lookup({v, mon->get_quorum_con_features()}, + &osdm_bl)) { dout(10) << __func__ << " found map in cache ver " << v << dendl; closest_pinned = v; has_cached_osdmap = true; @@ -3609,20 +3684,34 @@ int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl) int OSDMonitor::get_version_full(version_t ver, bufferlist& bl) { - if (full_osd_cache.lookup(ver, &bl)) { - return 0; - } - int ret = PaxosService::get_version_full(ver, bl); - if (ret == -ENOENT) { - // build map? - ret = get_full_from_pinned_map(ver, bl); - } - if (ret != 0) { - return ret; - } + return get_version_full(ver, mon->get_quorum_con_features(), bl); +} - full_osd_cache.add(ver, bl); +int OSDMonitor::get_version_full(version_t ver, uint64_t features, + bufferlist& bl) +{ + uint64_t significant_features = OSDMap::get_significant_features(features); + if (full_osd_cache.lookup({ver, significant_features}, &bl)) { return 0; + } + int ret = PaxosService::get_version_full(ver, bl); + if (ret == -ENOENT) { + // build map? + ret = get_full_from_pinned_map(ver, bl); + } + if (ret < 0) { + return ret; + } + // NOTE: this check is imprecise; the OSDMap encoding features may + // be a subset of the latest mon quorum features, but worst case we + // reencode once and then cache the (identical) result under both + // feature masks. + if (significant_features != + OSDMap::get_significant_features(mon->get_quorum_con_features())) { + reencode_full_map(bl, features); + } + full_osd_cache.add({ver, significant_features}, bl); + return 0; } epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until) @@ -3659,7 +3748,7 @@ void OSDMonitor::check_osdmap_sub(Subscription *sub) if (sub->next >= 1) send_incremental(sub->next, sub->session, sub->incremental_onetime); else - sub->session->con->send_message(build_latest_full()); + sub->session->con->send_message(build_latest_full(sub->session->con_features)); if (sub->onetime) mon->session_map.remove_sub(sub); else diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 902436ff1eb39..92f83ea475abf 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -42,6 +42,9 @@ class MOSDMap; #include "erasure-code/ErasureCodeInterface.h" #include "mon/MonOpRequest.h" +#include +// re-include our assert to clobber the system one; fix dout: +#include "include/assert.h" /// information about a particular peer's failure reports for one osd struct failure_reporter_t { @@ -219,8 +222,13 @@ public: map osd_weight; - SimpleLRU inc_osd_cache; - SimpleLRU full_osd_cache; + using osdmap_key_t = std::pair; + using osdmap_cache_t = SimpleLRU, + boost::hash>; + osdmap_cache_t inc_osd_cache; + osdmap_cache_t full_osd_cache; bool has_osdmap_manifest; osdmap_manifest_t osdmap_manifest; @@ -339,8 +347,8 @@ private: bool can_mark_in(int o); // ... - MOSDMap *build_latest_full(); - MOSDMap *build_incremental(epoch_t first, epoch_t last); + MOSDMap *build_latest_full(uint64_t features); + MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features); void send_full(MonOpRequestRef op); void send_incremental(MonOpRequestRef op, epoch_t first); public: @@ -532,6 +540,9 @@ private: int load_metadata(int osd, map& m, ostream *err); void count_metadata(const string& field, Formatter *f); + + void reencode_incremental_map(bufferlist& bl, uint64_t features); + void reencode_full_map(bufferlist& bl, uint64_t features); public: void count_metadata(const string& field, map *out); protected: @@ -637,6 +648,9 @@ public: mempool::osdmap::map *gap_removed_snaps); int get_version(version_t ver, bufferlist& bl) override; + int get_version(version_t ver, uint64_t feature, bufferlist& bl); + + int get_version_full(version_t ver, uint64_t feature, bufferlist& bl); int get_version_full(version_t ver, bufferlist& bl) override; int get_inc(version_t ver, OSDMap::Incremental& inc); int get_full_from_pinned_map(version_t ver, bufferlist& bl); -- 2.39.5