From: Xiaoxi CHEN Date: Mon, 23 Apr 2018 17:57:52 +0000 (-0500) Subject: mon/OSDMonitor: add feature into osdmap cache key. X-Git-Tag: v12.2.6~44^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0b6c15aedc723e246a43571f3470019194da8142;p=ceph.git mon/OSDMonitor: add feature into osdmap cache key. Change the cache from SimpleLRU inc_osd_cache; to SimpleLRU, bufferlist> inc_osd_cache; By doing this , we can cache several encoded osdmap for different features, not necessary to re-encoding for each client. Signed-off-by: Xiaoxi CHEN (cherry picked from commit d84a49e262e1a66103a97730d8b09587e5694c44) Conflicts: src/mon/OSDMonitor.cc drop get_inc and get_full_from_pinned_map, which were introduced post luminous (commit bc5df2b4497104c2a8747daf0530bb5184f9fecb) --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 40896d336110..4b7c9da84791 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -259,8 +259,9 @@ void OSDMonitor::create_initial() } // encode into pending incremental + uint64_t features = newmap.get_encoding_features(); newmap.encode(pending_inc.fullmap, - mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED); + features | CEPH_FEATURE_RESERVED); pending_inc.full_crc = newmap.get_crc(); dout(20) << " full crc " << pending_inc.full_crc << dendl; } @@ -1339,7 +1340,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) // determine appropriate features features = tmp.get_encoding_features(); - dout(10) << __func__ << " encoding full map with " << features << dendl; + dout(10) << __func__ << " encoding full map with " + << ceph_release_name(tmp.require_osd_release) + << " features " << features << dendl; // the features should be a subset of the mon quorum's features! assert((features & ~mon->get_quorum_con_features()) == 0); @@ -1546,8 +1549,13 @@ void OSDMonitor::share_map_with_random_osd() } dout(10) << "committed, telling random " << s->inst << " all about it" << dendl; + + // get feature of the peer + // use quorum_con_features, if it's an anonymous connection. + uint64_t features = s->con_features ? s->con_features : + mon->get_quorum_con_features(); // whatev, they'll request more if they need it - MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch()); + MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features); s->con->send_message(m); // NOTE: do *not* record osd has up to this epoch (as we do // elsewhere) as they may still need to request older values. @@ -1754,6 +1762,11 @@ bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op) { op->mark_osdmon_event(__func__); MMonGetOSDMap *m = static_cast(op->get_req()); + + uint64_t features = mon->get_quorum_con_features(); + if (m->get_session() && m->get_session()->con_features) + features = m->get_session()->con_features; + dout(10) << __func__ << " " << *m << dendl; MOSDMap *reply = new MOSDMap(mon->monmap->fsid); epoch_t first = get_first_committed(); @@ -1762,13 +1775,13 @@ bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op) for (epoch_t e = MAX(first, m->get_full_first()); e <= MIN(last, m->get_full_last()) && max > 0; ++e, --max) { - int r = get_version_full(e, reply->maps[e]); + int r = get_version_full(e, features, reply->maps[e]); assert(r >= 0); } for (epoch_t e = MAX(first, m->get_inc_first()); e <= MIN(last, m->get_inc_last()) && max > 0; ++e, --max) { - int r = get_version(e, reply->incremental_maps[e]); + int r = get_version(e, features, reply->incremental_maps[e]); assert(r >= 0); } reply->oldest_map = first; @@ -3115,25 +3128,25 @@ void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start) } -MOSDMap *OSDMonitor::build_latest_full() +MOSDMap *OSDMonitor::build_latest_full(uint64_t features) { MOSDMap *r = new MOSDMap(mon->monmap->fsid); - get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]); + get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]); r->oldest_map = get_first_committed(); r->newest_map = osdmap.get_epoch(); return r; } -MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to) +MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features) { - dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl; + dout(10) << "build_incremental [" << from << ".." << to << "] with features " << std::hex << features << dendl; MOSDMap *m = new MOSDMap(mon->monmap->fsid); m->oldest_map = get_first_committed(); m->newest_map = osdmap.get_epoch(); for (epoch_t e = to; e >= from && e > 0; e--) { bufferlist bl; - int err = get_version(e, bl); + int err = get_version(e, features, bl); if (err == 0) { assert(bl.length()); // if (get_version(e, bl) > 0) { @@ -3143,7 +3156,7 @@ MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to) } else { assert(err == -ENOENT); assert(!bl.length()); - get_version_full(e, bl); + get_version_full(e, features, bl); if (bl.length() > 0) { //else if (get_version("full", e, bl) > 0) { dout(20) << "build_incremental full " << e << " " @@ -3161,7 +3174,7 @@ void OSDMonitor::send_full(MonOpRequestRef op) { op->mark_osdmon_event(__func__); dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl; - mon->send_reply(op, build_latest_full()); + mon->send_reply(op, build_latest_full(op->get_session()->con_features)); } void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first) @@ -3194,6 +3207,11 @@ void OSDMonitor::send_incremental(epoch_t first, dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]" << " to " << session->inst << dendl; + // get feature of the peer + // use quorum_con_features, if it's an anonymous connection. + uint64_t features = session->con_features ? session->con_features : + mon->get_quorum_con_features(); + if (first <= session->osd_epoch) { dout(10) << __func__ << " " << session->inst << " should already have epoch " << session->osd_epoch << dendl; @@ -3203,7 +3221,7 @@ void OSDMonitor::send_incremental(epoch_t first, if (first < get_first_committed()) { first = get_first_committed(); bufferlist bl; - int err = get_version_full(first, bl); + int err = get_version_full(first, features, bl); assert(err == 0); assert(bl.length()); @@ -3227,9 +3245,9 @@ void OSDMonitor::send_incremental(epoch_t first, } while (first <= osdmap.get_epoch()) { - epoch_t last = MIN(first + g_conf->osd_map_message_max - 1, - osdmap.get_epoch()); - MOSDMap *m = build_incremental(first, last); + epoch_t last = std::min(first + g_conf->osd_map_message_max - 1, + osdmap.get_epoch()); + MOSDMap *m = build_incremental(first, last, features); if (req) { // send some maps. it may not be all of them, but it will get them @@ -3247,26 +3265,98 @@ void OSDMonitor::send_incremental(epoch_t first, int OSDMonitor::get_version(version_t ver, bufferlist& bl) { - if (inc_osd_cache.lookup(ver, &bl)) { - return 0; - } - int ret = PaxosService::get_version(ver, bl); - if (!ret) { - inc_osd_cache.add(ver, bl); - } + return get_version(ver, mon->get_quorum_con_features(), bl); +} + +void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features) +{ + OSDMap::Incremental inc; + bufferlist::iterator q = bl.begin(); + inc.decode(q); + // always encode with subset of osdmap's canonical features + uint64_t f = features & inc.encode_features; + dout(20) << __func__ << " " << inc.epoch << " with features " << f + << dendl; + bl.clear(); + if (inc.fullmap.length()) { + // embedded full map? + OSDMap m; + m.decode(inc.fullmap); + inc.fullmap.clear(); + m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED); + } + if (inc.crush.length()) { + // embedded crush map + CrushWrapper c; + auto p = inc.crush.begin(); + c.decode(p); + inc.crush.clear(); + c.encode(inc.crush, f); + } + inc.encode(bl, f | CEPH_FEATURE_RESERVED); +} + +void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features) +{ + OSDMap m; + bufferlist::iterator q = bl.begin(); + m.decode(q); + // always encode with subset of osdmap's canonical features + uint64_t f = features & m.get_encoding_features(); + dout(20) << __func__ << " " << m.get_epoch() << " with features " << f + << dendl; + bl.clear(); + m.encode(bl, f | CEPH_FEATURE_RESERVED); +} + +int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl) +{ + uint64_t significant_features = OSDMap::get_significant_features(features); + if (inc_osd_cache.lookup({ver, significant_features}, &bl)) { + return 0; + } + int ret = PaxosService::get_version(ver, bl); + if (ret < 0) { return ret; + } + // NOTE: this check is imprecise; the OSDMap encoding features may + // be a subset of the latest mon quorum features, but worst case we + // reencode once and then cache the (identical) result under both + // feature masks. + if (significant_features != + OSDMap::get_significant_features(mon->get_quorum_con_features())) { + reencode_incremental_map(bl, features); + } + inc_osd_cache.add({ver, significant_features}, bl); + return 0; } int OSDMonitor::get_version_full(version_t ver, bufferlist& bl) { - if (full_osd_cache.lookup(ver, &bl)) { - return 0; - } - int ret = PaxosService::get_version_full(ver, bl); - if (!ret) { - full_osd_cache.add(ver, bl); - } + return get_version_full(ver, mon->get_quorum_con_features(), bl); +} + +int OSDMonitor::get_version_full(version_t ver, uint64_t features, + bufferlist& bl) +{ + uint64_t significant_features = OSDMap::get_significant_features(features); + if (full_osd_cache.lookup({ver, significant_features}, &bl)) { + return 0; + } + int ret = PaxosService::get_version_full(ver, bl); + if (ret < 0) { return ret; + } + // NOTE: this check is imprecise; the OSDMap encoding features may + // be a subset of the latest mon quorum features, but worst case we + // reencode once and then cache the (identical) result under both + // feature masks. + if (significant_features != + OSDMap::get_significant_features(mon->get_quorum_con_features())) { + reencode_full_map(bl, features); + } + full_osd_cache.add({ver, significant_features}, bl); + return 0; } epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until) @@ -3303,7 +3393,7 @@ void OSDMonitor::check_osdmap_sub(Subscription *sub) if (sub->next >= 1) send_incremental(sub->next, sub->session, sub->incremental_onetime); else - sub->session->con->send_message(build_latest_full()); + sub->session->con->send_message(build_latest_full(sub->session->con_features)); if (sub->onetime) mon->session_map.remove_sub(sub); else diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index c3db2332fd31..964f56732930 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -42,6 +42,9 @@ class MOSDMap; #include "erasure-code/ErasureCodeInterface.h" #include "mon/MonOpRequest.h" +#include +// re-include our assert to clobber the system one; fix dout: +#include "include/assert.h" /// information about a particular peer's failure reports for one osd struct failure_reporter_t { @@ -140,8 +143,13 @@ public: map osd_weight; - SimpleLRU inc_osd_cache; - SimpleLRU full_osd_cache; + using osdmap_key_t = std::pair; + using osdmap_cache_t = SimpleLRU, + boost::hash>; + osdmap_cache_t inc_osd_cache; + osdmap_cache_t full_osd_cache; bool check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); @@ -244,8 +252,8 @@ private: bool can_mark_in(int o); // ... - MOSDMap *build_latest_full(); - MOSDMap *build_incremental(epoch_t first, epoch_t last); + MOSDMap *build_latest_full(uint64_t features); + MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features); void send_full(MonOpRequestRef op); void send_incremental(MonOpRequestRef op, epoch_t first); public: @@ -428,6 +436,9 @@ private: int load_metadata(int osd, map& m, ostream *err); void count_metadata(const string& field, Formatter *f); + + void reencode_incremental_map(bufferlist& bl, uint64_t features); + void reencode_full_map(bufferlist& bl, uint64_t features); public: void count_metadata(const string& field, map *out); protected: @@ -534,6 +545,9 @@ public: } int get_version(version_t ver, bufferlist& bl) override; + int get_version(version_t ver, uint64_t feature, bufferlist& bl); + + int get_version_full(version_t ver, uint64_t feature, bufferlist& bl); int get_version_full(version_t ver, bufferlist& bl) override; epoch_t blacklist(const entity_addr_t& a, utime_t until);