From: Sage Weil Date: Fri, 19 Apr 2019 21:33:04 +0000 (-0500) Subject: mon/OSDMonitor: track history and past_intervals for creating pgs X-Git-Tag: v15.1.0~2850^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=85d9dc6b107fa813bb56860e8b4c6c51e267e1f0;p=ceph.git mon/OSDMonitor: track history and past_intervals for creating pgs PG create messages from mons are the last case where the OSD may have to scan an unbounded number of old maps in order to construct a valid pg_history_t and PastIntervals. Try to avoid making that a difficult case by maintaining those structures on the monitor. It is still possible that the mon may send a pg create message to the OSD and it sits in a message queue for a very long time, but this would be a very difficult situation to get into, and is no different from inter-OSD messages that include history and past_intervals. Signed-off-by: Sage Weil --- diff --git a/src/messages/MOSDPGCreate2.h b/src/messages/MOSDPGCreate2.h index e2c6800e3b7f..a1413727c969 100644 --- a/src/messages/MOSDPGCreate2.h +++ b/src/messages/MOSDPGCreate2.h @@ -12,11 +12,12 @@ class MOSDPGCreate2 : public Message { public: - static constexpr int HEAD_VERSION = 1; + static constexpr int HEAD_VERSION = 2; static constexpr int COMPAT_VERSION = 1; epoch_t epoch = 0; map> pgs; + map> pg_extra; MOSDPGCreate2() : Message{MSG_OSD_PG_CREATE2, HEAD_VERSION, COMPAT_VERSION} {} @@ -38,12 +39,16 @@ public: using ceph::encode; encode(epoch, payload); encode(pgs, payload); + encode(pg_extra, payload); } void decode_payload() override { auto p = payload.cbegin(); using ceph::decode; decode(epoch, p); decode(pgs, p); + if (header.version >= 2) { + decode(pg_extra, p); + } } private: template diff --git a/src/mon/CreatingPGs.h b/src/mon/CreatingPGs.h index 396ccf1cac53..f46143c1d5ee 100644 --- a/src/mon/CreatingPGs.h +++ b/src/mon/CreatingPGs.h @@ -14,10 +14,88 @@ struct creating_pgs_t { epoch_t last_scan_epoch = 0; + struct pg_create_info { + epoch_t create_epoch; + utime_t create_stamp; + + // NOTE: pre-octopus instances of this class will have a + // zeroed-out history + vector up; + int up_primary = -1; + vector acting; + int acting_primary = -1; + pg_history_t history; + PastIntervals past_intervals; + + void encode(bufferlist& bl, uint64_t features) const { + using ceph::encode; + if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) { + // was pair prior to octopus + encode(create_epoch, bl); + encode(create_stamp, bl); + return; + } + ENCODE_START(1, 1, bl); + encode(create_epoch, bl); + encode(create_stamp, bl); + encode(up, bl); + encode(up_primary, bl); + encode(acting, bl); + encode(acting_primary, bl); + encode(history, bl); + encode(past_intervals, bl); + ENCODE_FINISH(bl); + } + void decode_legacy(bufferlist::const_iterator& p) { + using ceph::decode; + decode(create_epoch, p); + decode(create_stamp, p); + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + DECODE_START(1, p); + decode(create_epoch, p); + decode(create_stamp, p); + decode(up, p); + decode(up_primary, p); + decode(acting, p); + decode(acting_primary, p); + decode(history, p); + decode(past_intervals, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_unsigned("create_epoch", create_epoch); + f->dump_stream("create_stamp") << create_stamp; + f->open_array_section("up"); + for (auto& i : up) { + f->dump_unsigned("osd", i); + } + f->close_section(); + f->dump_int("up_primary", up_primary); + f->open_array_section("acting"); + for (auto& i : acting) { + f->dump_unsigned("osd", i); + } + f->close_section(); + f->dump_int("acting_primary", up_primary); + f->dump_object("pg_history", history); + f->dump_object("past_intervals", past_intervals); + } + + pg_create_info() {} + pg_create_info(epoch_t e, utime_t t) + : create_epoch(e), + create_stamp(t) { + // NOTE: we don't initialize the other fields here; see + // OSDMonitor::update_pending_pgs() + } + }; + /// pgs we are currently creating - std::map > pgs; + std::map pgs; - struct create_info { + struct pool_create_info { epoch_t created; utime_t modified; uint64_t start = 0; @@ -42,7 +120,7 @@ struct creating_pgs_t { }; /// queue of pgs we still need to create (poolid -> ) - map queue; + map queue; /// pools that exist in the osdmap for which at least one pg has been created std::set created_pools; @@ -76,18 +154,34 @@ struct creating_pgs_t { queue.erase(removed_pool); return total - pgs.size(); } - void encode(bufferlist& bl) const { - ENCODE_START(2, 1, bl); + void encode(bufferlist& bl, uint64_t features) const { + unsigned v = 3; + if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) { + v = 2; + } + ENCODE_START(v, 1, bl); encode(last_scan_epoch, bl); - encode(pgs, bl); + encode(pgs, bl, features); encode(created_pools, bl); encode(queue, bl); ENCODE_FINISH(bl); } void decode(bufferlist::const_iterator& bl) { - DECODE_START(2, bl); + DECODE_START(3, bl); decode(last_scan_epoch, bl); - decode(pgs, bl); + if (struct_v >= 3) { + decode(pgs, bl); + } else { + // legacy pg encoding + pgs.clear(); + uint32_t num; + decode(num, bl); + while (num--) { + pg_t pgid; + decode(pgid, bl); + pgs[pgid].decode_legacy(bl); + } + } decode(created_pools, bl); if (struct_v >= 2) decode(queue, bl); @@ -99,8 +193,7 @@ struct creating_pgs_t { for (auto& pg : pgs) { f->open_object_section("pg"); f->dump_stream("pgid") << pg.first; - f->dump_unsigned("epoch", pg.second.first); - f->dump_stream("ctime") << pg.second.second; + f->dump_object("pg_create_info", pg.second); f->close_section(); } f->close_section(); @@ -124,16 +217,17 @@ struct creating_pgs_t { static void generate_test_instances(list& o) { auto c = new creating_pgs_t; c->last_scan_epoch = 17; - c->pgs.emplace(pg_t{42, 2}, make_pair(31, utime_t{891, 113})); - c->pgs.emplace(pg_t{44, 2}, make_pair(31, utime_t{891, 113})); + c->pgs.emplace(pg_t{42, 2}, pg_create_info(31, utime_t{891, 113})); + c->pgs.emplace(pg_t{44, 2}, pg_create_info(31, utime_t{891, 113})); c->created_pools = {0, 1}; o.push_back(c); c = new creating_pgs_t; c->last_scan_epoch = 18; - c->pgs.emplace(pg_t{42, 3}, make_pair(31, utime_t{891, 113})); + c->pgs.emplace(pg_t{42, 3}, pg_create_info(31, utime_t{891, 113})); c->created_pools = {}; o.push_back(c); } }; -WRITE_CLASS_ENCODER(creating_pgs_t::create_info); -WRITE_CLASS_ENCODER(creating_pgs_t); +WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t::pg_create_info) +WRITE_CLASS_ENCODER(creating_pgs_t::pool_create_info) +WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 08bffcca6403..74c1b66ec32b 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -843,8 +843,10 @@ OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc, const pg_t pgid{ps, static_cast(poolid)}; // NOTE: use the *current* epoch as the PG creation epoch so that the // OSD does not have to generate a long set of PastIntervals. - pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch, - p->second.modified)); + pending_creatings.pgs.emplace( + pgid, + creating_pgs_t::pg_create_info(inc.epoch, + p->second.modified)); dout(10) << __func__ << " adding " << pgid << dendl; } p->second.start = end; @@ -859,6 +861,89 @@ OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc, } dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size() << " pools" << dendl; + + if (mon->monmap->min_mon_release >= CEPH_RELEASE_OCTOPUS) { + // walk creating pgs' history and past_intervals forward + for (auto& i : pending_creatings.pgs) { + // this mirrors PG::start_peering_interval() + pg_t pgid = i.first; + + // this is a bit imprecise, but sufficient? + struct min_size_predicate_t : public IsPGRecoverablePredicate { + const pg_pool_t *pi; + bool operator()(const set &have) const { + return have.size() >= pi->min_size; + } + explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {} + } min_size_predicate(nextmap.get_pg_pool(pgid.pool())); + + vector up, acting; + int up_primary, acting_primary; + nextmap.pg_to_up_acting_osds( + pgid, &up, &up_primary, &acting, &acting_primary); + if (i.second.history.epoch_created == 0) { + // new pg entry, set it up + i.second.up = up; + i.second.acting = acting; + i.second.up_primary = up_primary; + i.second.acting_primary = acting_primary; + i.second.history = pg_history_t(i.second.create_epoch, + i.second.create_stamp); + dout(10) << __func__ << " pg " << pgid << " just added, " + << " up " << i.second.up + << " p " << i.second.up_primary + << " acting " << i.second.acting + << " p " << i.second.acting_primary + << " history " << i.second.history + << " past_intervals " << i.second.past_intervals + << dendl; + } else { + std::stringstream debug; + if (PastIntervals::check_new_interval( + i.second.acting_primary, acting_primary, + i.second.acting, acting, + i.second.up_primary, up_primary, + i.second.up, up, + i.second.history.same_interval_since, + i.second.history.last_epoch_clean, + &nextmap, + &osdmap, + pgid, + &min_size_predicate, + &i.second.past_intervals, + &debug)) { + epoch_t e = inc.epoch; + i.second.history.same_interval_since = e; + if (i.second.up != up) { + i.second.history.same_up_since = e; + } + if (i.second.acting_primary != acting_primary) { + i.second.history.same_primary_since = e; + } + if (pgid.is_split( + osdmap.get_pg_num(pgid.pool()), + nextmap.get_pg_num(pgid.pool()), + nullptr)) { + i.second.history.last_epoch_split = e; + } + dout(10) << __func__ << " pg " << pgid << " new interval," + << " up " << i.second.up << " -> " << up + << " p " << i.second.up_primary << " -> " << up_primary + << " acting " << i.second.acting << " -> " << acting + << " p " << i.second.acting_primary << " -> " + << acting_primary + << " history " << i.second.history + << " past_intervals " << i.second.past_intervals + << dendl; + dout(20) << " debug: " << debug.str() << dendl; + i.second.up = up; + i.second.acting = acting; + i.second.up_primary = up_primary; + i.second.acting_primary = acting_primary; + } + } + } + } dout(10) << __func__ << " " << (pending_creatings.pgs.size() - total) << "/" << pending_creatings.pgs.size() @@ -1102,7 +1187,13 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) // process the pool flag removal below in the same osdmap epoch. auto pending_creatings = update_pending_pgs(pending_inc, tmp); bufferlist creatings_bl; - encode(pending_creatings, creatings_bl); + uint64_t features = CEPH_FEATURES_ALL; + if (mon->monmap->min_mon_release < CEPH_RELEASE_OCTOPUS) { + dout(20) << __func__ << " encoding pending pgs without octopus features" + << dendl; + features &= ~CEPH_FEATURE_SERVER_OCTOPUS; + } + encode(pending_creatings, creatings_bl, features); t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl); // remove any old (or incompat) POOL_CREATING flags @@ -4240,7 +4331,7 @@ void OSDMonitor::update_creating_pgs() << dendl; continue; } - auto mapped = pg.second.first; + auto mapped = pg.second.create_epoch; dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl; spg_t spgid(pgid); mapping.get_primary_and_shard(pgid, &acting_primary, &spgid); @@ -4313,16 +4404,23 @@ epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) cons oldm = new MOSDPGCreate(creating_pgs_epoch); } oldm->mkpg.emplace(pg.pgid, - pg_create_t{create->second.first, pg.pgid, 0}); - oldm->ctimes.emplace(pg.pgid, create->second.second); + pg_create_t{create->second.create_epoch, pg.pgid, 0}); + oldm->ctimes.emplace(pg.pgid, create->second.create_stamp); } else { if (!m) { m = new MOSDPGCreate2(creating_pgs_epoch); } - m->pgs.emplace(pg, create->second); + m->pgs.emplace(pg, make_pair(create->second.create_epoch, + create->second.create_stamp)); + if (create->second.history.epoch_created) { + dout(20) << __func__ << " " << pg << " " << create->second.history + << " " << create->second.past_intervals << dendl; + m->pg_extra.emplace(pg, make_pair(create->second.history, + create->second.past_intervals)); + } } dout(20) << __func__ << " will create " << pg - << " at " << create->second.first << dendl; + << " at " << create->second.create_epoch << dendl; } } if (m) { @@ -12597,9 +12695,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, bool creating_now; { std::lock_guard l(creating_pgs_lock); - auto emplaced = creating_pgs.pgs.emplace(pgid, - make_pair(osdmap.get_epoch(), - ceph_clock_now())); + auto emplaced = creating_pgs.pgs.emplace( + pgid, + creating_pgs_t::pg_create_info(osdmap.get_epoch(), + ceph_clock_now())); creating_now = emplaced.second; } if (creating_now) { diff --git a/src/tools/ceph-dencoder/types.h b/src/tools/ceph-dencoder/types.h index 4f3206d4539c..d6707394b58b 100644 --- a/src/tools/ceph-dencoder/types.h +++ b/src/tools/ceph-dencoder/types.h @@ -188,7 +188,7 @@ TYPE(LevelDBStoreStats) TYPE(ScrubResult) #include "mon/CreatingPGs.h" -TYPE(creating_pgs_t) +TYPE_FEATUREFUL(creating_pgs_t) #include "mgr/ServiceMap.h" TYPE_FEATUREFUL(ServiceMap) diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index 80c3aa7538f0..030f8b437a90 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -572,7 +572,7 @@ static int update_creating_pgs(MonitorDBStore& st) creating.last_scan_epoch = last_osdmap_epoch; bufferlist newbl; - ::encode(creating, newbl); + encode(creating, newbl, CEPH_FEATURES_ALL); auto t = make_shared(); t->put("osd_pg_creating", "creating", newbl);