From 6da520e0756179052cc4f125b50d73cffcbac001 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 11 Mar 2019 17:31:54 -0500 Subject: [PATCH] osd,mon: include more pg merge metadata in pg_pool_t The ones we need are source_version and target_version. Include it in a nice containing structure to keep things tidy. Signed-off-by: Sage Weil --- src/messages/MOSDPGReadyToMerge.h | 12 +++++- src/mon/OSDMonitor.cc | 7 +++- src/osd/OSD.cc | 38 +++++++++--------- src/osd/OSD.h | 8 ++-- src/osd/PG.cc | 18 ++++----- src/osd/PG.h | 3 +- src/osd/osd_types.cc | 32 +++++++-------- src/osd/osd_types.h | 67 ++++++++++++++++++++++++------- 8 files changed, 120 insertions(+), 65 deletions(-) diff --git a/src/messages/MOSDPGReadyToMerge.h b/src/messages/MOSDPGReadyToMerge.h index cbe22af39b0..b8c18095675 100644 --- a/src/messages/MOSDPGReadyToMerge.h +++ b/src/messages/MOSDPGReadyToMerge.h @@ -7,6 +7,7 @@ class MOSDPGReadyToMerge : public MessageInstance { public: pg_t pgid; + eversion_t source_version, target_version; epoch_t last_epoch_started = 0; epoch_t last_epoch_clean = 0; bool ready = true; @@ -14,9 +15,12 @@ public: MOSDPGReadyToMerge() : MessageInstance(MSG_OSD_PG_READY_TO_MERGE, 0) {} - MOSDPGReadyToMerge(pg_t p, epoch_t les, epoch_t lec, bool r, epoch_t v) + MOSDPGReadyToMerge(pg_t p, eversion_t sv, eversion_t tv, + epoch_t les, epoch_t lec, bool r, epoch_t v) : MessageInstance(MSG_OSD_PG_READY_TO_MERGE, v), pgid(p), + source_version(sv), + target_version(tv), last_epoch_started(les), last_epoch_clean(lec), ready(r) @@ -25,6 +29,8 @@ public: using ceph::encode; paxos_encode(); encode(pgid, payload); + encode(source_version, payload); + encode(target_version, payload); encode(last_epoch_started, payload); encode(last_epoch_clean, payload); encode(ready, payload); @@ -33,6 +39,8 @@ public: bufferlist::const_iterator p = payload.begin(); paxos_decode(p); decode(pgid, p); + decode(source_version, p); + decode(target_version, p); decode(last_epoch_started, p); decode(last_epoch_clean, p); decode(ready, p); @@ -41,6 +49,8 @@ public: void print(ostream &out) const { out << get_type_name() << "(" << pgid + << " sv " << source_version + << " tv " << target_version << " les/c " << last_epoch_started << "/" << last_epoch_clean << (ready ? " ready" : " NOT READY") << " v" << version << ")"; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index efbc8c1257c..0929d904f78 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3318,7 +3318,12 @@ bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op) } if (m->ready) { - p.dec_pg_num(m->last_epoch_started, m->last_epoch_clean); + p.dec_pg_num(m->pgid, + pending_inc.epoch, + m->source_version, + m->target_version, + m->last_epoch_started, + m->last_epoch_clean); p.last_change = pending_inc.epoch; } else { // back off the merge attempt! diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 3b5f4e80bff..2d7945b1e88 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1857,24 +1857,26 @@ bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num) // --- -void OSDService::set_ready_to_merge_source(PG *pg) +void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << pg->pg_id << dendl; - ready_to_merge_source.insert(pg->pg_id.pgid); + ready_to_merge_source[pg->pg_id.pgid] = version; assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0); _send_ready_to_merge(); } void OSDService::set_ready_to_merge_target(PG *pg, + eversion_t version, epoch_t last_epoch_started, epoch_t last_epoch_clean) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << pg->pg_id << dendl; ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, - make_pair(last_epoch_started, - last_epoch_clean))); + make_tuple(version, + last_epoch_started, + last_epoch_clean))); assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0); _send_ready_to_merge(); } @@ -1916,8 +1918,7 @@ void OSDService::_send_ready_to_merge() if (sent_ready_to_merge_source.count(src) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( src, - 0, - 0, + {}, {}, 0, 0, false, osdmap->get_epoch())); sent_ready_to_merge_source.insert(src); @@ -1927,28 +1928,29 @@ void OSDService::_send_ready_to_merge() if (sent_ready_to_merge_source.count(p.second) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( p.second, - 0, - 0, + {}, {}, 0, 0, false, osdmap->get_epoch())); sent_ready_to_merge_source.insert(p.second); } } for (auto src : ready_to_merge_source) { - if (not_ready_to_merge_source.count(src) || - not_ready_to_merge_target.count(src.get_parent())) { + if (not_ready_to_merge_source.count(src.first) || + not_ready_to_merge_target.count(src.first.get_parent())) { continue; } - auto p = ready_to_merge_target.find(src.get_parent()); + auto p = ready_to_merge_target.find(src.first.get_parent()); if (p != ready_to_merge_target.end() && - sent_ready_to_merge_source.count(src) == 0) { + sent_ready_to_merge_source.count(src.first) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( - src, - p->second.first, // PG's last_epoch_started - p->second.second, // PG's last_epoch_clean + src.first, // source pgid + src.second, // src version + std::get<0>(p->second), // target version + std::get<1>(p->second), // PG's last_epoch_started + std::get<2>(p->second), // PG's last_epoch_clean true, osdmap->get_epoch())); - sent_ready_to_merge_source.insert(src); + sent_ready_to_merge_source.insert(src.first); } } } @@ -8613,9 +8615,7 @@ bool OSD::advance_pg( pg->merge_from( sources, rctx, split_bits, nextmap->get_pg_pool( - pg->pg_id.pool())->get_pg_num_dec_last_epoch_started(), - nextmap->get_pg_pool( - pg->pg_id.pool())->get_pg_num_dec_last_epoch_clean()); + pg->pg_id.pool())->last_pg_merge_meta); pg->pg_slot->waiting_for_merge_epoch = 0; } else { dout(20) << __func__ << " not ready to merge yet" << dendl; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index bf054d70112..25a5bcc7dbf 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -711,14 +711,16 @@ public: // -- pg merge -- Mutex merge_lock = {"OSD::merge_lock"}; - set ready_to_merge_source; - map> ready_to_merge_target; // pg -> (les,lec) + map ready_to_merge_source; // pg -> version + map> ready_to_merge_target; // pg -> (version,les,lec) set not_ready_to_merge_source; map not_ready_to_merge_target; set sent_ready_to_merge_source; - void set_ready_to_merge_source(PG *pg); + void set_ready_to_merge_source(PG *pg, + eversion_t version); void set_ready_to_merge_target(PG *pg, + eversion_t version, epoch_t last_epoch_started, epoch_t last_epoch_clean); void set_not_ready_to_merge_source(pg_t source); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index f7aeabea198..0fce841bafd 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2378,11 +2378,12 @@ void PG::try_mark_clean() if (target) { ldout(cct, 10) << "ready to merge (target)" << dendl; osd->set_ready_to_merge_target(this, + info.last_update, info.history.last_epoch_started, info.history.last_epoch_clean); } else { ldout(cct, 10) << "ready to merge (source)" << dendl; - osd->set_ready_to_merge_source(this); + osd->set_ready_to_merge_source(this, info.last_update); } } } else { @@ -2706,8 +2707,7 @@ void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transac void PG::merge_from(map& sources, RecoveryCtx *rctx, unsigned split_bits, - epoch_t dec_last_epoch_started, - epoch_t dec_last_epoch_clean) + const pg_merge_meta_t& last_pg_merge_meta) { dout(10) << __func__ << " from " << sources << " split_bits " << split_bits << dendl; @@ -2792,15 +2792,15 @@ void PG::merge_from(map& sources, RecoveryCtx *rctx, // remapped in concert with each other... info.history = sources.begin()->second->info.history; - // we use the pg_num_dec_last_epoch_{started,clean} we got from + // we use the last_epoch_{started,clean} we got from // the caller, which are the epochs that were reported by the PGs were // found to be ready for merge. - info.history.last_epoch_clean = dec_last_epoch_clean; - info.history.last_epoch_started = dec_last_epoch_started; - info.last_epoch_started = dec_last_epoch_started; + info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean; + info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started; + info.last_epoch_started = last_pg_merge_meta.last_epoch_started; dout(10) << __func__ - << " set les/c to " << dec_last_epoch_started << "/" - << dec_last_epoch_clean + << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/" + << last_pg_merge_meta.last_epoch_clean << " from pool last_dec_*, source pg history was " << sources.begin()->second->info.history << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index b1f3db00650..52fba8b5b0b 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -414,8 +414,7 @@ public: void split_into(pg_t child_pgid, PG *child, unsigned split_bits); void merge_from(map& sources, RecoveryCtx *rctx, unsigned split_bits, - epoch_t dec_last_epoch_started, - epoch_t dec_last_epoch_clean); + const pg_merge_meta_t& last_pg_merge_meta); void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t); void scrub(epoch_t queued, ThreadPool::TPHandle &handle); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index fce9cbee788..9aeda158a79 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1337,10 +1337,7 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("pg_placement_num_target", get_pgp_num_target()); f->dump_unsigned("pg_num_target", get_pg_num_target()); f->dump_unsigned("pg_num_pending", get_pg_num_pending()); - f->dump_unsigned("pg_num_dec_last_epoch_started", - get_pg_num_dec_last_epoch_started()); - f->dump_unsigned("pg_num_dec_last_epoch_clean", - get_pg_num_dec_last_epoch_clean()); + f->dump_object("last_pg_merge_meta", last_pg_merge_meta); f->dump_stream("last_change") << get_last_change(); f->dump_stream("last_force_op_resend") << get_last_force_op_resend(); f->dump_stream("last_force_op_resend_prenautilus") @@ -1747,7 +1744,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - uint8_t v = 28; + uint8_t v = 29; // NOTE: any new encoding dependencies must be reflected by // SIGNIFICANT_FEATURES if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) { @@ -1842,17 +1839,20 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const encode(pg_num_target, bl); encode(pgp_num_target, bl); encode(pg_num_pending, bl); - encode(pg_num_dec_last_epoch_started, bl); - encode(pg_num_dec_last_epoch_clean, bl); + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01] + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01] encode(last_force_op_resend, bl); encode(pg_autoscale_mode, bl); } + if (v >= 29) { + encode(last_pg_merge_meta, bl); + } ENCODE_FINISH(bl); } void pg_pool_t::decode(bufferlist::const_iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(28, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(29, 5, 5, bl); decode(type, bl); decode(size, bl); decode(crush_rule, bl); @@ -2009,10 +2009,14 @@ void pg_pool_t::decode(bufferlist::const_iterator& bl) decode(pg_num_target, bl); decode(pgp_num_target, bl); decode(pg_num_pending, bl); - decode(pg_num_dec_last_epoch_started, bl); - decode(pg_num_dec_last_epoch_clean, bl); + epoch_t e; + decode(e, bl); + decode(e, bl); decode(last_force_op_resend, bl); decode(pg_autoscale_mode, bl); + if (struct_v >= 29) { + decode(last_pg_merge_meta, bl); + } } else { pg_num_target = pg_num; pgp_num_target = pgp_num; @@ -2040,8 +2044,8 @@ void pg_pool_t::generate_test_instances(list& o) a.pgp_num_target = 4; a.pg_num_target = 5; a.pg_num_pending = 5; - a.pg_num_dec_last_epoch_started = 2; - a.pg_num_dec_last_epoch_clean = 3; + a.last_pg_merge_meta.last_epoch_started = 2; + a.last_pg_merge_meta.last_epoch_clean = 2; a.last_change = 9; a.last_force_op_resend = 123823; a.last_force_op_resend_preluminous = 123824; @@ -2112,10 +2116,6 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) } if (p.get_pg_num_pending() != p.get_pg_num()) { out << " pg_num_pending " << p.get_pg_num_pending(); - if (p.get_pg_num_dec_last_epoch_started() || - p.get_pg_num_dec_last_epoch_clean()) - out << " dles/c " << p.get_pg_num_dec_last_epoch_started() - << "/" << p.get_pg_num_dec_last_epoch_clean(); } if (p.pg_autoscale_mode) { out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index bfde9710e0b..2edd539507e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1080,6 +1080,45 @@ private: }; WRITE_CLASS_ENCODER_FEATURES(pool_opts_t) +struct pg_merge_meta_t { + pg_t source_pgid; + epoch_t ready_epoch = 0; + epoch_t last_epoch_started = 0; + epoch_t last_epoch_clean = 0; + eversion_t source_version; + eversion_t target_version; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(source_pgid, bl); + encode(ready_epoch, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(source_version, bl); + encode(target_version, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(source_pgid, p); + decode(ready_epoch, p); + decode(last_epoch_started, p); + decode(last_epoch_clean, p); + decode(source_version, p); + decode(target_version, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_stream("source_pgid") << source_pgid; + f->dump_unsigned("ready_epoch", ready_epoch); + f->dump_unsigned("last_epoch_started", last_epoch_started); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("source_version") << source_version; + f->dump_stream("target_version") << target_version; + } +}; +WRITE_CLASS_ENCODER(pg_merge_meta_t) + /* * pg_pool */ @@ -1306,10 +1345,9 @@ public: /// last epoch that forced clients to resend (pre-luminous clients only) epoch_t last_force_op_resend_preluminous = 0; - /// last_epoch_started preceding pg_num decrement request - epoch_t pg_num_dec_last_epoch_started = 0; - /// last_epoch_clean preceding pg_num decrement request - epoch_t pg_num_dec_last_epoch_clean = 0; + /// metadata for the most recent PG merge + pg_merge_meta_t last_pg_merge_meta; + snapid_t snap_seq; ///< seq for per-pool snapshot epoch_t snap_epoch; ///< osdmap epoch of last snap uint64_t auid; ///< who owns the pg @@ -1565,13 +1603,6 @@ public: // pool size that it represents. unsigned get_pg_num_divisor(pg_t pgid) const; - epoch_t get_pg_num_dec_last_epoch_started() const { - return pg_num_dec_last_epoch_started; - } - epoch_t get_pg_num_dec_last_epoch_clean() const { - return pg_num_dec_last_epoch_clean; - } - bool is_pending_merge(pg_t pgid, bool *target) const; void set_pg_num(int p) { @@ -1593,11 +1624,19 @@ public: void set_pgp_num_target(int p) { pgp_num_target = p; } - void dec_pg_num(epoch_t last_epoch_started, + void dec_pg_num(pg_t source_pgid, + epoch_t ready_epoch, + eversion_t source_version, + eversion_t target_version, + epoch_t last_epoch_started, epoch_t last_epoch_clean) { --pg_num; - pg_num_dec_last_epoch_started = last_epoch_started; - pg_num_dec_last_epoch_clean = last_epoch_clean; + last_pg_merge_meta.source_pgid = source_pgid; + last_pg_merge_meta.ready_epoch = ready_epoch; + last_pg_merge_meta.source_version = source_version; + last_pg_merge_meta.target_version = target_version; + last_pg_merge_meta.last_epoch_started = last_epoch_started; + last_pg_merge_meta.last_epoch_clean = last_epoch_clean; calc_pg_masks(); } -- 2.39.5