From 450f337d6fd048c8c95a0ec0dec0d97f5474922e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 12 Oct 2018 16:10:45 -0500 Subject: [PATCH] osd,mon: keep last_epoch_started along with last_epoch_clean premerge We need to populate both the last_epoch_clean and last_epoch_started accurately when fabricating a pg merge target. In the case where the premerge PG peers, recovers (e.g., backfill), then becomes ready to merge, the last_epoch_clean may be > the last_epoch_started, and using the later last_epoch_clean will prevent the PG from subsequently peering (it'll go to incomplete state instead). Fix by keeping both values, separately. Signed-off-by: Sage Weil --- src/messages/MOSDPGReadyToMerge.h | 9 +++++++-- src/mon/OSDMonitor.cc | 2 +- src/osd/OSD.cc | 15 ++++++++++++--- src/osd/OSD.h | 6 ++++-- src/osd/PG.cc | 14 +++++++++----- src/osd/PG.h | 1 + src/osd/osd_types.cc | 13 ++++++++++--- src/osd/osd_types.h | 9 ++++++++- 8 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/messages/MOSDPGReadyToMerge.h b/src/messages/MOSDPGReadyToMerge.h index c07c58f4b3ba1..937eaa6723a5d 100644 --- a/src/messages/MOSDPGReadyToMerge.h +++ b/src/messages/MOSDPGReadyToMerge.h @@ -7,15 +7,17 @@ class MOSDPGReadyToMerge : public MessageInstance { public: pg_t pgid; + epoch_t last_epoch_started = 0; epoch_t last_epoch_clean = 0; bool ready = true; MOSDPGReadyToMerge() : MessageInstance(MSG_OSD_PG_READY_TO_MERGE, 0) {} - MOSDPGReadyToMerge(pg_t p, epoch_t lec, bool r, epoch_t v) + MOSDPGReadyToMerge(pg_t p, epoch_t les, epoch_t lec, bool r, epoch_t v) : MessageInstance(MSG_OSD_PG_READY_TO_MERGE, v), pgid(p), + last_epoch_started(les), last_epoch_clean(lec), ready(r) {} @@ -23,6 +25,7 @@ public: using ceph::encode; paxos_encode(); encode(pgid, payload); + encode(last_epoch_started, payload); encode(last_epoch_clean, payload); encode(ready, payload); } @@ -30,13 +33,15 @@ public: bufferlist::const_iterator p = payload.begin(); paxos_decode(p); decode(pgid, p); + decode(last_epoch_started, p); decode(last_epoch_clean, p); decode(ready, p); } const char *get_type_name() const override { return "osd_pg_ready_to_merge"; } void print(ostream &out) const { out << get_type_name() - << "(" << pgid << " lec " << last_epoch_clean + << "(" << pgid + << " les/c " << last_epoch_started << "/" << last_epoch_clean << (ready ? " ready" : " NOT READY") << " v" << version << ")"; } diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index baf1dcaf38200..d5f413ee317fa 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3296,7 +3296,7 @@ bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op) } if (m->ready) { - p.dec_pg_num(m->last_epoch_clean); + p.dec_pg_num(m->last_epoch_started, m->last_epoch_clean); p.last_change = pending_inc.epoch; } else { // back off the merge attempt! diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1bc9acff69284..6be8c5e938589 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1703,11 +1703,15 @@ void OSDService::set_ready_to_merge_source(PG *pg) _send_ready_to_merge(); } -void OSDService::set_ready_to_merge_target(PG *pg, epoch_t last_epoch_clean) +void OSDService::set_ready_to_merge_target(PG *pg, + epoch_t last_epoch_started, + epoch_t last_epoch_clean) { Mutex::Locker l(merge_lock); dout(10) << __func__ << " " << pg->pg_id << dendl; - ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, last_epoch_clean)); + ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, + make_pair(last_epoch_started, + last_epoch_clean))); assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0); _send_ready_to_merge(); } @@ -1750,6 +1754,7 @@ void OSDService::_send_ready_to_merge() monc->send_mon_message(new MOSDPGReadyToMerge( src, 0, + 0, false, osdmap->get_epoch())); sent_ready_to_merge_source.insert(src); @@ -1760,6 +1765,7 @@ void OSDService::_send_ready_to_merge() monc->send_mon_message(new MOSDPGReadyToMerge( p.second, 0, + 0, false, osdmap->get_epoch())); sent_ready_to_merge_source.insert(p.second); @@ -1775,7 +1781,8 @@ void OSDService::_send_ready_to_merge() sent_ready_to_merge_source.count(src) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( src, - p->second, // PG's last_epoch_clean + p->second.first, // PG's last_epoch_started + p->second.second, // PG's last_epoch_clean true, osdmap->get_epoch())); sent_ready_to_merge_source.insert(src); @@ -8207,6 +8214,8 @@ bool OSD::advance_pg( dout(1) << __func__ << " merging " << pg->pg_id << dendl; pg->merge_from( sources, rctx, split_bits, + nextmap->get_pg_pool( + pg->pg_id.pool())->get_pg_num_dec_last_epoch_started(), nextmap->get_pg_pool( pg->pg_id.pool())->get_pg_num_dec_last_epoch_clean()); pg->pg_slot->waiting_for_merge_epoch = 0; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 9dfa3411c5b51..067f3b65b6ef2 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -717,13 +717,15 @@ public: // -- pg merge -- Mutex merge_lock = {"OSD::merge_lock"}; set ready_to_merge_source; - map ready_to_merge_target; // pg -> last_epoch_clean + map> ready_to_merge_target; // pg -> (les,lec) set not_ready_to_merge_source; map not_ready_to_merge_target; set sent_ready_to_merge_source; void set_ready_to_merge_source(PG *pg); - void set_ready_to_merge_target(PG *pg, epoch_t last_epoch_clean); + void set_ready_to_merge_target(PG *pg, + epoch_t last_epoch_started, + epoch_t last_epoch_clean); void set_not_ready_to_merge_source(pg_t source); void set_not_ready_to_merge_target(pg_t target, pg_t source); void clear_ready_to_merge(PG *pg); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 3202abead4cb3..90b579a0b9f37 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2358,7 +2358,9 @@ void PG::try_mark_clean() if (pool.info.is_pending_merge(info.pgid.pgid, &target)) { if (target) { ldout(cct, 10) << "ready to merge (target)" << dendl; - osd->set_ready_to_merge_target(this, info.history.last_epoch_clean); + osd->set_ready_to_merge_target(this, + info.history.last_epoch_started, + info.history.last_epoch_clean); } else { ldout(cct, 10) << "ready to merge (source)" << dendl; osd->set_ready_to_merge_source(this); @@ -2677,6 +2679,7 @@ void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transac void PG::merge_from(map& sources, RecoveryCtx *rctx, unsigned split_bits, + epoch_t dec_last_epoch_started, epoch_t dec_last_epoch_clean) { dout(10) << __func__ << " from " << sources << " split_bits " << split_bits @@ -2769,11 +2772,12 @@ void PG::merge_from(map& sources, RecoveryCtx *rctx, // use last_epoch_clean value for last_epoch_started, though--we must be // conservative here to avoid breaking peering, calc_acting, etc. - info.history.last_epoch_started = dec_last_epoch_clean; - info.last_epoch_started = dec_last_epoch_clean; + info.history.last_epoch_started = dec_last_epoch_started; + info.last_epoch_started = dec_last_epoch_started; dout(10) << __func__ - << " set last_epoch_{started,clean} to " << dec_last_epoch_clean - << " from pg_num_dec_last_epoch_clean, source pg history was " + << " set les/c to " << dec_last_epoch_started << "/" + << dec_last_epoch_clean + << " from pool last_dec_*, source pg history was " << sources.begin()->second->info.history << dendl; } diff --git a/src/osd/PG.h b/src/osd/PG.h index 3aa6b180ea503..eacaa452c2ac9 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -400,6 +400,7 @@ public: void split_into(pg_t child_pgid, PG *child, unsigned split_bits); void merge_from(map& sources, RecoveryCtx *rctx, unsigned split_bits, + epoch_t dec_last_epoch_started, epoch_t dec_last_epoch_clean); void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 443bb21e05504..79b0ab8482b7d 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1231,6 +1231,8 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("pg_placement_num_target", get_pgp_num_target()); f->dump_unsigned("pg_num_target", get_pg_num_target()); f->dump_unsigned("pg_num_pending", get_pg_num_pending()); + f->dump_unsigned("pg_num_dec_last_epoch_started", + get_pg_num_dec_last_epoch_started()); f->dump_unsigned("pg_num_dec_last_epoch_clean", get_pg_num_dec_last_epoch_clean()); f->dump_stream("last_change") << get_last_change(); @@ -1734,6 +1736,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const encode(pg_num_target, bl); encode(pgp_num_target, bl); encode(pg_num_pending, bl); + encode(pg_num_dec_last_epoch_started, bl); encode(pg_num_dec_last_epoch_clean, bl); encode(last_force_op_resend, bl); } @@ -1899,6 +1902,7 @@ void pg_pool_t::decode(bufferlist::const_iterator& bl) decode(pg_num_target, bl); decode(pgp_num_target, bl); decode(pg_num_pending, bl); + decode(pg_num_dec_last_epoch_started, bl); decode(pg_num_dec_last_epoch_clean, bl); decode(last_force_op_resend, bl); } else { @@ -1927,7 +1931,8 @@ void pg_pool_t::generate_test_instances(list& o) a.pgp_num_target = 4; a.pg_num_target = 5; a.pg_num_pending = 5; - a.pg_num_dec_last_epoch_clean = 2; + a.pg_num_dec_last_epoch_started = 2; + a.pg_num_dec_last_epoch_clean = 3; a.last_change = 9; a.last_force_op_resend = 123823; a.last_force_op_resend_preluminous = 123824; @@ -1998,8 +2003,10 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) } if (p.get_pg_num_pending() != p.get_pg_num()) { out << " pg_num_pending " << p.get_pg_num_pending(); - if (p.get_pg_num_dec_last_epoch_clean()) - out << " dlec " << p.get_pg_num_dec_last_epoch_clean(); + if (p.get_pg_num_dec_last_epoch_started() || + p.get_pg_num_dec_last_epoch_clean()) + out << " dles/c " << p.get_pg_num_dec_last_epoch_started() + << "/" << p.get_pg_num_dec_last_epoch_clean(); } out << " last_change " << p.get_last_change(); if (p.get_last_force_op_resend() || diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index dd2c79aae0d14..c87e3bccf46c5 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1361,6 +1361,8 @@ public: /// last epoch that forced clients to resend (pre-luminous clients only) epoch_t last_force_op_resend_preluminous = 0; + ///< last_epoch_started preceding pg_num decrement request + epoch_t pg_num_dec_last_epoch_started = 0; ///< last_epoch_clean preceding pg_num decrement request epoch_t pg_num_dec_last_epoch_clean = 0; snapid_t snap_seq; ///< seq for per-pool snapshot @@ -1618,6 +1620,9 @@ public: // pool size that it represents. unsigned get_pg_num_divisor(pg_t pgid) const; + epoch_t get_pg_num_dec_last_epoch_started() const { + return pg_num_dec_last_epoch_started; + } epoch_t get_pg_num_dec_last_epoch_clean() const { return pg_num_dec_last_epoch_clean; } @@ -1643,8 +1648,10 @@ public: void set_pgp_num_target(int p) { pgp_num_target = p; } - void dec_pg_num(epoch_t last_epoch_clean) { + void dec_pg_num(epoch_t last_epoch_started, + epoch_t last_epoch_clean) { --pg_num; + pg_num_dec_last_epoch_started = last_epoch_started; pg_num_dec_last_epoch_clean = last_epoch_clean; calc_pg_masks(); } -- 2.39.5