From 9c5cb4ea620da4ef14345ebc8018f9ed008c4345 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 10 Apr 2018 00:56:05 +0800 Subject: [PATCH] osd: resend osd_pgtemp if it's not acked if the osd_pgtemp message is dropped before monitor receives it, we need to resend it. otherwise a pg could be stuck in activating state if the pg creation was withheld by the max-pg-per-osd on the replica, and then the replica osd removes some existing pg. Fixes: http://tracker.ceph.com/issues/23610 Signed-off-by: Kefu Chai --- src/osd/OSD.cc | 52 +++++++++++++++++++++++++++++++++----------------- src/osd/OSD.h | 12 +++++++++--- src/osd/PG.cc | 5 ++++- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f86b3c25f60eb..546ad2d95473d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -918,13 +918,16 @@ pair OSDService::get_con_osd_hb(int peer, epoch_t f } -void OSDService::queue_want_pg_temp(pg_t pgid, const vector& want) +void OSDService::queue_want_pg_temp(pg_t pgid, + const vector& want, + bool forced) { Mutex::Locker l(pg_temp_lock); - map >::iterator p = pg_temp_pending.find(pgid); + auto p = pg_temp_pending.find(pgid); if (p == pg_temp_pending.end() || - p->second != want) { - pg_temp_wanted[pgid] = want; + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; } } @@ -955,15 +958,36 @@ void OSDService::requeue_pg_temp() << pg_temp_wanted.size() << dendl; } +std::ostream& operator<<(std::ostream& out, + const OSDService::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + void OSDService::send_pg_temp() { Mutex::Locker l(pg_temp_lock); if (pg_temp_wanted.empty()) return; dout(10) << "send_pg_temp " << pg_temp_wanted << dendl; - MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch()); - m->pg_temp = pg_temp_wanted; - monc->send_mon_message(m); + MOSDPGTemp *ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = new MOSDPGTemp(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + for (auto m : ms) { + if (m) { + monc->send_mon_message(m); + } + } _sent_pg_temp(); } @@ -4018,7 +4042,6 @@ void OSD::resume_creating_pg() { bool do_sub_pg_creates = false; bool have_pending_creates = false; - MOSDPGTemp *pgtemp = nullptr; { const auto max_pgs_per_osd = (cct->_conf->get_val("mon_max_pg_per_osd") * @@ -4043,13 +4066,11 @@ void OSD::resume_creating_pg() auto pg = pending_creates_from_osd.cbegin(); while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) { dout(20) << __func__ << " pg " << pg->first << dendl; - if (!pgtemp) { - pgtemp = new MOSDPGTemp{osdmap->get_epoch()}; - } vector acting; osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr); - pgtemp->pg_temp[pg->first] = twiddle(acting); + service.queue_want_pg_temp(pg->first, twiddle(acting), true); pg = pending_creates_from_osd.erase(pg); + do_sub_pg_creates = true; spare_pgs--; } have_pending_creates = (pending_creates_from_mon > 0 || @@ -4072,7 +4093,7 @@ void OSD::resume_creating_pg() << start << dendl; do_renew_subs = true; } - } else if (pgtemp || do_sub_pg_creates) { + } else if (do_sub_pg_creates) { // no need to subscribe the osdmap continuously anymore // once the pgtemp and/or mon_subscribe(pg_creates) is sent if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) { @@ -4086,10 +4107,7 @@ void OSD::resume_creating_pg() monc->renew_subs(); } - if (pgtemp) { - pgtemp->forced = true; - monc->send_mon_message(pgtemp); - } + service.send_pg_temp(); } void OSD::build_initial_pg_history( diff --git a/src/osd/OSD.h b/src/osd/OSD.h index f7f322f66d9b9..9e086d9afe1b7 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -718,11 +718,17 @@ public: // -- pg_temp -- private: Mutex pg_temp_lock; - map > pg_temp_wanted; - map > pg_temp_pending; + struct pg_temp_t { + vector acting; + bool forced = false; + }; + map pg_temp_wanted; + map pg_temp_pending; void _sent_pg_temp(); + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); public: - void queue_want_pg_temp(pg_t pgid, const vector& want); + void queue_want_pg_temp(pg_t pgid, const vector& want, + bool forced = false); void remove_want_pg_temp(pg_t pgid); void requeue_pg_temp(); void send_pg_temp(); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d14806a4c333a..3d6abdff24ed5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5828,6 +5828,7 @@ void PG::start_peering_interval( pg_shard_t old_acting_primary = get_primary(); pg_shard_t old_up_primary = up_primary; bool was_old_primary = is_primary(); + bool was_old_replica = is_replica(); acting.swap(oldacting); up.swap(oldup); @@ -5947,9 +5948,11 @@ void PG::start_peering_interval( acting_recovery_backfill.clear(); scrub_queued = false; - // reset primary state? + // reset primary/replica state? if (was_old_primary || is_primary()) { osd->remove_want_pg_temp(info.pgid.pgid); + } else if (was_old_replica || is_replica()) { + osd->remove_want_pg_temp(info.pgid.pgid); } clear_primary_state(); -- 2.39.5