From: Kefu Chai Date: Mon, 9 Apr 2018 16:56:05 +0000 (+0800) Subject: osd: resend osd_pgtemp if it's not acked X-Git-Tag: v12.2.5~25^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F21330%2Fhead;p=ceph.git osd: resend osd_pgtemp if it's not acked if the osd_pgtemp message is dropped before monitor receives it, we need to resend it. otherwise a pg could be stuck in activating state if the pg creation was withheld by the max-pg-per-osd on the replica, and then the replica osd removes some existing pg. Fixes: http://tracker.ceph.com/issues/23610 Signed-off-by: Kefu Chai (cherry picked from commit 9c5cb4ea620da4ef14345ebc8018f9ed008c4345) Conflicts: src/osd/OSD.cc: structured binding is a feature introduced in C++17. while we are using C++11 in luminous. so implement it with C++11 syntax. src/osd/OSD.h: trivial resolution --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 04f3479742803..93bbaec5a9a05 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1034,13 +1034,16 @@ pair OSDService::get_con_osd_hb(int peer, epoch_t f } -void OSDService::queue_want_pg_temp(pg_t pgid, vector& want) +void OSDService::queue_want_pg_temp(pg_t pgid, + const vector& want, + bool forced) { Mutex::Locker l(pg_temp_lock); - map >::iterator p = pg_temp_pending.find(pgid); + auto p = pg_temp_pending.find(pgid); if (p == pg_temp_pending.end() || - p->second != want) { - pg_temp_wanted[pgid] = want; + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = pg_temp_t{want, forced}; } } @@ -1053,10 +1056,8 @@ void OSDService::remove_want_pg_temp(pg_t pgid) void OSDService::_sent_pg_temp() { - for (map >::iterator p = pg_temp_wanted.begin(); - p != pg_temp_wanted.end(); - ++p) - pg_temp_pending[p->first] = p->second; + pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)), + make_move_iterator(end(pg_temp_wanted))); pg_temp_wanted.clear(); } @@ -1073,15 +1074,37 @@ void OSDService::requeue_pg_temp() << pg_temp_wanted.size() << dendl; } +std::ostream& operator<<(std::ostream& out, + const OSDService::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + void OSDService::send_pg_temp() { Mutex::Locker l(pg_temp_lock); if (pg_temp_wanted.empty()) return; dout(10) << "send_pg_temp " << pg_temp_wanted << dendl; - MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch()); - m->pg_temp = pg_temp_wanted; - monc->send_mon_message(m); + MOSDPGTemp *ms[2] = {nullptr, nullptr}; + for (auto& pg_temp : pg_temp_wanted) { + auto& m = ms[pg_temp.second.forced]; + if (!m) { + m = new MOSDPGTemp(osdmap->get_epoch()); + m->forced = pg_temp.second.forced; + } + m->pg_temp.emplace(pg_temp.first, + pg_temp.second.acting); + } + for (auto m : ms) { + if (m) { + monc->send_mon_message(m); + } + } _sent_pg_temp(); } @@ -4504,7 +4527,6 @@ void OSD::resume_creating_pg() { bool do_sub_pg_creates = false; bool have_pending_creates = false; - MOSDPGTemp *pgtemp = nullptr; { const auto max_pgs_per_osd = (cct->_conf->get_val("mon_max_pg_per_osd") * @@ -4527,13 +4549,12 @@ void OSD::resume_creating_pg() } auto pg = pending_creates_from_osd.cbegin(); while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) { - if (!pgtemp) { - pgtemp = new MOSDPGTemp{osdmap->get_epoch()}; - } + dout(20) << __func__ << " pg " << pg->first << dendl; vector acting; osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr); - pgtemp->pg_temp[pg->first] = twiddle(acting); + service.queue_want_pg_temp(pg->first, twiddle(acting), true); pg = pending_creates_from_osd.erase(pg); + do_sub_pg_creates = true; spare_pgs--; } have_pending_creates = (pending_creates_from_mon > 0 || @@ -4556,7 +4577,7 @@ void OSD::resume_creating_pg() << start << dendl; do_renew_subs = true; } - } else if (pgtemp || do_sub_pg_creates) { + } else if (do_sub_pg_creates) { // no need to subscribe the osdmap continuously anymore // once the pgtemp and/or mon_subscribe(pg_creates) is sent if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) { @@ -4570,10 +4591,7 @@ void OSD::resume_creating_pg() monc->renew_subs(); } - if (pgtemp) { - pgtemp->forced = true; - monc->send_mon_message(pgtemp); - } + service.send_pg_temp(); } void OSD::build_initial_pg_history( diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 48711405b0c71..ea611cbae1cb7 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -838,11 +838,22 @@ public: // -- pg_temp -- private: Mutex pg_temp_lock; - map > pg_temp_wanted; - map > pg_temp_pending; + struct pg_temp_t { + pg_temp_t() + {} + pg_temp_t(vector v, bool f) + : acting{v}, forced{f} + {} + vector acting; + bool forced = false; + }; + map pg_temp_wanted; + map pg_temp_pending; void _sent_pg_temp(); + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); public: - void queue_want_pg_temp(pg_t pgid, vector& want); + void queue_want_pg_temp(pg_t pgid, const vector& want, + bool forced = false); void remove_want_pg_temp(pg_t pgid); void requeue_pg_temp(); void send_pg_temp(); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 42a27983a5718..e47e4d8f826c9 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5436,6 +5436,7 @@ void PG::start_peering_interval( pg_shard_t old_acting_primary = get_primary(); pg_shard_t old_up_primary = up_primary; bool was_old_primary = is_primary(); + bool was_old_replica = is_replica(); acting.swap(oldacting); up.swap(oldup); @@ -5554,9 +5555,11 @@ void PG::start_peering_interval( actingbackfill.clear(); scrub_queued = false; - // reset primary state? + // reset primary/replica state? if (was_old_primary || is_primary()) { osd->remove_want_pg_temp(info.pgid.pgid); + } else if (was_old_replica || is_replica()) { + osd->remove_want_pg_temp(info.pgid.pgid); } clear_primary_state();