From: Sage Weil Date: Wed, 8 Feb 2017 14:50:28 +0000 (-0500) Subject: mon/PGMonitor: move check_down_pgs into PGMapUpdater; use latest map X-Git-Tag: v12.0.1~343^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=dd70a93181d15ffd56ce6a8330ae725a6ddd72aa;p=ceph.git mon/PGMonitor: move check_down_pgs into PGMapUpdater; use latest map Two big changes here: 1) Move the updating into PGMapUpdater along with the other related checks. Cleaner! 2) We use the current OSDMap to mark things stale. Before, we had a weird two stage process: first we would apply the OSDMap to the PGMap, and then the *next* time the OSDMap updated we would look for stale PGs. This was silly. Instead, we mark the PGs stale when we process the OSDMap that made them stale. We have a be slightly careful because the PGMap stats primaries might be a bit old and, for example, might have a smaller max_osd than what we have now. This change is motivated right now because the new prime_pg_temp code is smarter about a failed OSD. While previously failing an OSD would install a pg_temp mapping to the old and new OSDs, even though the old OSD is now down, the new code does not. And previously failing both OSDs for a PG would always result in a subsequent OSDMap update removing those bad pg_temp values, giving check_down_pgs() a change to mark the PGs stale, while the new code has not subsequent OSDMap update, which would leave the PG unstale, causing the dump_stuck.py test to fail. Signed-off-by: Sage Weil --- diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index df7184378227..c73e75d5b95f 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2367,3 +2367,70 @@ void PGMapUpdater::update_creating_pgs( } } +static void _try_mark_pg_stale( + const OSDMap& osdmap, + pg_t pgid, + const pg_stat_t& cur, + PGMap::Incremental *pending_inc) +{ + if ((cur.state & PG_STATE_STALE) == 0 && + cur.acting_primary != -1 && + osdmap.is_down(cur.acting_primary)) { + pg_stat_t *newstat; + auto q = pending_inc->pg_stat_updates.find(pgid); + if (q != pending_inc->pg_stat_updates.end()) { + if ((q->second.acting_primary == cur.acting_primary) || + ((q->second.state & PG_STATE_STALE) == 0 && + q->second.acting_primary != -1 && + osdmap.is_down(q->second.acting_primary))) { + newstat = &q->second; + } else { + // pending update is no longer down or already stale + return; + } + } else { + newstat = &pending_inc->pg_stat_updates[pgid]; + *newstat = cur; + } + dout(10) << __func__ << " marking pg " << pgid + << " stale (acting_primary " << newstat->acting_primary + << ")" << dendl; + newstat->state |= PG_STATE_STALE; + newstat->last_unstale = ceph_clock_now(); + } +} + +void PGMapUpdater::check_down_pgs( + const OSDMap &osdmap, + const PGMap *pg_map, + bool check_all, + const set& need_check_down_pg_osds, + PGMap::Incremental *pending_inc) +{ + // if a large number of osds changed state, just iterate over the whole + // pg map. + if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() * + g_conf->mon_pg_check_down_all_threshold) { + check_all = true; + } + + if (check_all) { + for (const auto& p : pg_map->pg_stat) { + _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc); + } + } else { + for (auto osd : need_check_down_pg_osds) { + if (osdmap.is_down(osd)) { + auto p = pg_map->pg_by_osd.find(osd); + if (p == pg_map->pg_by_osd.end()) { + continue; + } + for (auto pgid : p->second) { + const pg_stat_t &stat = pg_map->pg_stat.at(pgid); + assert(stat.acting_primary == osd); + _try_mark_pg_stale(osdmap, pgid, stat, pending_inc); + } + } + } + } +} diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 1c86ab3afffd..ad356506f311 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -418,6 +418,13 @@ public: bool new_pool, PGMap *pg_map, PGMap::Incremental *pending_inc); + + static void check_down_pgs( + const OSDMap &osd_map, + const PGMap *pg_map, + bool check_all, + const set& need_check_down_pg_osds, + PGMap::Incremental *pending_inc); }; #endif diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 286e5c70fe9e..ca6334722b31 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -67,8 +67,8 @@ void PGMonitor::on_restart() void PGMonitor::on_active() { if (mon->is_leader()) { + check_all_pgs = true; check_osd_map(mon->osdmon()->osdmap.get_epoch()); - need_check_down_pgs = true; } update_logger(); @@ -861,6 +861,9 @@ void PGMonitor::check_osd_map(epoch_t epoch) return; } + // osds that went up or down + set need_check_down_pg_osds; + // apply latest map(s) for (epoch_t e = pg_map.last_osdmap_epoch+1; e <= epoch; @@ -877,14 +880,15 @@ void PGMonitor::check_osd_map(epoch_t epoch) &last_osd_report, &pg_map, &pending_inc); } + const OSDMap& osdmap = mon->osdmon()->osdmap; assert(pg_map.last_osdmap_epoch < epoch); pending_inc.osdmap_epoch = epoch; - PGMapUpdater::update_creating_pgs(mon->osdmon()->osdmap, - &pg_map, &pending_inc); - PGMapUpdater::register_new_pgs(mon->osdmon()->osdmap, &pg_map, &pending_inc); + PGMapUpdater::update_creating_pgs(osdmap, &pg_map, &pending_inc); + PGMapUpdater::register_new_pgs(osdmap, &pg_map, &pending_inc); - if (need_check_down_pgs || !need_check_down_pg_osds.empty()) - check_down_pgs(); + PGMapUpdater::check_down_pgs(osdmap, &pg_map, check_all_pgs, + need_check_down_pg_osds, &pending_inc); + check_all_pgs = false; propose_pending(); } @@ -933,76 +937,6 @@ epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) return last + 1; } -void PGMonitor::_try_mark_pg_stale( - const OSDMap *osdmap, - pg_t pgid, - const pg_stat_t& cur_stat) -{ - map::iterator q = pending_inc.pg_stat_updates.find(pgid); - pg_stat_t *stat; - if (q == pending_inc.pg_stat_updates.end()) { - stat = &pending_inc.pg_stat_updates[pgid]; - *stat = cur_stat; - } else { - stat = &q->second; - } - if ((stat->acting_primary == cur_stat.acting_primary) || - ((stat->state & PG_STATE_STALE) == 0 && - stat->acting_primary != -1 && - osdmap->is_down(stat->acting_primary))) { - dout(10) << " marking pg " << pgid << " stale (acting_primary " - << stat->acting_primary << ")" << dendl; - stat->state |= PG_STATE_STALE; - stat->last_unstale = ceph_clock_now(); - } -} - -void PGMonitor::check_down_pgs() -{ - dout(10) << "check_down_pgs last_osdmap_epoch " - << pg_map.last_osdmap_epoch << dendl; - if (pg_map.last_osdmap_epoch == 0) - return; - - // use the OSDMap that matches the one pg_map has consumed. - std::unique_ptr osdmap; - bufferlist bl; - int err = mon->osdmon()->get_version_full(pg_map.last_osdmap_epoch, bl); - assert(err == 0); - osdmap.reset(new OSDMap); - osdmap->decode(bl); - - // if a large number of osds changed state, just iterate over the whole - // pg map. - if (need_check_down_pg_osds.size() > (unsigned)osdmap->get_num_osds() * - g_conf->mon_pg_check_down_all_threshold) - need_check_down_pgs = true; - - if (need_check_down_pgs) { - for (auto p : pg_map.pg_stat) { - if ((p.second.state & PG_STATE_STALE) == 0 && - p.second.acting_primary != -1 && - osdmap->is_down(p.second.acting_primary)) { - _try_mark_pg_stale(osdmap.get(), p.first, p.second); - } - } - } else { - for (auto osd : need_check_down_pg_osds) { - if (osdmap->is_down(osd)) { - for (auto pgid : pg_map.pg_by_osd[osd]) { - const pg_stat_t &stat = pg_map.pg_stat[pgid]; - assert(stat.acting_primary == osd); - if ((stat.state & PG_STATE_STALE) == 0) { - _try_mark_pg_stale(osdmap.get(), pgid, stat); - } - } - } - } - } - need_check_down_pgs = false; - need_check_down_pg_osds.clear(); -} - void PGMonitor::dump_info(Formatter *f) const { f->open_object_section("pgmap"); diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index e1c9c822f98f..987079d47d37 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -43,12 +43,11 @@ class PGMonitor : public PaxosService { public: PGMap pg_map; - bool need_check_down_pgs; - set need_check_down_pg_osds; - private: PGMap::Incremental pending_inc; + bool check_all_pgs = false; + const char *pgmap_meta_prefix; const char *pgmap_pg_prefix; const char *pgmap_osd_prefix; @@ -94,18 +93,6 @@ private: epoch_t send_pg_creates(int osd, Connection *con, epoch_t next); - /** - * check pgs for down primary osds - * - * clears need_check_down_pgs - * clears need_check_down_pg_osds - * - */ - void check_down_pgs(); - void _try_mark_pg_stale(const OSDMap *osdmap, pg_t pgid, - const pg_stat_t& cur_stat); - - /** * Dump stats from pgs stuck in specified states. * @@ -118,7 +105,6 @@ private: public: PGMonitor(Monitor *mn, Paxos *p, const string& service_name) : PaxosService(mn, p, service_name), - need_check_down_pgs(false), pgmap_meta_prefix("pgmap_meta"), pgmap_pg_prefix("pgmap_pg"), pgmap_osd_prefix("pgmap_osd")