}
}
+static void _try_mark_pg_stale(
+ const OSDMap& osdmap,
+ pg_t pgid,
+ const pg_stat_t& cur,
+ PGMap::Incremental *pending_inc)
+{
+ if ((cur.state & PG_STATE_STALE) == 0 &&
+ cur.acting_primary != -1 &&
+ osdmap.is_down(cur.acting_primary)) {
+ pg_stat_t *newstat;
+ auto q = pending_inc->pg_stat_updates.find(pgid);
+ if (q != pending_inc->pg_stat_updates.end()) {
+ if ((q->second.acting_primary == cur.acting_primary) ||
+ ((q->second.state & PG_STATE_STALE) == 0 &&
+ q->second.acting_primary != -1 &&
+ osdmap.is_down(q->second.acting_primary))) {
+ newstat = &q->second;
+ } else {
+ // pending update is no longer down or already stale
+ return;
+ }
+ } else {
+ newstat = &pending_inc->pg_stat_updates[pgid];
+ *newstat = cur;
+ }
+ dout(10) << __func__ << " marking pg " << pgid
+ << " stale (acting_primary " << newstat->acting_primary
+ << ")" << dendl;
+ newstat->state |= PG_STATE_STALE;
+ newstat->last_unstale = ceph_clock_now();
+ }
+}
+
+void PGMapUpdater::check_down_pgs(
+ const OSDMap &osdmap,
+ const PGMap *pg_map,
+ bool check_all,
+ const set<int>& need_check_down_pg_osds,
+ PGMap::Incremental *pending_inc)
+{
+ // if a large number of osds changed state, just iterate over the whole
+ // pg map.
+ if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
+ g_conf->mon_pg_check_down_all_threshold) {
+ check_all = true;
+ }
+
+ if (check_all) {
+ for (const auto& p : pg_map->pg_stat) {
+ _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
+ }
+ } else {
+ for (auto osd : need_check_down_pg_osds) {
+ if (osdmap.is_down(osd)) {
+ auto p = pg_map->pg_by_osd.find(osd);
+ if (p == pg_map->pg_by_osd.end()) {
+ continue;
+ }
+ for (auto pgid : p->second) {
+ const pg_stat_t &stat = pg_map->pg_stat.at(pgid);
+ assert(stat.acting_primary == osd);
+ _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
+ }
+ }
+ }
+ }
+}
void PGMonitor::on_active()
{
if (mon->is_leader()) {
+ check_all_pgs = true;
check_osd_map(mon->osdmon()->osdmap.get_epoch());
- need_check_down_pgs = true;
}
update_logger();
return;
}
+ // osds that went up or down
+ set<int> need_check_down_pg_osds;
+
// apply latest map(s)
for (epoch_t e = pg_map.last_osdmap_epoch+1;
e <= epoch;
&last_osd_report, &pg_map, &pending_inc);
}
+ const OSDMap& osdmap = mon->osdmon()->osdmap;
assert(pg_map.last_osdmap_epoch < epoch);
pending_inc.osdmap_epoch = epoch;
- PGMapUpdater::update_creating_pgs(mon->osdmon()->osdmap,
- &pg_map, &pending_inc);
- PGMapUpdater::register_new_pgs(mon->osdmon()->osdmap, &pg_map, &pending_inc);
+ PGMapUpdater::update_creating_pgs(osdmap, &pg_map, &pending_inc);
+ PGMapUpdater::register_new_pgs(osdmap, &pg_map, &pending_inc);
- if (need_check_down_pgs || !need_check_down_pg_osds.empty())
- check_down_pgs();
+ PGMapUpdater::check_down_pgs(osdmap, &pg_map, check_all_pgs,
+ need_check_down_pg_osds, &pending_inc);
+ check_all_pgs = false;
propose_pending();
}
return last + 1;
}
-void PGMonitor::_try_mark_pg_stale(
- const OSDMap *osdmap,
- pg_t pgid,
- const pg_stat_t& cur_stat)
-{
- map<pg_t,pg_stat_t>::iterator q = pending_inc.pg_stat_updates.find(pgid);
- pg_stat_t *stat;
- if (q == pending_inc.pg_stat_updates.end()) {
- stat = &pending_inc.pg_stat_updates[pgid];
- *stat = cur_stat;
- } else {
- stat = &q->second;
- }
- if ((stat->acting_primary == cur_stat.acting_primary) ||
- ((stat->state & PG_STATE_STALE) == 0 &&
- stat->acting_primary != -1 &&
- osdmap->is_down(stat->acting_primary))) {
- dout(10) << " marking pg " << pgid << " stale (acting_primary "
- << stat->acting_primary << ")" << dendl;
- stat->state |= PG_STATE_STALE;
- stat->last_unstale = ceph_clock_now();
- }
-}
-
-void PGMonitor::check_down_pgs()
-{
- dout(10) << "check_down_pgs last_osdmap_epoch "
- << pg_map.last_osdmap_epoch << dendl;
- if (pg_map.last_osdmap_epoch == 0)
- return;
-
- // use the OSDMap that matches the one pg_map has consumed.
- std::unique_ptr<OSDMap> osdmap;
- bufferlist bl;
- int err = mon->osdmon()->get_version_full(pg_map.last_osdmap_epoch, bl);
- assert(err == 0);
- osdmap.reset(new OSDMap);
- osdmap->decode(bl);
-
- // if a large number of osds changed state, just iterate over the whole
- // pg map.
- if (need_check_down_pg_osds.size() > (unsigned)osdmap->get_num_osds() *
- g_conf->mon_pg_check_down_all_threshold)
- need_check_down_pgs = true;
-
- if (need_check_down_pgs) {
- for (auto p : pg_map.pg_stat) {
- if ((p.second.state & PG_STATE_STALE) == 0 &&
- p.second.acting_primary != -1 &&
- osdmap->is_down(p.second.acting_primary)) {
- _try_mark_pg_stale(osdmap.get(), p.first, p.second);
- }
- }
- } else {
- for (auto osd : need_check_down_pg_osds) {
- if (osdmap->is_down(osd)) {
- for (auto pgid : pg_map.pg_by_osd[osd]) {
- const pg_stat_t &stat = pg_map.pg_stat[pgid];
- assert(stat.acting_primary == osd);
- if ((stat.state & PG_STATE_STALE) == 0) {
- _try_mark_pg_stale(osdmap.get(), pgid, stat);
- }
- }
- }
- }
- }
- need_check_down_pgs = false;
- need_check_down_pg_osds.clear();
-}
-
void PGMonitor::dump_info(Formatter *f) const
{
f->open_object_section("pgmap");
public:
PGMap pg_map;
- bool need_check_down_pgs;
- set<int> need_check_down_pg_osds;
-
private:
PGMap::Incremental pending_inc;
+ bool check_all_pgs = false;
+
const char *pgmap_meta_prefix;
const char *pgmap_pg_prefix;
const char *pgmap_osd_prefix;
epoch_t send_pg_creates(int osd, Connection *con, epoch_t next);
- /**
- * check pgs for down primary osds
- *
- * clears need_check_down_pgs
- * clears need_check_down_pg_osds
- *
- */
- void check_down_pgs();
- void _try_mark_pg_stale(const OSDMap *osdmap, pg_t pgid,
- const pg_stat_t& cur_stat);
-
-
/**
* Dump stats from pgs stuck in specified states.
*
public:
PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
: PaxosService(mn, p, service_name),
- need_check_down_pgs(false),
pgmap_meta_prefix("pgmap_meta"),
pgmap_pg_prefix("pgmap_pg"),
pgmap_osd_prefix("pgmap_osd")