]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/PGMonitor: move check_down_pgs into PGMapUpdater; use latest map
authorSage Weil <sage@redhat.com>
Wed, 8 Feb 2017 14:50:28 +0000 (09:50 -0500)
committerSage Weil <sage@redhat.com>
Thu, 16 Feb 2017 17:04:50 +0000 (12:04 -0500)
Two big changes here:

1) Move the updating into PGMapUpdater along with the other related
checks.  Cleaner!

2) We use the current OSDMap to mark things stale.  Before, we had
a weird two stage process: first we would apply the OSDMap to the
PGMap, and then the *next* time the OSDMap updated we would look
for stale PGs.  This was silly.  Instead, we mark the PGs stale
when we process the OSDMap that made them stale.

We have a be slightly careful because the PGMap stats primaries
might be a bit old and, for example, might have a smaller max_osd
than what we have now.

This change is motivated right now because the new prime_pg_temp
code is smarter about a failed OSD.  While previously failing
an OSD would install a pg_temp mapping to the old and new OSDs,
even though the old OSD is now down, the new code does not.  And
previously failing both OSDs for a PG would always result in a
subsequent OSDMap update removing those bad pg_temp values, giving
check_down_pgs() a change to mark the PGs stale, while the new code
has not subsequent OSDMap update, which would leave the PG unstale,
causing the dump_stuck.py test to fail.

Signed-off-by: Sage Weil <sage@redhat.com>
src/mon/PGMap.cc
src/mon/PGMap.h
src/mon/PGMonitor.cc
src/mon/PGMonitor.h

index df71843782270d7c69ae00f8a6a7f7fad2807558..c73e75d5b95f3444fd18eefb1df3d6473f4e0b79 100644 (file)
@@ -2367,3 +2367,70 @@ void PGMapUpdater::update_creating_pgs(
   }
 }
 
+static void _try_mark_pg_stale(
+  const OSDMap& osdmap,
+  pg_t pgid,
+  const pg_stat_t& cur,
+  PGMap::Incremental *pending_inc)
+{
+  if ((cur.state & PG_STATE_STALE) == 0 &&
+      cur.acting_primary != -1 &&
+      osdmap.is_down(cur.acting_primary)) {
+    pg_stat_t *newstat;
+    auto q = pending_inc->pg_stat_updates.find(pgid);
+    if (q != pending_inc->pg_stat_updates.end()) {
+      if ((q->second.acting_primary == cur.acting_primary) ||
+         ((q->second.state & PG_STATE_STALE) == 0 &&
+          q->second.acting_primary != -1 &&
+          osdmap.is_down(q->second.acting_primary))) {
+       newstat = &q->second;
+      } else {
+       // pending update is no longer down or already stale
+       return;
+      }
+    } else {
+      newstat = &pending_inc->pg_stat_updates[pgid];
+      *newstat = cur;
+    }
+    dout(10) << __func__ << " marking pg " << pgid
+            << " stale (acting_primary " << newstat->acting_primary
+            << ")" << dendl;
+    newstat->state |= PG_STATE_STALE;
+    newstat->last_unstale = ceph_clock_now();
+  }
+}
+
+void PGMapUpdater::check_down_pgs(
+    const OSDMap &osdmap,
+    const PGMap *pg_map,
+    bool check_all,
+    const set<int>& need_check_down_pg_osds,
+    PGMap::Incremental *pending_inc)
+{
+  // if a large number of osds changed state, just iterate over the whole
+  // pg map.
+  if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
+      g_conf->mon_pg_check_down_all_threshold) {
+    check_all = true;
+  }
+
+  if (check_all) {
+    for (const auto& p : pg_map->pg_stat) {
+      _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
+    }
+  } else {
+    for (auto osd : need_check_down_pg_osds) {
+      if (osdmap.is_down(osd)) {
+       auto p = pg_map->pg_by_osd.find(osd);
+       if (p == pg_map->pg_by_osd.end()) {
+         continue;
+       }
+       for (auto pgid : p->second) {
+         const pg_stat_t &stat = pg_map->pg_stat.at(pgid);
+         assert(stat.acting_primary == osd);
+         _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
+       }
+      }
+    }
+  }
+}
index 1c86ab3afffdb9e24440aeec6ffc660ee2b1bb83..ad356506f3119de572a88ef8a5fb7d5b5676a578 100644 (file)
@@ -418,6 +418,13 @@ public:
       bool new_pool,
       PGMap *pg_map,
       PGMap::Incremental *pending_inc);
+
+  static void check_down_pgs(
+      const OSDMap &osd_map,
+      const PGMap *pg_map,
+      bool check_all,
+      const set<int>& need_check_down_pg_osds,
+      PGMap::Incremental *pending_inc);
 };
 
 #endif
index 286e5c70fe9e4ec4e95c411af9dab2f332c4f8ad..ca6334722b31227ccd93abdf36b2adbfbf7c451e 100644 (file)
@@ -67,8 +67,8 @@ void PGMonitor::on_restart()
 void PGMonitor::on_active()
 {
   if (mon->is_leader()) {
+    check_all_pgs = true;
     check_osd_map(mon->osdmon()->osdmap.get_epoch());
-    need_check_down_pgs = true;
   }
 
   update_logger();
@@ -861,6 +861,9 @@ void PGMonitor::check_osd_map(epoch_t epoch)
     return;
   }
 
+  // osds that went up or down
+  set<int> need_check_down_pg_osds;
+
   // apply latest map(s)
   for (epoch_t e = pg_map.last_osdmap_epoch+1;
        e <= epoch;
@@ -877,14 +880,15 @@ void PGMonitor::check_osd_map(epoch_t epoch)
                                 &last_osd_report, &pg_map, &pending_inc);
   }
 
+  const OSDMap& osdmap = mon->osdmon()->osdmap;
   assert(pg_map.last_osdmap_epoch < epoch);
   pending_inc.osdmap_epoch = epoch;
-  PGMapUpdater::update_creating_pgs(mon->osdmon()->osdmap,
-                                    &pg_map, &pending_inc);
-  PGMapUpdater::register_new_pgs(mon->osdmon()->osdmap, &pg_map, &pending_inc);
+  PGMapUpdater::update_creating_pgs(osdmap, &pg_map, &pending_inc);
+  PGMapUpdater::register_new_pgs(osdmap, &pg_map, &pending_inc);
 
-  if (need_check_down_pgs || !need_check_down_pg_osds.empty())
-    check_down_pgs();
+  PGMapUpdater::check_down_pgs(osdmap, &pg_map, check_all_pgs,
+                              need_check_down_pg_osds, &pending_inc);
+  check_all_pgs = false;
 
   propose_pending();
 }
@@ -933,76 +937,6 @@ epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
   return last + 1;
 }
 
-void PGMonitor::_try_mark_pg_stale(
-  const OSDMap *osdmap,
-  pg_t pgid,
-  const pg_stat_t& cur_stat)
-{
-  map<pg_t,pg_stat_t>::iterator q = pending_inc.pg_stat_updates.find(pgid);
-  pg_stat_t *stat;
-  if (q == pending_inc.pg_stat_updates.end()) {
-    stat = &pending_inc.pg_stat_updates[pgid];
-    *stat = cur_stat;
-  } else {
-    stat = &q->second;
-  }
-  if ((stat->acting_primary == cur_stat.acting_primary) ||
-      ((stat->state & PG_STATE_STALE) == 0 &&
-       stat->acting_primary != -1 &&
-       osdmap->is_down(stat->acting_primary))) {
-    dout(10) << " marking pg " << pgid << " stale (acting_primary "
-            << stat->acting_primary << ")" << dendl;
-    stat->state |= PG_STATE_STALE;  
-    stat->last_unstale = ceph_clock_now();
-  }
-}
-
-void PGMonitor::check_down_pgs()
-{
-  dout(10) << "check_down_pgs last_osdmap_epoch "
-          << pg_map.last_osdmap_epoch << dendl;
-  if (pg_map.last_osdmap_epoch == 0)
-    return;
-
-  // use the OSDMap that matches the one pg_map has consumed.
-  std::unique_ptr<OSDMap> osdmap;
-  bufferlist bl;
-  int err = mon->osdmon()->get_version_full(pg_map.last_osdmap_epoch, bl);
-  assert(err == 0);
-  osdmap.reset(new OSDMap);
-  osdmap->decode(bl);
-
-  // if a large number of osds changed state, just iterate over the whole
-  // pg map.
-  if (need_check_down_pg_osds.size() > (unsigned)osdmap->get_num_osds() *
-      g_conf->mon_pg_check_down_all_threshold)
-    need_check_down_pgs = true;
-
-  if (need_check_down_pgs) {
-    for (auto p : pg_map.pg_stat) {
-      if ((p.second.state & PG_STATE_STALE) == 0 &&
-          p.second.acting_primary != -1 &&
-          osdmap->is_down(p.second.acting_primary)) {
-       _try_mark_pg_stale(osdmap.get(), p.first, p.second);
-      }
-    }
-  } else {
-    for (auto osd : need_check_down_pg_osds) {
-      if (osdmap->is_down(osd)) {
-       for (auto pgid : pg_map.pg_by_osd[osd]) {
-         const pg_stat_t &stat = pg_map.pg_stat[pgid];
-         assert(stat.acting_primary == osd);
-         if ((stat.state & PG_STATE_STALE) == 0) {
-           _try_mark_pg_stale(osdmap.get(), pgid, stat);
-         }
-       }
-      }
-    }
-  }
-  need_check_down_pgs = false;
-  need_check_down_pg_osds.clear();
-}
-
 void PGMonitor::dump_info(Formatter *f) const
 {
   f->open_object_section("pgmap");
index e1c9c822f98f4d2f6c5bd5f621e8b3ce153110a1..987079d47d376a5ef744999959064e5afe6386a3 100644 (file)
@@ -43,12 +43,11 @@ class PGMonitor : public PaxosService {
 public:
   PGMap pg_map;
 
-  bool need_check_down_pgs;
-  set<int> need_check_down_pg_osds;
-
 private:
   PGMap::Incremental pending_inc;
 
+  bool check_all_pgs = false;
+
   const char *pgmap_meta_prefix;
   const char *pgmap_pg_prefix;
   const char *pgmap_osd_prefix;
@@ -94,18 +93,6 @@ private:
 
   epoch_t send_pg_creates(int osd, Connection *con, epoch_t next);
 
-  /**
-   * check pgs for down primary osds
-   *
-   * clears need_check_down_pgs
-   * clears need_check_down_pg_osds
-   *
-   */
-  void check_down_pgs();
-  void _try_mark_pg_stale(const OSDMap *osdmap, pg_t pgid,
-                         const pg_stat_t& cur_stat);
-
-
   /**
    * Dump stats from pgs stuck in specified states.
    *
@@ -118,7 +105,6 @@ private:
 public:
   PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
     : PaxosService(mn, p, service_name),
-      need_check_down_pgs(false),
       pgmap_meta_prefix("pgmap_meta"),
       pgmap_pg_prefix("pgmap_pg"),
       pgmap_osd_prefix("pgmap_osd")