]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: generate past intervals in parallel on boot
authorSage Weil <sage@inktank.com>
Wed, 25 Jul 2012 17:57:35 +0000 (10:57 -0700)
committerSage Weil <sage@inktank.com>
Wed, 25 Jul 2012 20:28:55 +0000 (13:28 -0700)
Even though we aggressively share past_intervals with notifies etc, it is
still possible for an osd to get buried behind a pile of old maps and need
to generate these if it has been out of the cluster for a while.  This has
happened to us in the past but, sadly, we did not merge the work then.
On the bright side, this implementation is much much much cleaner than the
old one because of the pg_interval_t helper we've since switched to.

On bootup, we look at the intervals each pg needs and calclate the union,
and then iterate over that map range.  The inner bit of the loop is
functionally identical to PG::build_past_intervals(), keeping the per-pg
state in the pistate struct.

Backport: argonaut
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
src/osd/OSD.cc
src/osd/OSD.h

index b4a6591d7699c3ea9ac531caa84ae65155f22c89..e2e45351850c9b7711e7189efc7921da899728ac 100644 (file)
@@ -1403,8 +1403,113 @@ void OSD::load_pgs()
     pg->unlock();
   }
   dout(10) << "load_pgs done" << dendl;
+
+  build_past_intervals_parallel();
 }
+
+
+/*
+ * build past_intervals efficiently on old, degraded, and buried
+ * clusters.  this is important for efficiently catching up osds that
+ * are way behind on maps to the current cluster state.
+ *
+ * this is a parallel version of PG::generate_past_intervals().
+ * follow the same logic, but do all pgs at the same time so that we
+ * can make a single pass across the osdmap history.
+ */
+struct pistate {
+  epoch_t start, end;
+  vector<int> old_acting, old_up;
+  epoch_t same_interval_since;
+};
+
+void OSD::build_past_intervals_parallel()
+{
+  map<PG*,pistate> pis;
+
+  // calculate untion of map range
+  epoch_t end_epoch = superblock.oldest_map;
+  epoch_t cur_epoch = superblock.newest_map;
+  for (hash_map<pg_t, PG*>::iterator i = pg_map.begin();
+       i != pg_map.end();
+       i++) {
+    PG *pg = i->second;
+
+    epoch_t start, end;
+    if (!pg->_calc_past_interval_range(&start, &end))
+      continue;
+
+    dout(10) << pg->info.pgid << " needs " << start << "-" << end << dendl;
+    pistate& p = pis[pg];
+    p.start = start;
+    p.end = end;
+    p.same_interval_since = 0;
+
+    if (start < cur_epoch)
+      cur_epoch = start;
+    if (end > end_epoch)
+      end_epoch = end;
+  }
+  if (pis.empty()) {
+    dout(10) << __func__ << " nothing to build" << dendl;
+    return;
+  }
+
+  dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
+  assert(cur_epoch <= end_epoch);
+
+  OSDMapRef cur_map, last_map;
+  for ( ; cur_epoch <= end_epoch; cur_epoch++) {
+    dout(10) << __func__ << " epoch " << cur_epoch << dendl;
+    last_map = cur_map;
+    cur_map = get_map(cur_epoch);
+
+    ObjectStore::Transaction t;
+
+    for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
+      PG *pg = i->first;
+      pistate& p = i->second;
+
+      if (cur_epoch < p.start || cur_epoch > p.end)
+       continue;
+
+      vector<int> acting, up;
+      cur_map->pg_to_up_acting_osds(pg->info.pgid, up, acting);
+
+      if (p.same_interval_since == 0) {
+       dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
+                << " first map, acting " << acting
+                << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
+       p.same_interval_since = cur_epoch;
+       p.old_up = up;
+       p.old_acting = acting;
+       continue;
+      }
+      assert(last_map);
+
+      std::stringstream debug;
+      bool new_interval = pg_interval_t::check_new_interval(p.old_acting, acting,
+                                                           p.old_up, up,
+                                                           p.same_interval_since,
+                                                           pg->info.history.last_epoch_clean,
+                                                           cur_map, last_map,
+                                                           &pg->past_intervals,
+                                                           &debug);
+      if (new_interval) {
+       dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
+                << " " << debug.str() << dendl;
+       p.old_up = up;
+       p.old_acting = acting;
+       p.same_interval_since = cur_epoch;
+       pg->write_info(t);
+      }
+    }
+
+    if (!t.empty())
+      store->apply_transaction(t);
+  }
+}
+
 
 /*
  * look up a pg.  if we have it, great.  if not, consider creating it IF the pg mapping
index 43643f7d629205bd224f2a73c20d3b01fa62e01a..94eaed60c49dca37f831f39a0789579f655ce395 100644 (file)
@@ -740,6 +740,8 @@ protected:
                       bool primary);
   
   void load_pgs();
+  void build_past_intervals_parallel();
+
   void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& pset);
   void project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from,
                          const vector<int>& lastup, const vector<int>& lastacting);