]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: filter osds removed from probe set from peer_info_requested
authorSage Weil <sage.weil@dreamhost.com>
Wed, 25 Apr 2012 20:07:34 +0000 (13:07 -0700)
committerSage Weil <sage.weil@dreamhost.com>
Wed, 25 Apr 2012 20:07:34 +0000 (13:07 -0700)
Peef_info_requested should be a strict subset of the probe set.  Filter
osds that are dropped from probe from peer_info_requested.  We could also
restart peering from scratch here, but this is less expensive, because we
don't have to re-probe everyone.

Once we adjust the probe and peer_info_requested sets, (re)check if we're
done: we may have been blocedk on a previous peer_info_requested entry.

The situation I saw was:

  "recovery_state": [
        { "name": "Started\/Primary\/Peering\/GetInfo",
          "enter_time": "2012-04-25 14:39:56.905748",
          "requested_info_from": [
                { "osd": 193}]},
        { "name": "Started\/Primary\/Peering",
          "enter_time": "2012-04-25 14:39:56.905748",
          "probing_osds": [
                79,
                191,
                195],
          "down_osds_we_would_probe": [],
          "peering_blocked_by": []},
        { "name": "Started",
          "enter_time": "2012-04-25 14:39:56.905742"}]}

Once in this state, cycling osd.193 doesn't help, because the prior_set
is not affected.

Signed-off-by: Sage Weil <sage.weil@dreamhost.com>
Reviewed-by: Samuel Just <samuel.just@dreamhost.com>
src/osd/PG.cc

index ade015804adb5c5a3a4c7c58ece5e2a91306644e..3fc24b82d6b752f390b76c022b06e553969e6cd3 100644 (file)
@@ -4414,64 +4414,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
     if (old_start < pg->info.history.last_epoch_started) {
       dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
       pg->build_prior(prior_set);
+
+      // filter out any osds that got dropped from the probe set from
+      // peer_info_requested.  this is less expensive than restarting
+      // peering (which would re-probe everyone).
+      set<int>::iterator p = peer_info_requested.begin();
+      while (p != peer_info_requested.end()) {
+       if (prior_set->probe.count(*p) == 0) {
+         dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
+         peer_info_requested.erase(++p);
+       } else {
+         ++p;
+       }
+      }
       get_infos();
-    } else {
-      // are we done getting everything?
-      if (peer_info_requested.empty() && !prior_set->pg_down) {
-       /*
-        * make sure we have at least one !incomplete() osd from the
-        * last rw interval.  the incomplete (backfilling) replicas
-        * get a copy of the log, but they don't get all the object
-        * updates, so they are insufficient to recover changes during
-        * that interval.
-        */
-       if (pg->info.history.last_epoch_started) {
-         for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
-              p != pg->past_intervals.rend();
-              ++p) {
-           if (p->first < pg->info.history.last_epoch_started)
-             break;
-           if (!p->second.maybe_went_rw)
-             continue;
-           Interval& interval = p->second;
-           dout(10) << " last maybe_went_rw interval was " << interval << dendl;
-           OSDMapRef osdmap = pg->get_osdmap();
-
-           /*
-            * this mirrors the PriorSet calculation: we wait if we
-            * don't have an up (AND !incomplete) node AND there are
-            * nodes down that might be usable.
-            */
-           bool any_up_complete_now = false;
-           bool any_down_now = false;
-           for (unsigned i=0; i<interval.acting.size(); i++) {
-             int o = interval.acting[i];
-             if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
-               continue;  // dne or lost
-             if (osdmap->is_up(o)) {
-               pg_info_t *pinfo;
-               if (o == pg->osd->whoami) {
-                 pinfo = &pg->info;
-               } else {
-                 assert(pg->peer_info.count(o));
-                 pinfo = &pg->peer_info[o];
-               }
-               if (!pinfo->is_incomplete())
-                 any_up_complete_now = true;
+    }
+
+    // are we done getting everything?
+    if (peer_info_requested.empty() && !prior_set->pg_down) {
+      /*
+       * make sure we have at least one !incomplete() osd from the
+       * last rw interval.  the incomplete (backfilling) replicas
+       * get a copy of the log, but they don't get all the object
+       * updates, so they are insufficient to recover changes during
+       * that interval.
+       */
+      if (pg->info.history.last_epoch_started) {
+       for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
+            p != pg->past_intervals.rend();
+            ++p) {
+         if (p->first < pg->info.history.last_epoch_started)
+           break;
+         if (!p->second.maybe_went_rw)
+           continue;
+         Interval& interval = p->second;
+         dout(10) << " last maybe_went_rw interval was " << interval << dendl;
+         OSDMapRef osdmap = pg->get_osdmap();
+
+         /*
+          * this mirrors the PriorSet calculation: we wait if we
+          * don't have an up (AND !incomplete) node AND there are
+          * nodes down that might be usable.
+          */
+         bool any_up_complete_now = false;
+         bool any_down_now = false;
+         for (unsigned i=0; i<interval.acting.size(); i++) {
+           int o = interval.acting[i];
+           if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
+             continue;  // dne or lost
+           if (osdmap->is_up(o)) {
+             pg_info_t *pinfo;
+             if (o == pg->osd->whoami) {
+               pinfo = &pg->info;
              } else {
-               any_down_now = true;
+               assert(pg->peer_info.count(o));
+               pinfo = &pg->peer_info[o];
              }
+             if (!pinfo->is_incomplete())
+               any_up_complete_now = true;
+           } else {
+             any_down_now = true;
            }
-           if (!any_up_complete_now && any_down_now) {
-             dout(10) << " no osds up+complete from interval " << interval << dendl;
-             pg->state_set(PG_STATE_DOWN);
-             return discard_event();
-           }
-           break;
          }
+         if (!any_up_complete_now && any_down_now) {
+           dout(10) << " no osds up+complete from interval " << interval << dendl;
+           pg->state_set(PG_STATE_DOWN);
+           return discard_event();
+         }
+         break;
        }
-       post_event(GotInfo());
       }
+      post_event(GotInfo());
     }
   }
   return discard_event();