]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: fail to peer if interval lacks any !incomplete replicas
authorSage Weil <sage@newdream.net>
Tue, 10 Jan 2012 21:23:00 +0000 (13:23 -0800)
committerSage Weil <sage@newdream.net>
Tue, 10 Jan 2012 21:23:00 +0000 (13:23 -0800)
We need at least one non-incomplete replica during a rw interval in order
to peer.  The backfilling/incomplete replicas get log entries, but not
all object writes, so they are (mostly) excluded from the peering process
(find_best_info(), in particular).

We can't do this during the PriorSet calculation because we don't have
their PG::Info yet.  But, once we get it, we need to make sure at least one
of the replicas during the last rw interval is not incomplete, or else we
should mark the pg DOWN (just like the PriorSet calculation does).

This logic mostly mirrors that of PriorSet, but additionally requires
the replicas be !incomplete.

Signed-off-by: Sage Weil <sage@newdream.net>
src/osd/PG.cc

index 05a4f75bb95ce5f75e6ee03ec0b7c19b1a1e941f..e37da4b449752943a4811d378cd065dfa832032a 100644 (file)
@@ -4232,8 +4232,61 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
       get_infos();
     } else {
       // are we done getting everything?
-      if (peer_info_requested.empty() && !prior_set->pg_down)
+      if (peer_info_requested.empty() && !prior_set->pg_down) {
+       /*
+        * make sure we have at least one !incomplete() osd from the
+        * last rw interval.  the incomplete (backfilling) replicas
+        * get a copy of the log, but they don't get all the object
+        * updates, so they are insufficient to recover changes during
+        * that interval.
+        */
+       if (pg->info.history.last_epoch_started) {
+         for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
+              p != pg->past_intervals.rend();
+              ++p) {
+           if (p->first < pg->info.history.last_epoch_started)
+             break;
+           if (!p->second.maybe_went_rw)
+             continue;
+           Interval& interval = p->second;
+           dout(10) << " last maybe_went_rw interval was " << interval << dendl;
+           OSDMapRef osdmap = pg->get_osdmap();
+
+           /*
+            * this mirrors the PriorSet calculation: we wait if we
+            * don't have an up (AND !incomplete) node AND there are
+            * nodes down that might be usable.
+            */
+           bool any_up_complete_now = false;
+           bool any_down_now = false;
+           for (unsigned i=0; i<interval.acting.size(); i++) {
+             int o = interval.acting[i];
+             if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
+               continue;  // dne or lost
+             if (osdmap->is_up(o)) {
+               PG::Info *pinfo;
+               if (o == pg->osd->whoami) {
+                 pinfo = &pg->info;
+               } else {
+                 assert(pg->peer_info.count(o));
+                 pinfo = &pg->peer_info[o];
+               }
+               if (!pinfo->is_incomplete())
+                 any_up_complete_now = true;
+             } else {
+               any_down_now = true;
+             }
+           }
+           if (!any_up_complete_now && any_down_now) {
+             dout(10) << " no osds up+complete from interval " << interval << dendl;
+             pg->state_set(PG_STATE_DOWN);
+             return discard_event();
+           }
+           break;
+         }
+       }
        post_event(GotInfo());
+      }
     }
   }
   return discard_event();