]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: move may_need_replay calculation out of PriorSet
authorSage Weil <sage@newdream.net>
Fri, 21 Oct 2011 22:23:51 +0000 (15:23 -0700)
committerSage Weil <sage@newdream.net>
Fri, 21 Oct 2011 22:24:36 +0000 (15:24 -0700)
Although they both depend on past intervals, they are unrelated.  Factor
out the may_need_replay calculation from PriorSet.  Instead, do it right
before we activate when we need to decide whether to do a replay window
or not.

Signed-off-by: Sage Weil <sage@newdream.net>
src/osd/PG.cc
src/osd/PG.h

index 83b48cce566f028197d9d26d98669939b8fd7ad8..96c49258e64da912760ebcab8b5460fad68bc55e 100644 (file)
@@ -1044,9 +1044,6 @@ void PG::build_prior(std::auto_ptr<PriorSet> &prior_set)
                                 this));
   PriorSet &prior(*prior_set.get());
                                 
-  if (prior.crashed) {
-    state_set(PG_STATE_REPLAY);
-  }
   if (prior.pg_down) {
     state_set(PG_STATE_DOWN);
   }
@@ -1329,12 +1326,13 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
   assert(!is_active());
 
   // -- crash recovery?
-  if (is_replay()) {
+  if (may_need_replay(osd->osdmap)) {
     if (g_conf->osd_replay_window > 0) {
       replay_until = ceph_clock_now(g_ceph_context);
       replay_until += g_conf->osd_replay_window;
       dout(10) << "crashed, allowing op replay for " << g_conf->osd_replay_window
               << " until " << replay_until << dendl;
+      state_set(PG_STATE_REPLAY);
       osd->replay_queue_lock.Lock();
       osd->replay_queue.push_back(pair<pg_t,utime_t>(info.pgid, replay_until));
       osd->replay_queue_lock.Unlock();
@@ -3329,6 +3327,85 @@ void PG::fulfill_log(int from, const Query &query, epoch_t query_epoch)
   }
 }
 
+
+// true if all OSDs in prior intervals may have crashed, and we need to replay
+// false positives are okay, false negatives are not.
+bool PG::may_need_replay(const OSDMap *osdmap) const
+{
+  bool crashed = false;
+
+  for (map<epoch_t,Interval>::const_reverse_iterator p = past_intervals.rbegin();
+       p != past_intervals.rend();
+       p++) {
+    const Interval &interval = p->second;
+    dout(10) << "may_need_replay " << interval << dendl;
+
+    if (interval.last < info.history.last_epoch_started)
+      break;  // we don't care
+
+    if (interval.acting.empty())
+      continue;
+
+    if (!interval.maybe_went_rw)
+      continue;
+
+    // look at whether any of the osds during this interval survived
+    // past the end of the interval (i.e., didn't crash and
+    // potentially fail to COMMIT a write that it ACKed).
+    bool any_survived_interval = false;
+
+    // consider ACTING osds
+    for (unsigned i=0; i<interval.acting.size(); i++) {
+      int o = interval.acting[i];
+
+      const osd_info_t *pinfo = 0;
+      if (osdmap->exists(o))
+       pinfo = &osdmap->get_info(o);
+
+      // does this osd appear to have survived through the end of the
+      // interval?
+      if (pinfo) {
+       if (pinfo->up_from <= interval.first && pinfo->up_thru > interval.last) {
+         dout(10) << "may_need_replay  osd." << o
+                  << " up_from " << pinfo->up_from << " up_thru " << pinfo->up_thru
+                  << " survived the interval" << dendl;
+         any_survived_interval = true;
+       }
+       else if (pinfo->up_from <= interval.first &&
+                (std::find(acting.begin(), acting.end(), o) != acting.end() ||
+                 std::find(up.begin(), up.end(), o) != up.end())) {
+         dout(10) << "may_need_replay  osd." << o
+                  << " up_from " << pinfo->up_from << " and is in acting|up,"
+                  << " assumed to have survived the interval" << dendl;
+         // (if it hasn't, we will rebuild PriorSet)
+         any_survived_interval = true;
+       }
+       else if (pinfo->up_from > interval.last &&
+                pinfo->last_clean_begin <= interval.first &&
+                pinfo->last_clean_end > interval.last) {
+         dout(10) << "may_need_replay  prior osd." << o
+                  << " up_from " << pinfo->up_from
+                  << " and last clean interval ["
+                  << pinfo->last_clean_begin << "," << pinfo->last_clean_end
+                  << ") survived the interval" << dendl;
+         any_survived_interval = true;
+       }
+      }
+    }
+
+    if (!any_survived_interval) {
+      dout(3) << "may_need_replay  no known survivors of interval "
+             << interval.first << "-" << interval.last
+             << ", may need replay" << dendl;
+      crashed = true;
+      break;
+    }
+  }
+
+  return crashed;
+}
+
+
 bool PG::acting_up_affected(const vector<int>& newup, const vector<int>& newacting)
 {
   if (acting != newacting || up != newup) {
@@ -4671,7 +4748,7 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
                       const vector<int> &acting,
                       const PG::Info &info,
                       const PG *debug_pg)
-  : crashed(false), pg_down(false)
+  : pg_down(false)
 {
   /*
    * We have to be careful to gracefully deal with situations like
@@ -4739,6 +4816,9 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
     if (interval.acting.empty())
       continue;
 
+    if (!interval.maybe_went_rw)
+      continue;
+
     // look at candidate osds during this interval.  each falls into
     // one of three categories: up, down (but potentially
     // interesting), or lost (down, but we won't wait for it).
@@ -4746,11 +4826,6 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
     bool any_down_now = false;  // any candidates down now (that might have useful data)
     bool any_lost_now = false;  // any candidates lost now (that we will ignore)
 
-    // look at whether any of the osds during this interval survived
-    // past the end of the interval (i.e., didn't crash and
-    // potentially fail to COMMIT a write that it ACKed).
-    bool any_survived_interval = false;
-
     // consider ACTING osds
     for (unsigned i=0; i<interval.acting.size(); i++) {
       int o = interval.acting[i];
@@ -4759,36 +4834,6 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
       if (osdmap.exists(o))
        pinfo = &osdmap.get_info(o);
 
-      // does this osd appear to have survived through the end of the
-      // interval?
-      if (pinfo) {
-       if (pinfo->up_from <= interval.first && pinfo->up_thru > interval.last) {
-         dout(10) << "build_prior  osd." << o
-                  << " up_from " << pinfo->up_from << " up_thru " << pinfo->up_thru
-                  << " survived the interval" << dendl;
-         any_survived_interval = true;
-       }
-       else if (pinfo->up_from <= interval.first &&
-                (std::find(acting.begin(), acting.end(), o) != acting.end() ||
-                 std::find(up.begin(), up.end(), o) != up.end())) {
-         dout(10) << "build_prior  osd." << o
-                  << " up_from " << pinfo->up_from << " and is in acting|up,"
-                  << " assumed to have survived the interval" << dendl;
-         // (if it hasn't, we will rebuild PriorSet)
-         any_survived_interval = true;
-       }
-       else if (pinfo->up_from > interval.last &&
-                pinfo->last_clean_begin <= interval.first &&
-                pinfo->last_clean_end > interval.last) {
-         dout(10) << "build_prior  prior osd." << o
-                  << " up_from " << pinfo->up_from
-                  << " and last clean interval ["
-                  << pinfo->last_clean_begin << "," << pinfo->last_clean_end
-                  << ") survived the interval" << dendl;
-         any_survived_interval = true;
-       }
-      }
-
       if (osdmap.is_up(o)) {
        // include past acting osds if they are up.
        probe.insert(o);
@@ -4811,7 +4856,7 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
     // if nobody survived this interval, and we may have gone rw,
     // then we need to wait for one of those osds to recover to
     // ensure that we haven't lost any information.
-    if (!any_up_now && any_down_now && interval.maybe_went_rw) {
+    if (!any_up_now && any_down_now) {
       // fixme: how do we identify a "clean" shutdown anyway?
       dout(10) << "build_prior  possibly went active+rw, none up; including down osds" << dendl;
       for (vector<int>::const_iterator i = interval.acting.begin();
@@ -4828,21 +4873,11 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
        }
       }
     }
-
-    // if we may have gone rw during this interval, but nobody is
-    // known to have survived through the completion of that interval,
-    // mark the PG crashed so that writers who got ACK but not COMMIT
-    // have a chance to replay their requests and preserve ordering.
-    if (interval.maybe_went_rw && !any_survived_interval) {
-      dout(10) << "build_prior  no acting survived past the interval, marking crashed" << dendl;
-      crashed = true;
-    }
   }
 
   dout(10) << "build_prior final: probe " << probe
           << " down " << down
           << " blocked_by " << blocked_by
-          << (crashed ? " crashed":"")
           << (pg_down ? " pg_down":"")
           << dendl;
 }
index aee0447c3fb729a70f6039e2bfab584e74bc7943..937a37bd30297f693a9c5ca011274db66b0d1eb8 100644 (file)
@@ -878,7 +878,6 @@ public:
     set<int> down;  /// down osds that would normally be in @probe and might be interesting.
     map<int,epoch_t> blocked_by;  /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
 
-    bool crashed;   /// true if past osd failures were such that clients may need to replay requests.
     bool pg_down;   /// some down osds are included in @cur; the DOWN pg state bit should be set.
     PriorSet(const OSDMap &osdmap,
             const map<epoch_t, Interval> &past_intervals,
@@ -893,6 +892,9 @@ public:
   friend std::ostream& operator<<(std::ostream& oss,
                                  const struct PriorSet &prior);
 
+  bool may_need_replay(const OSDMap *osdmap) const;
+
+
 public:    
   struct RecoveryCtx {
     utime_t start_time;