osd: Backfill peers should not be included in the acting set

author David Zafman <david.zafman@inktank.com>

Fri, 11 Oct 2013 22:53:49 +0000 (15:53 -0700)

committer David Zafman <david.zafman@inktank.com>

Tue, 19 Nov 2013 22:45:20 +0000 (14:45 -0800)
author David Zafman <david.zafman@inktank.com>
Fri, 11 Oct 2013 22:53:49 +0000 (15:53 -0700)
committer David Zafman <david.zafman@inktank.com>
Tue, 19 Nov 2013 22:45:20 +0000 (14:45 -0800)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 444fe33dcb8c1a558864e807162878fffe001ff9..efb83885c433af0415cfd890f533d6b420e4edea 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -6011,8 +6011,11 @@ void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg)
  bool OSD::compat_must_dispatch_immediately(PG *pg)
  {
    assert(pg->is_locked());
-  for (vector<int>::iterator i = pg->acting.begin();
-       i != pg->acting.end();
+  vector<int> *tmpacting = &pg->acting;
+  if (pg->actingbackfill.size() > 0)
+    tmpacting = &pg->actingbackfill;
+  for (vector<int>::iterator i = tmpacting->begin();
+       i != tmpacting->end();
         ++i) {
      if (*i == whoami)
        continue;
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 2f5026c2e2d9fd4bcbb08d5155ecae81b80aa91d..4bcd5051d027721204069b78e5d77107bec43b28 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -169,7 +169,6 @@ PG::PG(OSDService *o, OSDMapRef curmap,
    need_up_thru(false),
    last_peering_reset(0),
    heartbeat_peer_lock("PG::heartbeat_peer_lock"),
-  backfill_target(-1),
    backfill_reserved(0),
    backfill_reserving(0),
    flushes_in_progress(0),
@@ -483,8 +482,9 @@ bool PG::needs_recovery() const
      ret = true;
    }
  
-  vector<int>::const_iterator end = acting.end();
-  vector<int>::const_iterator a = acting.begin();
+  assert(actingbackfill.size() > 0);
+  vector<int>::const_iterator end = actingbackfill.end();
+  vector<int>::const_iterator a = actingbackfill.begin();
    assert(a != end);
    ++a;
    for (; a != end; ++a) {
@@ -512,10 +512,10 @@ bool PG::needs_backfill() const
  
    bool ret = false;
  
-  vector<int>::const_iterator end = acting.end();
-  vector<int>::const_iterator a = acting.begin();
-  assert(a != end);
-  ++a;
+  // We can assume that only possible osds that need backfill
+  // are on the backfill_targets vector.
+  vector<int>::const_iterator end = backfill_targets.end();
+  vector<int>::const_iterator a = backfill_targets.begin();
    for (; a != end; ++a) {
      int peer = *a;
      map<int,pg_info_t>::const_iterator pi = peer_info.find(peer);
@@ -843,7 +843,7 @@ map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t>
   * incomplete, or another osd has a longer tail that allows us to
   * bring other up nodes up to date.
   */
-bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want) const
+bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>& backfill) const
  {
    map<int, pg_info_t> all_info(peer_info.begin(), peer_info.end());
    all_info[osd->whoami] = info;
@@ -907,7 +907,6 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want) const
            << " with " << primary->second << dendl;
    want.push_back(primary->first);
    unsigned usable = 1;
-  unsigned backfill = 0;
  
    // select replicas that have log contiguity with primary.
    // prefer up, then acting, then any peer_info osds 
@@ -918,13 +917,8 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want) const
        continue;
      const pg_info_t &cur_info = all_info.find(*i)->second;
      if (cur_info.is_incomplete() || cur_info.last_update < primary->second.log_tail) {
-      if (backfill < 1) {
-       dout(10) << " osd." << *i << " (up) accepted (backfill) " << cur_info << dendl;
-       want.push_back(*i);
-       backfill++;
-      } else {
-       dout(10) << " osd." << *i << " (up) rejected" << cur_info << dendl;
-      }
+      dout(10) << " osd." << *i << " (up) backfill " << cur_info << dendl;
+      backfill.push_back(*i);
      } else {
        want.push_back(*i);
        usable++;
@@ -932,6 +926,7 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want) const
      }
    }
  
+  // This no longer has backfill OSDs, but they are covered above.
    for (vector<int>::const_iterator i = acting.begin();
         i != acting.end();
         ++i) {
@@ -991,15 +986,23 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want) const
   */
  bool PG::choose_acting(int& newest_update_osd)
  {
-  vector<int> want;
+  vector<int> want, backfill;
  
-  if (!calc_acting(newest_update_osd, want)) {
+  if (!calc_acting(newest_update_osd, want, backfill)) {
      dout(10) << "choose_acting failed" << dendl;
      assert(want_acting.empty());
      return false;
    }
  
-  if (want.size() < pool.info.min_size) {
+  // For now we only backfill 1 at a time as before
+  if (!backfill.empty())
+    backfill.resize(1);
+
+  // This might cause a problem if min_size is large
+  // and we need to backfill more than 1 osd.  Older
+  // code would only include 1 backfill osd and now we
+  // have the resize above.
+  if (want.size() + backfill.size() < pool.info.min_size) {
      want_acting.clear();
      return false;
    }
@@ -1008,16 +1011,39 @@ bool PG::choose_acting(int& newest_update_osd)
      dout(10) << "choose_acting want " << want << " != acting " << acting
              << ", requesting pg_temp change" << dendl;
      want_acting = want;
+
      if (want == up) {
+      // There can't be any pending backfill if
+      // want is the same as crush map up OSDs.
+      assert(backfill.empty());
        vector<int> empty;
        osd->queue_want_pg_temp(info.pgid, empty);
      } else
        osd->queue_want_pg_temp(info.pgid, want);
      return false;
+  }
+  want_acting.clear();
+  // We can only get here when new interval has arrived and
+  // we've accepted the acting set.  Now we can create
+  // actingbackfill and backfill_targets vectors.
+  actingbackfill = acting;
+  actingbackfill.insert(actingbackfill.end(), backfill.begin(), backfill.end());
+  assert(backfill_targets.empty() || backfill_targets == backfill);
+  if (backfill_targets.empty()) {
+    backfill_targets = backfill;
+    for (unsigned i = 0; i < backfill.size() ; ++i) {
+      stray_set.erase(backfill[i]);
+    }
    } else {
-    want_acting.clear();
+    // Will not change if already set because up would have had to change
+    assert(backfill_targets == backfill);
+    // Verify that nothing in backfill is in stray_set
+    for (unsigned i = 0; i < backfill.size() ; ++i) {
+      assert(stray_set.find(backfill[i]) == stray_set.end());
+    }
    }
-  dout(10) << "choose_acting want " << want << " (== acting)" << dendl;
+  dout(10) << "choose_acting want " << want << " (== acting) backfill_targets " 
+    << backfill << dendl;
    return true;
  }
  
@@ -1181,8 +1207,9 @@ void PG::activate(ObjectStore::Transaction& t,
      // count replicas that are not backfilling
      unsigned active = 1;
  
-    for (unsigned i=1; i<acting.size(); i++) {
-      int peer = acting[i];
+    assert(actingbackfill.size() > 0);
+    for (unsigned i=1; i<actingbackfill.size(); i++) {
+      int peer = actingbackfill[i];
        assert(peer_info.count(peer));
        pg_info_t& pi = peer_info[peer];
  
@@ -1272,8 +1299,10 @@ void PG::activate(ObjectStore::Transaction& t,
        }
      }
  
+    assert(active == acting.size());
+
      // degraded?
-    if (get_osdmap()->get_pg_size(info.pgid) > active)
+    if (get_osdmap()->get_pg_size(info.pgid) > acting.size())
        state_set(PG_STATE_DEGRADED);
  
      // all clean?
@@ -1422,7 +1451,8 @@ void PG::_activate_committed(epoch_t e)
      dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated 
              << " last_epoch_started " << info.history.last_epoch_started
              << " same_interval_since " << info.history.same_interval_since << dendl;
-    if (peer_activated.size() == acting.size())
+    assert(actingbackfill.size() > 0);
+    if (peer_activated.size() == actingbackfill.size())
        all_activated_and_committed();
    } else {
      dout(10) << "_activate_committed " << e << " telling primary" << dendl;
@@ -1454,7 +1484,8 @@ void PG::all_activated_and_committed()
  {
    dout(10) << "all_activated_and_committed" << dendl;
    assert(is_primary());
-  assert(peer_activated.size() == acting.size());
+  assert(peer_activated.size() == actingbackfill.size());
+  assert(actingbackfill.size() > 0);
  
    // info.last_epoch_started is set during activate()
    info.history.last_epoch_started = info.last_epoch_started;
@@ -1689,6 +1720,7 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
  
    child->snap_trimq = snap_trimq;
  
+  // There can't be recovery/backfill going on now
    get_osdmap()->pg_to_up_acting_osds(child->info.pgid, child->up, child->acting);
    child->role = get_osdmap()->calc_pg_role(osd->whoami, child->acting);
    if (get_primary() != child->get_primary())
@@ -1721,7 +1753,7 @@ void PG::clear_recovery_state()
      finish_recovery_op(soid, true);
    }
  
-  backfill_target = -1;
+  backfill_targets.clear();
    backfill_info.clear();
    peer_backfill_info.clear();
    waiting_on_backfill = false;
@@ -1860,7 +1892,7 @@ void PG::publish_stats_to_osd()
      pg_stats_publish.stats.add(unstable_stats);
  
      // calc copies, degraded
-    unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), acting.size());
+    unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), actingbackfill.size());
      pg_stats_publish.stats.calc_copies(target);
      pg_stats_publish.stats.sum.num_objects_degraded = 0;
      if ((is_degraded() || !is_clean()) && is_active()) {
@@ -1870,23 +1902,24 @@ void PG::publish_stats_to_osd()
  
        uint64_t degraded = 0;
  
-      // if the acting set is smaller than we want, add in those missing replicas
-      if (acting.size() < target)
-       degraded += (target - acting.size()) * num_objects;
+      // if the actingbackfill set is smaller than we want, add in those missing replicas
+      if (actingbackfill.size() < target)
+       degraded += (target - actingbackfill.size()) * num_objects;
  
        // missing on primary
        pg_stats_publish.stats.sum.num_objects_missing_on_primary =
         pg_log.get_missing().num_missing();
        degraded += pg_log.get_missing().num_missing();
        
-      for (unsigned i=1; i<acting.size(); i++) {
-       assert(peer_missing.count(acting[i]));
+      assert(actingbackfill.size() > 0);
+      for (unsigned i=1; i<actingbackfill.size(); i++) {
+       assert(peer_missing.count(actingbackfill[i]));
  
         // in missing set
-       degraded += peer_missing[acting[i]].num_missing();
+       degraded += peer_missing[actingbackfill[i]].num_missing();
  
         // not yet backfilled
-       degraded += num_objects - peer_info[acting[i]].stats.stats.sum.num_objects;
+       degraded += num_objects - peer_info[actingbackfill[i]].stats.stats.sum.num_objects;
        }
        pg_stats_publish.stats.sum.num_objects_degraded = degraded;
        pg_stats_publish.stats.sum.num_objects_unfound = get_num_unfound();
@@ -2198,11 +2231,13 @@ void PG::write_if_dirty(ObjectStore::Transaction& t)
  
  void PG::trim_peers()
  {
+  assert(is_primary());
    calc_trim_to();
    dout(10) << "trim_peers " << pg_trim_to << dendl;
    if (pg_trim_to != eversion_t()) {
-    for (unsigned i=1; i<acting.size(); i++)
-      osd->send_message_osd_cluster(acting[i],
+    assert(actingbackfill.size() > 0);
+    for (unsigned i=1; i<actingbackfill.size(); i++)
+      osd->send_message_osd_cluster(actingbackfill[i],
                                     new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
                                                    pg_trim_to),
                                     get_osdmap()->get_epoch());
@@ -2899,6 +2934,7 @@ void PG::clear_scrub_reserved()
  
  void PG::scrub_reserve_replicas()
  {
+  assert(backfill_targets.empty());
    for (unsigned i=1; i<acting.size(); i++) {
      dout(10) << "scrub requesting reserve from osd." << acting[i] << dendl;
      vector<OSDOp> scrub(1);
@@ -2915,6 +2951,7 @@ void PG::scrub_reserve_replicas()
  
  void PG::scrub_unreserve_replicas()
  {
+  assert(backfill_targets.empty());
    for (unsigned i=1; i<acting.size(); i++) {
      dout(10) << "scrub requesting unreserve from osd." << acting[i] << dendl;
      vector<OSDOp> scrub(1);
@@ -3253,6 +3290,7 @@ void PG::scrub(ThreadPool::TPHandle &handle)
    if (!scrubber.active) {
      OSDMapRef curmap = osd->get_osdmap();
      scrubber.is_chunky = true;
+    assert(backfill_targets.empty());
      for (unsigned i=1; i<acting.size(); i++) {
        ConnectionRef con = osd->get_con_osd_cluster(acting[i], get_osdmap()->get_epoch());
        if (!con)
@@ -4191,8 +4229,9 @@ void PG::share_pg_info()
    dout(10) << "share_pg_info" << dendl;
  
    // share new pg_info_t with replicas
-  for (unsigned i=1; i<acting.size(); i++) {
-    int peer = acting[i];
+  assert(actingbackfill.size() > 0);
+  for (unsigned i=1; i<actingbackfill.size(); i++) {
+    int peer = actingbackfill[i];
      if (peer_info.count(i)) {
        peer_info[i].last_epoch_started = info.last_epoch_started;
        peer_info[i].history.merge(info.history);
@@ -4219,9 +4258,9 @@ void PG::share_pg_log()
    dout(10) << __func__ << dendl;
    assert(is_primary());
  
-  vector<int>::const_iterator a = acting.begin();
-  assert(a != acting.end());
-  vector<int>::const_iterator end = acting.end();
+  vector<int>::const_iterator a = actingbackfill.begin();
+  assert(a != actingbackfill.end());
+  vector<int>::const_iterator end = actingbackfill.end();
    while (++a != end) {
      int peer(*a);
      pg_missing_t& pmissing(peer_missing[peer]);
@@ -4464,6 +4503,8 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
      info.stats.mapping_epoch = info.history.same_interval_since;
    }
  
+  // This will now be remapped during a backfill in cases
+  // that it would not have been before.
    if (up != acting)
      state_set(PG_STATE_REMAPPED);
    else
@@ -4521,6 +4562,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
  
    peer_missing.clear();
    peer_purged.clear();
+  actingbackfill.clear();
  
    // reset primary state?
    if (oldrole == 0 || get_role() == 0)
@@ -5410,7 +5452,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_con
    PG *pg = context< RecoveryMachine >().pg;
    pg->state_set(PG_STATE_BACKFILL_WAIT);
    ConnectionRef con = pg->osd->get_con_osd_cluster(
-    pg->backfill_target, pg->get_osdmap()->get_epoch());
+    pg->get_backfill_target(), pg->get_osdmap()->get_epoch());
    if (con) {
      if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
        unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH
@@ -5795,7 +5837,8 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
  
    // if we finished backfill, all acting are active; recheck if
    // DEGRADED is appropriate.
-  if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size())
+  assert(pg->actingbackfill.size() > 0);
+  if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->actingbackfill.size())
      pg->state_clear(PG_STATE_DEGRADED);
  
    // adjust acting set?  (e.g. because backfill completed...)
@@ -5898,10 +5941,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
     * this does not matter) */
    if (advmap.lastmap->get_pg_size(pg->info.pgid) !=
        pg->get_osdmap()->get_pg_size(pg->info.pgid)) {
-    unsigned active = pg->acting.size();
-    if (pg->backfill_target != -1)
-      --active;
-    if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= active)
+    if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size())
        pg->state_clear(PG_STATE_DEGRADED);
      else
        pg->state_set(PG_STATE_DEGRADED);
@@ -5988,10 +6028,11 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe
    assert(pg->is_active());
    assert(pg->is_primary());
  
+  assert(pg->actingbackfill.size() > 0);
    // don't update history (yet) if we are active and primary; the replica
    // may be telling us they have activated (and committed) but we can't
    // share that until _everyone_ does the same.
-  if (pg->is_acting(infoevt.from)) {
+  if (pg->is_actingbackfill(infoevt.from)) {
      assert(pg->info.history.last_epoch_started < 
            pg->info.history.same_interval_since);
      assert(infoevt.info.history.last_epoch_started >= 
@@ -6001,7 +6042,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe
      pg->peer_activated.insert(infoevt.from);
    }
  
-  if (pg->peer_activated.size() == pg->acting.size()) {
+  if (pg->peer_activated.size() == pg->actingbackfill.size()) {
      pg->all_activated_and_committed();
    }
    return discard_event();
@@ -6505,7 +6546,9 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx)
  
    // how much log to request?
    eversion_t request_log_from = pg->info.last_update;
-  for (vector<int>::iterator p = pg->acting.begin() + 1; p != pg->acting.end(); ++p) {
+  assert(pg->actingbackfill.size() > 0);
+  for (vector<int>::iterator p = pg->actingbackfill.begin() + 1;
+          p != pg->actingbackfill.end(); ++p) {
      pg_info_t& ri = pg->peer_info[*p];
      if (ri.last_update >= best.log_tail && ri.last_update < request_log_from)
        request_log_from = ri.last_update;
@@ -6689,8 +6732,9 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
    context< RecoveryMachine >().log_enter(state_name);
  
    PG *pg = context< RecoveryMachine >().pg;
-  for (vector<int>::iterator i = pg->acting.begin() + 1;
-       i != pg->acting.end();
+  assert(pg->actingbackfill.size() > 0);
+  for (vector<int>::iterator i = pg->actingbackfill.begin() + 1;
+       i != pg->actingbackfill.end();
         ++i) {
      const pg_info_t& pi = pg->peer_info[*i];
  
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 2f54cd7c4c7c068ae94eee6102d6920b2651b0de..8106ea073b7b3934eb4c4895e20fe3fd21ee5bab 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -333,7 +333,7 @@ public:
  
    // primary state
   public:
-  vector<int> up, acting, want_acting;
+  vector<int> up, acting, want_acting, actingbackfill;
    map<int,eversion_t> peer_last_complete_ondisk;
    eversion_t  min_last_complete_ondisk;  // up: min over last_complete_ondisk, peer_last_complete_ondisk
    eversion_t  pg_trim_to;
@@ -507,14 +507,18 @@ protected:
    
    BackfillInterval backfill_info;
    BackfillInterval peer_backfill_info;
-  int backfill_target;
+  vector<int> backfill_targets;
    bool backfill_reserved;
    bool backfill_reserving;
  
    friend class OSD;
  
  public:
+  // Compatibility with single backfill target code
    int get_backfill_target() const {
+    int backfill_target = -1;
+    if (backfill_targets.size() > 0)
+      backfill_target = backfill_targets[0];
      return backfill_target;
    }
  
@@ -568,6 +572,11 @@ public:
        if (up[i] == osd) return true;
      return false;
    }
+  bool is_actingbackfill(int osd) const {
+    for (unsigned i=0; i<actingbackfill.size(); i++)
+      if (actingbackfill[i] == osd) return true;
+    return false;
+  }
    
    bool needs_recovery() const;
    bool needs_backfill() const;
@@ -589,10 +598,11 @@ public:
  
    bool calc_min_last_complete_ondisk() {
      eversion_t min = last_complete_ondisk;
-    for (unsigned i=1; i<acting.size(); i++) {
-      if (peer_last_complete_ondisk.count(acting[i]) == 0)
+    assert(actingbackfill.size() > 0);
+    for (unsigned i=1; i<actingbackfill.size(); i++) {
+      if (peer_last_complete_ondisk.count(actingbackfill[i]) == 0)
         return false;   // we don't have complete info
-      eversion_t a = peer_last_complete_ondisk[acting[i]];
+      eversion_t a = peer_last_complete_ondisk[actingbackfill[i]];
        if (a < min)
         min = a;
      }
@@ -624,7 +634,7 @@ public:
    void trim_write_ahead();
  
    map<int, pg_info_t>::const_iterator find_best_info(const map<int, pg_info_t> &infos) const;
-  bool calc_acting(int& newest_update_osd, vector<int>& want) const;
+  bool calc_acting(int& newest_update_osd, vector<int>& want, vector<int>& backfill) const;
    bool choose_acting(int& newest_update_osd);
    void build_might_have_unfound();
    void replay_queued_ops();
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h

index 42959664ea89b3f8b3f8d90f12860b424072598c..07496898b5d485a1ddef6a8743c99976d9a8414c 100644 (file)
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -95,7 +95,7 @@
       virtual void send_message(int to_osd, Message *m) = 0;
       virtual void queue_transaction(ObjectStore::Transaction *t) = 0;
       virtual epoch_t get_epoch() = 0;
-     virtual const vector<int> &get_acting() = 0;
+     virtual const vector<int> &get_actingbackfill() = 0;
       virtual std::string gen_dbg_prefix() const = 0;
  
       virtual const map<hobject_t, set<int> > &get_missing_loc() = 0;
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc

index c91ed68505ac3ff7d0288206c148c39f2f698bba..4a2f40566752dc1c4fb4b48535cf9d4ba92ad8ba 100644 (file)
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -224,7 +224,7 @@ void ReplicatedPG::on_peer_recover(
    publish_stats_to_osd();
    // done!
    peer_missing[peer].got(soid, recovery_info.version);
-  if (peer == backfill_target && backfills_in_flight.count(soid)) {
+  if (peer == get_backfill_target() && backfills_in_flight.count(soid)) {
      map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
      assert(i != recovering.end());
      list<OpRequestRef> requeue_list;
@@ -305,15 +305,16 @@ bool ReplicatedPG::is_degraded_object(const hobject_t& soid)
  {
    if (pg_log.get_missing().missing.count(soid))
      return true;
-  for (unsigned i = 1; i < acting.size(); i++) {
-    int peer = acting[i];
+  assert(actingbackfill.size() > 0);
+  for (unsigned i = 1; i < actingbackfill.size(); i++) {
+    int peer = actingbackfill[i];
      if (peer_missing.count(peer) &&
         peer_missing[peer].missing.count(soid))
        return true;
  
      // Object is degraded if after last_backfill AND
      // we are backfilling it
-    if (peer == backfill_target &&
+    if (peer == get_backfill_target() &&
         peer_info[peer].last_backfill <= soid &&
         last_backfill_started >= soid &&
         backfills_in_flight.count(soid))
@@ -338,8 +339,9 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
             << ", recovering"
             << dendl;
      eversion_t v;
-    for (unsigned i = 1; i < acting.size(); i++) {
-      int peer = acting[i];
+    assert(actingbackfill.size() > 0);
+    for (unsigned i = 1; i < actingbackfill.size(); i++) {
+      int peer = actingbackfill[i];
        if (peer_missing.count(peer) &&
           peer_missing[peer].missing.count(soid)) {
         v = peer_missing[peer].missing[soid].need;
@@ -462,6 +464,18 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
      for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
        f->dump_unsigned("osd", *p);
      f->close_section();
+    if (backfill_targets.size() > 0) {
+      f->open_array_section("backfill_targets");
+      for (vector<int>::iterator p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
+        f->dump_unsigned("osd", *p);
+      f->close_section();
+    }
+    if (actingbackfill.size() > 0) {
+      f->open_array_section("actingbackfill");
+      for (vector<int>::iterator p = actingbackfill.begin(); p != actingbackfill.end(); ++p)
+        f->dump_unsigned("osd", *p);
+      f->close_section();
+    }
      f->open_object_section("info");
      info.dump(f.get());
      f->close_section();
@@ -1029,6 +1043,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
    // opposite is not a problem; if the target is after the line, we
    // don't apply on the backfill_target and it doesn't matter.)
    pg_info_t *backfill_target_info = NULL;
+  int backfill_target = get_backfill_target();
    bool before_backfill = false;
    if (backfill_target >= 0) {
      backfill_target_info = &peer_info[backfill_target];
@@ -1600,7 +1615,7 @@ void ReplicatedPG::do_scan(
    case MOSDPGScan::OP_SCAN_DIGEST:
      {
        int from = m->get_source().num();
-      assert(from == backfill_target);
+      assert(from == get_backfill_target());
        BackfillInterval& bi = peer_backfill_info;
        bi.begin = m->begin;
        bi.end = m->end;
@@ -1778,7 +1793,6 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
    switch (m->op) {
    case MOSDPGBackfill::OP_BACKFILL_FINISH:
      {
-      assert(is_replica());
        assert(cct->_conf->osd_kill_backfill_at != 1);
  
        MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
@@ -1797,7 +1811,6 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
  
    case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
      {
-      assert(is_replica());
        assert(cct->_conf->osd_kill_backfill_at != 2);
  
        info.last_backfill = m->last_backfill;
@@ -4247,6 +4260,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
    ctx->obc->ssc->snapset = ctx->new_snapset;
    info.stats.stats.add(ctx->delta_stats, ctx->obc->obs.oi.category);
  
+  int backfill_target = get_backfill_target();
    if (backfill_target >= 0) {
      pg_info_t& pinfo = peer_info[backfill_target];
      if (soid <= pinfo.last_backfill)
@@ -4903,13 +4917,14 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
  
    int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
  
-  if (ctx->op && acting.size() > 1) {
+  assert(actingbackfill.size() > 0);
+  if (ctx->op && actingbackfill.size() > 1) {
      ostringstream ss;
-    ss << "waiting for subops from " << vector<int>(acting.begin() + 1, acting.end());
+    ss << "waiting for subops from " << vector<int>(actingbackfill.begin() + 1, actingbackfill.end());
      ctx->op->mark_sub_op_sent(ss.str());
    }
-  for (unsigned i=1; i<acting.size(); i++) {
-    int peer = acting[i];
+  for (unsigned i=1; i<actingbackfill.size(); i++) {
+    int peer = actingbackfill[i];
      pg_info_t &pinfo = peer_info[peer];
  
      repop->waitfor_ack.insert(peer);
@@ -4926,6 +4941,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
        assert(0 == "broken implementation, do not use");
      }
  
+    int backfill_target = get_backfill_target();
      // ship resulting transaction, log entries, and pg_stats
      if (peer == backfill_target && soid > last_backfill_started &&
          // only skip normal (not temp pool=-1) objects
@@ -5567,7 +5583,6 @@ void ReplicatedPG::sub_op_modify(OpRequestRef op)
    // sanity checks
    assert(m->map_epoch >= info.history.same_interval_since);
    assert(is_active());
-  assert(is_replica());
    
    // we better not be missing this.
    assert(!pg_log.get_missing().is_missing(soid));
@@ -6846,13 +6861,14 @@ void ReplicatedPG::_committed_pushed_object(
      last_complete_ondisk = last_complete;
  
      if (last_complete_ondisk == info.last_update) {
-      if (is_replica()) {
+      if (!is_primary()) {
+        // Either we are a replica or backfill target.
         // we are fully up to date.  tell the primary!
         osd->send_message_osd_cluster(get_primary(),
                                       new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
                                                      last_complete_ondisk),
                                       get_osdmap()->get_epoch());
-      } else if (is_primary()) {
+      } else {
         // we are the primary.  tell replicas to trim?
         if (calc_min_last_complete_ondisk())
           trim_peers();
@@ -7078,10 +7094,11 @@ eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid)
    v = pg_log.get_missing().missing.find(oid)->second.have;
    dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
  
-  for (unsigned i=1; i<acting.size(); ++i) {
-    int peer = acting[i];
+  assert(actingbackfill.size() > 0);
+  for (unsigned i=1; i<actingbackfill.size(); ++i) {
+    int peer = actingbackfill[i];
      if (!peer_missing[peer].is_missing(oid)) {
-      assert(peer == backfill_target);
+      assert(peer == get_backfill_target());
        continue;
      }
      eversion_t h = peer_missing[peer].missing[oid].have;
@@ -7364,17 +7381,16 @@ void ReplicatedPG::on_shutdown()
    cancel_recovery();
  }
  
+// For now only care about a single backfill at a time
  void ReplicatedPG::on_activate()
  {
-  for (unsigned i = 1; i<acting.size(); i++) {
-    if (peer_info[acting[i]].last_backfill != hobject_t::get_max()) {
-      assert(backfill_target == -1);
-      backfill_target = acting[i];
-      last_backfill_started = peer_info[acting[i]].last_backfill;
-      dout(10) << " chose backfill target osd." << backfill_target
-              << " from " << last_backfill_started << dendl;
-    }
-  }
+  int backfill_target = get_backfill_target();
+  if (backfill_target == -1)
+    return;
+  last_backfill_started = peer_info[backfill_target].last_backfill;
+  assert(last_backfill_started != hobject_t::get_max());
+  dout(10) << " chose backfill target osd." << backfill_target
+          << " from " << last_backfill_started << dendl;
  }
  
  void ReplicatedPG::on_change(ObjectStore::Transaction *t)
@@ -7583,6 +7599,7 @@ bool ReplicatedPG::start_recovery_ops(
      work_in_progress = true;
  
    bool deferred_backfill = false;
+  int backfill_target = get_backfill_target();
    if (recovering.empty() &&
        state_test(PG_STATE_BACKFILL) &&
        backfill_target >= 0 && started < max &&
@@ -7837,6 +7854,7 @@ int ReplicatedPG::prep_object_replica_pushes(
    const hobject_t& soid, eversion_t v,
    PGBackend::RecoveryHandle *h)
  {
+  assert(is_primary());
    dout(10) << __func__ << ": on " << soid << dendl;
  
    // NOTE: we know we will get a valid oloc off of disk here.
@@ -7844,8 +7862,9 @@ int ReplicatedPG::prep_object_replica_pushes(
    if (!obc) {
      pg_log.missing_add(soid, v, eversion_t());
      bool uhoh = true;
-    for (unsigned i=1; i<acting.size(); i++) {
-      int peer = acting[i];
+    assert(actingbackfill.size() > 0);
+    for (unsigned i=1; i<actingbackfill.size(); i++) {
+      int peer = actingbackfill[i];
        if (!peer_missing[peer].is_missing(soid, v)) {
         missing_loc[soid].insert(peer);
         missing_loc_sources.insert(peer);
@@ -7888,8 +7907,9 @@ int ReplicatedBackend::start_pushes(
  {
    int pushes = 0;
    // who needs it?  
-  for (unsigned i=1; i<get_parent()->get_acting().size(); i++) {
-    int peer = get_parent()->get_acting()[i];
+  assert(get_parent()->get_actingbackfill().size() > 0);
+  for (unsigned i=1; i<get_parent()->get_actingbackfill().size(); i++) {
+    int peer = get_parent()->get_actingbackfill()[i];
      map<int, pg_missing_t>::const_iterator j =
        get_parent()->get_peer_missing().find(peer);
      assert(j != get_parent()->get_peer_missing().end());
@@ -7912,8 +7932,9 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
  
    // this is FAR from an optimal recovery order.  pretty lame, really.
-  for (unsigned i=1; i<acting.size(); i++) {
-    int peer = acting[i];
+  assert(actingbackfill.size() > 0);
+  for (unsigned i=1; i<actingbackfill.size(); i++) {
+    int peer = actingbackfill[i];
      map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
      assert(pm != peer_missing.end());
      map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
@@ -7995,6 +8016,7 @@ int ReplicatedPG::recover_backfill(
    ThreadPool::TPHandle &handle, bool *work_started)
  {
    dout(10) << "recover_backfill (" << max << ")" << dendl;
+  int backfill_target = get_backfill_target();
    assert(backfill_target >= 0);
  
    pg_info_t& pinfo = peer_info[backfill_target];
@@ -8239,7 +8261,7 @@ void ReplicatedPG::prep_backfill_object_push(
    dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
  
    backfills_in_flight.insert(oid);
-  map<int, pg_missing_t>::iterator bpm = peer_missing.find(backfill_target);
+  map<int, pg_missing_t>::iterator bpm = peer_missing.find(get_backfill_target());
    assert(bpm != peer_missing.end());
    bpm->second.add(oid, eversion_t(), eversion_t());
  
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h

index 439c6a9e5c079af53e3742b09304686016366717..817deaf0d8e234d8cc617ceb00b7ec02131c69e8 100644 (file)
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -280,8 +280,8 @@ public:
    epoch_t get_epoch() {
      return get_osdmap()->get_epoch();
    }
-  const vector<int> &get_acting() {
-    return acting;
+  const vector<int> &get_actingbackfill() {
+    return actingbackfill;
    }
    std::string gen_dbg_prefix() const { return gen_prefix(); }
author	David Zafman <david.zafman@inktank.com>
	Fri, 11 Oct 2013 22:53:49 +0000 (15:53 -0700)
committer	David Zafman <david.zafman@inktank.com>
	Tue, 19 Nov 2013 22:45:20 +0000 (14:45 -0800)
src/osd/OSD.cc		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/PGBackend.h		patch \| blob \| history
src/osd/ReplicatedPG.cc		patch \| blob \| history
src/osd/ReplicatedPG.h		patch \| blob \| history