osd/: move choose_acting and callees into PeeringState

author sjust@redhat.com <sjust@redhat.com>

Mon, 29 Apr 2019 20:55:20 +0000 (13:55 -0700)

committer sjust@redhat.com <sjust@redhat.com>

Wed, 1 May 2019 18:22:21 +0000 (11:22 -0700)
author sjust@redhat.com <sjust@redhat.com>
Mon, 29 Apr 2019 20:55:20 +0000 (13:55 -0700)
committer sjust@redhat.com <sjust@redhat.com>
Wed, 1 May 2019 18:22:21 +0000 (11:22 -0700)
diff --git a/src/osd/MissingLoc.h b/src/osd/MissingLoc.h

index 87c0d3c4cea55d40da4dedf9228213b8b9c85ed5..4ebc5520c7615b1e7d6f69e1a26861cde35bdaa7 100644 (file)
--- a/src/osd/MissingLoc.h
+++ b/src/osd/MissingLoc.h
@@ -120,7 +120,7 @@ class MissingLoc {
      is_readable.reset(_is_readable);
      is_recoverable.reset(_is_recoverable);
    }
-  IsPGRecoverablePredicate &get_recoverable_predicate() {
+  const IsPGRecoverablePredicate &get_recoverable_predicate() const {
      return *is_recoverable;
    }
    std::ostream& gen_prefix(std::ostream& out) const {
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 06d2da9864755e90a7fe54a8e75afe4961e85673..489669666ef8c97c9a49e066cab98d55689f41ec 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -551,701 +551,6 @@ PG::Scrubber::Scrubber()
  
  PG::Scrubber::~Scrubber() {}
  
-/**
- * find_best_info
- *
- * Returns an iterator to the best info in infos sorted by:
- *  1) Prefer newer last_update
- *  2) Prefer longer tail if it brings another info into contiguity
- *  3) Prefer current primary
- */
-map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
-  const map<pg_shard_t, pg_info_t> &infos,
-  bool restrict_to_up_acting,
-  bool *history_les_bound) const
-{
-  ceph_assert(history_les_bound);
-  /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
-   * to make changes to this process.  Also, make sure to update it
-   * when you find bugs! */
-  eversion_t min_last_update_acceptable = eversion_t::max();
-  epoch_t max_last_epoch_started_found = 0;
-  for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
-       i != infos.end();
-       ++i) {
-    if (!cct->_conf->osd_find_best_info_ignore_history_les &&
-       max_last_epoch_started_found < i->second.history.last_epoch_started) {
-      *history_les_bound = true;
-      max_last_epoch_started_found = i->second.history.last_epoch_started;
-    }
-    if (!i->second.is_incomplete() &&
-       max_last_epoch_started_found < i->second.last_epoch_started) {
-      *history_les_bound = false;
-      max_last_epoch_started_found = i->second.last_epoch_started;
-    }
-  }
-  for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
-       i != infos.end();
-       ++i) {
-    if (max_last_epoch_started_found <= i->second.last_epoch_started) {
-      if (min_last_update_acceptable > i->second.last_update)
-       min_last_update_acceptable = i->second.last_update;
-    }
-  }
-  if (min_last_update_acceptable == eversion_t::max())
-    return infos.end();
-
-  map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
-  // find osd with newest last_update (oldest for ec_pool).
-  // if there are multiples, prefer
-  //  - a longer tail, if it brings another peer into log contiguity
-  //  - the current primary
-  for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
-       p != infos.end();
-       ++p) {
-    if (restrict_to_up_acting && !is_up(p->first) &&
-       !is_acting(p->first))
-      continue;
-    // Only consider peers with last_update >= min_last_update_acceptable
-    if (p->second.last_update < min_last_update_acceptable)
-      continue;
-    // Disqualify anyone with a too old last_epoch_started
-    if (p->second.last_epoch_started < max_last_epoch_started_found)
-      continue;
-    // Disqualify anyone who is incomplete (not fully backfilled)
-    if (p->second.is_incomplete())
-      continue;
-    if (best == infos.end()) {
-      best = p;
-      continue;
-    }
-    // Prefer newer last_update
-    if (pool.info.require_rollback()) {
-      if (p->second.last_update > best->second.last_update)
-       continue;
-      if (p->second.last_update < best->second.last_update) {
-       best = p;
-       continue;
-      }
-    } else {
-      if (p->second.last_update < best->second.last_update)
-       continue;
-      if (p->second.last_update > best->second.last_update) {
-       best = p;
-       continue;
-      }
-    }
-
-    // Prefer longer tail
-    if (p->second.log_tail > best->second.log_tail) {
-      continue;
-    } else if (p->second.log_tail < best->second.log_tail) {
-      best = p;
-      continue;
-    }
-
-    if (!p->second.has_missing() && best->second.has_missing()) {
-      dout(10) << __func__ << " prefer osd." << p->first
-               << " because it is complete while best has missing"
-               << dendl;
-      best = p;
-      continue;
-    } else if (p->second.has_missing() && !best->second.has_missing()) {
-      dout(10) << __func__ << " skipping osd." << p->first
-               << " because it has missing while best is complete"
-               << dendl;
-      continue;
-    } else {
-      // both are complete or have missing
-      // fall through
-    }
-
-    // prefer current primary (usually the caller), all things being equal
-    if (p->first == pg_whoami) {
-      dout(10) << "calc_acting prefer osd." << p->first
-              << " because it is current primary" << dendl;
-      best = p;
-      continue;
-    }
-  }
-  return best;
-}
-
-void PG::calc_ec_acting(
-  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
-  unsigned size,
-  const vector<int> &acting,
-  const vector<int> &up,
-  const map<pg_shard_t, pg_info_t> &all_info,
-  bool restrict_to_up_acting,
-  vector<int> *_want,
-  set<pg_shard_t> *backfill,
-  set<pg_shard_t> *acting_backfill,
-  ostream &ss)
-{
-  vector<int> want(size, CRUSH_ITEM_NONE);
-  map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
-  for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
-       i != all_info.end();
-       ++i) {
-    all_info_by_shard[i->first.shard].insert(i->first);
-  }
-  for (uint8_t i = 0; i < want.size(); ++i) {
-    ss << "For position " << (unsigned)i << ": ";
-    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
-       !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
-       all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
-       auth_log_shard->second.log_tail) {
-      ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
-      want[i] = up[i];
-      continue;
-    }
-    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
-      ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
-        << " and ";
-      backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
-    }
-
-    if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
-       !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
-       all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
-       auth_log_shard->second.log_tail) {
-      ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
-      want[i] = acting[i];
-    } else if (!restrict_to_up_acting) {
-      for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
-          j != all_info_by_shard[shard_id_t(i)].end();
-          ++j) {
-       ceph_assert(j->shard == i);
-       if (!all_info.find(*j)->second.is_incomplete() &&
-           all_info.find(*j)->second.last_update >=
-           auth_log_shard->second.log_tail) {
-         ss << " selecting stray: " << *j << std::endl;
-         want[i] = j->osd;
-         break;
-       }
-      }
-      if (want[i] == CRUSH_ITEM_NONE)
-       ss << " failed to fill position " << (int)i << std::endl;
-    }
-  }
-
-  for (uint8_t i = 0; i < want.size(); ++i) {
-    if (want[i] != CRUSH_ITEM_NONE) {
-      acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
-    }
-  }
-  acting_backfill->insert(backfill->begin(), backfill->end());
-  _want->swap(want);
-}
-
-/**
- * calculate the desired acting set.
- *
- * Choose an appropriate acting set.  Prefer up[0], unless it is
- * incomplete, or another osd has a longer tail that allows us to
- * bring other up nodes up to date.
- */
-void PG::calc_replicated_acting(
-  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
-  uint64_t force_auth_primary_missing_objects,
-  unsigned size,
-  const vector<int> &acting,
-  const vector<int> &up,
-  pg_shard_t up_primary,
-  const map<pg_shard_t, pg_info_t> &all_info,
-  bool restrict_to_up_acting,
-  vector<int> *want,
-  set<pg_shard_t> *backfill,
-  set<pg_shard_t> *acting_backfill,
-  const OSDMapRef osdmap,
-  ostream &ss)
-{
-  pg_shard_t auth_log_shard_id = auth_log_shard->first;
-
-  ss << __func__ << " newest update on osd." << auth_log_shard_id
-     << " with " << auth_log_shard->second
-     << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
-  
-  // select primary
-  auto primary = all_info.find(up_primary);
-  if (up.size() &&
-      !primary->second.is_incomplete() &&
-      primary->second.last_update >=
-        auth_log_shard->second.log_tail) {
-    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
-      auto approx_missing_objects =
-        primary->second.stats.stats.sum.num_objects_missing;
-      auto auth_version = auth_log_shard->second.last_update.version;
-      auto primary_version = primary->second.last_update.version;
-      if (auth_version > primary_version) {
-        approx_missing_objects += auth_version - primary_version;
-      } else {
-        approx_missing_objects += primary_version - auth_version;
-      }
-      if ((uint64_t)approx_missing_objects >
-          force_auth_primary_missing_objects) {
-        primary = auth_log_shard;
-        ss << "up_primary: " << up_primary << ") has approximate "
-           << approx_missing_objects
-           << "(>" << force_auth_primary_missing_objects <<") "
-           << "missing objects, osd." << auth_log_shard_id
-           << " selected as primary instead"
-           << std::endl;
-      } else {
-        ss << "up_primary: " << up_primary << ") selected as primary"
-           << std::endl;
-      }
-    } else {
-      ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
-    }
-  } else {
-    ceph_assert(!auth_log_shard->second.is_incomplete());
-    ss << "up[0] needs backfill, osd." << auth_log_shard_id
-       << " selected as primary instead" << std::endl;
-    primary = auth_log_shard;
-  }
-
-  ss << __func__ << " primary is osd." << primary->first
-     << " with " << primary->second << std::endl;
-  want->push_back(primary->first.osd);
-  acting_backfill->insert(primary->first);
-
-  /* We include auth_log_shard->second.log_tail because in GetLog,
-   * we will request logs back to the min last_update over our
-   * acting_backfill set, which will result in our log being extended
-   * as far backwards as necessary to pick up any peers which can
-   * be log recovered by auth_log_shard's log */
-  eversion_t oldest_auth_log_entry =
-    std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
-
-  // select replicas that have log contiguity with primary.
-  // prefer up, then acting, then any peer_info osds
-  for (auto i : up) {
-    pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
-    if (up_cand == primary->first)
-      continue;
-    const pg_info_t &cur_info = all_info.find(up_cand)->second;
-    if (cur_info.is_incomplete() ||
-        cur_info.last_update < oldest_auth_log_entry) {
-      ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
-      backfill->insert(up_cand);
-      acting_backfill->insert(up_cand);
-    } else {
-      want->push_back(i);
-      acting_backfill->insert(up_cand);
-      ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
-    }
-    if (want->size() >= size) {
-      break;
-    }
-  }
-
-  if (want->size() >= size) {
-    return;
-  }
-
-  std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
-  candidate_by_last_update.reserve(acting.size());
-  // This no longer has backfill OSDs, but they are covered above.
-  for (auto i : acting) {
-    pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
-    // skip up osds we already considered above
-    if (acting_cand == primary->first)
-      continue;
-    vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
-    if (up_it != up.end())
-      continue;
-
-    const pg_info_t &cur_info = all_info.find(acting_cand)->second;
-    if (cur_info.is_incomplete() ||
-       cur_info.last_update < oldest_auth_log_entry) {
-      ss << " shard " << acting_cand << " (acting) REJECTED "
-        << cur_info << std::endl;
-    } else {
-      candidate_by_last_update.emplace_back(cur_info.last_update, i);
-    }
-  }
-
-  auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
-                            const std::pair<eversion_t, int> &rhs) {
-    return lhs.first > rhs.first;
-  };
-  // sort by last_update, in descending order.
-  std::sort(candidate_by_last_update.begin(),
-            candidate_by_last_update.end(), sort_by_eversion);
-  for (auto &p: candidate_by_last_update) {
-    ceph_assert(want->size() < size);
-    want->push_back(p.second);
-    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
-    acting_backfill->insert(s);
-    ss << " shard " << s << " (acting) accepted "
-       << all_info.find(s)->second << std::endl;
-    if (want->size() >= size) {
-      return;
-    }
-  }
-
-  if (restrict_to_up_acting) {
-    return;
-  }
-  candidate_by_last_update.clear();
-  candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
-  // continue to search stray to find more suitable peers
-  for (auto &i : all_info) {
-    // skip up osds we already considered above
-    if (i.first == primary->first)
-      continue;
-    vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
-    if (up_it != up.end())
-      continue;
-    vector<int>::const_iterator acting_it = find(
-      acting.begin(), acting.end(), i.first.osd);
-    if (acting_it != acting.end())
-      continue;
-
-    if (i.second.is_incomplete() ||
-       i.second.last_update < oldest_auth_log_entry) {
-      ss << " shard " << i.first << " (stray) REJECTED " << i.second
-         << std::endl;
-    } else {
-      candidate_by_last_update.emplace_back(i.second.last_update, i.first.osd);
-    }
-  }
-
-  if (candidate_by_last_update.empty()) {
-    // save us some effort
-    return;
-  }
-
-  // sort by last_update, in descending order.
-  std::sort(candidate_by_last_update.begin(),
-            candidate_by_last_update.end(), sort_by_eversion);
-
-  for (auto &p: candidate_by_last_update) {
-    ceph_assert(want->size() < size);
-    want->push_back(p.second);
-    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
-    acting_backfill->insert(s);
-    ss << " shard " << s << " (stray) accepted "
-       << all_info.find(s)->second << std::endl;
-    if (want->size() >= size) {
-      return;
-    }
-  }
-}
-
-bool PG::recoverable_and_ge_min_size(const vector<int> &want) const
-{
-  unsigned num_want_acting = 0;
-  set<pg_shard_t> have;
-  for (int i = 0; i < (int)want.size(); ++i) {
-    if (want[i] != CRUSH_ITEM_NONE) {
-      ++num_want_acting;
-      have.insert(
-        pg_shard_t(
-          want[i],
-          pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
-    }
-  }
-  // We go incomplete if below min_size for ec_pools since backfill
-  // does not currently maintain rollbackability
-  // Otherwise, we will go "peered", but not "active"
-  if (num_want_acting < pool.info.min_size &&
-      (pool.info.is_erasure() ||
-       !cct->_conf->osd_allow_recovery_below_min_size)) {
-    dout(10) << __func__ << " failed, below min size" << dendl;
-    return false;
-  }
-
-  /* Check whether we have enough acting shards to later perform recovery */
-  boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
-      get_pgbackend()->get_is_recoverable_predicate());
-  if (!(*recoverable_predicate)(have)) {
-    dout(10) << __func__ << " failed, not recoverable" << dendl;
-    return false;
-  }
-
-  return true;
-}
-
-void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
-                                  const pg_info_t &auth_info,
-                                  vector<int> *want,
-                                  set<pg_shard_t> *async_recovery,
-                                  const OSDMapRef osdmap) const
-{
-  set<pair<int, pg_shard_t> > candidates_by_cost;
-  for (uint8_t i = 0; i < want->size(); ++i) {
-    if ((*want)[i] == CRUSH_ITEM_NONE)
-      continue;
-
-    // Considering log entries to recover is accurate enough for
-    // now. We could use minimum_to_decode_with_cost() later if
-    // necessary.
-    pg_shard_t shard_i((*want)[i], shard_id_t(i));
-    // do not include strays
-    if (stray_set.find(shard_i) != stray_set.end())
-      continue;
-    // Do not include an osd that is not up, since choosing it as
-    // an async_recovery_target will move it out of the acting set.
-    // This results in it being identified as a stray during peering,
-    // because it is no longer in the up or acting set.
-    if (!is_up(shard_i))
-      continue;
-    auto shard_info = all_info.find(shard_i)->second;
-    // for ec pools we rollback all entries past the authoritative
-    // last_update *before* activation. This is relatively inexpensive
-    // compared to recovery, since it is purely local, so treat shards
-    // past the authoritative last_update the same as those equal to it.
-    version_t auth_version = auth_info.last_update.version;
-    version_t candidate_version = shard_info.last_update.version;
-    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
-      auto approx_missing_objects =
-        shard_info.stats.stats.sum.num_objects_missing;
-      if (auth_version > candidate_version) {
-        approx_missing_objects += auth_version - candidate_version;
-      }
-      if (static_cast<uint64_t>(approx_missing_objects) >
-         cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
-        candidates_by_cost.emplace(approx_missing_objects, shard_i);
-      }
-    } else {
-      if (auth_version > candidate_version &&
-          (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
-        candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
-      }
-    }
-  }
-
-  dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
-           << dendl;
-
-  // take out as many osds as we can for async recovery, in order of cost
-  for (auto rit = candidates_by_cost.rbegin();
-       rit != candidates_by_cost.rend(); ++rit) {
-    pg_shard_t cur_shard = rit->second;
-    vector<int> candidate_want(*want);
-    candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
-    if (recoverable_and_ge_min_size(candidate_want)) {
-      want->swap(candidate_want);
-      async_recovery->insert(cur_shard);
-    }
-  }
-  dout(20) << __func__ << " result want=" << *want
-           << " async_recovery=" << *async_recovery << dendl;
-}
-
-void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
-                                          const pg_info_t &auth_info,
-                                          vector<int> *want,
-                                          set<pg_shard_t> *async_recovery,
-                                          const OSDMapRef osdmap) const
-{
-  set<pair<int, pg_shard_t> > candidates_by_cost;
-  for (auto osd_num : *want) {
-    pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
-    // do not include strays
-    if (stray_set.find(shard_i) != stray_set.end())
-      continue;
-    // Do not include an osd that is not up, since choosing it as
-    // an async_recovery_target will move it out of the acting set.
-    // This results in it being identified as a stray during peering,
-    // because it is no longer in the up or acting set.
-    if (!is_up(shard_i))
-      continue;
-    auto shard_info = all_info.find(shard_i)->second;
-    // use the approximate magnitude of the difference in length of
-    // logs plus historical missing objects as the cost of recovery
-    version_t auth_version = auth_info.last_update.version;
-    version_t candidate_version = shard_info.last_update.version;
-    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
-      auto approx_missing_objects =
-        shard_info.stats.stats.sum.num_objects_missing;
-      if (auth_version > candidate_version) {
-        approx_missing_objects += auth_version - candidate_version;
-      } else {
-        approx_missing_objects += candidate_version - auth_version;
-      }
-      if (static_cast<uint64_t>(approx_missing_objects)  >
-         cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
-        candidates_by_cost.emplace(approx_missing_objects, shard_i);
-      }
-    } else {
-      size_t approx_entries;
-      if (auth_version > candidate_version) {
-        approx_entries = auth_version - candidate_version;
-      } else {
-        approx_entries = candidate_version - auth_version;
-      }
-      if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
-        candidates_by_cost.insert(make_pair(approx_entries, shard_i));
-      }
-    }
-  }
-
-  dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
-           << dendl;
-  // take out as many osds as we can for async recovery, in order of cost
-  for (auto rit = candidates_by_cost.rbegin();
-       rit != candidates_by_cost.rend(); ++rit) {
-    if (want->size() <= pool.info.min_size) {
-      break;
-    }
-    pg_shard_t cur_shard = rit->second;
-    vector<int> candidate_want(*want);
-    for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
-      if (*it == cur_shard.osd) {
-        candidate_want.erase(it);
-       want->swap(candidate_want);
-       async_recovery->insert(cur_shard);
-        break;
-      }
-    }
-  }
-  dout(20) << __func__ << " result want=" << *want
-           << " async_recovery=" << *async_recovery << dendl;
-}
-
-/**
- * choose acting
- *
- * calculate the desired acting, and request a change with the monitor
- * if it differs from the current acting.
- *
- * if restrict_to_up_acting=true, we filter out anything that's not in
- * up/acting.  in order to lift this restriction, we need to
- *  1) check whether it's worth switching the acting set any time we get
- *     a new pg info (not just here, when recovery finishes)
- *  2) check whether anything in want_acting went down on each new map
- *     (and, if so, calculate a new want_acting)
- *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
- * TODO!
- */
-bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
-                      bool restrict_to_up_acting,
-                      bool *history_les_bound)
-{
-  map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
-  all_info[pg_whoami] = info;
-
-  if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
-    for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
-         p != all_info.end();
-         ++p) {
-      dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
-    }
-  }
-
-  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
-    find_best_info(all_info, restrict_to_up_acting, history_les_bound);
-
-  if (auth_log_shard == all_info.end()) {
-    if (up != acting) {
-      dout(10) << __func__ << " no suitable info found (incomplete backfills?),"
-              << " reverting to up" << dendl;
-      want_acting = up;
-      vector<int> empty;
-      osd->queue_want_pg_temp(info.pgid.pgid, empty);
-    } else {
-      dout(10) << __func__ << " failed" << dendl;
-      ceph_assert(want_acting.empty());
-    }
-    return false;
-  }
-
-  ceph_assert(!auth_log_shard->second.is_incomplete());
-  auth_log_shard_id = auth_log_shard->first;
-
-  set<pg_shard_t> want_backfill, want_acting_backfill;
-  vector<int> want;
-  stringstream ss;
-  if (pool.info.is_replicated())
-    calc_replicated_acting(
-      auth_log_shard,
-      cct->_conf.get_val<uint64_t>(
-        "osd_force_auth_primary_missing_objects"),
-      get_osdmap()->get_pg_size(info.pgid.pgid),
-      acting,
-      up,
-      up_primary,
-      all_info,
-      restrict_to_up_acting,
-      &want,
-      &want_backfill,
-      &want_acting_backfill,
-      get_osdmap(),
-      ss);
-  else
-    calc_ec_acting(
-      auth_log_shard,
-      get_osdmap()->get_pg_size(info.pgid.pgid),
-      acting,
-      up,
-      all_info,
-      restrict_to_up_acting,
-      &want,
-      &want_backfill,
-      &want_acting_backfill,
-      ss);
-  dout(10) << ss.str() << dendl;
-
-  if (!recoverable_and_ge_min_size(want)) {
-    want_acting.clear();
-    return false;
-  }
-
-  set<pg_shard_t> want_async_recovery;
-  if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
-    if (pool.info.is_erasure()) {
-      choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
-    } else {
-      choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
-    }
-  }
-  if (want != acting) {
-    dout(10) << __func__ << " want " << want << " != acting " << acting
-            << ", requesting pg_temp change" << dendl;
-    want_acting = want;
-
-    if (!cct->_conf->osd_debug_no_acting_change) {
-      if (want_acting == up) {
-       // There can't be any pending backfill if
-       // want is the same as crush map up OSDs.
-       ceph_assert(want_backfill.empty());
-       vector<int> empty;
-       osd->queue_want_pg_temp(info.pgid.pgid, empty);
-      } else
-       osd->queue_want_pg_temp(info.pgid.pgid, want);
-    }
-    return false;
-  }
-  want_acting.clear();
-  acting_recovery_backfill = want_acting_backfill;
-  dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl;
-  ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill);
-  if (backfill_targets.empty()) {
-    // Caller is GetInfo
-    backfill_targets = want_backfill;
-  }
-  // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
-  ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery());
-  if (async_recovery_targets.empty() || !needs_recovery()) {
-    async_recovery_targets = want_async_recovery;
-  }
-  // Will not change if already set because up would have had to change
-  // Verify that nothing in backfill is in stray_set
-  for (set<pg_shard_t>::iterator i = want_backfill.begin();
-      i != want_backfill.end();
-      ++i) {
-    ceph_assert(stray_set.find(*i) == stray_set.end());
-  }
-  dout(10) << "choose_acting want=" << want << " backfill_targets="
-           << want_backfill << " async_recovery_targets="
-           << async_recovery_targets << dendl;
-  return true;
-}
-
  /* Build the might_have_unfound set.
   *
   * This is used by the primary OSD during recovery.
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 785a742436a360b902e465a3563ac3ddc40a1daf..6e21b99ee235d3aea74d8ba0b721e67a923eac40 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1045,50 +1045,6 @@ protected:
  
    void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
    
-  map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
-    const map<pg_shard_t, pg_info_t> &infos,
-    bool restrict_to_up_acting,
-    bool *history_les_bound) const;
-  static void calc_ec_acting(
-    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
-    unsigned size,
-    const vector<int> &acting,
-    const vector<int> &up,
-    const map<pg_shard_t, pg_info_t> &all_info,
-    bool restrict_to_up_acting,
-    vector<int> *want,
-    set<pg_shard_t> *backfill,
-    set<pg_shard_t> *acting_backfill,
-    ostream &ss);
-  static void calc_replicated_acting(
-    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
-    uint64_t force_auth_primary_missing_objects,
-    unsigned size,
-    const vector<int> &acting,
-    const vector<int> &up,
-    pg_shard_t up_primary,
-    const map<pg_shard_t, pg_info_t> &all_info,
-    bool restrict_to_up_acting,
-    vector<int> *want,
-    set<pg_shard_t> *backfill,
-    set<pg_shard_t> *acting_backfill,
-    const OSDMapRef osdmap,
-    ostream &ss);
-  void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
-                                const pg_info_t &auth_info,
-                                vector<int> *want,
-                                set<pg_shard_t> *async_recovery,
-                                const OSDMapRef osdmap) const;
-  void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
-                                        const pg_info_t &auth_info,
-                                        vector<int> *want,
-                                        set<pg_shard_t> *async_recovery,
-                                        const OSDMapRef osdmap) const;
-
-  bool recoverable_and_ge_min_size(const vector<int> &want) const;
-  bool choose_acting(pg_shard_t &auth_log_shard,
-                    bool restrict_to_up_acting,
-                    bool *history_les_bound);
    void build_might_have_unfound();
    void activate(
      ObjectStore::Transaction& t,
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc

index a546ea0754b2a87b8b4c9b13f133683e09020661..e19a179d03aaae3a6ffda122f1c351520327ebb0 100644 (file)
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -1127,6 +1127,715 @@ void PeeringState::reject_reservation()
      get_osdmap_epoch());
  }
  
+/**
+ * find_best_info
+ *
+ * Returns an iterator to the best info in infos sorted by:
+ *  1) Prefer newer last_update
+ *  2) Prefer longer tail if it brings another info into contiguity
+ *  3) Prefer current primary
+ */
+map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
+  const map<pg_shard_t, pg_info_t> &infos,
+  bool restrict_to_up_acting,
+  bool *history_les_bound) const
+{
+  ceph_assert(history_les_bound);
+  /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
+   * to make changes to this process.  Also, make sure to update it
+   * when you find bugs! */
+  eversion_t min_last_update_acceptable = eversion_t::max();
+  epoch_t max_last_epoch_started_found = 0;
+  for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
+       i != infos.end();
+       ++i) {
+    if (!cct->_conf->osd_find_best_info_ignore_history_les &&
+       max_last_epoch_started_found < i->second.history.last_epoch_started) {
+      *history_les_bound = true;
+      max_last_epoch_started_found = i->second.history.last_epoch_started;
+    }
+    if (!i->second.is_incomplete() &&
+       max_last_epoch_started_found < i->second.last_epoch_started) {
+      *history_les_bound = false;
+      max_last_epoch_started_found = i->second.last_epoch_started;
+    }
+  }
+  for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
+       i != infos.end();
+       ++i) {
+    if (max_last_epoch_started_found <= i->second.last_epoch_started) {
+      if (min_last_update_acceptable > i->second.last_update)
+       min_last_update_acceptable = i->second.last_update;
+    }
+  }
+  if (min_last_update_acceptable == eversion_t::max())
+    return infos.end();
+
+  map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
+  // find osd with newest last_update (oldest for ec_pool).
+  // if there are multiples, prefer
+  //  - a longer tail, if it brings another peer into log contiguity
+  //  - the current primary
+  for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
+       p != infos.end();
+       ++p) {
+    if (restrict_to_up_acting && !is_up(p->first) &&
+       !is_acting(p->first))
+      continue;
+    // Only consider peers with last_update >= min_last_update_acceptable
+    if (p->second.last_update < min_last_update_acceptable)
+      continue;
+    // Disqualify anyone with a too old last_epoch_started
+    if (p->second.last_epoch_started < max_last_epoch_started_found)
+      continue;
+    // Disqualify anyone who is incomplete (not fully backfilled)
+    if (p->second.is_incomplete())
+      continue;
+    if (best == infos.end()) {
+      best = p;
+      continue;
+    }
+    // Prefer newer last_update
+    if (pool.info.require_rollback()) {
+      if (p->second.last_update > best->second.last_update)
+       continue;
+      if (p->second.last_update < best->second.last_update) {
+       best = p;
+       continue;
+      }
+    } else {
+      if (p->second.last_update < best->second.last_update)
+       continue;
+      if (p->second.last_update > best->second.last_update) {
+       best = p;
+       continue;
+      }
+    }
+
+    // Prefer longer tail
+    if (p->second.log_tail > best->second.log_tail) {
+      continue;
+    } else if (p->second.log_tail < best->second.log_tail) {
+      best = p;
+      continue;
+    }
+
+    if (!p->second.has_missing() && best->second.has_missing()) {
+      psdout(10) << __func__ << " prefer osd." << p->first
+               << " because it is complete while best has missing"
+               << dendl;
+      best = p;
+      continue;
+    } else if (p->second.has_missing() && !best->second.has_missing()) {
+      psdout(10) << __func__ << " skipping osd." << p->first
+               << " because it has missing while best is complete"
+               << dendl;
+      continue;
+    } else {
+      // both are complete or have missing
+      // fall through
+    }
+
+    // prefer current primary (usually the caller), all things being equal
+    if (p->first == pg_whoami) {
+      psdout(10) << "calc_acting prefer osd." << p->first
+                << " because it is current primary" << dendl;
+      best = p;
+      continue;
+    }
+  }
+  return best;
+}
+
+void PeeringState::calc_ec_acting(
+  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+  unsigned size,
+  const vector<int> &acting,
+  const vector<int> &up,
+  const map<pg_shard_t, pg_info_t> &all_info,
+  bool restrict_to_up_acting,
+  vector<int> *_want,
+  set<pg_shard_t> *backfill,
+  set<pg_shard_t> *acting_backfill,
+  ostream &ss)
+{
+  vector<int> want(size, CRUSH_ITEM_NONE);
+  map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
+  for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
+       i != all_info.end();
+       ++i) {
+    all_info_by_shard[i->first.shard].insert(i->first);
+  }
+  for (uint8_t i = 0; i < want.size(); ++i) {
+    ss << "For position " << (unsigned)i << ": ";
+    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
+       !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
+       all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
+       auth_log_shard->second.log_tail) {
+      ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
+      want[i] = up[i];
+      continue;
+    }
+    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
+      ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
+        << " and ";
+      backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
+    }
+
+    if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
+       !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
+       all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
+       auth_log_shard->second.log_tail) {
+      ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
+      want[i] = acting[i];
+    } else if (!restrict_to_up_acting) {
+      for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
+          j != all_info_by_shard[shard_id_t(i)].end();
+          ++j) {
+       ceph_assert(j->shard == i);
+       if (!all_info.find(*j)->second.is_incomplete() &&
+           all_info.find(*j)->second.last_update >=
+           auth_log_shard->second.log_tail) {
+         ss << " selecting stray: " << *j << std::endl;
+         want[i] = j->osd;
+         break;
+       }
+      }
+      if (want[i] == CRUSH_ITEM_NONE)
+       ss << " failed to fill position " << (int)i << std::endl;
+    }
+  }
+
+  for (uint8_t i = 0; i < want.size(); ++i) {
+    if (want[i] != CRUSH_ITEM_NONE) {
+      acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
+    }
+  }
+  acting_backfill->insert(backfill->begin(), backfill->end());
+  _want->swap(want);
+}
+
+/**
+ * calculate the desired acting set.
+ *
+ * Choose an appropriate acting set.  Prefer up[0], unless it is
+ * incomplete, or another osd has a longer tail that allows us to
+ * bring other up nodes up to date.
+ */
+void PeeringState::calc_replicated_acting(
+  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+  uint64_t force_auth_primary_missing_objects,
+  unsigned size,
+  const vector<int> &acting,
+  const vector<int> &up,
+  pg_shard_t up_primary,
+  const map<pg_shard_t, pg_info_t> &all_info,
+  bool restrict_to_up_acting,
+  vector<int> *want,
+  set<pg_shard_t> *backfill,
+  set<pg_shard_t> *acting_backfill,
+  const OSDMapRef osdmap,
+  ostream &ss)
+{
+  pg_shard_t auth_log_shard_id = auth_log_shard->first;
+
+  ss << __func__ << " newest update on osd." << auth_log_shard_id
+     << " with " << auth_log_shard->second
+     << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
+
+  // select primary
+  auto primary = all_info.find(up_primary);
+  if (up.size() &&
+      !primary->second.is_incomplete() &&
+      primary->second.last_update >=
+        auth_log_shard->second.log_tail) {
+    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+      auto approx_missing_objects =
+        primary->second.stats.stats.sum.num_objects_missing;
+      auto auth_version = auth_log_shard->second.last_update.version;
+      auto primary_version = primary->second.last_update.version;
+      if (auth_version > primary_version) {
+        approx_missing_objects += auth_version - primary_version;
+      } else {
+        approx_missing_objects += primary_version - auth_version;
+      }
+      if ((uint64_t)approx_missing_objects >
+          force_auth_primary_missing_objects) {
+        primary = auth_log_shard;
+        ss << "up_primary: " << up_primary << ") has approximate "
+           << approx_missing_objects
+           << "(>" << force_auth_primary_missing_objects <<") "
+           << "missing objects, osd." << auth_log_shard_id
+           << " selected as primary instead"
+           << std::endl;
+      } else {
+        ss << "up_primary: " << up_primary << ") selected as primary"
+           << std::endl;
+      }
+    } else {
+      ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
+    }
+  } else {
+    ceph_assert(!auth_log_shard->second.is_incomplete());
+    ss << "up[0] needs backfill, osd." << auth_log_shard_id
+       << " selected as primary instead" << std::endl;
+    primary = auth_log_shard;
+  }
+
+  ss << __func__ << " primary is osd." << primary->first
+     << " with " << primary->second << std::endl;
+  want->push_back(primary->first.osd);
+  acting_backfill->insert(primary->first);
+
+  /* We include auth_log_shard->second.log_tail because in GetLog,
+   * we will request logs back to the min last_update over our
+   * acting_backfill set, which will result in our log being extended
+   * as far backwards as necessary to pick up any peers which can
+   * be log recovered by auth_log_shard's log */
+  eversion_t oldest_auth_log_entry =
+    std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
+
+  // select replicas that have log contiguity with primary.
+  // prefer up, then acting, then any peer_info osds
+  for (auto i : up) {
+    pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
+    if (up_cand == primary->first)
+      continue;
+    const pg_info_t &cur_info = all_info.find(up_cand)->second;
+    if (cur_info.is_incomplete() ||
+        cur_info.last_update < oldest_auth_log_entry) {
+      ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
+      backfill->insert(up_cand);
+      acting_backfill->insert(up_cand);
+    } else {
+      want->push_back(i);
+      acting_backfill->insert(up_cand);
+      ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
+    }
+    if (want->size() >= size) {
+      break;
+    }
+  }
+
+  if (want->size() >= size) {
+    return;
+  }
+
+  std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
+  candidate_by_last_update.reserve(acting.size());
+  // This no longer has backfill OSDs, but they are covered above.
+  for (auto i : acting) {
+    pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
+    // skip up osds we already considered above
+    if (acting_cand == primary->first)
+      continue;
+    vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
+    if (up_it != up.end())
+      continue;
+
+    const pg_info_t &cur_info = all_info.find(acting_cand)->second;
+    if (cur_info.is_incomplete() ||
+       cur_info.last_update < oldest_auth_log_entry) {
+      ss << " shard " << acting_cand << " (acting) REJECTED "
+        << cur_info << std::endl;
+    } else {
+      candidate_by_last_update.emplace_back(cur_info.last_update, i);
+    }
+  }
+
+  auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
+                            const std::pair<eversion_t, int> &rhs) {
+    return lhs.first > rhs.first;
+  };
+  // sort by last_update, in descending order.
+  std::sort(candidate_by_last_update.begin(),
+            candidate_by_last_update.end(), sort_by_eversion);
+  for (auto &p: candidate_by_last_update) {
+    ceph_assert(want->size() < size);
+    want->push_back(p.second);
+    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
+    acting_backfill->insert(s);
+    ss << " shard " << s << " (acting) accepted "
+       << all_info.find(s)->second << std::endl;
+    if (want->size() >= size) {
+      return;
+    }
+  }
+
+  if (restrict_to_up_acting) {
+    return;
+  }
+  candidate_by_last_update.clear();
+  candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
+  // continue to search stray to find more suitable peers
+  for (auto &i : all_info) {
+    // skip up osds we already considered above
+    if (i.first == primary->first)
+      continue;
+    vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
+    if (up_it != up.end())
+      continue;
+    vector<int>::const_iterator acting_it = find(
+      acting.begin(), acting.end(), i.first.osd);
+    if (acting_it != acting.end())
+      continue;
+
+    if (i.second.is_incomplete() ||
+       i.second.last_update < oldest_auth_log_entry) {
+      ss << " shard " << i.first << " (stray) REJECTED " << i.second
+         << std::endl;
+    } else {
+      candidate_by_last_update.emplace_back(
+        i.second.last_update, i.first.osd);
+    }
+  }
+
+  if (candidate_by_last_update.empty()) {
+    // save us some effort
+    return;
+  }
+
+  // sort by last_update, in descending order.
+  std::sort(candidate_by_last_update.begin(),
+            candidate_by_last_update.end(), sort_by_eversion);
+
+  for (auto &p: candidate_by_last_update) {
+    ceph_assert(want->size() < size);
+    want->push_back(p.second);
+    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
+    acting_backfill->insert(s);
+    ss << " shard " << s << " (stray) accepted "
+       << all_info.find(s)->second << std::endl;
+    if (want->size() >= size) {
+      return;
+    }
+  }
+}
+
+bool PeeringState::recoverable_and_ge_min_size(const vector<int> &want) const
+{
+  unsigned num_want_acting = 0;
+  set<pg_shard_t> have;
+  for (int i = 0; i < (int)want.size(); ++i) {
+    if (want[i] != CRUSH_ITEM_NONE) {
+      ++num_want_acting;
+      have.insert(
+        pg_shard_t(
+          want[i],
+          pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+    }
+  }
+  // We go incomplete if below min_size for ec_pools since backfill
+  // does not currently maintain rollbackability
+  // Otherwise, we will go "peered", but not "active"
+  if (num_want_acting < pool.info.min_size &&
+      (pool.info.is_erasure() ||
+       !cct->_conf->osd_allow_recovery_below_min_size)) {
+    psdout(10) << __func__ << " failed, below min size" << dendl;
+    return false;
+  }
+
+  /* Check whether we have enough acting shards to later perform recovery */
+  if (!missing_loc.get_recoverable_predicate()(have)) {
+    psdout(10) << __func__ << " failed, not recoverable" << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+void PeeringState::choose_async_recovery_ec(
+  const map<pg_shard_t, pg_info_t> &all_info,
+  const pg_info_t &auth_info,
+  vector<int> *want,
+  set<pg_shard_t> *async_recovery,
+  const OSDMapRef osdmap) const
+{
+  set<pair<int, pg_shard_t> > candidates_by_cost;
+  for (uint8_t i = 0; i < want->size(); ++i) {
+    if ((*want)[i] == CRUSH_ITEM_NONE)
+      continue;
+
+    // Considering log entries to recover is accurate enough for
+    // now. We could use minimum_to_decode_with_cost() later if
+    // necessary.
+    pg_shard_t shard_i((*want)[i], shard_id_t(i));
+    // do not include strays
+    if (stray_set.find(shard_i) != stray_set.end())
+      continue;
+    // Do not include an osd that is not up, since choosing it as
+    // an async_recovery_target will move it out of the acting set.
+    // This results in it being identified as a stray during peering,
+    // because it is no longer in the up or acting set.
+    if (!is_up(shard_i))
+      continue;
+    auto shard_info = all_info.find(shard_i)->second;
+    // for ec pools we rollback all entries past the authoritative
+    // last_update *before* activation. This is relatively inexpensive
+    // compared to recovery, since it is purely local, so treat shards
+    // past the authoritative last_update the same as those equal to it.
+    version_t auth_version = auth_info.last_update.version;
+    version_t candidate_version = shard_info.last_update.version;
+    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+      auto approx_missing_objects =
+        shard_info.stats.stats.sum.num_objects_missing;
+      if (auth_version > candidate_version) {
+        approx_missing_objects += auth_version - candidate_version;
+      }
+      if (static_cast<uint64_t>(approx_missing_objects) >
+         cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.emplace(approx_missing_objects, shard_i);
+      }
+    } else {
+      if (auth_version > candidate_version &&
+          (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
+      }
+    }
+  }
+
+  psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
+            << dendl;
+
+  // take out as many osds as we can for async recovery, in order of cost
+  for (auto rit = candidates_by_cost.rbegin();
+       rit != candidates_by_cost.rend(); ++rit) {
+    pg_shard_t cur_shard = rit->second;
+    vector<int> candidate_want(*want);
+    candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
+    if (recoverable_and_ge_min_size(candidate_want)) {
+      want->swap(candidate_want);
+      async_recovery->insert(cur_shard);
+    }
+  }
+  psdout(20) << __func__ << " result want=" << *want
+            << " async_recovery=" << *async_recovery << dendl;
+}
+
+void PeeringState::choose_async_recovery_replicated(
+  const map<pg_shard_t, pg_info_t> &all_info,
+  const pg_info_t &auth_info,
+  vector<int> *want,
+  set<pg_shard_t> *async_recovery,
+  const OSDMapRef osdmap) const
+{
+  set<pair<int, pg_shard_t> > candidates_by_cost;
+  for (auto osd_num : *want) {
+    pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
+    // do not include strays
+    if (stray_set.find(shard_i) != stray_set.end())
+      continue;
+    // Do not include an osd that is not up, since choosing it as
+    // an async_recovery_target will move it out of the acting set.
+    // This results in it being identified as a stray during peering,
+    // because it is no longer in the up or acting set.
+    if (!is_up(shard_i))
+      continue;
+    auto shard_info = all_info.find(shard_i)->second;
+    // use the approximate magnitude of the difference in length of
+    // logs plus historical missing objects as the cost of recovery
+    version_t auth_version = auth_info.last_update.version;
+    version_t candidate_version = shard_info.last_update.version;
+    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+      auto approx_missing_objects =
+        shard_info.stats.stats.sum.num_objects_missing;
+      if (auth_version > candidate_version) {
+        approx_missing_objects += auth_version - candidate_version;
+      } else {
+        approx_missing_objects += candidate_version - auth_version;
+      }
+      if (static_cast<uint64_t>(approx_missing_objects)  >
+         cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.emplace(approx_missing_objects, shard_i);
+      }
+    } else {
+      size_t approx_entries;
+      if (auth_version > candidate_version) {
+        approx_entries = auth_version - candidate_version;
+      } else {
+        approx_entries = candidate_version - auth_version;
+      }
+      if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.insert(make_pair(approx_entries, shard_i));
+      }
+    }
+  }
+
+  psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
+            << dendl;
+  // take out as many osds as we can for async recovery, in order of cost
+  for (auto rit = candidates_by_cost.rbegin();
+       rit != candidates_by_cost.rend(); ++rit) {
+    if (want->size() <= pool.info.min_size) {
+      break;
+    }
+    pg_shard_t cur_shard = rit->second;
+    vector<int> candidate_want(*want);
+    for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
+      if (*it == cur_shard.osd) {
+        candidate_want.erase(it);
+       want->swap(candidate_want);
+       async_recovery->insert(cur_shard);
+        break;
+      }
+    }
+  }
+  psdout(20) << __func__ << " result want=" << *want
+            << " async_recovery=" << *async_recovery << dendl;
+}
+
+
+
+/**
+ * choose acting
+ *
+ * calculate the desired acting, and request a change with the monitor
+ * if it differs from the current acting.
+ *
+ * if restrict_to_up_acting=true, we filter out anything that's not in
+ * up/acting.  in order to lift this restriction, we need to
+ *  1) check whether it's worth switching the acting set any time we get
+ *     a new pg info (not just here, when recovery finishes)
+ *  2) check whether anything in want_acting went down on each new map
+ *     (and, if so, calculate a new want_acting)
+ *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
+ * TODO!
+ */
+bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
+                                bool restrict_to_up_acting,
+                                bool *history_les_bound)
+{
+  map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
+  all_info[pg_whoami] = info;
+
+  if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
+    for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
+         p != all_info.end();
+         ++p) {
+      psdout(10) << __func__ << " all_info osd." << p->first << " "
+                << p->second << dendl;
+    }
+  }
+
+  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
+    find_best_info(all_info, restrict_to_up_acting, history_les_bound);
+
+  if (auth_log_shard == all_info.end()) {
+    if (up != acting) {
+      psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
+                << " reverting to up" << dendl;
+      want_acting = up;
+      vector<int> empty;
+      pl->queue_want_pg_temp(empty);
+    } else {
+      psdout(10) << __func__ << " failed" << dendl;
+      ceph_assert(want_acting.empty());
+    }
+    return false;
+  }
+
+  ceph_assert(!auth_log_shard->second.is_incomplete());
+  auth_log_shard_id = auth_log_shard->first;
+
+  set<pg_shard_t> want_backfill, want_acting_backfill;
+  vector<int> want;
+  stringstream ss;
+  if (pool.info.is_replicated())
+    calc_replicated_acting(
+      auth_log_shard,
+      cct->_conf.get_val<uint64_t>(
+        "osd_force_auth_primary_missing_objects"),
+      get_osdmap()->get_pg_size(info.pgid.pgid),
+      acting,
+      up,
+      up_primary,
+      all_info,
+      restrict_to_up_acting,
+      &want,
+      &want_backfill,
+      &want_acting_backfill,
+      get_osdmap(),
+      ss);
+  else
+    calc_ec_acting(
+      auth_log_shard,
+      get_osdmap()->get_pg_size(info.pgid.pgid),
+      acting,
+      up,
+      all_info,
+      restrict_to_up_acting,
+      &want,
+      &want_backfill,
+      &want_acting_backfill,
+      ss);
+  psdout(10) << ss.str() << dendl;
+
+  if (!recoverable_and_ge_min_size(want)) {
+    want_acting.clear();
+    return false;
+  }
+
+  set<pg_shard_t> want_async_recovery;
+  if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
+    if (pool.info.is_erasure()) {
+      choose_async_recovery_ec(
+       all_info, auth_log_shard->second, &want, &want_async_recovery,
+       get_osdmap());
+    } else {
+      choose_async_recovery_replicated(
+       all_info, auth_log_shard->second, &want, &want_async_recovery,
+       get_osdmap());
+    }
+  }
+  if (want != acting) {
+    psdout(10) << __func__ << " want " << want << " != acting " << acting
+              << ", requesting pg_temp change" << dendl;
+    want_acting = want;
+
+    if (!cct->_conf->osd_debug_no_acting_change) {
+      if (want_acting == up) {
+       // There can't be any pending backfill if
+       // want is the same as crush map up OSDs.
+       ceph_assert(want_backfill.empty());
+       vector<int> empty;
+       pl->queue_want_pg_temp(empty);
+      } else
+       pl->queue_want_pg_temp(want);
+    }
+    return false;
+  }
+  want_acting.clear();
+  acting_recovery_backfill = want_acting_backfill;
+  psdout(10) << "acting_recovery_backfill is "
+            << acting_recovery_backfill << dendl;
+  ceph_assert(
+    backfill_targets.empty() ||
+    backfill_targets == want_backfill);
+  if (backfill_targets.empty()) {
+    // Caller is GetInfo
+    backfill_targets = want_backfill;
+  }
+  // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
+  ceph_assert(
+    async_recovery_targets.empty() ||
+    async_recovery_targets == want_async_recovery ||
+    !needs_recovery());
+  if (async_recovery_targets.empty() || !needs_recovery()) {
+    async_recovery_targets = want_async_recovery;
+  }
+  // Will not change if already set because up would have had to change
+  // Verify that nothing in backfill is in stray_set
+  for (set<pg_shard_t>::iterator i = want_backfill.begin();
+      i != want_backfill.end();
+      ++i) {
+    ceph_assert(stray_set.find(*i) == stray_set.end());
+  }
+  psdout(10) << "choose_acting want=" << want << " backfill_targets="
+           << want_backfill << " async_recovery_targets="
+           << async_recovery_targets << dendl;
+  return true;
+}
+
  
  
  /*------------ Peering State Machine----------------*/
@@ -2357,11 +3066,11 @@ PeeringState::Recovered::Recovered(my_context ctx)
  
    // adjust acting set?  (e.g. because backfill completed...)
    bool history_les_bound = false;
-  if (ps->acting != ps->up && !pg->choose_acting(auth_log_shard,
+  if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
                                                  true, &history_les_bound)) {
      ceph_assert(ps->want_acting.size());
    } else if (!ps->async_recovery_targets.empty()) {
-    pg->choose_acting(auth_log_shard, true, &history_les_bound);
+    ps->choose_acting(auth_log_shard, true, &history_les_bound);
    }
  
    if (context< Active >().all_replicas_activated  &&
@@ -3205,7 +3914,7 @@ PeeringState::GetLog::GetLog(my_context ctx)
    DECLARE_LOCALS
  
    // adjust acting?
-  if (!pg->choose_acting(auth_log_shard, false,
+  if (!ps->choose_acting(auth_log_shard, false,
                          &context< Peering >().history_les_bound)) {
      if (!ps->want_acting.empty()) {
        post_event(NeedActingChange());
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h

index 18e30ac8fa168b686326a73f08097012ce65649d..66b26f86f0efa540f093561b8a83ecc3d52f1f03 100644 (file)
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -1256,6 +1256,54 @@ public:
  
    void reject_reservation();
  
+  // acting set
+  map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
+    const map<pg_shard_t, pg_info_t> &infos,
+    bool restrict_to_up_acting,
+    bool *history_les_bound) const;
+  static void calc_ec_acting(
+    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+    unsigned size,
+    const vector<int> &acting,
+    const vector<int> &up,
+    const map<pg_shard_t, pg_info_t> &all_info,
+    bool restrict_to_up_acting,
+    vector<int> *want,
+    set<pg_shard_t> *backfill,
+    set<pg_shard_t> *acting_backfill,
+    ostream &ss);
+  static void calc_replicated_acting(
+    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+    uint64_t force_auth_primary_missing_objects,
+    unsigned size,
+    const vector<int> &acting,
+    const vector<int> &up,
+    pg_shard_t up_primary,
+    const map<pg_shard_t, pg_info_t> &all_info,
+    bool restrict_to_up_acting,
+    vector<int> *want,
+    set<pg_shard_t> *backfill,
+    set<pg_shard_t> *acting_backfill,
+    const OSDMapRef osdmap,
+    ostream &ss);
+  void choose_async_recovery_ec(
+    const map<pg_shard_t, pg_info_t> &all_info,
+    const pg_info_t &auth_info,
+    vector<int> *want,
+    set<pg_shard_t> *async_recovery,
+    const OSDMapRef osdmap) const;
+  void choose_async_recovery_replicated(
+    const map<pg_shard_t, pg_info_t> &all_info,
+    const pg_info_t &auth_info,
+    vector<int> *want,
+    set<pg_shard_t> *async_recovery,
+    const OSDMapRef osdmap) const;
+
+  bool recoverable_and_ge_min_size(const vector<int> &want) const;
+  bool choose_acting(pg_shard_t &auth_log_shard,
+                    bool restrict_to_up_acting,
+                    bool *history_les_bound);
+
  public:
    PeeringState(
      CephContext *cct,
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc

index 5908a3e4824981a6fcc4cbbf0b96f15587cec3ca..5d329b3a40cdd7a55d2973a575c47e112b53ea08 100644 (file)
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -3850,7 +3850,7 @@ bool PastIntervals::check_new_interval(
    const OSDMap *osdmap,
    const OSDMap *lastmap,
    pg_t pgid,
-  IsPGRecoverablePredicate &could_have_gone_active,
+  const IsPGRecoverablePredicate &could_have_gone_active,
    PastIntervals *past_intervals,
    std::ostream *out)
  {
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h

index 4fe9c352c1980e17cdb919f31eec0a0e1d040a6a..6dc86267e4ac03f4b20d48acf5eb59822f9e4dd5 100644 (file)
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -3194,7 +3194,7 @@ public:
      std::shared_ptr<const OSDMap> osdmap,      ///< [in] current map
      std::shared_ptr<const OSDMap> lastmap,     ///< [in] last map
      pg_t pgid,                                  ///< [in] pgid for pg
-    IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
+    const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
      PastIntervals *past_intervals,              ///< [out] intervals
      std::ostream *out = 0                            ///< [out] debug ostream
      ) {
@@ -3292,7 +3292,7 @@ public:
      std::map<int, epoch_t> blocked_by;  ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
  
      bool pg_down = false;   ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
-    std::unique_ptr<IsPGRecoverablePredicate> pcontdec;
+    const IsPGRecoverablePredicate* pcontdec = nullptr;
  
      PriorSet() = default;
      PriorSet(PriorSet &&) = default;
@@ -3320,7 +3320,7 @@ public:
        std::set<int> down,
        std::map<int, epoch_t> blocked_by,
        bool pg_down,
-      IsPGRecoverablePredicate *pcontdec)
+      const IsPGRecoverablePredicate *pcontdec)
        : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
         pg_down(pg_down), pcontdec(pcontdec) {}
  
@@ -3330,7 +3330,7 @@ public:
        const PastIntervals &past_intervals,
        bool ec_pool,
        epoch_t last_epoch_started,
-      IsPGRecoverablePredicate *c,
+      const IsPGRecoverablePredicate *c,
        F f,
        const std::vector<int> &up,
        const std::vector<int> &acting,
@@ -3355,7 +3355,7 @@ PastIntervals::PriorSet::PriorSet(
    const PastIntervals &past_intervals,
    bool ec_pool,
    epoch_t last_epoch_started,
-  IsPGRecoverablePredicate *c,
+  const IsPGRecoverablePredicate *c,
    F f,
    const std::vector<int> &up,
    const std::vector<int> &acting,
author	sjust@redhat.com <sjust@redhat.com>
	Mon, 29 Apr 2019 20:55:20 +0000 (13:55 -0700)
committer	sjust@redhat.com <sjust@redhat.com>
	Wed, 1 May 2019 18:22:21 +0000 (11:22 -0700)
src/osd/MissingLoc.h		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/PeeringState.cc		patch \| blob \| history
src/osd/PeeringState.h		patch \| blob \| history
src/osd/osd_types.cc		patch \| blob \| history
src/osd/osd_types.h		patch \| blob \| history