From bc839f453d7079be80586eb09bae2ef4bc025e8b Mon Sep 17 00:00:00 2001 From: "sjust@redhat.com" Date: Mon, 29 Apr 2019 13:55:20 -0700 Subject: [PATCH] osd/: move choose_acting and callees into PeeringState Also marks IsPGRecoverablePredicate users const to allow simply reusing the instance on MissingLoc. Signed-off-by: Samuel Just --- src/osd/MissingLoc.h | 2 +- src/osd/PG.cc | 695 -------------------------------------- src/osd/PG.h | 44 --- src/osd/PeeringState.cc | 715 +++++++++++++++++++++++++++++++++++++++- src/osd/PeeringState.h | 48 +++ src/osd/osd_types.cc | 2 +- src/osd/osd_types.h | 10 +- 7 files changed, 767 insertions(+), 749 deletions(-) diff --git a/src/osd/MissingLoc.h b/src/osd/MissingLoc.h index 87c0d3c4cea..4ebc5520c76 100644 --- a/src/osd/MissingLoc.h +++ b/src/osd/MissingLoc.h @@ -120,7 +120,7 @@ class MissingLoc { is_readable.reset(_is_readable); is_recoverable.reset(_is_recoverable); } - IsPGRecoverablePredicate &get_recoverable_predicate() { + const IsPGRecoverablePredicate &get_recoverable_predicate() const { return *is_recoverable; } std::ostream& gen_prefix(std::ostream& out) const { diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 06d2da98647..489669666ef 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -551,701 +551,6 @@ PG::Scrubber::Scrubber() PG::Scrubber::~Scrubber() {} -/** - * find_best_info - * - * Returns an iterator to the best info in infos sorted by: - * 1) Prefer newer last_update - * 2) Prefer longer tail if it brings another info into contiguity - * 3) Prefer current primary - */ -map::const_iterator PG::find_best_info( - const map &infos, - bool restrict_to_up_acting, - bool *history_les_bound) const -{ - ceph_assert(history_les_bound); - /* See doc/dev/osd_internals/last_epoch_started.rst before attempting - * to make changes to this process. Also, make sure to update it - * when you find bugs! */ - eversion_t min_last_update_acceptable = eversion_t::max(); - epoch_t max_last_epoch_started_found = 0; - for (map::const_iterator i = infos.begin(); - i != infos.end(); - ++i) { - if (!cct->_conf->osd_find_best_info_ignore_history_les && - max_last_epoch_started_found < i->second.history.last_epoch_started) { - *history_les_bound = true; - max_last_epoch_started_found = i->second.history.last_epoch_started; - } - if (!i->second.is_incomplete() && - max_last_epoch_started_found < i->second.last_epoch_started) { - *history_les_bound = false; - max_last_epoch_started_found = i->second.last_epoch_started; - } - } - for (map::const_iterator i = infos.begin(); - i != infos.end(); - ++i) { - if (max_last_epoch_started_found <= i->second.last_epoch_started) { - if (min_last_update_acceptable > i->second.last_update) - min_last_update_acceptable = i->second.last_update; - } - } - if (min_last_update_acceptable == eversion_t::max()) - return infos.end(); - - map::const_iterator best = infos.end(); - // find osd with newest last_update (oldest for ec_pool). - // if there are multiples, prefer - // - a longer tail, if it brings another peer into log contiguity - // - the current primary - for (map::const_iterator p = infos.begin(); - p != infos.end(); - ++p) { - if (restrict_to_up_acting && !is_up(p->first) && - !is_acting(p->first)) - continue; - // Only consider peers with last_update >= min_last_update_acceptable - if (p->second.last_update < min_last_update_acceptable) - continue; - // Disqualify anyone with a too old last_epoch_started - if (p->second.last_epoch_started < max_last_epoch_started_found) - continue; - // Disqualify anyone who is incomplete (not fully backfilled) - if (p->second.is_incomplete()) - continue; - if (best == infos.end()) { - best = p; - continue; - } - // Prefer newer last_update - if (pool.info.require_rollback()) { - if (p->second.last_update > best->second.last_update) - continue; - if (p->second.last_update < best->second.last_update) { - best = p; - continue; - } - } else { - if (p->second.last_update < best->second.last_update) - continue; - if (p->second.last_update > best->second.last_update) { - best = p; - continue; - } - } - - // Prefer longer tail - if (p->second.log_tail > best->second.log_tail) { - continue; - } else if (p->second.log_tail < best->second.log_tail) { - best = p; - continue; - } - - if (!p->second.has_missing() && best->second.has_missing()) { - dout(10) << __func__ << " prefer osd." << p->first - << " because it is complete while best has missing" - << dendl; - best = p; - continue; - } else if (p->second.has_missing() && !best->second.has_missing()) { - dout(10) << __func__ << " skipping osd." << p->first - << " because it has missing while best is complete" - << dendl; - continue; - } else { - // both are complete or have missing - // fall through - } - - // prefer current primary (usually the caller), all things being equal - if (p->first == pg_whoami) { - dout(10) << "calc_acting prefer osd." << p->first - << " because it is current primary" << dendl; - best = p; - continue; - } - } - return best; -} - -void PG::calc_ec_acting( - map::const_iterator auth_log_shard, - unsigned size, - const vector &acting, - const vector &up, - const map &all_info, - bool restrict_to_up_acting, - vector *_want, - set *backfill, - set *acting_backfill, - ostream &ss) -{ - vector want(size, CRUSH_ITEM_NONE); - map > all_info_by_shard; - for (map::const_iterator i = all_info.begin(); - i != all_info.end(); - ++i) { - all_info_by_shard[i->first.shard].insert(i->first); - } - for (uint8_t i = 0; i < want.size(); ++i) { - ss << "For position " << (unsigned)i << ": "; - if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE && - !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() && - all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >= - auth_log_shard->second.log_tail) { - ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl; - want[i] = up[i]; - continue; - } - if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) { - ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i)) - << " and "; - backfill->insert(pg_shard_t(up[i], shard_id_t(i))); - } - - if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE && - !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() && - all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >= - auth_log_shard->second.log_tail) { - ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl; - want[i] = acting[i]; - } else if (!restrict_to_up_acting) { - for (set::iterator j = all_info_by_shard[shard_id_t(i)].begin(); - j != all_info_by_shard[shard_id_t(i)].end(); - ++j) { - ceph_assert(j->shard == i); - if (!all_info.find(*j)->second.is_incomplete() && - all_info.find(*j)->second.last_update >= - auth_log_shard->second.log_tail) { - ss << " selecting stray: " << *j << std::endl; - want[i] = j->osd; - break; - } - } - if (want[i] == CRUSH_ITEM_NONE) - ss << " failed to fill position " << (int)i << std::endl; - } - } - - for (uint8_t i = 0; i < want.size(); ++i) { - if (want[i] != CRUSH_ITEM_NONE) { - acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i))); - } - } - acting_backfill->insert(backfill->begin(), backfill->end()); - _want->swap(want); -} - -/** - * calculate the desired acting set. - * - * Choose an appropriate acting set. Prefer up[0], unless it is - * incomplete, or another osd has a longer tail that allows us to - * bring other up nodes up to date. - */ -void PG::calc_replicated_acting( - map::const_iterator auth_log_shard, - uint64_t force_auth_primary_missing_objects, - unsigned size, - const vector &acting, - const vector &up, - pg_shard_t up_primary, - const map &all_info, - bool restrict_to_up_acting, - vector *want, - set *backfill, - set *acting_backfill, - const OSDMapRef osdmap, - ostream &ss) -{ - pg_shard_t auth_log_shard_id = auth_log_shard->first; - - ss << __func__ << " newest update on osd." << auth_log_shard_id - << " with " << auth_log_shard->second - << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl; - - // select primary - auto primary = all_info.find(up_primary); - if (up.size() && - !primary->second.is_incomplete() && - primary->second.last_update >= - auth_log_shard->second.log_tail) { - if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { - auto approx_missing_objects = - primary->second.stats.stats.sum.num_objects_missing; - auto auth_version = auth_log_shard->second.last_update.version; - auto primary_version = primary->second.last_update.version; - if (auth_version > primary_version) { - approx_missing_objects += auth_version - primary_version; - } else { - approx_missing_objects += primary_version - auth_version; - } - if ((uint64_t)approx_missing_objects > - force_auth_primary_missing_objects) { - primary = auth_log_shard; - ss << "up_primary: " << up_primary << ") has approximate " - << approx_missing_objects - << "(>" << force_auth_primary_missing_objects <<") " - << "missing objects, osd." << auth_log_shard_id - << " selected as primary instead" - << std::endl; - } else { - ss << "up_primary: " << up_primary << ") selected as primary" - << std::endl; - } - } else { - ss << "up_primary: " << up_primary << ") selected as primary" << std::endl; - } - } else { - ceph_assert(!auth_log_shard->second.is_incomplete()); - ss << "up[0] needs backfill, osd." << auth_log_shard_id - << " selected as primary instead" << std::endl; - primary = auth_log_shard; - } - - ss << __func__ << " primary is osd." << primary->first - << " with " << primary->second << std::endl; - want->push_back(primary->first.osd); - acting_backfill->insert(primary->first); - - /* We include auth_log_shard->second.log_tail because in GetLog, - * we will request logs back to the min last_update over our - * acting_backfill set, which will result in our log being extended - * as far backwards as necessary to pick up any peers which can - * be log recovered by auth_log_shard's log */ - eversion_t oldest_auth_log_entry = - std::min(primary->second.log_tail, auth_log_shard->second.log_tail); - - // select replicas that have log contiguity with primary. - // prefer up, then acting, then any peer_info osds - for (auto i : up) { - pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD); - if (up_cand == primary->first) - continue; - const pg_info_t &cur_info = all_info.find(up_cand)->second; - if (cur_info.is_incomplete() || - cur_info.last_update < oldest_auth_log_entry) { - ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl; - backfill->insert(up_cand); - acting_backfill->insert(up_cand); - } else { - want->push_back(i); - acting_backfill->insert(up_cand); - ss << " osd." << i << " (up) accepted " << cur_info << std::endl; - } - if (want->size() >= size) { - break; - } - } - - if (want->size() >= size) { - return; - } - - std::vector> candidate_by_last_update; - candidate_by_last_update.reserve(acting.size()); - // This no longer has backfill OSDs, but they are covered above. - for (auto i : acting) { - pg_shard_t acting_cand(i, shard_id_t::NO_SHARD); - // skip up osds we already considered above - if (acting_cand == primary->first) - continue; - vector::const_iterator up_it = find(up.begin(), up.end(), i); - if (up_it != up.end()) - continue; - - const pg_info_t &cur_info = all_info.find(acting_cand)->second; - if (cur_info.is_incomplete() || - cur_info.last_update < oldest_auth_log_entry) { - ss << " shard " << acting_cand << " (acting) REJECTED " - << cur_info << std::endl; - } else { - candidate_by_last_update.emplace_back(cur_info.last_update, i); - } - } - - auto sort_by_eversion =[](const std::pair &lhs, - const std::pair &rhs) { - return lhs.first > rhs.first; - }; - // sort by last_update, in descending order. - std::sort(candidate_by_last_update.begin(), - candidate_by_last_update.end(), sort_by_eversion); - for (auto &p: candidate_by_last_update) { - ceph_assert(want->size() < size); - want->push_back(p.second); - pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); - acting_backfill->insert(s); - ss << " shard " << s << " (acting) accepted " - << all_info.find(s)->second << std::endl; - if (want->size() >= size) { - return; - } - } - - if (restrict_to_up_acting) { - return; - } - candidate_by_last_update.clear(); - candidate_by_last_update.reserve(all_info.size()); // overestimate but fine - // continue to search stray to find more suitable peers - for (auto &i : all_info) { - // skip up osds we already considered above - if (i.first == primary->first) - continue; - vector::const_iterator up_it = find(up.begin(), up.end(), i.first.osd); - if (up_it != up.end()) - continue; - vector::const_iterator acting_it = find( - acting.begin(), acting.end(), i.first.osd); - if (acting_it != acting.end()) - continue; - - if (i.second.is_incomplete() || - i.second.last_update < oldest_auth_log_entry) { - ss << " shard " << i.first << " (stray) REJECTED " << i.second - << std::endl; - } else { - candidate_by_last_update.emplace_back(i.second.last_update, i.first.osd); - } - } - - if (candidate_by_last_update.empty()) { - // save us some effort - return; - } - - // sort by last_update, in descending order. - std::sort(candidate_by_last_update.begin(), - candidate_by_last_update.end(), sort_by_eversion); - - for (auto &p: candidate_by_last_update) { - ceph_assert(want->size() < size); - want->push_back(p.second); - pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); - acting_backfill->insert(s); - ss << " shard " << s << " (stray) accepted " - << all_info.find(s)->second << std::endl; - if (want->size() >= size) { - return; - } - } -} - -bool PG::recoverable_and_ge_min_size(const vector &want) const -{ - unsigned num_want_acting = 0; - set have; - for (int i = 0; i < (int)want.size(); ++i) { - if (want[i] != CRUSH_ITEM_NONE) { - ++num_want_acting; - have.insert( - pg_shard_t( - want[i], - pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); - } - } - // We go incomplete if below min_size for ec_pools since backfill - // does not currently maintain rollbackability - // Otherwise, we will go "peered", but not "active" - if (num_want_acting < pool.info.min_size && - (pool.info.is_erasure() || - !cct->_conf->osd_allow_recovery_below_min_size)) { - dout(10) << __func__ << " failed, below min size" << dendl; - return false; - } - - /* Check whether we have enough acting shards to later perform recovery */ - boost::scoped_ptr recoverable_predicate( - get_pgbackend()->get_is_recoverable_predicate()); - if (!(*recoverable_predicate)(have)) { - dout(10) << __func__ << " failed, not recoverable" << dendl; - return false; - } - - return true; -} - -void PG::choose_async_recovery_ec(const map &all_info, - const pg_info_t &auth_info, - vector *want, - set *async_recovery, - const OSDMapRef osdmap) const -{ - set > candidates_by_cost; - for (uint8_t i = 0; i < want->size(); ++i) { - if ((*want)[i] == CRUSH_ITEM_NONE) - continue; - - // Considering log entries to recover is accurate enough for - // now. We could use minimum_to_decode_with_cost() later if - // necessary. - pg_shard_t shard_i((*want)[i], shard_id_t(i)); - // do not include strays - if (stray_set.find(shard_i) != stray_set.end()) - continue; - // Do not include an osd that is not up, since choosing it as - // an async_recovery_target will move it out of the acting set. - // This results in it being identified as a stray during peering, - // because it is no longer in the up or acting set. - if (!is_up(shard_i)) - continue; - auto shard_info = all_info.find(shard_i)->second; - // for ec pools we rollback all entries past the authoritative - // last_update *before* activation. This is relatively inexpensive - // compared to recovery, since it is purely local, so treat shards - // past the authoritative last_update the same as those equal to it. - version_t auth_version = auth_info.last_update.version; - version_t candidate_version = shard_info.last_update.version; - if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { - auto approx_missing_objects = - shard_info.stats.stats.sum.num_objects_missing; - if (auth_version > candidate_version) { - approx_missing_objects += auth_version - candidate_version; - } - if (static_cast(approx_missing_objects) > - cct->_conf.get_val("osd_async_recovery_min_cost")) { - candidates_by_cost.emplace(approx_missing_objects, shard_i); - } - } else { - if (auth_version > candidate_version && - (auth_version - candidate_version) > cct->_conf.get_val("osd_async_recovery_min_cost")) { - candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i)); - } - } - } - - dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost - << dendl; - - // take out as many osds as we can for async recovery, in order of cost - for (auto rit = candidates_by_cost.rbegin(); - rit != candidates_by_cost.rend(); ++rit) { - pg_shard_t cur_shard = rit->second; - vector candidate_want(*want); - candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE; - if (recoverable_and_ge_min_size(candidate_want)) { - want->swap(candidate_want); - async_recovery->insert(cur_shard); - } - } - dout(20) << __func__ << " result want=" << *want - << " async_recovery=" << *async_recovery << dendl; -} - -void PG::choose_async_recovery_replicated(const map &all_info, - const pg_info_t &auth_info, - vector *want, - set *async_recovery, - const OSDMapRef osdmap) const -{ - set > candidates_by_cost; - for (auto osd_num : *want) { - pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD); - // do not include strays - if (stray_set.find(shard_i) != stray_set.end()) - continue; - // Do not include an osd that is not up, since choosing it as - // an async_recovery_target will move it out of the acting set. - // This results in it being identified as a stray during peering, - // because it is no longer in the up or acting set. - if (!is_up(shard_i)) - continue; - auto shard_info = all_info.find(shard_i)->second; - // use the approximate magnitude of the difference in length of - // logs plus historical missing objects as the cost of recovery - version_t auth_version = auth_info.last_update.version; - version_t candidate_version = shard_info.last_update.version; - if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { - auto approx_missing_objects = - shard_info.stats.stats.sum.num_objects_missing; - if (auth_version > candidate_version) { - approx_missing_objects += auth_version - candidate_version; - } else { - approx_missing_objects += candidate_version - auth_version; - } - if (static_cast(approx_missing_objects) > - cct->_conf.get_val("osd_async_recovery_min_cost")) { - candidates_by_cost.emplace(approx_missing_objects, shard_i); - } - } else { - size_t approx_entries; - if (auth_version > candidate_version) { - approx_entries = auth_version - candidate_version; - } else { - approx_entries = candidate_version - auth_version; - } - if (approx_entries > cct->_conf.get_val("osd_async_recovery_min_cost")) { - candidates_by_cost.insert(make_pair(approx_entries, shard_i)); - } - } - } - - dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost - << dendl; - // take out as many osds as we can for async recovery, in order of cost - for (auto rit = candidates_by_cost.rbegin(); - rit != candidates_by_cost.rend(); ++rit) { - if (want->size() <= pool.info.min_size) { - break; - } - pg_shard_t cur_shard = rit->second; - vector candidate_want(*want); - for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) { - if (*it == cur_shard.osd) { - candidate_want.erase(it); - want->swap(candidate_want); - async_recovery->insert(cur_shard); - break; - } - } - } - dout(20) << __func__ << " result want=" << *want - << " async_recovery=" << *async_recovery << dendl; -} - -/** - * choose acting - * - * calculate the desired acting, and request a change with the monitor - * if it differs from the current acting. - * - * if restrict_to_up_acting=true, we filter out anything that's not in - * up/acting. in order to lift this restriction, we need to - * 1) check whether it's worth switching the acting set any time we get - * a new pg info (not just here, when recovery finishes) - * 2) check whether anything in want_acting went down on each new map - * (and, if so, calculate a new want_acting) - * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap) - * TODO! - */ -bool PG::choose_acting(pg_shard_t &auth_log_shard_id, - bool restrict_to_up_acting, - bool *history_les_bound) -{ - map all_info(peer_info.begin(), peer_info.end()); - all_info[pg_whoami] = info; - - if (cct->_conf->subsys.should_gather()) { - for (map::iterator p = all_info.begin(); - p != all_info.end(); - ++p) { - dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl; - } - } - - map::const_iterator auth_log_shard = - find_best_info(all_info, restrict_to_up_acting, history_les_bound); - - if (auth_log_shard == all_info.end()) { - if (up != acting) { - dout(10) << __func__ << " no suitable info found (incomplete backfills?)," - << " reverting to up" << dendl; - want_acting = up; - vector empty; - osd->queue_want_pg_temp(info.pgid.pgid, empty); - } else { - dout(10) << __func__ << " failed" << dendl; - ceph_assert(want_acting.empty()); - } - return false; - } - - ceph_assert(!auth_log_shard->second.is_incomplete()); - auth_log_shard_id = auth_log_shard->first; - - set want_backfill, want_acting_backfill; - vector want; - stringstream ss; - if (pool.info.is_replicated()) - calc_replicated_acting( - auth_log_shard, - cct->_conf.get_val( - "osd_force_auth_primary_missing_objects"), - get_osdmap()->get_pg_size(info.pgid.pgid), - acting, - up, - up_primary, - all_info, - restrict_to_up_acting, - &want, - &want_backfill, - &want_acting_backfill, - get_osdmap(), - ss); - else - calc_ec_acting( - auth_log_shard, - get_osdmap()->get_pg_size(info.pgid.pgid), - acting, - up, - all_info, - restrict_to_up_acting, - &want, - &want_backfill, - &want_acting_backfill, - ss); - dout(10) << ss.str() << dendl; - - if (!recoverable_and_ge_min_size(want)) { - want_acting.clear(); - return false; - } - - set want_async_recovery; - if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) { - if (pool.info.is_erasure()) { - choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap()); - } else { - choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap()); - } - } - if (want != acting) { - dout(10) << __func__ << " want " << want << " != acting " << acting - << ", requesting pg_temp change" << dendl; - want_acting = want; - - if (!cct->_conf->osd_debug_no_acting_change) { - if (want_acting == up) { - // There can't be any pending backfill if - // want is the same as crush map up OSDs. - ceph_assert(want_backfill.empty()); - vector empty; - osd->queue_want_pg_temp(info.pgid.pgid, empty); - } else - osd->queue_want_pg_temp(info.pgid.pgid, want); - } - return false; - } - want_acting.clear(); - acting_recovery_backfill = want_acting_backfill; - dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl; - ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill); - if (backfill_targets.empty()) { - // Caller is GetInfo - backfill_targets = want_backfill; - } - // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete - ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery()); - if (async_recovery_targets.empty() || !needs_recovery()) { - async_recovery_targets = want_async_recovery; - } - // Will not change if already set because up would have had to change - // Verify that nothing in backfill is in stray_set - for (set::iterator i = want_backfill.begin(); - i != want_backfill.end(); - ++i) { - ceph_assert(stray_set.find(*i) == stray_set.end()); - } - dout(10) << "choose_acting want=" << want << " backfill_targets=" - << want_backfill << " async_recovery_targets=" - << async_recovery_targets << dendl; - return true; -} - /* Build the might_have_unfound set. * * This is used by the primary OSD during recovery. diff --git a/src/osd/PG.h b/src/osd/PG.h index 785a742436a..6e21b99ee23 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1045,50 +1045,6 @@ protected: void discover_all_missing(std::map > &query_map); - map::const_iterator find_best_info( - const map &infos, - bool restrict_to_up_acting, - bool *history_les_bound) const; - static void calc_ec_acting( - map::const_iterator auth_log_shard, - unsigned size, - const vector &acting, - const vector &up, - const map &all_info, - bool restrict_to_up_acting, - vector *want, - set *backfill, - set *acting_backfill, - ostream &ss); - static void calc_replicated_acting( - map::const_iterator auth_log_shard, - uint64_t force_auth_primary_missing_objects, - unsigned size, - const vector &acting, - const vector &up, - pg_shard_t up_primary, - const map &all_info, - bool restrict_to_up_acting, - vector *want, - set *backfill, - set *acting_backfill, - const OSDMapRef osdmap, - ostream &ss); - void choose_async_recovery_ec(const map &all_info, - const pg_info_t &auth_info, - vector *want, - set *async_recovery, - const OSDMapRef osdmap) const; - void choose_async_recovery_replicated(const map &all_info, - const pg_info_t &auth_info, - vector *want, - set *async_recovery, - const OSDMapRef osdmap) const; - - bool recoverable_and_ge_min_size(const vector &want) const; - bool choose_acting(pg_shard_t &auth_log_shard, - bool restrict_to_up_acting, - bool *history_les_bound); void build_might_have_unfound(); void activate( ObjectStore::Transaction& t, diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index a546ea0754b..e19a179d03a 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -1127,6 +1127,715 @@ void PeeringState::reject_reservation() get_osdmap_epoch()); } +/** + * find_best_info + * + * Returns an iterator to the best info in infos sorted by: + * 1) Prefer newer last_update + * 2) Prefer longer tail if it brings another info into contiguity + * 3) Prefer current primary + */ +map::const_iterator PeeringState::find_best_info( + const map &infos, + bool restrict_to_up_acting, + bool *history_les_bound) const +{ + ceph_assert(history_les_bound); + /* See doc/dev/osd_internals/last_epoch_started.rst before attempting + * to make changes to this process. Also, make sure to update it + * when you find bugs! */ + eversion_t min_last_update_acceptable = eversion_t::max(); + epoch_t max_last_epoch_started_found = 0; + for (map::const_iterator i = infos.begin(); + i != infos.end(); + ++i) { + if (!cct->_conf->osd_find_best_info_ignore_history_les && + max_last_epoch_started_found < i->second.history.last_epoch_started) { + *history_les_bound = true; + max_last_epoch_started_found = i->second.history.last_epoch_started; + } + if (!i->second.is_incomplete() && + max_last_epoch_started_found < i->second.last_epoch_started) { + *history_les_bound = false; + max_last_epoch_started_found = i->second.last_epoch_started; + } + } + for (map::const_iterator i = infos.begin(); + i != infos.end(); + ++i) { + if (max_last_epoch_started_found <= i->second.last_epoch_started) { + if (min_last_update_acceptable > i->second.last_update) + min_last_update_acceptable = i->second.last_update; + } + } + if (min_last_update_acceptable == eversion_t::max()) + return infos.end(); + + map::const_iterator best = infos.end(); + // find osd with newest last_update (oldest for ec_pool). + // if there are multiples, prefer + // - a longer tail, if it brings another peer into log contiguity + // - the current primary + for (map::const_iterator p = infos.begin(); + p != infos.end(); + ++p) { + if (restrict_to_up_acting && !is_up(p->first) && + !is_acting(p->first)) + continue; + // Only consider peers with last_update >= min_last_update_acceptable + if (p->second.last_update < min_last_update_acceptable) + continue; + // Disqualify anyone with a too old last_epoch_started + if (p->second.last_epoch_started < max_last_epoch_started_found) + continue; + // Disqualify anyone who is incomplete (not fully backfilled) + if (p->second.is_incomplete()) + continue; + if (best == infos.end()) { + best = p; + continue; + } + // Prefer newer last_update + if (pool.info.require_rollback()) { + if (p->second.last_update > best->second.last_update) + continue; + if (p->second.last_update < best->second.last_update) { + best = p; + continue; + } + } else { + if (p->second.last_update < best->second.last_update) + continue; + if (p->second.last_update > best->second.last_update) { + best = p; + continue; + } + } + + // Prefer longer tail + if (p->second.log_tail > best->second.log_tail) { + continue; + } else if (p->second.log_tail < best->second.log_tail) { + best = p; + continue; + } + + if (!p->second.has_missing() && best->second.has_missing()) { + psdout(10) << __func__ << " prefer osd." << p->first + << " because it is complete while best has missing" + << dendl; + best = p; + continue; + } else if (p->second.has_missing() && !best->second.has_missing()) { + psdout(10) << __func__ << " skipping osd." << p->first + << " because it has missing while best is complete" + << dendl; + continue; + } else { + // both are complete or have missing + // fall through + } + + // prefer current primary (usually the caller), all things being equal + if (p->first == pg_whoami) { + psdout(10) << "calc_acting prefer osd." << p->first + << " because it is current primary" << dendl; + best = p; + continue; + } + } + return best; +} + +void PeeringState::calc_ec_acting( + map::const_iterator auth_log_shard, + unsigned size, + const vector &acting, + const vector &up, + const map &all_info, + bool restrict_to_up_acting, + vector *_want, + set *backfill, + set *acting_backfill, + ostream &ss) +{ + vector want(size, CRUSH_ITEM_NONE); + map > all_info_by_shard; + for (map::const_iterator i = all_info.begin(); + i != all_info.end(); + ++i) { + all_info_by_shard[i->first.shard].insert(i->first); + } + for (uint8_t i = 0; i < want.size(); ++i) { + ss << "For position " << (unsigned)i << ": "; + if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE && + !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() && + all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl; + want[i] = up[i]; + continue; + } + if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) { + ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i)) + << " and "; + backfill->insert(pg_shard_t(up[i], shard_id_t(i))); + } + + if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE && + !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() && + all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl; + want[i] = acting[i]; + } else if (!restrict_to_up_acting) { + for (set::iterator j = all_info_by_shard[shard_id_t(i)].begin(); + j != all_info_by_shard[shard_id_t(i)].end(); + ++j) { + ceph_assert(j->shard == i); + if (!all_info.find(*j)->second.is_incomplete() && + all_info.find(*j)->second.last_update >= + auth_log_shard->second.log_tail) { + ss << " selecting stray: " << *j << std::endl; + want[i] = j->osd; + break; + } + } + if (want[i] == CRUSH_ITEM_NONE) + ss << " failed to fill position " << (int)i << std::endl; + } + } + + for (uint8_t i = 0; i < want.size(); ++i) { + if (want[i] != CRUSH_ITEM_NONE) { + acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i))); + } + } + acting_backfill->insert(backfill->begin(), backfill->end()); + _want->swap(want); +} + +/** + * calculate the desired acting set. + * + * Choose an appropriate acting set. Prefer up[0], unless it is + * incomplete, or another osd has a longer tail that allows us to + * bring other up nodes up to date. + */ +void PeeringState::calc_replicated_acting( + map::const_iterator auth_log_shard, + uint64_t force_auth_primary_missing_objects, + unsigned size, + const vector &acting, + const vector &up, + pg_shard_t up_primary, + const map &all_info, + bool restrict_to_up_acting, + vector *want, + set *backfill, + set *acting_backfill, + const OSDMapRef osdmap, + ostream &ss) +{ + pg_shard_t auth_log_shard_id = auth_log_shard->first; + + ss << __func__ << " newest update on osd." << auth_log_shard_id + << " with " << auth_log_shard->second + << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl; + + // select primary + auto primary = all_info.find(up_primary); + if (up.size() && + !primary->second.is_incomplete() && + primary->second.last_update >= + auth_log_shard->second.log_tail) { + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + primary->second.stats.stats.sum.num_objects_missing; + auto auth_version = auth_log_shard->second.last_update.version; + auto primary_version = primary->second.last_update.version; + if (auth_version > primary_version) { + approx_missing_objects += auth_version - primary_version; + } else { + approx_missing_objects += primary_version - auth_version; + } + if ((uint64_t)approx_missing_objects > + force_auth_primary_missing_objects) { + primary = auth_log_shard; + ss << "up_primary: " << up_primary << ") has approximate " + << approx_missing_objects + << "(>" << force_auth_primary_missing_objects <<") " + << "missing objects, osd." << auth_log_shard_id + << " selected as primary instead" + << std::endl; + } else { + ss << "up_primary: " << up_primary << ") selected as primary" + << std::endl; + } + } else { + ss << "up_primary: " << up_primary << ") selected as primary" << std::endl; + } + } else { + ceph_assert(!auth_log_shard->second.is_incomplete()); + ss << "up[0] needs backfill, osd." << auth_log_shard_id + << " selected as primary instead" << std::endl; + primary = auth_log_shard; + } + + ss << __func__ << " primary is osd." << primary->first + << " with " << primary->second << std::endl; + want->push_back(primary->first.osd); + acting_backfill->insert(primary->first); + + /* We include auth_log_shard->second.log_tail because in GetLog, + * we will request logs back to the min last_update over our + * acting_backfill set, which will result in our log being extended + * as far backwards as necessary to pick up any peers which can + * be log recovered by auth_log_shard's log */ + eversion_t oldest_auth_log_entry = + std::min(primary->second.log_tail, auth_log_shard->second.log_tail); + + // select replicas that have log contiguity with primary. + // prefer up, then acting, then any peer_info osds + for (auto i : up) { + pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD); + if (up_cand == primary->first) + continue; + const pg_info_t &cur_info = all_info.find(up_cand)->second; + if (cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry) { + ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl; + backfill->insert(up_cand); + acting_backfill->insert(up_cand); + } else { + want->push_back(i); + acting_backfill->insert(up_cand); + ss << " osd." << i << " (up) accepted " << cur_info << std::endl; + } + if (want->size() >= size) { + break; + } + } + + if (want->size() >= size) { + return; + } + + std::vector> candidate_by_last_update; + candidate_by_last_update.reserve(acting.size()); + // This no longer has backfill OSDs, but they are covered above. + for (auto i : acting) { + pg_shard_t acting_cand(i, shard_id_t::NO_SHARD); + // skip up osds we already considered above + if (acting_cand == primary->first) + continue; + vector::const_iterator up_it = find(up.begin(), up.end(), i); + if (up_it != up.end()) + continue; + + const pg_info_t &cur_info = all_info.find(acting_cand)->second; + if (cur_info.is_incomplete() || + cur_info.last_update < oldest_auth_log_entry) { + ss << " shard " << acting_cand << " (acting) REJECTED " + << cur_info << std::endl; + } else { + candidate_by_last_update.emplace_back(cur_info.last_update, i); + } + } + + auto sort_by_eversion =[](const std::pair &lhs, + const std::pair &rhs) { + return lhs.first > rhs.first; + }; + // sort by last_update, in descending order. + std::sort(candidate_by_last_update.begin(), + candidate_by_last_update.end(), sort_by_eversion); + for (auto &p: candidate_by_last_update) { + ceph_assert(want->size() < size); + want->push_back(p.second); + pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); + acting_backfill->insert(s); + ss << " shard " << s << " (acting) accepted " + << all_info.find(s)->second << std::endl; + if (want->size() >= size) { + return; + } + } + + if (restrict_to_up_acting) { + return; + } + candidate_by_last_update.clear(); + candidate_by_last_update.reserve(all_info.size()); // overestimate but fine + // continue to search stray to find more suitable peers + for (auto &i : all_info) { + // skip up osds we already considered above + if (i.first == primary->first) + continue; + vector::const_iterator up_it = find(up.begin(), up.end(), i.first.osd); + if (up_it != up.end()) + continue; + vector::const_iterator acting_it = find( + acting.begin(), acting.end(), i.first.osd); + if (acting_it != acting.end()) + continue; + + if (i.second.is_incomplete() || + i.second.last_update < oldest_auth_log_entry) { + ss << " shard " << i.first << " (stray) REJECTED " << i.second + << std::endl; + } else { + candidate_by_last_update.emplace_back( + i.second.last_update, i.first.osd); + } + } + + if (candidate_by_last_update.empty()) { + // save us some effort + return; + } + + // sort by last_update, in descending order. + std::sort(candidate_by_last_update.begin(), + candidate_by_last_update.end(), sort_by_eversion); + + for (auto &p: candidate_by_last_update) { + ceph_assert(want->size() < size); + want->push_back(p.second); + pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); + acting_backfill->insert(s); + ss << " shard " << s << " (stray) accepted " + << all_info.find(s)->second << std::endl; + if (want->size() >= size) { + return; + } + } +} + +bool PeeringState::recoverable_and_ge_min_size(const vector &want) const +{ + unsigned num_want_acting = 0; + set have; + for (int i = 0; i < (int)want.size(); ++i) { + if (want[i] != CRUSH_ITEM_NONE) { + ++num_want_acting; + have.insert( + pg_shard_t( + want[i], + pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } + // We go incomplete if below min_size for ec_pools since backfill + // does not currently maintain rollbackability + // Otherwise, we will go "peered", but not "active" + if (num_want_acting < pool.info.min_size && + (pool.info.is_erasure() || + !cct->_conf->osd_allow_recovery_below_min_size)) { + psdout(10) << __func__ << " failed, below min size" << dendl; + return false; + } + + /* Check whether we have enough acting shards to later perform recovery */ + if (!missing_loc.get_recoverable_predicate()(have)) { + psdout(10) << __func__ << " failed, not recoverable" << dendl; + return false; + } + + return true; +} + +void PeeringState::choose_async_recovery_ec( + const map &all_info, + const pg_info_t &auth_info, + vector *want, + set *async_recovery, + const OSDMapRef osdmap) const +{ + set > candidates_by_cost; + for (uint8_t i = 0; i < want->size(); ++i) { + if ((*want)[i] == CRUSH_ITEM_NONE) + continue; + + // Considering log entries to recover is accurate enough for + // now. We could use minimum_to_decode_with_cost() later if + // necessary. + pg_shard_t shard_i((*want)[i], shard_id_t(i)); + // do not include strays + if (stray_set.find(shard_i) != stray_set.end()) + continue; + // Do not include an osd that is not up, since choosing it as + // an async_recovery_target will move it out of the acting set. + // This results in it being identified as a stray during peering, + // because it is no longer in the up or acting set. + if (!is_up(shard_i)) + continue; + auto shard_info = all_info.find(shard_i)->second; + // for ec pools we rollback all entries past the authoritative + // last_update *before* activation. This is relatively inexpensive + // compared to recovery, since it is purely local, so treat shards + // past the authoritative last_update the same as those equal to it. + version_t auth_version = auth_info.last_update.version; + version_t candidate_version = shard_info.last_update.version; + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + shard_info.stats.stats.sum.num_objects_missing; + if (auth_version > candidate_version) { + approx_missing_objects += auth_version - candidate_version; + } + if (static_cast(approx_missing_objects) > + cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.emplace(approx_missing_objects, shard_i); + } + } else { + if (auth_version > candidate_version && + (auth_version - candidate_version) > cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i)); + } + } + } + + psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost + << dendl; + + // take out as many osds as we can for async recovery, in order of cost + for (auto rit = candidates_by_cost.rbegin(); + rit != candidates_by_cost.rend(); ++rit) { + pg_shard_t cur_shard = rit->second; + vector candidate_want(*want); + candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE; + if (recoverable_and_ge_min_size(candidate_want)) { + want->swap(candidate_want); + async_recovery->insert(cur_shard); + } + } + psdout(20) << __func__ << " result want=" << *want + << " async_recovery=" << *async_recovery << dendl; +} + +void PeeringState::choose_async_recovery_replicated( + const map &all_info, + const pg_info_t &auth_info, + vector *want, + set *async_recovery, + const OSDMapRef osdmap) const +{ + set > candidates_by_cost; + for (auto osd_num : *want) { + pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD); + // do not include strays + if (stray_set.find(shard_i) != stray_set.end()) + continue; + // Do not include an osd that is not up, since choosing it as + // an async_recovery_target will move it out of the acting set. + // This results in it being identified as a stray during peering, + // because it is no longer in the up or acting set. + if (!is_up(shard_i)) + continue; + auto shard_info = all_info.find(shard_i)->second; + // use the approximate magnitude of the difference in length of + // logs plus historical missing objects as the cost of recovery + version_t auth_version = auth_info.last_update.version; + version_t candidate_version = shard_info.last_update.version; + if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { + auto approx_missing_objects = + shard_info.stats.stats.sum.num_objects_missing; + if (auth_version > candidate_version) { + approx_missing_objects += auth_version - candidate_version; + } else { + approx_missing_objects += candidate_version - auth_version; + } + if (static_cast(approx_missing_objects) > + cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.emplace(approx_missing_objects, shard_i); + } + } else { + size_t approx_entries; + if (auth_version > candidate_version) { + approx_entries = auth_version - candidate_version; + } else { + approx_entries = candidate_version - auth_version; + } + if (approx_entries > cct->_conf.get_val("osd_async_recovery_min_cost")) { + candidates_by_cost.insert(make_pair(approx_entries, shard_i)); + } + } + } + + psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost + << dendl; + // take out as many osds as we can for async recovery, in order of cost + for (auto rit = candidates_by_cost.rbegin(); + rit != candidates_by_cost.rend(); ++rit) { + if (want->size() <= pool.info.min_size) { + break; + } + pg_shard_t cur_shard = rit->second; + vector candidate_want(*want); + for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) { + if (*it == cur_shard.osd) { + candidate_want.erase(it); + want->swap(candidate_want); + async_recovery->insert(cur_shard); + break; + } + } + } + psdout(20) << __func__ << " result want=" << *want + << " async_recovery=" << *async_recovery << dendl; +} + + + +/** + * choose acting + * + * calculate the desired acting, and request a change with the monitor + * if it differs from the current acting. + * + * if restrict_to_up_acting=true, we filter out anything that's not in + * up/acting. in order to lift this restriction, we need to + * 1) check whether it's worth switching the acting set any time we get + * a new pg info (not just here, when recovery finishes) + * 2) check whether anything in want_acting went down on each new map + * (and, if so, calculate a new want_acting) + * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap) + * TODO! + */ +bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, + bool restrict_to_up_acting, + bool *history_les_bound) +{ + map all_info(peer_info.begin(), peer_info.end()); + all_info[pg_whoami] = info; + + if (cct->_conf->subsys.should_gather()) { + for (map::iterator p = all_info.begin(); + p != all_info.end(); + ++p) { + psdout(10) << __func__ << " all_info osd." << p->first << " " + << p->second << dendl; + } + } + + map::const_iterator auth_log_shard = + find_best_info(all_info, restrict_to_up_acting, history_les_bound); + + if (auth_log_shard == all_info.end()) { + if (up != acting) { + psdout(10) << __func__ << " no suitable info found (incomplete backfills?)," + << " reverting to up" << dendl; + want_acting = up; + vector empty; + pl->queue_want_pg_temp(empty); + } else { + psdout(10) << __func__ << " failed" << dendl; + ceph_assert(want_acting.empty()); + } + return false; + } + + ceph_assert(!auth_log_shard->second.is_incomplete()); + auth_log_shard_id = auth_log_shard->first; + + set want_backfill, want_acting_backfill; + vector want; + stringstream ss; + if (pool.info.is_replicated()) + calc_replicated_acting( + auth_log_shard, + cct->_conf.get_val( + "osd_force_auth_primary_missing_objects"), + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + up_primary, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + get_osdmap(), + ss); + else + calc_ec_acting( + auth_log_shard, + get_osdmap()->get_pg_size(info.pgid.pgid), + acting, + up, + all_info, + restrict_to_up_acting, + &want, + &want_backfill, + &want_acting_backfill, + ss); + psdout(10) << ss.str() << dendl; + + if (!recoverable_and_ge_min_size(want)) { + want_acting.clear(); + return false; + } + + set want_async_recovery; + if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) { + if (pool.info.is_erasure()) { + choose_async_recovery_ec( + all_info, auth_log_shard->second, &want, &want_async_recovery, + get_osdmap()); + } else { + choose_async_recovery_replicated( + all_info, auth_log_shard->second, &want, &want_async_recovery, + get_osdmap()); + } + } + if (want != acting) { + psdout(10) << __func__ << " want " << want << " != acting " << acting + << ", requesting pg_temp change" << dendl; + want_acting = want; + + if (!cct->_conf->osd_debug_no_acting_change) { + if (want_acting == up) { + // There can't be any pending backfill if + // want is the same as crush map up OSDs. + ceph_assert(want_backfill.empty()); + vector empty; + pl->queue_want_pg_temp(empty); + } else + pl->queue_want_pg_temp(want); + } + return false; + } + want_acting.clear(); + acting_recovery_backfill = want_acting_backfill; + psdout(10) << "acting_recovery_backfill is " + << acting_recovery_backfill << dendl; + ceph_assert( + backfill_targets.empty() || + backfill_targets == want_backfill); + if (backfill_targets.empty()) { + // Caller is GetInfo + backfill_targets = want_backfill; + } + // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete + ceph_assert( + async_recovery_targets.empty() || + async_recovery_targets == want_async_recovery || + !needs_recovery()); + if (async_recovery_targets.empty() || !needs_recovery()) { + async_recovery_targets = want_async_recovery; + } + // Will not change if already set because up would have had to change + // Verify that nothing in backfill is in stray_set + for (set::iterator i = want_backfill.begin(); + i != want_backfill.end(); + ++i) { + ceph_assert(stray_set.find(*i) == stray_set.end()); + } + psdout(10) << "choose_acting want=" << want << " backfill_targets=" + << want_backfill << " async_recovery_targets=" + << async_recovery_targets << dendl; + return true; +} + /*------------ Peering State Machine----------------*/ @@ -2357,11 +3066,11 @@ PeeringState::Recovered::Recovered(my_context ctx) // adjust acting set? (e.g. because backfill completed...) bool history_les_bound = false; - if (ps->acting != ps->up && !pg->choose_acting(auth_log_shard, + if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard, true, &history_les_bound)) { ceph_assert(ps->want_acting.size()); } else if (!ps->async_recovery_targets.empty()) { - pg->choose_acting(auth_log_shard, true, &history_les_bound); + ps->choose_acting(auth_log_shard, true, &history_les_bound); } if (context< Active >().all_replicas_activated && @@ -3205,7 +3914,7 @@ PeeringState::GetLog::GetLog(my_context ctx) DECLARE_LOCALS // adjust acting? - if (!pg->choose_acting(auth_log_shard, false, + if (!ps->choose_acting(auth_log_shard, false, &context< Peering >().history_les_bound)) { if (!ps->want_acting.empty()) { post_event(NeedActingChange()); diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index 18e30ac8fa1..66b26f86f0e 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1256,6 +1256,54 @@ public: void reject_reservation(); + // acting set + map::const_iterator find_best_info( + const map &infos, + bool restrict_to_up_acting, + bool *history_les_bound) const; + static void calc_ec_acting( + map::const_iterator auth_log_shard, + unsigned size, + const vector &acting, + const vector &up, + const map &all_info, + bool restrict_to_up_acting, + vector *want, + set *backfill, + set *acting_backfill, + ostream &ss); + static void calc_replicated_acting( + map::const_iterator auth_log_shard, + uint64_t force_auth_primary_missing_objects, + unsigned size, + const vector &acting, + const vector &up, + pg_shard_t up_primary, + const map &all_info, + bool restrict_to_up_acting, + vector *want, + set *backfill, + set *acting_backfill, + const OSDMapRef osdmap, + ostream &ss); + void choose_async_recovery_ec( + const map &all_info, + const pg_info_t &auth_info, + vector *want, + set *async_recovery, + const OSDMapRef osdmap) const; + void choose_async_recovery_replicated( + const map &all_info, + const pg_info_t &auth_info, + vector *want, + set *async_recovery, + const OSDMapRef osdmap) const; + + bool recoverable_and_ge_min_size(const vector &want) const; + bool choose_acting(pg_shard_t &auth_log_shard, + bool restrict_to_up_acting, + bool *history_les_bound); + public: PeeringState( CephContext *cct, diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 5908a3e4824..5d329b3a40c 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -3850,7 +3850,7 @@ bool PastIntervals::check_new_interval( const OSDMap *osdmap, const OSDMap *lastmap, pg_t pgid, - IsPGRecoverablePredicate &could_have_gone_active, + const IsPGRecoverablePredicate &could_have_gone_active, PastIntervals *past_intervals, std::ostream *out) { diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 4fe9c352c19..6dc86267e4a 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -3194,7 +3194,7 @@ public: std::shared_ptr osdmap, ///< [in] current map std::shared_ptr lastmap, ///< [in] last map pg_t pgid, ///< [in] pgid for pg - IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active + const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active PastIntervals *past_intervals, ///< [out] intervals std::ostream *out = 0 ///< [out] debug ostream ) { @@ -3292,7 +3292,7 @@ public: std::map blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set. - std::unique_ptr pcontdec; + const IsPGRecoverablePredicate* pcontdec = nullptr; PriorSet() = default; PriorSet(PriorSet &&) = default; @@ -3320,7 +3320,7 @@ public: std::set down, std::map blocked_by, bool pg_down, - IsPGRecoverablePredicate *pcontdec) + const IsPGRecoverablePredicate *pcontdec) : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by), pg_down(pg_down), pcontdec(pcontdec) {} @@ -3330,7 +3330,7 @@ public: const PastIntervals &past_intervals, bool ec_pool, epoch_t last_epoch_started, - IsPGRecoverablePredicate *c, + const IsPGRecoverablePredicate *c, F f, const std::vector &up, const std::vector &acting, @@ -3355,7 +3355,7 @@ PastIntervals::PriorSet::PriorSet( const PastIntervals &past_intervals, bool ec_pool, epoch_t last_epoch_started, - IsPGRecoverablePredicate *c, + const IsPGRecoverablePredicate *c, F f, const std::vector &up, const std::vector &acting, -- 2.39.5