From ead5d2a8138552ff4745a409d893471950a806da Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 25 Apr 2012 13:07:34 -0700 Subject: [PATCH] osd: filter osds removed from probe set from peer_info_requested Peef_info_requested should be a strict subset of the probe set. Filter osds that are dropped from probe from peer_info_requested. We could also restart peering from scratch here, but this is less expensive, because we don't have to re-probe everyone. Once we adjust the probe and peer_info_requested sets, (re)check if we're done: we may have been blocedk on a previous peer_info_requested entry. The situation I saw was: "recovery_state": [ { "name": "Started\/Primary\/Peering\/GetInfo", "enter_time": "2012-04-25 14:39:56.905748", "requested_info_from": [ { "osd": 193}]}, { "name": "Started\/Primary\/Peering", "enter_time": "2012-04-25 14:39:56.905748", "probing_osds": [ 79, 191, 195], "down_osds_we_would_probe": [], "peering_blocked_by": []}, { "name": "Started", "enter_time": "2012-04-25 14:39:56.905742"}]} Once in this state, cycling osd.193 doesn't help, because the prior_set is not affected. Signed-off-by: Sage Weil Reviewed-by: Samuel Just --- src/osd/PG.cc | 115 ++++++++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 504049a3c58c4..a6a7fc5b628d4 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -4413,64 +4413,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in if (old_start < pg->info.history.last_epoch_started) { dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl; pg->build_prior(prior_set); + + // filter out any osds that got dropped from the probe set from + // peer_info_requested. this is less expensive than restarting + // peering (which would re-probe everyone). + set::iterator p = peer_info_requested.begin(); + while (p != peer_info_requested.end()) { + if (prior_set->probe.count(*p) == 0) { + dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; + peer_info_requested.erase(++p); + } else { + ++p; + } + } get_infos(); - } else { - // are we done getting everything? - if (peer_info_requested.empty() && !prior_set->pg_down) { - /* - * make sure we have at least one !incomplete() osd from the - * last rw interval. the incomplete (backfilling) replicas - * get a copy of the log, but they don't get all the object - * updates, so they are insufficient to recover changes during - * that interval. - */ - if (pg->info.history.last_epoch_started) { - for (map::reverse_iterator p = pg->past_intervals.rbegin(); - p != pg->past_intervals.rend(); - ++p) { - if (p->first < pg->info.history.last_epoch_started) - break; - if (!p->second.maybe_went_rw) - continue; - Interval& interval = p->second; - dout(10) << " last maybe_went_rw interval was " << interval << dendl; - OSDMapRef osdmap = pg->get_osdmap(); - - /* - * this mirrors the PriorSet calculation: we wait if we - * don't have an up (AND !incomplete) node AND there are - * nodes down that might be usable. - */ - bool any_up_complete_now = false; - bool any_down_now = false; - for (unsigned i=0; iexists(o) || osdmap->get_info(o).lost_at > interval.first) - continue; // dne or lost - if (osdmap->is_up(o)) { - pg_info_t *pinfo; - if (o == pg->osd->whoami) { - pinfo = &pg->info; - } else { - assert(pg->peer_info.count(o)); - pinfo = &pg->peer_info[o]; - } - if (!pinfo->is_incomplete()) - any_up_complete_now = true; + } + + // are we done getting everything? + if (peer_info_requested.empty() && !prior_set->pg_down) { + /* + * make sure we have at least one !incomplete() osd from the + * last rw interval. the incomplete (backfilling) replicas + * get a copy of the log, but they don't get all the object + * updates, so they are insufficient to recover changes during + * that interval. + */ + if (pg->info.history.last_epoch_started) { + for (map::reverse_iterator p = pg->past_intervals.rbegin(); + p != pg->past_intervals.rend(); + ++p) { + if (p->first < pg->info.history.last_epoch_started) + break; + if (!p->second.maybe_went_rw) + continue; + Interval& interval = p->second; + dout(10) << " last maybe_went_rw interval was " << interval << dendl; + OSDMapRef osdmap = pg->get_osdmap(); + + /* + * this mirrors the PriorSet calculation: we wait if we + * don't have an up (AND !incomplete) node AND there are + * nodes down that might be usable. + */ + bool any_up_complete_now = false; + bool any_down_now = false; + for (unsigned i=0; iexists(o) || osdmap->get_info(o).lost_at > interval.first) + continue; // dne or lost + if (osdmap->is_up(o)) { + pg_info_t *pinfo; + if (o == pg->osd->whoami) { + pinfo = &pg->info; } else { - any_down_now = true; + assert(pg->peer_info.count(o)); + pinfo = &pg->peer_info[o]; } + if (!pinfo->is_incomplete()) + any_up_complete_now = true; + } else { + any_down_now = true; } - if (!any_up_complete_now && any_down_now) { - dout(10) << " no osds up+complete from interval " << interval << dendl; - pg->state_set(PG_STATE_DOWN); - return discard_event(); - } - break; } + if (!any_up_complete_now && any_down_now) { + dout(10) << " no osds up+complete from interval " << interval << dendl; + pg->state_set(PG_STATE_DOWN); + return discard_event(); + } + break; } - post_event(GotInfo()); } + post_event(GotInfo()); } } return discard_event(); -- 2.39.5