From: Samuel Just Date: Mon, 4 Nov 2013 05:02:36 +0000 (-0800) Subject: OSD: allow project_pg_history to handle a missing map X-Git-Tag: v0.67.5~14 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bd08d332f5d23b6097a7e8344a6183896b2303a9;p=ceph.git OSD: allow project_pg_history to handle a missing map If we get a peering message for an old map we don't have, we can throwit out: the sending OSD will learn about the newer maps and update itself accordingly, and we don't have the information to know if the message is valid. This situation can only happen if the sender was down for a long enough time to create a map gap and its PGs have not yet advanced from their boot-up maps to the current ones, so we can rely on it Fixes: #6712 Signed-off-by: Samuel Just Reviewed-by: Greg Farnum (cherry picked from commit cd0d612e1abdf5c87082eeeccd4ca09dd14fd737) --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d9d9ab3477b..bac7415157e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2191,9 +2191,10 @@ void OSD::handle_pg_peering_evt( int role = osdmap->calc_pg_role(whoami, acting, acting.size()); pg_history_t history = info.history; - project_pg_history(info.pgid, history, epoch, up, acting); + bool valid_history = project_pg_history( + info.pgid, history, epoch, up, acting); - if (epoch < history.same_interval_since) { + if (!valid_history || epoch < history.same_interval_since) { dout(10) << "get_or_create_pg " << info.pgid << " acting changed in " << history.same_interval_since << " (msg from " << epoch << ")" << dendl; return; @@ -2378,7 +2379,7 @@ void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set& ps * Fill in the passed history so you know same_interval_since, same_up_since, * and same_primary_since. */ -void OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, +bool OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, const vector& currentup, const vector& currentacting) { @@ -2392,7 +2393,11 @@ void OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, e > from; e--) { // verify during intermediate epoch (e-1) - OSDMapRef oldmap = get_map(e-1); + OSDMapRef oldmap = service.try_get_map(e-1); + if (!oldmap) { + dout(15) << __func__ << ": found map gap, returning false" << dendl; + return false; + } assert(oldmap->have_pg_pool(pgid.pool())); vector up, acting; @@ -2442,6 +2447,7 @@ void OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, } dout(15) << "project_pg_history end " << h << dendl; + return true; } // ------------------------------------- @@ -5865,7 +5871,12 @@ void OSD::handle_pg_create(OpRequestRef op) utime_t now = ceph_clock_now(NULL); history.last_scrub_stamp = now; history.last_deep_scrub_stamp = now; - project_pg_history(pgid, history, created, up, acting); + bool valid_history = + project_pg_history(pgid, history, created, up, acting); + /* the pg creation message must have come from a mon and therefore + * cannot be on the other side of a map gap + */ + assert(valid_history); // register. creating_pgs[pgid].history = history; @@ -6468,9 +6479,11 @@ void OSD::handle_pg_query(OpRequestRef op) // same primary? pg_history_t history = it->second.history; - project_pg_history(pgid, history, it->second.epoch_sent, up, acting); + bool valid_history = + project_pg_history(pgid, history, it->second.epoch_sent, up, acting); - if (it->second.epoch_sent < history.same_interval_since) { + if (!valid_history || + it->second.epoch_sent < history.same_interval_since) { dout(10) << " pg " << pgid << " dne, and pg has changed in " << history.same_interval_since << " (msg from " << it->second.epoch_sent << ")" << dendl; @@ -6534,9 +6547,11 @@ void OSD::handle_pg_remove(OpRequestRef op) pg_history_t history = pg->info.history; vector up, acting; osdmap->pg_to_up_acting_osds(pgid, up, acting); - project_pg_history(pg->info.pgid, history, pg->get_osdmap()->get_epoch(), - up, acting); - if (history.same_interval_since <= m->get_epoch()) { + bool valid_history = + project_pg_history(pg->info.pgid, history, pg->get_osdmap()->get_epoch(), + up, acting); + if (valid_history && + history.same_interval_since <= m->get_epoch()) { assert(pg->get_primary() == m->get_source().num()); PGRef _pg(pg); _remove_pg(pg); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index f2bf5c6f55b..c14636c5154 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1132,8 +1132,12 @@ protected: void build_past_intervals_parallel(); void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set& pset); - void project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, - const vector& lastup, const vector& lastacting); + + /// project pg history from from to now + bool project_pg_history( + pg_t pgid, pg_history_t& h, epoch_t from, + const vector& lastup, const vector& lastacting + ); ///< @return false if there was a map gap between from and now void wake_pg_waiters(pg_t pgid) { if (waiting_for_pg.count(pgid)) {