From: Sage Weil Date: Sat, 14 Nov 2015 03:11:17 +0000 (-0500) Subject: osd: simplify pg creation X-Git-Tag: v10.0.1~26^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=53f2c7f291d94774dda7182d00fd26af4ee65f6f;p=ceph.git osd: simplify pg creation We used to have a complicated pg creation process in which we would query any previous mappings for the pg before we created the new 'empty' pg locally. The tracking of the prior mappings was very simple (and broken), but it didn't really matter because the mon would resend pg create messages periodically. Now it doesn't, so that broke. However, none of this is necessary: the PG peering process does all of the same things. Namely, it - enumerates past intervals - determines which ones may have been rw - queries OSDs from each one to gather any potential changes This is a more robust version of what the creation code was (or should have been doing). So, let's rip it all out and let peering handle it. As long as the newly instantiated PG sets last_epoch_started and _clean to the created epoch we will probe and consider all of these prior mappings and find any previous instance of the PG (if one existed). Yay for removing unnecessary code! Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2a2d28d50f6..0730d5c4e8d 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -713,8 +713,6 @@ OPTION(osd_op_log_threshold, OPT_INT, 5) // how many op log messages to show in OPTION(osd_verify_sparse_read_holes, OPT_BOOL, false) // read fiemap-reported holes and verify they are zeros OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE, 0) OPTION(osd_debug_drop_ping_duration, OPT_INT, 0) -OPTION(osd_debug_drop_pg_create_probability, OPT_DOUBLE, 0) -OPTION(osd_debug_drop_pg_create_duration, OPT_INT, 1) OPTION(osd_debug_drop_op_probability, OPT_DOUBLE, 0) // probability of stalling/dropping a client op OPTION(osd_debug_op_order, OPT_BOOL, false) OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64, 0) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2f6a0dd49de..b6152904876 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1598,9 +1598,6 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_, map_lock("OSD::map_lock"), pg_map_lock("OSD::pg_map_lock"), last_pg_create_epoch(0), - debug_drop_pg_create_probability(cct->_conf->osd_debug_drop_pg_create_probability), - debug_drop_pg_create_duration(cct->_conf->osd_debug_drop_pg_create_duration), - debug_drop_pg_create_left(-1), mon_report_lock("OSD::mon_report_lock"), stats_ack_timeout(cct->_conf->osd_mon_ack_timeout), up_thru_wanted(0), up_thru_pending(0), @@ -2771,7 +2768,6 @@ OSD::res_result OSD::_try_resurrect_pg( PG *OSD::_create_lock_pg( OSDMapRef createmap, spg_t pgid, - bool newly_created, bool hold_map_lock, bool backfill, int role, @@ -3202,7 +3198,8 @@ void OSD::handle_pg_peering_evt( if (!valid_history || epoch < history.same_interval_since) { dout(10) << "get_or_create_pg " << pgid << " acting changed in " - << history.same_interval_since << " (msg from " << epoch << ")" << dendl; + << history.same_interval_since << " (msg from " << epoch << ")" + << dendl; return; } @@ -3210,28 +3207,6 @@ void OSD::handle_pg_peering_evt( assert(0); } - bool create = false; - if (primary) { - // DNE on source? - if (info.dne()) { - // is there a creation pending on this pg? - if (creating_pgs.count(pgid)) { - creating_pgs[pgid].prior.erase(from); - if (!can_create_pg(pgid)) - return; - history = creating_pgs[pgid].history; - create = true; - } else { - dout(10) << "get_or_create_pg " << pgid - << " DNE on source, but creation probe, ignoring" << dendl; - return; - } - } - creating_pgs.erase(pgid); - } else { - assert(!info.dne()); // pg exists if we are hearing about it - } - // do we need to resurrect a deleting pg? spg_t resurrected; PGRef old_pg_state; @@ -3250,7 +3225,7 @@ void OSD::handle_pg_peering_evt( PG *pg = _create_lock_pg( get_map(epoch), - pgid, create, false, result == RES_SELF, + pgid, false, result == RES_SELF, role, up, up_primary, acting, acting_primary, @@ -3268,7 +3243,7 @@ void OSD::handle_pg_peering_evt( return; } case RES_SELF: { - old_pg_state->lock(); + old_pg_state->lock(); OSDMapRef old_osd_map = old_pg_state->get_osdmap(); int old_role = old_pg_state->role; vector old_up = old_pg_state->up; @@ -3282,7 +3257,6 @@ void OSD::handle_pg_peering_evt( old_osd_map, resurrected, false, - false, true, old_role, old_up, @@ -3319,7 +3293,6 @@ void OSD::handle_pg_peering_evt( old_osd_map, resurrected, false, - false, true, old_role, old_up, @@ -3363,56 +3336,6 @@ void OSD::handle_pg_peering_evt( } -/* - * calculate prior pg members during an epoch interval [start,end) - * - from each epoch, include all osds up then AND now - * - if no osds from then are up now, include them all, even tho they're not reachable now - */ -void OSD::calc_priors_during( - spg_t pgid, epoch_t start, epoch_t end, set& pset) -{ - dout(15) << "calc_priors_during " << pgid << " [" << start - << "," << end << ")" << dendl; - - for (epoch_t e = start; e < end; e++) { - OSDMapRef oldmap = get_map(e); - vector acting; - oldmap->pg_to_acting_osds(pgid.pgid, acting); - dout(20) << " " << pgid << " in epoch " << e << " was " << acting << dendl; - int up = 0; - int actual_osds = 0; - for (unsigned i=0; iis_up(acting[i])) { - if (acting[i] != whoami) { - pset.insert( - pg_shard_t( - acting[i], - osdmap->pg_is_ec(pgid.pgid) ? shard_id_t(i) : shard_id_t::NO_SHARD)); - } - up++; - } - actual_osds++; - } - } - if (!up && actual_osds) { - // sucky. add down osds, even tho we can't reach them right now. - for (unsigned i=0; ipg_is_ec(pgid.pgid) ? shard_id_t(i) : shard_id_t::NO_SHARD)); - } - } - } - } - dout(10) << "calc_priors_during " << pgid - << " [" << start << "," << end - << ") = " << pset << dendl; -} - - /** * Fill in the passed history so you know same_interval_since, same_up_since, * and same_primary_since. @@ -6819,28 +6742,6 @@ void OSD::advance_map() } service.set_epochs(&boot_epoch, &up_epoch, NULL); } - - // scan pg creations - ceph::unordered_map::iterator n = creating_pgs.begin(); - while (n != creating_pgs.end()) { - ceph::unordered_map::iterator p = n++; - spg_t pgid = p->first; - - // am i still primary? - vector acting; - int primary; - osdmap->pg_to_acting_osds(pgid.pgid, &acting, &primary); - if (primary != whoami) { - dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl; - creating_pgs.erase(p); - } else { - /* - * adding new ppl to our pg has no effect, since we're still primary, - * and obviously haven't given the new nodes any data. - */ - p->second.acting.swap(acting); // keep the latest - } - } } void OSD::consume_map() @@ -7081,22 +6982,6 @@ bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch, // ---------------------------------------- // pg creation - -bool OSD::can_create_pg(spg_t pgid) -{ - assert(creating_pgs.count(pgid)); - - // priors empty? - if (!creating_pgs[pgid].prior.empty()) { - dout(10) << "can_create_pg " << pgid - << " - waiting for priors " << creating_pgs[pgid].prior << dendl; - return false; - } - - dout(10) << "can_create_pg " << pgid << " - can create now" << dendl; - return true; -} - void OSD::split_pgs( PG *parent, const set &childpgids, set > *out_pgs, @@ -7159,21 +7044,6 @@ void OSD::handle_pg_create(OpRequestRef op) dout(10) << "handle_pg_create " << *m << dendl; - // drop the next N pg_creates in a row? - if (debug_drop_pg_create_left < 0 && - cct->_conf->osd_debug_drop_pg_create_probability > - ((((double)(rand()%100))/100.0))) { - debug_drop_pg_create_left = debug_drop_pg_create_duration; - } - if (debug_drop_pg_create_left >= 0) { - --debug_drop_pg_create_left; - if (debug_drop_pg_create_left >= 0) { - dout(0) << "DEBUG dropping/ignoring pg_create, will drop the next " - << debug_drop_pg_create_left << " too" << dendl; - return; - } - } - /* we have to hack around require_mon_peer's interface limits, so * grab an extra reference before going in. If the peer isn't * a Monitor, the reference is put for us (and then cleared @@ -7190,15 +7060,12 @@ void OSD::handle_pg_create(OpRequestRef op) op->mark_started(); - int num_created = 0; - map::iterator ci = m->ctimes.begin(); for (map::iterator p = m->mkpg.begin(); p != m->mkpg.end(); ++p, ++ci) { assert(ci != m->ctimes.end() && ci->first == p->first); epoch_t created = p->second.created; - pg_t parent = p->second.parent; if (p->second.split_bits) // Skip split pgs continue; pg_t on = p->first; @@ -7245,73 +7112,35 @@ void OSD::handle_pg_create(OpRequestRef op) continue; } - // figure history pg_history_t history; history.epoch_created = created; - history.last_epoch_clean = created; - // Newly created PGs don't need to scrub immediately, so mark them - // as scrubbed at creation time. - if (ci->second == utime_t()) { - // Older OSD doesn't send ctime, so just do what we did before - // The repair_test.py can fail in a mixed cluster - utime_t now = ceph_clock_now(NULL); - history.last_scrub_stamp = now; - history.last_deep_scrub_stamp = now; - } else { - history.last_scrub_stamp = ci->second; - history.last_deep_scrub_stamp = ci->second; - } + history.last_scrub_stamp = ci->second; + history.last_deep_scrub_stamp = ci->second; bool valid_history = project_pg_history( pgid, history, created, up, up_primary, acting, acting_primary); /* the pg creation message must have come from a mon and therefore * cannot be on the other side of a map gap */ assert(valid_history); - - // register. - creating_pgs[pgid].history = history; - creating_pgs[pgid].parent = parent; - creating_pgs[pgid].acting.swap(acting); - calc_priors_during(pgid, created, history.same_interval_since, - creating_pgs[pgid].prior); PG::RecoveryCtx rctx = create_context(); - // poll priors - set& pset = creating_pgs[pgid].prior; - dout(10) << "mkpg " << pgid << " e" << created - << " h " << history - << " : querying priors " << pset << dendl; - for (set::iterator p = pset.begin(); p != pset.end(); ++p) - if (osdmap->is_up(p->osd)) - (*rctx.query_map)[p->osd][spg_t(pgid.pgid, p->shard)] = - pg_query_t( - pg_query_t::INFO, - p->shard, pgid.shard, - history, - osdmap->get_epoch()); - - PG *pg = NULL; - if (can_create_pg(pgid)) { - const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool()); - PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); - PG::_init(*rctx.transaction, pgid, pp); - - pg_interval_map_t pi; - pg = _create_lock_pg( - osdmap, pgid, true, false, false, - 0, creating_pgs[pgid].acting, whoami, - creating_pgs[pgid].acting, whoami, - history, pi, - *rctx.transaction); - pg->info.last_epoch_started = pg->info.history.last_epoch_started; - creating_pgs.erase(pgid); - pg->handle_create(&rctx); - pg->write_if_dirty(*rctx.transaction); - pg->publish_stats_to_osd(); - pg->unlock(); - num_created++; - wake_pg_waiters(pg, pgid); - } + const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool()); + PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); + PG::_init(*rctx.transaction, pgid, pp); + + pg_interval_map_t pi; + PG *pg = _create_lock_pg( + osdmap, pgid, false, false, + 0, up, up_primary, + acting, acting_primary, + history, pi, + *rctx.transaction); + pg->info.last_epoch_started = created; + pg->handle_create(&rctx); + pg->write_if_dirty(*rctx.transaction); + pg->publish_stats_to_osd(); + pg->unlock(); + wake_pg_waiters(pg, pgid); dispatch_context(rctx, pg, osdmap); } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 7ee15a75f93..481a781764e 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1887,7 +1887,6 @@ protected: PG *_create_lock_pg( OSDMapRef createmap, spg_t pgid, - bool newly_created, bool hold_map_lock, bool backfill, int role, @@ -1914,9 +1913,6 @@ protected: void load_pgs(); void build_past_intervals_parallel(); - void calc_priors_during( - spg_t pgid, epoch_t start, epoch_t end, set& pset); - /// project pg history from from to now bool project_pg_history( spg_t pgid, pg_history_t& h, epoch_t from, @@ -1944,20 +1940,9 @@ protected: } } - // -- pg creation -- - struct create_pg_info { - pg_history_t history; - vector acting; - set prior; - pg_t parent; - }; - ceph::unordered_map creating_pgs; + epoch_t last_pg_create_epoch; - double debug_drop_pg_create_probability; - int debug_drop_pg_create_duration; - int debug_drop_pg_create_left; // 0 if we just dropped the last one, -1 if we can drop more - bool can_create_pg(spg_t pgid); void handle_pg_create(OpRequestRef op); void split_pgs(