From: Sage Weil Date: Thu, 8 Oct 2015 16:13:40 +0000 (-0400) Subject: mon/PGMonitor: revamp how pg creates are tracked X-Git-Tag: v10.0.1~26^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1f4b7141c5a381a0da759bd5773501f1fbaaa078;p=ceph.git mon/PGMonitor: revamp how pg creates are tracked Previously we were calculating and managing in-core state that wasn't committed as part of the pg_map, leading to all sorts of ugliness that didn't really work. Instead, * set mapping in all creating pgs in the committed pg_map * make all pg create message sending be based on committed state * update mappings for creating pgs every time we consume a new osdmap, so that we have a reliable/stable epoch to attach to it. In particular, having that stable epoch means we have a reference we can put in the pg create message that will also be used for the subscription version. That way OSDs get consistent creates from any mon. Signed-off-by: Sage Weil --- diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 0e305d9fc9ea..0640b1a6e8c3 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -305,7 +305,6 @@ void PGMonitor::post_paxos_update() { dout(10) << __func__ << dendl; if (mon->osdmon()->osdmap.get_epoch()) { - map_pg_creates(); send_pg_creates(); } } @@ -358,8 +357,6 @@ void PGMonitor::read_pgmap_meta() if (last_pg_scan != pg_map.get_last_pg_scan()) { pg_map.set_last_pg_scan(last_pg_scan); - // clear our osdmap epoch so that map_pg_creates() will re-run - last_map_pg_create_osd_epoch = 0; } float full_ratio, nearfull_ratio; @@ -941,7 +938,8 @@ void PGMonitor::check_osd_map(epoch_t epoch) propose = true; } - // scan pg space? + if (map_pg_creates()) + propose = true; if (register_new_pgs()) propose = true; @@ -953,7 +951,9 @@ void PGMonitor::check_osd_map(epoch_t epoch) propose_pending(); } -void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool) +void PGMonitor::register_pg(OSDMap *osdmap, + pg_pool_t& pool, pg_t pgid, epoch_t epoch, + bool new_pool) { pg_t parent; int split_bits = 0; @@ -963,11 +963,11 @@ void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_ while (1) { // remove most significant bit int msb = pool.calc_bits_of(parent.ps()); - if (!msb) break; + if (!msb) + break; parent.set_ps(parent.ps() & ~(1<<(msb-1))); split_bits++; - dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl; - //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) { + dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl; if (pg_map.pg_stat.count(parent) && pg_map.pg_stat[parent].state != PG_STATE_CREATING) { dout(10) << " parent is " << parent << dendl; @@ -995,6 +995,12 @@ void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_ stats.last_clean_scrub_stamp = now; } + osdmap->pg_to_up_acting_osds( + pgid, + &stats.up, + &stats.up_primary, + &stats.acting, + &stats.acting_primary); if (split_bits == 0) { dout(10) << __func__ << " will create " << pgid @@ -1025,7 +1031,8 @@ bool PGMonitor::register_new_pgs() ++p) { int64_t poolid = p->first; pg_pool_t &pool = p->second; - int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), pool.get_size()); + int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), + pool.get_type(), pool.get_size()); if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno)) continue; @@ -1048,7 +1055,7 @@ bool PGMonitor::register_new_pgs() continue; } created++; - register_pg(pool, pgid, pool.get_last_change(), new_pool); + register_pg(osdmap, pool, pgid, pool.get_last_change(), new_pool); } } @@ -1095,32 +1102,24 @@ bool PGMonitor::register_new_pgs() return (created || removed); } -void PGMonitor::map_pg_creates() +bool PGMonitor::map_pg_creates() { OSDMap *osdmap = &mon->osdmon()->osdmap; - if (osdmap->get_epoch() == last_map_pg_create_osd_epoch) { - dout(10) << "map_pg_creates to " << pg_map.creating_pgs.size() - << " pgs -- no change" << dendl; - return; - } - dout(10) << "map_pg_creates to " << pg_map.creating_pgs.size() - << " pgs osdmap epoch " << osdmap->get_epoch() << dendl; - last_map_pg_create_osd_epoch = osdmap->get_epoch(); + dout(10) << __func__ << " to " << pg_map.creating_pgs.size() + << " pgs, osdmap epoch " << osdmap->get_epoch() + << dendl; - for (set::iterator p = pg_map.creating_pgs.begin(); + unsigned changed = 0; + for (set::const_iterator p = pg_map.creating_pgs.begin(); p != pg_map.creating_pgs.end(); ++p) { pg_t pgid = *p; pg_t on = pgid; - pg_stat_t *s = NULL; - ceph::unordered_map::iterator q = pg_map.pg_stat.find(pgid); - if (q == pg_map.pg_stat.end()) { - s = &pg_map.pg_stat[pgid]; - } else { - s = &q->second; - pg_map.stat_pg_sub(pgid, *s, true); - } + ceph::unordered_map::const_iterator q = + pg_map.pg_stat.find(pgid); + assert(q != pg_map.pg_stat.end()); + const pg_stat_t *s = &q->second; if (s->parent_split_bits) on = s->parent; @@ -1134,50 +1133,38 @@ void PGMonitor::map_pg_creates() &acting, &acting_primary); - bool changed_primary = false; if (up != s->up || up_primary != s->up_primary || acting != s->acting || acting_primary != s->acting_primary) { - if (acting_primary != s->acting_primary) { - changed_primary = true; - s->mapping_epoch = pg_map.last_pg_scan; - if (s->acting_primary != -1) { - map >& r = - pg_map.creating_pgs_by_osd_epoch[s->acting_primary]; - r[s->mapping_epoch].erase(pgid); - if (r[s->mapping_epoch].empty()) - r.erase(s->mapping_epoch); - if (r.empty()) - pg_map.creating_pgs_by_osd_epoch.erase(s->acting_primary); - } - } - s->up = up; - s->up_primary = up_primary; - s->acting = acting; - s->acting_primary = acting_primary; - } - pg_map.stat_pg_add(pgid, *s, true); + dout(20) << __func__ << " " << pgid << " " + << " acting_primary: " << s->acting_primary + << " -> " << acting_primary + << " acting: " << s->acting << " -> " << acting + << " up_primary: " << s->up_primary << " -> " << up_primary + << " up: " << s->up << " -> " << up + << dendl; - // don't send creates for localized pgs - if (pgid.preferred() >= 0) - continue; + pg_stat_t *ns = &pending_inc.pg_stat_updates[pgid]; + *ns = *s; - // don't send creates for splits - if (s->parent_split_bits) - continue; + // note epoch if the target of the create message changed + if (acting_primary != ns->acting_primary) + ns->mapping_epoch = osdmap->get_epoch(); - if (acting_primary != -1) { - if (changed_primary) { - pg_map.creating_pgs_by_osd_epoch[acting_primary][s->mapping_epoch].insert( - pgid); - } - } else { - dout(20) << "map_pg_creates " << pgid << " -> no osds in epoch " - << mon->osdmon()->osdmap.get_epoch() << ", skipping" << dendl; - continue; // blarney! + ns->up = up; + ns->up_primary = up_primary; + ns->acting = acting; + ns->acting_primary = acting_primary; + + ++changed; } } + if (changed) { + dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl; + return true; + } + return false; } void PGMonitor::send_pg_creates() @@ -1216,6 +1203,7 @@ void PGMonitor::send_pg_creates() epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) { + dout(30) << __func__ << " " << pg_map.creating_pgs_by_osd_epoch << dendl; map > >::iterator p = pg_map.creating_pgs_by_osd_epoch.find(osd); if (p == pg_map.creating_pgs_by_osd_epoch.end()) @@ -1233,7 +1221,7 @@ epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) last = q->first; for (set::iterator r = q->second.begin(); r != q->second.end(); ++r) { if (!m) - m = new MOSDPGCreate(last_map_pg_create_osd_epoch); + m = new MOSDPGCreate(pg_map.last_osdmap_epoch); m->mkpg[*r] = pg_create_t(pg_map.pg_stat[*r].created, pg_map.pg_stat[*r].parent, pg_map.pg_stat[*r].parent_split_bits); diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 50f650b11b3b..29b2e03a7973 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -49,9 +49,6 @@ public: bool need_check_down_pgs; set need_check_down_pg_osds; - epoch_t last_map_pg_create_osd_epoch; - - private: PGMap::Incremental pending_inc; @@ -115,7 +112,8 @@ private: // when we last received PG stats from each osd map last_osd_report; - void register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool); + void register_pg(OSDMap *osdmap, pg_pool_t& pool, pg_t pgid, + epoch_t epoch, bool new_pool); /** * check latest osdmap for new pgs to register @@ -124,7 +122,13 @@ private: */ bool register_new_pgs(); - void map_pg_creates(); + /** + * recalculate creating pg mappings + * + * @return true if we updated pending_inc + */ + bool map_pg_creates(); + void send_pg_creates(); epoch_t send_pg_creates(int osd, Connection *con, epoch_t next); @@ -160,7 +164,6 @@ public: PGMonitor(Monitor *mn, Paxos *p, const string& service_name) : PaxosService(mn, p, service_name), need_check_down_pgs(false), - last_map_pg_create_osd_epoch(0), pgmap_meta_prefix("pgmap_meta"), pgmap_pg_prefix("pgmap_pg"), pgmap_osd_prefix("pgmap_osd")