From: Sage Weil Date: Wed, 21 Jan 2015 02:35:13 +0000 (-0800) Subject: mon: prime pg_temp X-Git-Tag: v9.0.1~34^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7a1305b6611426cb65ebfb4aaa8a5ffc524f9309;p=ceph.git mon: prime pg_temp Prime pg_temp values for - any osd that goes up/down or has a reweight change - all osds on crush map change We're ignoring primary_affinity and primary_temp at the moment. No attempt is made (yet) to limit the time or CPU we burn doing this. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 937aee6a75ec..46396fa91284 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -193,6 +193,7 @@ OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get conce OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap +OPTION(mon_osd_prime_pg_temp, OPT_BOOL, false) // prime osdmap with pg mapping changes OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps OPTION(mon_lease, OPT_FLOAT, 5) // lease interval OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 10e49a17c7f0..b43511aa89d7 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -620,6 +620,93 @@ void OSDMonitor::create_pending() OSDMap::remove_down_temps(g_ceph_context, osdmap, &pending_inc); } +void OSDMonitor::maybe_prime_pg_temp() +{ + if (pending_inc.crush.length()) { + dout(10) << __func__ << " new crush map" << dendl; + OSDMap next; + next.deepish_copy_from(osdmap); + next.apply_incremental(pending_inc); + prime_pg_temp(next, &mon->pgmon()->pg_map); + return; + } + + // check for interesting OSDs + set osds; + for (map::iterator p = pending_inc.new_state.begin(); + p != pending_inc.new_state.end(); + ++p) { + if (p->second & CEPH_OSD_UP) { + osds.insert(p->first); + } + } + for (map::iterator p = pending_inc.new_weight.begin(); + p != pending_inc.new_weight.end(); + ++p) { + osds.insert(p->first); + } + if (!osds.empty()) { + dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl; + OSDMap next; + next.deepish_copy_from(osdmap); + next.apply_incremental(pending_inc); + for (set::iterator p = osds.begin(); p != osds.end(); ++p) { + prime_pg_temp(next, &mon->pgmon()->pg_map, *p); + } + } +} + +void OSDMonitor::prime_pg_temp(OSDMap& next, + ceph::unordered_map::iterator pp) +{ + // do not touch a mapping if a change is pending + if (pending_inc.new_pg_temp.count(pp->first)) + return; + vector up, acting; + int up_primary, acting_primary; + next.pg_to_up_acting_osds(pp->first, &up, &up_primary, &acting, &acting_primary); + if (acting == pp->second.acting) + return; // no change since last pg update, skip + vector cur_up, cur_acting; + osdmap.pg_to_up_acting_osds(pp->first, &cur_up, &up_primary, + &cur_acting, &acting_primary); + if (cur_acting == acting) + return; // no change this epoch; must be stale pg_stat + + dout(20) << __func__ << " " << pp->first << " " << cur_up << "/" << cur_acting + << " -> " << up << "/" << acting + << ", priming " << cur_acting + << dendl; + pending_inc.new_pg_temp[pp->first] = cur_acting; +} + +void OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd) +{ + dout(10) << __func__ << " osd." << osd << dendl; + ceph::unordered_map >::iterator po = pg_map->pg_by_osd.find(osd); + if (po != pg_map->pg_by_osd.end()) { + for (set::iterator p = po->second.begin(); + p != po->second.end(); + ++p) { + ceph::unordered_map::iterator pp = pg_map->pg_stat.find(*p); + if (pp == pg_map->pg_stat.end()) + continue; + prime_pg_temp(next, pp); + } + } +} + +void OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map) +{ + dout(10) << __func__ << dendl; + for (ceph::unordered_map::iterator pp = pg_map->pg_stat.begin(); + pp != pg_map->pg_stat.end(); + ++pp) { + prime_pg_temp(next, pp); + } +} + + /** * @note receiving a transaction in this function gives a fair amount of * freedom to the service implementation if it does need it. It shouldn't. @@ -635,6 +722,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap); assert(r == 0); + if (g_conf->mon_osd_prime_pg_temp) + maybe_prime_pg_temp(); + bufferlist bl; // tell me about it diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index fa45ab985811..ea6926b7df69 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -34,6 +34,8 @@ using namespace std; #include "Session.h" class Monitor; +class PGMap; + #include "messages/MOSDBoot.h" #include "messages/MMonCommand.h" #include "messages/MOSDMap.h" @@ -198,6 +200,12 @@ private: void share_map_with_random_osd(); + void maybe_prime_pg_temp(); + void prime_pg_temp(OSDMap& next, + ceph::unordered_map::iterator pp); + void prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd); + void prime_pg_temp(OSDMap& next, PGMap *pg_map); + void update_logger(); void handle_query(PaxosServiceMessage *m); diff --git a/src/vstart.sh b/src/vstart.sh index 469180d7a235..df57669e50a7 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -401,6 +401,7 @@ $extra_conf mon pg warn min per osd = 3 mon osd allow primary affinity = true mon reweight min pgs per osd = 4 + mon osd prime pg temp = true $DAEMONOPTS $CMONDEBUG $extra_conf