]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: prime pg_temp
authorSage Weil <sage@redhat.com>
Wed, 21 Jan 2015 02:35:13 +0000 (18:35 -0800)
committerSage Weil <sage@redhat.com>
Wed, 21 Jan 2015 03:20:24 +0000 (19:20 -0800)
Prime pg_temp values for

 - any osd that goes up/down or has a reweight change
 - all osds on crush map change

We're ignoring primary_affinity and primary_temp at the moment.

No attempt is made (yet) to limit the time or CPU we burn doing this.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/config_opts.h
src/mon/OSDMonitor.cc
src/mon/OSDMonitor.h
src/vstart.sh

index 937aee6a75ec1bb5efbbfe24dd686933af73f6c0..46396fa91284fe94cd6c6a23a0ae1bab3f355d31 100644 (file)
@@ -193,6 +193,7 @@ OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32)     // max op age before we get conce
 OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
 OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false)  // allow primary_temp to be set in the osdmap
 OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false)  // allow primary_affinity to be set in the osdmap
+OPTION(mon_osd_prime_pg_temp, OPT_BOOL, false)  // prime osdmap with pg mapping changes
 OPTION(mon_stat_smooth_intervals, OPT_INT, 2)  // smooth stats over last N PGMap maps
 OPTION(mon_lease, OPT_FLOAT, 5)       // lease interval
 OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease
index 10e49a17c7f02d95a4fe35ac67cb5cd9ae141a00..b43511aa89d7aec910f06ad8e9e5d4d513f8a1c7 100644 (file)
@@ -620,6 +620,93 @@ void OSDMonitor::create_pending()
   OSDMap::remove_down_temps(g_ceph_context, osdmap, &pending_inc);
 }
 
+void OSDMonitor::maybe_prime_pg_temp()
+{
+  if (pending_inc.crush.length()) {
+    dout(10) << __func__ << " new crush map" << dendl;
+    OSDMap next;
+    next.deepish_copy_from(osdmap);
+    next.apply_incremental(pending_inc);
+    prime_pg_temp(next, &mon->pgmon()->pg_map);
+    return;
+  }
+
+  // check for interesting OSDs
+  set<int> osds;
+  for (map<int32_t,uint8_t>::iterator p = pending_inc.new_state.begin();
+       p != pending_inc.new_state.end();
+       ++p) {
+    if (p->second & CEPH_OSD_UP) {
+      osds.insert(p->first);
+    }
+  }
+  for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
+       p != pending_inc.new_weight.end();
+       ++p) {
+    osds.insert(p->first);
+  }
+  if (!osds.empty()) {
+    dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
+    OSDMap next;
+    next.deepish_copy_from(osdmap);
+    next.apply_incremental(pending_inc);
+    for (set<int>::iterator p = osds.begin(); p != osds.end(); ++p) {
+      prime_pg_temp(next, &mon->pgmon()->pg_map, *p);
+    }
+  }
+}
+
+void OSDMonitor::prime_pg_temp(OSDMap& next,
+                              ceph::unordered_map<pg_t, pg_stat_t>::iterator pp)
+{
+  // do not touch a mapping if a change is pending
+  if (pending_inc.new_pg_temp.count(pp->first))
+    return;
+  vector<int> up, acting;
+  int up_primary, acting_primary;
+  next.pg_to_up_acting_osds(pp->first, &up, &up_primary, &acting, &acting_primary);
+  if (acting == pp->second.acting)
+    return;  // no change since last pg update, skip
+  vector<int> cur_up, cur_acting;
+  osdmap.pg_to_up_acting_osds(pp->first, &cur_up, &up_primary,
+                             &cur_acting, &acting_primary);
+  if (cur_acting == acting)
+    return;  // no change this epoch; must be stale pg_stat
+
+  dout(20) << __func__ << " " << pp->first << " " << cur_up << "/" << cur_acting
+          << " -> " << up << "/" << acting
+          << ", priming " << cur_acting
+          << dendl;
+  pending_inc.new_pg_temp[pp->first] = cur_acting;
+}
+
+void OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd)
+{
+  dout(10) << __func__ << " osd." << osd << dendl;
+  ceph::unordered_map<int, set<pg_t> >::iterator po = pg_map->pg_by_osd.find(osd);
+  if (po != pg_map->pg_by_osd.end()) {
+    for (set<pg_t>::iterator p = po->second.begin();
+        p != po->second.end();
+        ++p) {
+      ceph::unordered_map<pg_t, pg_stat_t>::iterator pp = pg_map->pg_stat.find(*p);
+      if (pp == pg_map->pg_stat.end())
+       continue;
+      prime_pg_temp(next, pp);
+    }
+  }
+}
+
+void OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map)
+{
+  dout(10) << __func__ << dendl;
+  for (ceph::unordered_map<pg_t, pg_stat_t>::iterator pp = pg_map->pg_stat.begin();
+       pp != pg_map->pg_stat.end();
+       ++pp) {
+    prime_pg_temp(next, pp);
+  }
+}
+
+
 /**
  * @note receiving a transaction in this function gives a fair amount of
  * freedom to the service implementation if it does need it. It shouldn't.
@@ -635,6 +722,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
   assert(r == 0);
 
+  if (g_conf->mon_osd_prime_pg_temp)
+    maybe_prime_pg_temp();
+
   bufferlist bl;
 
   // tell me about it
index fa45ab98581145ed458b23601f0b497a96f03c77..ea6926b7df697aa90c73b42072d0594fa376c242 100644 (file)
@@ -34,6 +34,8 @@ using namespace std;
 #include "Session.h"
 
 class Monitor;
+class PGMap;
+
 #include "messages/MOSDBoot.h"
 #include "messages/MMonCommand.h"
 #include "messages/MOSDMap.h"
@@ -198,6 +200,12 @@ private:
 
   void share_map_with_random_osd();
 
+  void maybe_prime_pg_temp();
+  void prime_pg_temp(OSDMap& next,
+                    ceph::unordered_map<pg_t, pg_stat_t>::iterator pp);
+  void prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd);
+  void prime_pg_temp(OSDMap& next, PGMap *pg_map);
+
   void update_logger();
 
   void handle_query(PaxosServiceMessage *m);
index 469180d7a235d31422c691d38f6427379231e6a5..df57669e50a7776cc4f76d284c3fa25697cbd224 100755 (executable)
@@ -401,6 +401,7 @@ $extra_conf
         mon pg warn min per osd = 3
         mon osd allow primary affinity = true
         mon reweight min pgs per osd = 4
+        mon osd prime pg temp = true
 $DAEMONOPTS
 $CMONDEBUG
 $extra_conf