]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/OSDMonitor: extract reweight_by_utilization into PGMap
authorKefu Chai <kchai@redhat.com>
Sat, 8 Apr 2017 04:29:43 +0000 (12:29 +0800)
committerKefu Chai <kchai@redhat.com>
Mon, 10 Apr 2017 15:20:16 +0000 (23:20 +0800)
Signed-off-by: Kefu Chai <kchai@redhat.com>
src/mon/OSDMonitor.cc
src/mon/OSDMonitor.h
src/mon/PGMap.cc
src/mon/PGMap.h

index b4060740e82f83c44cc9e7c6c85e52238153add7..bb708fa4e7921404c7d3148587b66eb50c55f1ae 100644 (file)
@@ -448,237 +448,6 @@ void OSDMonitor::update_logger()
   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
 }
 
-/* Assign a lower weight to overloaded OSDs.
- *
- * The osds that will get a lower weight are those with with a utilization
- * percentage 'oload' percent greater than the average utilization.
- */
-int OSDMonitor::reweight_by_utilization(int oload,
-                                       double max_changef,
-                                       int max_osds,
-                                       bool by_pg, const set<int64_t> *pools,
-                                       bool no_increasing,
-                                       bool dry_run,
-                                       std::stringstream *ss,
-                                       std::string *out_str,
-                                       Formatter *f)
-{
-  if (oload <= 100) {
-    *ss << "You must give a percentage higher than 100. "
-      "The reweighting threshold will be calculated as <average-utilization> "
-      "times <input-percentage>. For example, an argument of 200 would "
-      "reweight OSDs which are twice as utilized as the average OSD.\n";
-    return -EINVAL;
-  }
-
-  const PGMap &pgm = mon->pgmon()->pg_map;
-  vector<int> pgs_by_osd(osdmap.get_max_osd());
-
-  // Avoid putting a small number (or 0) in the denominator when calculating
-  // average_util
-  double average_util;
-  if (by_pg) {
-    // by pg mapping
-    double weight_sum = 0.0;      // sum up the crush weights
-    unsigned num_pg_copies = 0;
-    int num_osds = 0;
-    for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p =
-          pgm.pg_stat.begin();
-        p != pgm.pg_stat.end();
-        ++p) {
-      if (pools && pools->count(p->first.pool()) == 0)
-       continue;
-      for (vector<int>::const_iterator q = p->second.acting.begin();
-          q != p->second.acting.end();
-          ++q) {
-       if (*q >= (int)pgs_by_osd.size())
-         pgs_by_osd.resize(*q);
-       if (pgs_by_osd[*q] == 0) {
-          if (osdmap.crush->get_item_weightf(*q) <= 0) {
-            //skip if we currently can not identify item
-            continue;
-          }
-         weight_sum += osdmap.crush->get_item_weightf(*q);
-         ++num_osds;
-       }
-       ++pgs_by_osd[*q];
-       ++num_pg_copies;
-      }
-    }
-
-    if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
-      *ss << "Refusing to reweight: we only have " << num_pg_copies
-         << " PGs across " << num_osds << " osds!\n";
-      return -EDOM;
-    }
-
-    average_util = (double)num_pg_copies / weight_sum;
-  } else {
-    // by osd utilization
-    int num_osd = MAX(1, pgm.osd_stat.size());
-    if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
-       < g_conf->mon_reweight_min_bytes_per_osd) {
-      *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
-         << " kb across all osds!\n";
-      return -EDOM;
-    }
-    if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
-       < g_conf->mon_reweight_min_bytes_per_osd) {
-      *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
-         << " kb used across all osds!\n";
-      return -EDOM;
-    }
-
-    average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
-  }
-
-  // adjust down only if we are above the threshold
-  double overload_util = average_util * (double)oload / 100.0;
-
-  // but aggressively adjust weights up whenever possible.
-  double underload_util = average_util;
-
-  unsigned max_change = (unsigned)(max_changef * (double)0x10000);
-
-  ostringstream oss;
-  if (f) {
-    f->open_object_section("reweight_by_utilization");
-    f->dump_int("overload_min", oload);
-    f->dump_float("max_change", max_changef);
-    f->dump_int("max_change_osds", max_osds);
-    f->dump_float("average_utilization", average_util);
-    f->dump_float("overload_utilization", overload_util);
-  } else {
-    oss << "oload " << oload << "\n";
-    oss << "max_change " << max_changef << "\n";
-    oss << "max_change_osds " << max_osds << "\n";
-    oss.precision(4);
-    oss << "average_utilization " << std::fixed << average_util << "\n";
-    oss << "overload_utilization " << overload_util << "\n";
-  }
-  bool changed = false;
-  int num_changed = 0;
-
-  // precompute util for each OSD
-  std::vector<std::pair<int, float> > util_by_osd;
-  for (ceph::unordered_map<int,osd_stat_t>::const_iterator p =
-       pgm.osd_stat.begin();
-       p != pgm.osd_stat.end();
-       ++p) {
-    std::pair<int, float> osd_util;
-    osd_util.first = p->first;
-    if (by_pg) {
-      if (p->first >= (int)pgs_by_osd.size() ||
-        pgs_by_osd[p->first] == 0) {
-        // skip if this OSD does not contain any pg
-        // belonging to the specified pool(s).
-        continue;
-      }
-
-      if (osdmap.crush->get_item_weightf(p->first) <= 0) {
-        // skip if we are unable to locate item.
-        continue;
-      }
-
-      osd_util.second = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first);
-    } else {
-      osd_util.second = (double)p->second.kb_used / (double)p->second.kb;
-    }
-    util_by_osd.push_back(osd_util);
-  }
-
-  // sort by absolute deviation from the mean utilization,
-  // in descending order.
-  std::sort(util_by_osd.begin(), util_by_osd.end(),
-    [average_util](std::pair<int, float> l, std::pair<int, float> r) {
-      return abs(l.second - average_util) > abs(r.second - average_util);
-    }
-  );
-
-  OSDMap::Incremental newinc;
-
-  if (f)
-    f->open_array_section("reweights");
-
-  for (std::vector<std::pair<int, float> >::const_iterator p =
-        util_by_osd.begin();
-       p != util_by_osd.end();
-       ++p) {
-    unsigned weight = osdmap.get_weight(p->first);
-    if (weight == 0) {
-      // skip if OSD is currently out
-      continue;
-    }
-    float util = p->second;
-
-    if (util >= overload_util) {
-      // Assign a lower weight to overloaded OSDs. The current weight
-      // is a factor to take into account the original weights,
-      // to represent e.g. differing storage capacities
-      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
-      if (weight > max_change)
-       new_weight = MAX(new_weight, weight - max_change);
-      newinc.new_weight[p->first] = new_weight;
-      if (!dry_run) {
-       pending_inc.new_weight[p->first] = new_weight;
-       changed = true;
-      }
-      if (f) {
-       f->open_object_section("osd");
-       f->dump_int("osd", p->first);
-       f->dump_float("weight", (float)weight / (float)0x10000);
-       f->dump_float("new_weight", (float)new_weight / (float)0x10000);
-       f->close_section();
-      } else {
-        oss << "osd." << p->first << " weight "
-            << (float)weight / (float)0x10000 << " -> "
-            << (float)new_weight / (float)0x10000 << "\n";
-      }
-      if (++num_changed >= max_osds)
-       break;
-    }
-    if (!no_increasing && util <= underload_util) {
-      // assign a higher weight.. if we can.
-      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
-      new_weight = MIN(new_weight, weight + max_change);
-      if (new_weight > 0x10000)
-       new_weight = 0x10000;
-      if (new_weight > weight) {
-       newinc.new_weight[p->first] = new_weight;
-       if (!dry_run) {
-         pending_inc.new_weight[p->first] = new_weight;
-         changed = true;
-       }
-        oss << "osd." << p->first << " weight "
-            << (float)weight / (float)0x10000 << " -> "
-            << (float)new_weight / (float)0x10000 << "\n";
-       if (++num_changed >= max_osds)
-         break;
-      }
-    }
-  }
-  if (f) {
-    f->close_section();
-  }
-
-  OSDMap newmap;
-  newmap.deepish_copy_from(osdmap);
-  newinc.fsid = newmap.fsid;
-  newinc.epoch = newmap.get_epoch() + 1;
-  newmap.apply_incremental(newinc);
-
-  osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
-
-  if (f) {
-    f->close_section();
-  } else {
-    *out_str += "\n";
-    *out_str += oss.str();
-  }
-  dout(10) << "reweight_by_utilization: finished with " << out_str << dendl;
-  return changed;
-}
-
 template <typename F>
 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
 public:
@@ -8628,24 +8397,31 @@ done:
     string no_increasing;
     cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
     string out_str;
-    err = reweight_by_utilization(oload,
-                                 max_change,
-                                 max_osds,
-                                 by_pg,
-                                 pools.empty() ? NULL : &pools,
-                                 no_increasing == "--no-increasing",
-                                 dry_run,
-                                 &ss, &out_str, f.get());
+    map<int32_t, uint32_t> new_weights;
+    err = reweight::by_utilization(osdmap,
+                                  mon->pgmon()->pg_map,
+                                  oload,
+                                  max_change,
+                                  max_osds,
+                                  by_pg,
+                                  pools.empty() ? NULL : &pools,
+                                  no_increasing == "--no-increasing",
+                                  &new_weights,
+                                  &ss, &out_str, f.get());
+    if (err >= 0) {
+      dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
+    }
     if (f)
       f->flush(rdata);
     else
       rdata.append(out_str);
     if (err < 0) {
       ss << "FAILED reweight-by-pg";
-    } else if (err == 0) {
+    } else if (err == 0 || dry_run) {
       ss << "no change";
     } else {
       ss << "SUCCESSFUL reweight-by-pg";
+      pending_inc.new_weight = std::move(new_weights);
       wait_for_finished_proposal(
        op,
        new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
index b02067ac5f24e9b2e3b0d2809d73e644a121c87b..32d08f78d3e07fc6ea5c5bcf141215648938012e 100644 (file)
@@ -240,16 +240,6 @@ public:
                        MonOpRequestRef req = MonOpRequestRef());
 
 private:
-  int reweight_by_utilization(int oload,
-                             double max_change,
-                             int max_osds,
-                             bool by_pg,
-                             const set<int64_t> *pools,
-                             bool no_increasing,
-                             bool dry_run,
-                             std::stringstream *ss,
-                             std::string *out_str,
-                             Formatter *f);
   void print_utilization(ostream &out, Formatter *f, bool tree) const;
 
   bool check_source(PaxosServiceMessage *m, uuid_d fsid);
index bd0bab0a6355d05ac191c035c28396b9f46a15f6..320309edd6c4578756375a79d8b916674729d1f5 100644 (file)
@@ -2884,3 +2884,210 @@ void PGMapUpdater::check_down_pgs(
     }
   }
 }
+
+int reweight::by_utilization(
+    const OSDMap &osdmap,
+    const PGMap &pgm,
+    int oload,
+    double max_changef,
+    int max_osds,
+    bool by_pg, const set<int64_t> *pools,
+    bool no_increasing,
+    map<int32_t, uint32_t>* new_weights,
+    std::stringstream *ss,
+    std::string *out_str,
+    Formatter *f)
+{
+  if (oload <= 100) {
+    *ss << "You must give a percentage higher than 100. "
+      "The reweighting threshold will be calculated as <average-utilization> "
+      "times <input-percentage>. For example, an argument of 200 would "
+      "reweight OSDs which are twice as utilized as the average OSD.\n";
+    return -EINVAL;
+  }
+
+  vector<int> pgs_by_osd(osdmap.get_max_osd());
+
+  // Avoid putting a small number (or 0) in the denominator when calculating
+  // average_util
+  double average_util;
+  if (by_pg) {
+    // by pg mapping
+    double weight_sum = 0.0;      // sum up the crush weights
+    unsigned num_pg_copies = 0;
+    int num_osds = 0;
+    for (const auto& pg : pgm.pg_stat) {
+      if (pools && pools->count(pg.first.pool()) == 0)
+       continue;
+      for (const auto acting : pg.second.acting) {
+       if (acting >= (int)pgs_by_osd.size())
+         pgs_by_osd.resize(acting);
+       if (pgs_by_osd[acting] == 0) {
+          if (osdmap.crush->get_item_weightf(acting) <= 0) {
+            //skip if we currently can not identify item
+            continue;
+          }
+         weight_sum += osdmap.crush->get_item_weightf(acting);
+         ++num_osds;
+       }
+       ++pgs_by_osd[acting];
+       ++num_pg_copies;
+      }
+    }
+
+    if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
+      *ss << "Refusing to reweight: we only have " << num_pg_copies
+         << " PGs across " << num_osds << " osds!\n";
+      return -EDOM;
+    }
+
+    average_util = (double)num_pg_copies / weight_sum;
+  } else {
+    // by osd utilization
+    int num_osd = MAX(1, pgm.osd_stat.size());
+    if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
+       < g_conf->mon_reweight_min_bytes_per_osd) {
+      *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
+         << " kb across all osds!\n";
+      return -EDOM;
+    }
+    if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
+       < g_conf->mon_reweight_min_bytes_per_osd) {
+      *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
+         << " kb used across all osds!\n";
+      return -EDOM;
+    }
+
+    average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
+  }
+
+  // adjust down only if we are above the threshold
+  const double overload_util = average_util * (double)oload / 100.0;
+
+  // but aggressively adjust weights up whenever possible.
+  const double underload_util = average_util;
+
+  const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
+
+  ostringstream oss;
+  if (f) {
+    f->open_object_section("reweight_by_utilization");
+    f->dump_int("overload_min", oload);
+    f->dump_float("max_change", max_changef);
+    f->dump_int("max_change_osds", max_osds);
+    f->dump_float("average_utilization", average_util);
+    f->dump_float("overload_utilization", overload_util);
+  } else {
+    oss << "oload " << oload << "\n";
+    oss << "max_change " << max_changef << "\n";
+    oss << "max_change_osds " << max_osds << "\n";
+    oss.precision(4);
+    oss << "average_utilization " << std::fixed << average_util << "\n";
+    oss << "overload_utilization " << overload_util << "\n";
+  }
+  int num_changed = 0;
+
+  // precompute util for each OSD
+  std::vector<std::pair<int, float> > util_by_osd;
+  for (const auto& p : pgm.osd_stat) {
+    std::pair<int, float> osd_util;
+    osd_util.first = p.first;
+    if (by_pg) {
+      if (p.first >= (int)pgs_by_osd.size() ||
+        pgs_by_osd[p.first] == 0) {
+        // skip if this OSD does not contain any pg
+        // belonging to the specified pool(s).
+        continue;
+      }
+
+      if (osdmap.crush->get_item_weightf(p.first) <= 0) {
+        // skip if we are unable to locate item.
+        continue;
+      }
+
+      osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
+    } else {
+      osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
+    }
+    util_by_osd.push_back(osd_util);
+  }
+
+  // sort by absolute deviation from the mean utilization,
+  // in descending order.
+  std::sort(util_by_osd.begin(), util_by_osd.end(),
+    [average_util](std::pair<int, float> l, std::pair<int, float> r) {
+      return abs(l.second - average_util) > abs(r.second - average_util);
+    }
+  );
+
+  if (f)
+    f->open_array_section("reweights");
+
+  for (const auto& p : util_by_osd) {
+    unsigned weight = osdmap.get_weight(p.first);
+    if (weight == 0) {
+      // skip if OSD is currently out
+      continue;
+    }
+    float util = p.second;
+
+    if (util >= overload_util) {
+      // Assign a lower weight to overloaded OSDs. The current weight
+      // is a factor to take into account the original weights,
+      // to represent e.g. differing storage capacities
+      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+      if (weight > max_change)
+       new_weight = MAX(new_weight, weight - max_change);
+      new_weights->insert({p.first, new_weight});
+      if (f) {
+       f->open_object_section("osd");
+       f->dump_int("osd", p.first);
+       f->dump_float("weight", (float)weight / (float)0x10000);
+       f->dump_float("new_weight", (float)new_weight / (float)0x10000);
+       f->close_section();
+      } else {
+        oss << "osd." << p.first << " weight "
+            << (float)weight / (float)0x10000 << " -> "
+            << (float)new_weight / (float)0x10000 << "\n";
+      }
+      if (++num_changed >= max_osds)
+       break;
+    }
+    if (!no_increasing && util <= underload_util) {
+      // assign a higher weight.. if we can.
+      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+      new_weight = MIN(new_weight, weight + max_change);
+      if (new_weight > 0x10000)
+       new_weight = 0x10000;
+      if (new_weight > weight) {
+       new_weights->insert({p.first, new_weight});
+        oss << "osd." << p.first << " weight "
+            << (float)weight / (float)0x10000 << " -> "
+            << (float)new_weight / (float)0x10000 << "\n";
+       if (++num_changed >= max_osds)
+         break;
+      }
+    }
+  }
+  if (f) {
+    f->close_section();
+  }
+
+  OSDMap newmap;
+  newmap.deepish_copy_from(osdmap);
+  OSDMap::Incremental newinc;
+  newinc.fsid = newmap.get_fsid();
+  newinc.epoch = newmap.get_epoch() + 1;
+  newinc.new_weight = *new_weights;
+  newmap.apply_incremental(newinc);
+
+  osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
+
+  if (f) {
+    f->close_section();
+  } else {
+    *out_str += "\n";
+    *out_str += oss.str();
+  }
+  return num_changed;
+}
index 80b41b93110f01df77fa58762a58bd08e6942eab..05ccea3f03fc69f9454c8199af3e31a2538d8975 100644 (file)
@@ -440,4 +440,23 @@ public:
       PGMap::Incremental *pending_inc);
 };
 
+namespace reweight {
+/* Assign a lower weight to overloaded OSDs.
+ *
+ * The osds that will get a lower weight are those with with a utilization
+ * percentage 'oload' percent greater than the average utilization.
+ */
+  int by_utilization(const OSDMap &osd_map,
+                    const PGMap &pg_map,
+                    int oload,
+                    double max_changef,
+                    int max_osds,
+                    bool by_pg, const set<int64_t> *pools,
+                    bool no_increasing,
+                    map<int32_t, uint32_t>* new_weights,
+                    std::stringstream *ss,
+                    std::string *out_str,
+                    Formatter *f);
+}
+
 #endif