From: Kefu Chai Date: Sat, 8 Apr 2017 04:29:43 +0000 (+0800) Subject: mon/OSDMonitor: extract reweight_by_utilization into PGMap X-Git-Tag: v12.0.2~83^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=82dccc4e746116511548064027ce871bb54eedcf;p=ceph.git mon/OSDMonitor: extract reweight_by_utilization into PGMap Signed-off-by: Kefu Chai --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index b4060740e82f..bb708fa4e792 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -448,237 +448,6 @@ void OSDMonitor::update_logger() mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch()); } -/* Assign a lower weight to overloaded OSDs. - * - * The osds that will get a lower weight are those with with a utilization - * percentage 'oload' percent greater than the average utilization. - */ -int OSDMonitor::reweight_by_utilization(int oload, - double max_changef, - int max_osds, - bool by_pg, const set *pools, - bool no_increasing, - bool dry_run, - std::stringstream *ss, - std::string *out_str, - Formatter *f) -{ - if (oload <= 100) { - *ss << "You must give a percentage higher than 100. " - "The reweighting threshold will be calculated as " - "times . For example, an argument of 200 would " - "reweight OSDs which are twice as utilized as the average OSD.\n"; - return -EINVAL; - } - - const PGMap &pgm = mon->pgmon()->pg_map; - vector pgs_by_osd(osdmap.get_max_osd()); - - // Avoid putting a small number (or 0) in the denominator when calculating - // average_util - double average_util; - if (by_pg) { - // by pg mapping - double weight_sum = 0.0; // sum up the crush weights - unsigned num_pg_copies = 0; - int num_osds = 0; - for (ceph::unordered_map::const_iterator p = - pgm.pg_stat.begin(); - p != pgm.pg_stat.end(); - ++p) { - if (pools && pools->count(p->first.pool()) == 0) - continue; - for (vector::const_iterator q = p->second.acting.begin(); - q != p->second.acting.end(); - ++q) { - if (*q >= (int)pgs_by_osd.size()) - pgs_by_osd.resize(*q); - if (pgs_by_osd[*q] == 0) { - if (osdmap.crush->get_item_weightf(*q) <= 0) { - //skip if we currently can not identify item - continue; - } - weight_sum += osdmap.crush->get_item_weightf(*q); - ++num_osds; - } - ++pgs_by_osd[*q]; - ++num_pg_copies; - } - } - - if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) { - *ss << "Refusing to reweight: we only have " << num_pg_copies - << " PGs across " << num_osds << " osds!\n"; - return -EDOM; - } - - average_util = (double)num_pg_copies / weight_sum; - } else { - // by osd utilization - int num_osd = MAX(1, pgm.osd_stat.size()); - if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd - < g_conf->mon_reweight_min_bytes_per_osd) { - *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb - << " kb across all osds!\n"; - return -EDOM; - } - if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd - < g_conf->mon_reweight_min_bytes_per_osd) { - *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used - << " kb used across all osds!\n"; - return -EDOM; - } - - average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb; - } - - // adjust down only if we are above the threshold - double overload_util = average_util * (double)oload / 100.0; - - // but aggressively adjust weights up whenever possible. - double underload_util = average_util; - - unsigned max_change = (unsigned)(max_changef * (double)0x10000); - - ostringstream oss; - if (f) { - f->open_object_section("reweight_by_utilization"); - f->dump_int("overload_min", oload); - f->dump_float("max_change", max_changef); - f->dump_int("max_change_osds", max_osds); - f->dump_float("average_utilization", average_util); - f->dump_float("overload_utilization", overload_util); - } else { - oss << "oload " << oload << "\n"; - oss << "max_change " << max_changef << "\n"; - oss << "max_change_osds " << max_osds << "\n"; - oss.precision(4); - oss << "average_utilization " << std::fixed << average_util << "\n"; - oss << "overload_utilization " << overload_util << "\n"; - } - bool changed = false; - int num_changed = 0; - - // precompute util for each OSD - std::vector > util_by_osd; - for (ceph::unordered_map::const_iterator p = - pgm.osd_stat.begin(); - p != pgm.osd_stat.end(); - ++p) { - std::pair osd_util; - osd_util.first = p->first; - if (by_pg) { - if (p->first >= (int)pgs_by_osd.size() || - pgs_by_osd[p->first] == 0) { - // skip if this OSD does not contain any pg - // belonging to the specified pool(s). - continue; - } - - if (osdmap.crush->get_item_weightf(p->first) <= 0) { - // skip if we are unable to locate item. - continue; - } - - osd_util.second = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first); - } else { - osd_util.second = (double)p->second.kb_used / (double)p->second.kb; - } - util_by_osd.push_back(osd_util); - } - - // sort by absolute deviation from the mean utilization, - // in descending order. - std::sort(util_by_osd.begin(), util_by_osd.end(), - [average_util](std::pair l, std::pair r) { - return abs(l.second - average_util) > abs(r.second - average_util); - } - ); - - OSDMap::Incremental newinc; - - if (f) - f->open_array_section("reweights"); - - for (std::vector >::const_iterator p = - util_by_osd.begin(); - p != util_by_osd.end(); - ++p) { - unsigned weight = osdmap.get_weight(p->first); - if (weight == 0) { - // skip if OSD is currently out - continue; - } - float util = p->second; - - if (util >= overload_util) { - // Assign a lower weight to overloaded OSDs. The current weight - // is a factor to take into account the original weights, - // to represent e.g. differing storage capacities - unsigned new_weight = (unsigned)((average_util / util) * (float)weight); - if (weight > max_change) - new_weight = MAX(new_weight, weight - max_change); - newinc.new_weight[p->first] = new_weight; - if (!dry_run) { - pending_inc.new_weight[p->first] = new_weight; - changed = true; - } - if (f) { - f->open_object_section("osd"); - f->dump_int("osd", p->first); - f->dump_float("weight", (float)weight / (float)0x10000); - f->dump_float("new_weight", (float)new_weight / (float)0x10000); - f->close_section(); - } else { - oss << "osd." << p->first << " weight " - << (float)weight / (float)0x10000 << " -> " - << (float)new_weight / (float)0x10000 << "\n"; - } - if (++num_changed >= max_osds) - break; - } - if (!no_increasing && util <= underload_util) { - // assign a higher weight.. if we can. - unsigned new_weight = (unsigned)((average_util / util) * (float)weight); - new_weight = MIN(new_weight, weight + max_change); - if (new_weight > 0x10000) - new_weight = 0x10000; - if (new_weight > weight) { - newinc.new_weight[p->first] = new_weight; - if (!dry_run) { - pending_inc.new_weight[p->first] = new_weight; - changed = true; - } - oss << "osd." << p->first << " weight " - << (float)weight / (float)0x10000 << " -> " - << (float)new_weight / (float)0x10000 << "\n"; - if (++num_changed >= max_osds) - break; - } - } - } - if (f) { - f->close_section(); - } - - OSDMap newmap; - newmap.deepish_copy_from(osdmap); - newinc.fsid = newmap.fsid; - newinc.epoch = newmap.get_epoch() + 1; - newmap.apply_incremental(newinc); - - osdmap.summarize_mapping_stats(&newmap, pools, out_str, f); - - if (f) { - f->close_section(); - } else { - *out_str += "\n"; - *out_str += oss.str(); - } - dout(10) << "reweight_by_utilization: finished with " << out_str << dendl; - return changed; -} - template class OSDUtilizationDumper : public CrushTreeDumper::Dumper { public: @@ -8628,24 +8397,31 @@ done: string no_increasing; cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing); string out_str; - err = reweight_by_utilization(oload, - max_change, - max_osds, - by_pg, - pools.empty() ? NULL : &pools, - no_increasing == "--no-increasing", - dry_run, - &ss, &out_str, f.get()); + map new_weights; + err = reweight::by_utilization(osdmap, + mon->pgmon()->pg_map, + oload, + max_change, + max_osds, + by_pg, + pools.empty() ? NULL : &pools, + no_increasing == "--no-increasing", + &new_weights, + &ss, &out_str, f.get()); + if (err >= 0) { + dout(10) << "reweight::by_utilization: finished with " << out_str << dendl; + } if (f) f->flush(rdata); else rdata.append(out_str); if (err < 0) { ss << "FAILED reweight-by-pg"; - } else if (err == 0) { + } else if (err == 0 || dry_run) { ss << "no change"; } else { ss << "SUCCESSFUL reweight-by-pg"; + pending_inc.new_weight = std::move(new_weights); wait_for_finished_proposal( op, new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1)); diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index b02067ac5f24..32d08f78d3e0 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -240,16 +240,6 @@ public: MonOpRequestRef req = MonOpRequestRef()); private: - int reweight_by_utilization(int oload, - double max_change, - int max_osds, - bool by_pg, - const set *pools, - bool no_increasing, - bool dry_run, - std::stringstream *ss, - std::string *out_str, - Formatter *f); void print_utilization(ostream &out, Formatter *f, bool tree) const; bool check_source(PaxosServiceMessage *m, uuid_d fsid); diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index bd0bab0a6355..320309edd6c4 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2884,3 +2884,210 @@ void PGMapUpdater::check_down_pgs( } } } + +int reweight::by_utilization( + const OSDMap &osdmap, + const PGMap &pgm, + int oload, + double max_changef, + int max_osds, + bool by_pg, const set *pools, + bool no_increasing, + map* new_weights, + std::stringstream *ss, + std::string *out_str, + Formatter *f) +{ + if (oload <= 100) { + *ss << "You must give a percentage higher than 100. " + "The reweighting threshold will be calculated as " + "times . For example, an argument of 200 would " + "reweight OSDs which are twice as utilized as the average OSD.\n"; + return -EINVAL; + } + + vector pgs_by_osd(osdmap.get_max_osd()); + + // Avoid putting a small number (or 0) in the denominator when calculating + // average_util + double average_util; + if (by_pg) { + // by pg mapping + double weight_sum = 0.0; // sum up the crush weights + unsigned num_pg_copies = 0; + int num_osds = 0; + for (const auto& pg : pgm.pg_stat) { + if (pools && pools->count(pg.first.pool()) == 0) + continue; + for (const auto acting : pg.second.acting) { + if (acting >= (int)pgs_by_osd.size()) + pgs_by_osd.resize(acting); + if (pgs_by_osd[acting] == 0) { + if (osdmap.crush->get_item_weightf(acting) <= 0) { + //skip if we currently can not identify item + continue; + } + weight_sum += osdmap.crush->get_item_weightf(acting); + ++num_osds; + } + ++pgs_by_osd[acting]; + ++num_pg_copies; + } + } + + if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) { + *ss << "Refusing to reweight: we only have " << num_pg_copies + << " PGs across " << num_osds << " osds!\n"; + return -EDOM; + } + + average_util = (double)num_pg_copies / weight_sum; + } else { + // by osd utilization + int num_osd = MAX(1, pgm.osd_stat.size()); + if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd + < g_conf->mon_reweight_min_bytes_per_osd) { + *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb + << " kb across all osds!\n"; + return -EDOM; + } + if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd + < g_conf->mon_reweight_min_bytes_per_osd) { + *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used + << " kb used across all osds!\n"; + return -EDOM; + } + + average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb; + } + + // adjust down only if we are above the threshold + const double overload_util = average_util * (double)oload / 100.0; + + // but aggressively adjust weights up whenever possible. + const double underload_util = average_util; + + const unsigned max_change = (unsigned)(max_changef * (double)0x10000); + + ostringstream oss; + if (f) { + f->open_object_section("reweight_by_utilization"); + f->dump_int("overload_min", oload); + f->dump_float("max_change", max_changef); + f->dump_int("max_change_osds", max_osds); + f->dump_float("average_utilization", average_util); + f->dump_float("overload_utilization", overload_util); + } else { + oss << "oload " << oload << "\n"; + oss << "max_change " << max_changef << "\n"; + oss << "max_change_osds " << max_osds << "\n"; + oss.precision(4); + oss << "average_utilization " << std::fixed << average_util << "\n"; + oss << "overload_utilization " << overload_util << "\n"; + } + int num_changed = 0; + + // precompute util for each OSD + std::vector > util_by_osd; + for (const auto& p : pgm.osd_stat) { + std::pair osd_util; + osd_util.first = p.first; + if (by_pg) { + if (p.first >= (int)pgs_by_osd.size() || + pgs_by_osd[p.first] == 0) { + // skip if this OSD does not contain any pg + // belonging to the specified pool(s). + continue; + } + + if (osdmap.crush->get_item_weightf(p.first) <= 0) { + // skip if we are unable to locate item. + continue; + } + + osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first); + } else { + osd_util.second = (double)p.second.kb_used / (double)p.second.kb; + } + util_by_osd.push_back(osd_util); + } + + // sort by absolute deviation from the mean utilization, + // in descending order. + std::sort(util_by_osd.begin(), util_by_osd.end(), + [average_util](std::pair l, std::pair r) { + return abs(l.second - average_util) > abs(r.second - average_util); + } + ); + + if (f) + f->open_array_section("reweights"); + + for (const auto& p : util_by_osd) { + unsigned weight = osdmap.get_weight(p.first); + if (weight == 0) { + // skip if OSD is currently out + continue; + } + float util = p.second; + + if (util >= overload_util) { + // Assign a lower weight to overloaded OSDs. The current weight + // is a factor to take into account the original weights, + // to represent e.g. differing storage capacities + unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + if (weight > max_change) + new_weight = MAX(new_weight, weight - max_change); + new_weights->insert({p.first, new_weight}); + if (f) { + f->open_object_section("osd"); + f->dump_int("osd", p.first); + f->dump_float("weight", (float)weight / (float)0x10000); + f->dump_float("new_weight", (float)new_weight / (float)0x10000); + f->close_section(); + } else { + oss << "osd." << p.first << " weight " + << (float)weight / (float)0x10000 << " -> " + << (float)new_weight / (float)0x10000 << "\n"; + } + if (++num_changed >= max_osds) + break; + } + if (!no_increasing && util <= underload_util) { + // assign a higher weight.. if we can. + unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + new_weight = MIN(new_weight, weight + max_change); + if (new_weight > 0x10000) + new_weight = 0x10000; + if (new_weight > weight) { + new_weights->insert({p.first, new_weight}); + oss << "osd." << p.first << " weight " + << (float)weight / (float)0x10000 << " -> " + << (float)new_weight / (float)0x10000 << "\n"; + if (++num_changed >= max_osds) + break; + } + } + } + if (f) { + f->close_section(); + } + + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + OSDMap::Incremental newinc; + newinc.fsid = newmap.get_fsid(); + newinc.epoch = newmap.get_epoch() + 1; + newinc.new_weight = *new_weights; + newmap.apply_incremental(newinc); + + osdmap.summarize_mapping_stats(&newmap, pools, out_str, f); + + if (f) { + f->close_section(); + } else { + *out_str += "\n"; + *out_str += oss.str(); + } + return num_changed; +} diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 80b41b93110f..05ccea3f03fc 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -440,4 +440,23 @@ public: PGMap::Incremental *pending_inc); }; +namespace reweight { +/* Assign a lower weight to overloaded OSDs. + * + * The osds that will get a lower weight are those with with a utilization + * percentage 'oload' percent greater than the average utilization. + */ + int by_utilization(const OSDMap &osd_map, + const PGMap &pg_map, + int oload, + double max_changef, + int max_osds, + bool by_pg, const set *pools, + bool no_increasing, + map* new_weights, + std::stringstream *ss, + std::string *out_str, + Formatter *f); +} + #endif