From 8ed66d3087ef9437bf406569e825d3115db1968d Mon Sep 17 00:00:00 2001 From: Josh Salomon Date: Mon, 18 Apr 2022 11:37:13 +0300 Subject: [PATCH] osd, mon: add read balance score to command `ceph osd pool ls detail` for replicated pools. osd: Added workload balance score to the command ceph osd pool ls detail (different flavors for console output and json/xml output) mon: Added command 'osd rm-primary-temp' (developers only) rm was not working vstart: Added osd debug messages into mon log with -d flag For commands that execute methonds in OSD module Signed-off-by: Josh Salomon --- src/mgr/ActivePyModules.cc | 2 +- src/mgr/PyOSDMap.cc | 2 +- src/mon/MonCommands.h | 6 +- src/mon/OSDMonitor.cc | 54 ++-- src/osd/OSDMap.cc | 490 +++++++++++++++++++++++++++++--- src/osd/OSDMap.h | 79 ++++- src/test/osd/TestOSDMap.cc | 195 +++++++++++-- src/tools/ceph_monstore_tool.cc | 2 +- src/tools/osdmaptool.cc | 2 +- src/vstart.sh | 1 + 10 files changed, 742 insertions(+), 91 deletions(-) diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index 58c3d9ee4d6..76e9b41f62c 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -235,7 +235,7 @@ PyObject *ActivePyModules::get_python(const std::string &what) cluster_state.with_osdmap([&](const OSDMap &osd_map){ no_gil.acquire_gil(); if (what == "osd_map") { - osd_map.dump(&f); + osd_map.dump(&f, g_ceph_context); } else if (what == "osd_map_tree") { osd_map.print_tree(&f, nullptr); } else if (what == "osd_map_crush") { diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc index ad188afccbc..83475f5ee5f 100644 --- a/src/mgr/PyOSDMap.cc +++ b/src/mgr/PyOSDMap.cc @@ -50,7 +50,7 @@ static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj) static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj) { PyFormatter f; - self->osdmap->dump(&f); + self->osdmap->dump(&f, g_ceph_context); return f.get(); } diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 29e7b21275a..2f3fc83379e 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -993,7 +993,11 @@ COMMAND("osd rm-pg-upmap-primary " COMMAND("osd primary-temp " "name=pgid,type=CephPgid " "name=id,type=CephOsdName", - "set primary_temp mapping pgid:|-1 (developers only)", + "set primary_temp mapping pgid: (developers only)", + "osd", "rw") +COMMAND("osd rm-primary-temp " + "name=pgid,type=CephPgid ", + "clear primary_temp mapping pgid (developers only)", "osd", "rw") COMMAND("osd primary-affinity " "name=id,type=CephOsdName " diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index bb8a5488de8..cdeeb49c138 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -5350,7 +5350,7 @@ static void dump_cpu_list(Formatter *f, const char *name, void OSDMonitor::dump_info(Formatter *f) { f->open_object_section("osdmap"); - osdmap.dump(f); + osdmap.dump(f, cct); f->close_section(); f->open_array_section("osd_metadata"); @@ -5509,11 +5509,11 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) stringstream ds; if (f) { f->open_object_section("osdmap"); - p->dump(f.get()); + p->dump(f.get(), cct); f->close_section(); f->flush(ds); } else { - p->print(ds); + p->print(cct, ds); } rdata.append(ds); if (!f) @@ -6051,26 +6051,25 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) cmd_getval(cmdmap, "detail", detail); if (!f && detail == "detail") { ostringstream ss; - osdmap.print_pools(ss); + osdmap.print_pools(cct, ss); rdata.append(ss.str()); } else { if (f) f->open_array_section("pools"); - for (map::const_iterator it = osdmap.get_pools().begin(); - it != osdmap.get_pools().end(); - ++it) { + for (auto &[pid, pdata] : osdmap.get_pools()) { if (f) { if (detail == "detail") { f->open_object_section("pool"); - f->dump_int("pool_id", it->first); - f->dump_string("pool_name", osdmap.get_pool_name(it->first)); - it->second.dump(f.get()); + f->dump_int("pool_id", pid); + f->dump_string("pool_name", osdmap.get_pool_name(pid)); + pdata.dump(f.get()); + osdmap.dump_read_balance_score(cct, pid, pdata, f.get()); f->close_section(); } else { - f->dump_string("pool_name", osdmap.get_pool_name(it->first)); + f->dump_string("pool_name", osdmap.get_pool_name(pid)); } } else { - rdata.append(osdmap.get_pool_name(it->first) + "\n"); + rdata.append(osdmap.get_pool_name(pid) + "\n"); } } if (f) { @@ -10294,7 +10293,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, set osds; newcrush.get_devices_by_class(device_class, &osds); for (auto& p: osds) { - err = newcrush.remove_device_class(g_ceph_context, p, &ss); + err = newcrush.remove_device_class(cct, p, &ss); if (err < 0) { // ss has reason for failure goto reply; @@ -12027,23 +12026,32 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, new_pg_temp.begin(), new_pg_temp.end()); ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp; goto update; - } else if (prefix == "osd primary-temp") { + } else if (prefix == "osd primary-temp" || + prefix == "osd rm-primary-temp") { pg_t pgid; err = parse_pgid(cmdmap, ss, pgid); if (err < 0) goto reply; int64_t osd; - if (!cmd_getval(cmdmap, "id", osd)) { - ss << "unable to parse 'id' value '" - << cmd_vartype_stringify(cmdmap.at("id")) << "'"; - err = -EINVAL; - goto reply; + if (prefix == "osd primary-temp") { + if (!cmd_getval(cmdmap, "id", osd)) { + ss << "unable to parse 'id' value '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + err = -ENOENT; + goto reply; + } } - if (osd != -1 && !osdmap.exists(osd)) { - ss << "osd." << osd << " does not exist"; - err = -ENOENT; - goto reply; + else if (prefix == "osd rm-primary-temp") { + osd = -1; + } + else { + ceph_assert(0 == "Unreachable!"); } if (osdmap.require_min_compat_client != ceph_release_t::unknown && diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 699ccb21665..c36a926245c 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -3754,7 +3755,61 @@ void OSDMap::dump_osd(int id, Formatter *f) const f->close_section(); } -void OSDMap::dump(Formatter *f) const +void OSDMap::dump_pool(CephContext *cct, + int64_t pid, + const pg_pool_t &pdata, + ceph::Formatter *f) const +{ + std::string name(""); + const auto &pni = pool_name.find(pid); + if (pni != pool_name.end()) + name = pni->second; + f->open_object_section("pool"); + f->dump_int("pool", pid); + f->dump_string("pool_name", name); + pdata.dump(f); + dump_read_balance_score(cct, pid, pdata, f); + f->close_section(); // pool +} + +void OSDMap::dump_read_balance_score(CephContext *cct, + int64_t pid, + const pg_pool_t &pdata, + ceph::Formatter *f) const +{ + if (pdata.is_replicated()) { + // Add rb section with values for score, optimal score, raw score + // // and primary_affinity average + OSDMap::read_balance_info_t rb_info; + auto rc = calc_read_balance_score(cct, pid, &rb_info); + if (rc >= 0) { + f->open_object_section("read_balance"); + f->dump_float("score_acting", rb_info.acting_adj_score); + f->dump_float("score_stable", rb_info.adjusted_score); + f->dump_float("optimal_score", rb_info.optimal_score); + f->dump_float("raw_score_acting", rb_info.acting_raw_score); + f->dump_float("raw_score_stable", rb_info.raw_score); + f->dump_float("primary_affinity_weighted", rb_info.pa_weighted); + f->dump_float("average_primary_affinity", rb_info.pa_avg); + f->dump_float("average_primary_affinity_weighted", rb_info.pa_weighted_avg); + if (rb_info.err_msg.length() > 0) { + f->dump_string("error_message", rb_info.err_msg); + } + f->close_section(); // read_balance + } + else { + if (rb_info.err_msg.length() > 0) { + f->open_object_section("read_balance"); + f->dump_string("error_message", rb_info.err_msg); + f->dump_float("score_acting", rb_info.acting_adj_score); + f->dump_float("score_stable", rb_info.adjusted_score); + f->close_section(); // read_balance + } + } + } +} + +void OSDMap::dump(Formatter *f, CephContext *cct) const { f->dump_int("epoch", get_epoch()); f->dump_stream("fsid") << get_fsid(); @@ -3786,16 +3841,8 @@ void OSDMap::dump(Formatter *f) const to_string(require_osd_release)); f->open_array_section("pools"); - for (const auto &pool : pools) { - std::string name(""); - const auto &pni = pool_name.find(pool.first); - if (pni != pool_name.end()) - name = pni->second; - f->open_object_section("pool"); - f->dump_int("pool", pool.first); - f->dump_string("pool_name", name); - pool.second.dump(f); - f->close_section(); + for (const auto &[pid, pdata] : pools) { + dump_pool(cct, pid, pdata, f); } f->close_section(); @@ -4028,23 +4075,39 @@ string OSDMap::get_flag_string() const return get_flag_string(flags); } -void OSDMap::print_pools(ostream& out) const +void OSDMap::print_pools(CephContext *cct, ostream& out) const { - for (const auto &pool : pools) { + for (const auto &[pid, pdata] : pools) { std::string name(""); - const auto &pni = pool_name.find(pool.first); + const auto &pni = pool_name.find(pid); if (pni != pool_name.end()) name = pni->second; - out << "pool " << pool.first + char rb_score_str[32] = ""; + int rc = 0; + read_balance_info_t rb_info; + if (pdata.is_replicated()) { + rc = calc_read_balance_score(cct, pid, &rb_info); + if (rc >= 0) + snprintf (rb_score_str, sizeof(rb_score_str), + " read_balance_score %.2f", rb_info.acting_adj_score); + } + + out << "pool " << pid << " '" << name - << "' " << pool.second << "\n"; + << "' " << pdata + << rb_score_str << "\n"; + if (rb_info.err_msg.length() > 0) { + out << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << "\n"; + } + + //TODO - print error messages here. - for (const auto &snap : pool.second.snaps) + for (const auto &snap : pdata.snaps) out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n"; - if (!pool.second.removed_snaps.empty()) - out << "\tremoved_snaps " << pool.second.removed_snaps << "\n"; - auto p = removed_snaps_queue.find(pool.first); + if (!pdata.removed_snaps.empty()) + out << "\tremoved_snaps " << pdata.removed_snaps << "\n"; + auto p = removed_snaps_queue.find(pid); if (p != removed_snaps_queue.end()) { out << "\tremoved_snaps_queue " << p->second << "\n"; } @@ -4085,7 +4148,7 @@ void OSDMap::print_osd(int id, ostream& out) const out << "\n"; } -void OSDMap::print(ostream& out) const +void OSDMap::print(CephContext *cct, ostream& out) const { out << "epoch " << get_epoch() << "\n" << "fsid " << get_fsid() << "\n" @@ -4118,7 +4181,7 @@ void OSDMap::print(ostream& out) const out << "cluster_snapshot " << get_cluster_snapshot() << "\n"; out << "\n"; - print_pools(out); + print_pools(cct, out); out << "max_osd " << get_max_osd() << "\n"; print_osds(out); @@ -4692,13 +4755,13 @@ int OSDMap::summarize_mapping_stats( if (osd >= 0 && osd < get_max_osd()) ++new_by_osd[osd]; } - if (pi->type == pg_pool_t::TYPE_ERASURE) { + if (pi->is_erasure()) { for (unsigned i=0; itype == pg_pool_t::TYPE_REPLICATED) { + } else if (pi->is_replicated()) { for (int osd : up) { if (std::find(up2.begin(), up2.end(), osd) == up2.end()) { ++moved_pg; @@ -5182,20 +5245,92 @@ int OSDMap::calc_pg_upmaps( return num_changed; } +map> OSDMap::get_pgs_by_osd( + CephContext *cct, + int64_t pid, + map> *p_primaries_by_osd, + map> *p_acting_primaries_by_osd) const +{ + // Set up the OSDMap + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(*this); + + // Get the pool from the provided pool id + const pg_pool_t* pool = get_pg_pool(pid); + + // build array of pgs from the pool + map> pgs_by_osd; + for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) { + pg_t pg(ps, pid); + vector up; + int primary; + int acting_prim; + tmp_osd_map.pg_to_up_acting_osds(pg, &up, &primary, nullptr, &acting_prim); + if (cct != nullptr) + ldout(cct, 20) << __func__ << " " << pg + << " up " << up + << " primary " << primary + << " acting_primary " << acting_prim + << dendl; + + if (!up.empty()) { // up can be empty is test generated files + // in this case, we return empty result + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + if (p_primaries_by_osd != nullptr) { + if (primary != CRUSH_ITEM_NONE) + (*p_primaries_by_osd)[primary].insert(pg); + } + if (p_acting_primaries_by_osd != nullptr) { + if (acting_prim != CRUSH_ITEM_NONE) + (*p_acting_primaries_by_osd)[acting_prim].insert(pg); + } + } + } + return pgs_by_osd; +} + +float OSDMap::get_osds_weight( + CephContext *cct, + const OSDMap& tmp_osd_map, + int64_t pid, + map& osds_weight) const +{ + map pmap; + ceph_assert(pools.count(pid)); + int ruleno = pools.at(pid).get_crush_rule(); + tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap); + ldout(cct,20) << __func__ << " pool " << pid + << " ruleno " << ruleno + << " weight-map " << pmap + << dendl; + float osds_weight_total = 0; + for (auto [oid, oweight] : pmap) { + auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight; + if (adjusted_weight != 0) { + osds_weight[oid] += adjusted_weight; + osds_weight_total += adjusted_weight; + } + } + return osds_weight_total; +} + float OSDMap::build_pool_pgs_info ( CephContext *cct, const std::set& only_pools, ///< [optional] restrict to pool const OSDMap& tmp_osd_map, int& total_pgs, map>& pgs_by_osd, - map& osd_weight) + map& osds_weight) { // // This function builds some data structures that are used by calc_pg_upmaps. // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs // and returns the osd_weight_total // - float osd_weight_total = 0.0; + float osds_weight_total = 0.0; for (auto& [pid, pdata] : pools) { if (!only_pools.empty() && !only_pools.count(pid)) continue; @@ -5211,23 +5346,9 @@ float OSDMap::build_pool_pgs_info ( } total_pgs += pdata.get_size() * pdata.get_pg_num(); - map pmap; - int ruleno = pdata.get_crush_rule(); - tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap); - ldout(cct,20) << __func__ << " pool " << pid - << " ruleno " << ruleno - << " weight-map " << pmap - << dendl; - for (auto [oid, oweight] : pmap) { - auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight; - if (adjusted_weight == 0) { - continue; - } - osd_weight[oid] += adjusted_weight; - osd_weight_total += adjusted_weight; - } + osds_weight_total = get_osds_weight(cct, tmp_osd_map, pid, osds_weight); } - for (auto& [oid, oweight] : osd_weight) { + for (auto& [oid, oweight] : osds_weight) { int pgs = 0; auto p = pgs_by_osd.find(oid); if (p != pgs_by_osd.end()) @@ -5237,7 +5358,7 @@ float OSDMap::build_pool_pgs_info ( ldout(cct, 20) << " osd." << oid << " weight " << oweight << " pgs " << pgs << dendl; } - return osd_weight_total; + return osds_weight_total; } // return total weight of all OSDs @@ -5582,6 +5703,289 @@ OSDMap::candidates_t OSDMap::build_candidates( return candidates; } +// return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs +int64_t OSDMap::has_zero_pa_pgs(CephContext *cct, int64_t pool_id) const +{ + const pg_pool_t* pool = get_pg_pool(pool_id); + for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) { + pg_t pg(ps, pool_id); + vector acting; + pg_to_up_acting_osds(pg, nullptr, nullptr, &acting, nullptr); + if (cct != nullptr) { + ldout(cct, 30) << __func__ << " " << pg << " acting " << acting << dendl; + } + bool pg_zero_pa = true; + for (auto osd : acting) { + if (get_primary_affinityf(osd) != 0) { + pg_zero_pa = false; + break; + } + } + if (pg_zero_pa) { + if (cct != nullptr) { + ldout(cct, 20) << __func__ << " " << pg << " - maps only to OSDs with primiary affinity 0" << dendl; + } + return (int64_t)ps; + } + } + return -1; +} + +void OSDMap::zero_rbi(read_balance_info_t &rbi) const { + rbi.pa_avg = 0.; + rbi.pa_weighted = 0.; + rbi.pa_weighted_avg = 0.; + rbi.raw_score = 0.; + rbi.optimal_score = 0.; + rbi.adjusted_score = 0.; + rbi.acting_raw_score = 0.; + rbi.acting_adj_score = 0.; + rbi.err_msg = ""; +} + +int OSDMap::set_rbi( + CephContext *cct, + read_balance_info_t &rbi, + int64_t pool_id, + float total_w_pa, + float pa_sum, + int num_osds, + int osd_pa_count, + float total_osd_weight, + uint max_prims_per_osd, + uint max_acting_prims_per_osd, + float avg_prims_per_osd, + bool prim_on_zero_pa, + bool acting_on_zero_pa, + float max_osd_score) const +{ + // put all the ugly code here, so rest of code is nicer. + const pg_pool_t* pool = get_pg_pool(pool_id); + zero_rbi(rbi); + + if (total_w_pa / total_osd_weight < 1. / float(pool->get_size())) { + ldout(cct, 20) << __func__ << " pool " << pool_id << " average primary affinity is lower than" + << 1. / float(pool->get_size()) << dendl; + rbi.err_msg = fmt::format( + "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable", + pool_id, 1. / float(pool->get_size())); + return -EINVAL; + } + rbi.pa_weighted = total_w_pa; + + // weighted_prim_affinity_avg + rbi.pa_weighted_avg = rbi_round(rbi.pa_weighted / total_osd_weight); // in [0..1] + // p_rbi->pa_weighted / osd_pa_count; // in [0..1] + + rbi.raw_score = rbi_round((float)max_prims_per_osd / avg_prims_per_osd); // >=1 + if (acting_on_zero_pa) { + rbi.acting_raw_score = rbi_round(max_osd_score); + rbi.err_msg = fmt::format( + "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate", + pool_id); + } else { + rbi.acting_raw_score = rbi_round((float)max_acting_prims_per_osd / avg_prims_per_osd); + } + + if (osd_pa_count != 0) { + // this implies that pa_sum > 0 + rbi.pa_avg = rbi_round(pa_sum / osd_pa_count); // in [0..1] + } else { + rbi.pa_avg = 0.; + } + + if (rbi.pa_avg != 0.) { + int64_t zpg; + if ((zpg = has_zero_pa_pgs(cct, pool_id)) >= 0) { + pg_t pg(zpg, pool_id); + std::stringstream ss; + ss << pg; + ldout(cct, 10) << __func__ << " pool " << pool_id << " has some PGs where all OSDs are with primary_affinity 0 (" << pg << ",...)" << dendl; + rbi.err_msg = fmt::format( + "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable", + pool_id, ss.str()); + return -EINVAL; + } + rbi.optimal_score = rbi_round(float(num_osds) / float(osd_pa_count)); // >= 1 + // adjust the score to the primary affinity setting (if prim affinity is set + // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1) + // When total system primary affinity is too low (average < 1 / pool replica count) + // the score is negative in order to grab the user's attention. + rbi.adjusted_score = rbi_round(rbi.raw_score / rbi.optimal_score); // >= 1 if PA is not low + rbi.acting_adj_score = rbi_round(rbi.acting_raw_score / rbi.optimal_score); // >= 1 if PA is not low + + } else { + // We should never get here - this condition is checked before calling this function - this is just sanity check code. + rbi.err_msg = fmt::format( + "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score", + pool_id); + return -EINVAL; + } + + return 0; +} + +int OSDMap::calc_read_balance_score(CephContext *cct, int64_t pool_id, + read_balance_info_t *p_rbi) const +{ + //BUG: wrong score with one PG replica 3 and 4 OSDs + if (cct != nullptr) + ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) << dendl; + + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(*this); + if (p_rbi == nullptr) { + // The only case where error message is not set - this is not tested in the unit test. + if (cct != nullptr) + ldout(cct,30) << __func__ << " p_rbi is nullptr." << dendl; + return -EINVAL; + } + + if (tmp_osd_map.pools.count(pool_id) == 0) { + if (cct != nullptr) + ldout(cct,30) << __func__ << " pool " << pool_id << " not found." << dendl; + zero_rbi(*p_rbi); + p_rbi->err_msg = fmt::format("pool {} not found", pool_id); + return -ENOENT; + } + int rc = 0; + const pg_pool_t* pool = tmp_osd_map.get_pg_pool(pool_id); + auto num_pgs = pool->get_pg_num(); + + map> pgs_by_osd; + map> prim_pgs_by_osd; + map> acting_prims_by_osd; + + pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pool_id, &prim_pgs_by_osd, &acting_prims_by_osd); + + if (cct != nullptr) + ldout(cct,30) << __func__ << " Primaries for pool: " + << prim_pgs_by_osd << dendl; + + if (pgs_by_osd.empty()) { + //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id); + return -EINVAL; + } + if (cct != nullptr) { + for (auto& [osd,pgs] : prim_pgs_by_osd) { + ldout(cct,20) << __func__ << " Pool " << pool_id << " OSD." << osd + << " has " << pgs.size() << " primary PGs, " + << acting_prims_by_osd[osd].size() << " acting primaries." + << dendl; + } + } + + auto num_osds = pgs_by_osd.size(); + + float avg_prims_per_osd = (float)num_pgs / (float)num_osds; + uint64_t max_prims_per_osd = 0; + uint64_t max_acting_prims_per_osd = 0; + float max_osd_score = 0.; + bool prim_on_zero_pa = false; + bool acting_on_zero_pa = false; + + float prim_affinity_sum = 0.; + float total_osd_weight = 0.; + float total_weighted_pa = 0.; + + map osds_crush_weight; + // Set up the OSDMap + int ruleno = tmp_osd_map.pools.at(pool_id).get_crush_rule(); + tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &osds_crush_weight); + + if (cct != nullptr) { + ldout(cct,20) << __func__ << " pool " << pool_id + << " ruleno " << ruleno + << " weight-map " << osds_crush_weight + << dendl; + } + uint osd_pa_count = 0; + + for (auto [osd, oweight] : osds_crush_weight) { // loop over all OSDs + total_osd_weight += oweight; + float osd_pa = tmp_osd_map.get_primary_affinityf(osd); + total_weighted_pa += oweight * osd_pa; + if (osd_pa != 0.) { + osd_pa_count++; + } + if (prim_pgs_by_osd.count(osd)) { + auto n_prims = prim_pgs_by_osd.at(osd).size(); + max_prims_per_osd = std::max(max_prims_per_osd, n_prims); + if (osd_pa == 0.) { + prim_on_zero_pa = true; + } + } + if (acting_prims_by_osd.count(osd)) { + auto n_aprims = acting_prims_by_osd.at(osd).size(); + max_acting_prims_per_osd = std::max(max_acting_prims_per_osd, n_aprims); + if (osd_pa != 0.) { + max_osd_score = std::max(max_osd_score, float(n_aprims) / osd_pa); + } + else { + acting_on_zero_pa = true; + } + } + + prim_affinity_sum += osd_pa; + if (cct != nullptr) { + auto np = prim_pgs_by_osd.count(osd) ? prim_pgs_by_osd.at(osd).size() : 0; + auto nap = acting_prims_by_osd.count(osd) ? acting_prims_by_osd.at(osd).size() : 0; + auto wt = osds_crush_weight.count(osd) ? osds_crush_weight.at(osd) : 0.; + ldout(cct,30) << __func__ << " OSD." << osd << " info: " + << " num_primaries " << np + << " num_acting_prims " << nap + << " prim_affinity " << tmp_osd_map.get_primary_affinityf(osd) + << " weight " << wt + << dendl; + } + } + if (cct != nullptr) { + ldout(cct,30) << __func__ << " pool " << pool_id + << " total_osd_weight " << total_osd_weight + << " total_weighted_pa " << total_weighted_pa + << dendl; + } + + if (prim_affinity_sum == 0.0) { + if (cct != nullptr) { + ldout(cct, 10) << __func__ << " pool " << pool_id + << " has primary_affinity set to zero on all OSDs" << dendl; + } + zero_rbi(*p_rbi); + p_rbi->err_msg = fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id); + + return -ERANGE; // score has a different meaning now. + } + else { + max_osd_score *= prim_affinity_sum / num_osds; + } + + rc = tmp_osd_map.set_rbi(cct, *p_rbi, pool_id, total_weighted_pa, + prim_affinity_sum, num_osds, osd_pa_count, + total_osd_weight, max_prims_per_osd, + max_acting_prims_per_osd, avg_prims_per_osd, + prim_on_zero_pa, acting_on_zero_pa, max_osd_score); + + if (cct != nullptr) { + ldout(cct,30) << __func__ << " pool " << get_pool_name(pool_id) + << " pa_avg " << p_rbi->pa_avg + << " pa_weighted " << p_rbi->pa_weighted + << " pa_weighted_avg " << p_rbi->pa_weighted_avg + << " optimal_score " << p_rbi->optimal_score + << " adjusted_score " << p_rbi->adjusted_score + << " acting_adj_score " << p_rbi->acting_adj_score + << dendl; + ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) + << " raw_score: " << p_rbi->raw_score + << " acting_raw_score: " << p_rbi->acting_raw_score + << dendl; + ldout(cct,10) << __func__ << " pool " << get_pool_name(pool_id) + << " wl_score: " << p_rbi->acting_adj_score << dendl; + } + + return rc; +} + int OSDMap::get_osds_by_bucket_name(const string &name, set *osds) const { return crush->get_leaves(name, osds); diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 54474c2893e..b0ae3bc4e60 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -1465,13 +1465,29 @@ public: ); private: // Bunch of internal functions used only by calc_pg_upmaps (result of code refactoring) + + std::map> get_pgs_by_osd( + CephContext *cct, + int64_t pid, + std::map> *p_primaries_by_osd = nullptr, + std::map> *p_acting_primaries_by_osd = nullptr + ) const; // used in calc_desired_primary_distribution() + +private: + float get_osds_weight( + CephContext *cct, + const OSDMap& tmp_osd_map, + int64_t pid, + std::map& osds_weight + ) const; + float build_pool_pgs_info ( CephContext *cct, const std::set& pools, ///< [optional] restrict to pool const OSDMap& tmp_osd_map, int& total_pgs, std::map>& pgs_by_osd, - std::map& osd_weight + std::map& osds_weight ); // return total weight of all OSDs float calc_deviations ( @@ -1559,6 +1575,59 @@ bool try_drop_remap_underfull( std::random_device::result_type *p_seed ); +public: + typedef struct { + float pa_avg; + float pa_weighted; + float pa_weighted_avg; + float raw_score; + float optimal_score; // based on primary_affinity values + float adjusted_score; // based on raw_score and pa_avg 1 is optimal + float acting_raw_score; // based on active_primaries (temporary) + float acting_adj_score; // based on raw_active_score and pa_avg 1 is optimal + std::string err_msg; + } read_balance_info_t; + // + // This function calculates scores about the cluster read balance state + // p_rb_info->acting_adj_score is the current read balance score (acting) + // p_rb_info->adjusted_score is the stable read balance score + // Return value of 0 is OK, negative means an error (may happen with + // some arifically generated osamap files) + // + int calc_read_balance_score( + CephContext *cct, + int64_t pool_id, + read_balance_info_t *p_rb_info) const; + +private: + float rbi_round(float f) const { + return (f > 0.0) ? floor(f * 100 + 0.5) / 100 : ceil(f * 100 - 0.5) / 100; + } + + int64_t has_zero_pa_pgs( + CephContext *cct, + int64_t pool_id) const; + + void zero_rbi( + read_balance_info_t &rbi + ) const; + + int set_rbi( + CephContext *cct, + read_balance_info_t &rbi, + int64_t pool_id, + float total_w_pa, + float pa_sum, + int num_osds, + int osd_pa_count, + float total_osd_weight, + uint max_prims_per_osd, + uint max_acting_prims_per_osd, + float avg_prims_per_osd, + bool prim_on_zero_pa, + bool acting_on_zero_pa, + float max_osd_score) const; + public: int get_osds_by_bucket_name(const std::string &name, std::set *osds) const; @@ -1627,10 +1696,10 @@ public: private: void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const; public: - void print(std::ostream& out) const; + void print(CephContext *cct, std::ostream& out) const; void print_osd(int id, std::ostream& out) const; void print_osds(std::ostream& out) const; - void print_pools(std::ostream& out) const; + void print_pools(CephContext *cct, std::ostream& out) const; void print_summary(ceph::Formatter *f, std::ostream& out, const std::string& prefix, bool extra=false) const; void print_oneline_summary(std::ostream& out) const; @@ -1656,9 +1725,11 @@ public: static void dump_erasure_code_profiles( const mempool::osdmap::map > &profiles, ceph::Formatter *f); - void dump(ceph::Formatter *f) const; + void dump(ceph::Formatter *f, CephContext *cct = nullptr) const; void dump_osd(int id, ceph::Formatter *f) const; void dump_osds(ceph::Formatter *f) const; + void dump_pool(CephContext *cct, int64_t pid, const pg_pool_t &pdata, ceph::Formatter *f) const; + void dump_read_balance_score(CephContext *cct, int64_t pid, const pg_pool_t &pdata, ceph::Formatter *f) const; static void generate_test_instances(std::list& o); bool check_new_blocklist_entries() const { return new_blocklist_entries; } diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 5435cefb5af..197352c1cf5 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -46,6 +46,7 @@ public: static const string range_addrs[]; static const string ip_addrs[]; static const string unblocked_ip_addrs[]; + const string EC_RULE_NAME = "erasure"; OSDMapTest() {} @@ -73,39 +74,57 @@ public: if (no_default_pools) // do not create any default pool(s) return; - // Create an EC rule and a pool using it - int r = osdmap.crush->add_simple_rule( - "erasure", "default", "osd", "", - "indep", pg_pool_t::TYPE_ERASURE, - &cerr); - OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); new_pool_inc.new_pool_max = osdmap.get_pool_max(); new_pool_inc.fsid = osdmap.get_fsid(); - pg_pool_t empty; // make an ec pool + set_ec_pool("ec", new_pool_inc); + // and a replicated pool + set_rep_pool("reppool",new_pool_inc); + osdmap.apply_incremental(new_pool_inc); + } + int get_ec_crush_rule() { + int r = osdmap.crush->get_rule_id(EC_RULE_NAME); + if (r < 0) { + r = osdmap.crush->add_simple_rule( + EC_RULE_NAME, "default", "osd", "", + "indep", pg_pool_t::TYPE_ERASURE, + &cerr); + } + return r; + } + uint64_t set_ec_pool(const string &name, OSDMap::Incremental &new_pool_inc, + bool assert_pool_id = true) { + pg_pool_t empty; uint64_t pool_id = ++new_pool_inc.new_pool_max; - ceph_assert(pool_id == my_ec_pool); + if (assert_pool_id) + ceph_assert(pool_id == my_ec_pool); pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty); p->size = 3; p->set_pg_num(64); p->set_pgp_num(64); p->type = pg_pool_t::TYPE_ERASURE; - p->crush_rule = r; - new_pool_inc.new_pool_names[pool_id] = "ec"; - // and a replicated pool - pool_id = ++new_pool_inc.new_pool_max; - ceph_assert(pool_id == my_rep_pool); - p = new_pool_inc.get_new_pool(pool_id, &empty); + p->crush_rule = get_ec_crush_rule(); + new_pool_inc.new_pool_names[pool_id] = name;//"ec"; + return pool_id; + } + uint64_t set_rep_pool(const string name, OSDMap::Incremental &new_pool_inc, + bool assert_pool_id = true) { + pg_pool_t empty; + uint64_t pool_id = ++new_pool_inc.new_pool_max; + if (assert_pool_id) + ceph_assert(pool_id == my_rep_pool); + pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty); p->size = 3; p->set_pg_num(64); p->set_pgp_num(64); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = 0; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); - new_pool_inc.new_pool_names[pool_id] = "reppool"; - osdmap.apply_incremental(new_pool_inc); + new_pool_inc.new_pool_names[pool_id] = name;//"reppool"; + return pool_id; } + unsigned int get_num_osds() { return num_osds; } void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) { bufferlist bl; @@ -211,6 +230,17 @@ public: job.wait(); tp.stop(); } + void set_primary_affinity_all(float pa) { + for (uint i = 0 ; i < get_num_osds() ; i++) { + osdmap.set_primary_affinity(i, int(pa * CEPH_OSD_MAX_PRIMARY_AFFINITY)); + } + } + bool score_in_range(float score, uint nosds = 0) { + if (nosds == 0) { + nosds = get_num_osds(); + } + return score >= 1.0 && score <= float(nosds); + } }; TEST_F(OSDMapTest, Create) { @@ -2280,6 +2310,139 @@ TEST_F(OSDMapTest, blocklisting_everything) { } } +TEST_F(OSDMapTest, ReadBalanceScore1) { + std::srand ( unsigned ( std::time(0) ) ); + uint osd_rand = rand() % 13; + set_up_map(6 + osd_rand); //whatever + auto pools = osdmap.get_pools(); + for (auto &[pid, pg_pool] : pools) { + const pg_pool_t *pi = osdmap.get_pg_pool(pid); + if (pi->is_replicated()) { + //cout << "pool " << pid << " " << pg_pool << std::endl; + auto replica_count = pi->get_size(); + OSDMap::read_balance_info_t rbi; + auto rc = osdmap.calc_read_balance_score(g_ceph_context, pid, &rbi); + + // "Normal" score is between 1 and num_osds + ASSERT_TRUE(rc == 0); + ASSERT_TRUE(score_in_range(rbi.adjusted_score)); + ASSERT_TRUE(score_in_range(rbi.acting_adj_score)); + ASSERT_TRUE(rbi.err_msg.empty()); + + // When all OSDs have primary_affinity 0, score should be 0 + auto num_osds = get_num_osds(); + set_primary_affinity_all(0.); + + rc = osdmap.calc_read_balance_score(g_ceph_context, pid, &rbi); + ASSERT_TRUE(rc < 0); + ASSERT_TRUE(rbi.adjusted_score == 0.); + ASSERT_TRUE(rbi.acting_adj_score == 0.); + ASSERT_FALSE(rbi.err_msg.empty()); + + std::vector osds; + for (uint i = 0 ; i < num_osds ; i++) { + osds.push_back(i); + } + + // Change primary_affinity of some OSDs to 1 others are 0 + float fratio = 1. / (float)replica_count; + for (int iter = 0 ; iter < 100 ; iter++) { // run the test 100 times + // Create random shuffle of OSDs + std::random_shuffle (osds.begin(), osds.end()); + for (uint i = 0 ; i < num_osds ; i++) { + if ((float(i + 1) / float(num_osds)) < fratio) { + ASSERT_TRUE(osds[i] < num_osds); + osdmap.set_primary_affinity(osds[i], CEPH_OSD_MAX_PRIMARY_AFFINITY); + rc = osdmap.calc_read_balance_score(g_ceph_context, pid, &rbi); + + ASSERT_TRUE(rc < 0); + ASSERT_TRUE(rbi.adjusted_score == 0.); + ASSERT_TRUE(rbi.acting_adj_score == 0.); + ASSERT_FALSE(rbi.err_msg.empty()); + } + else { + if (rc < 0) { + ASSERT_TRUE(rbi.adjusted_score == 0.); + ASSERT_TRUE(rbi.acting_adj_score == 0.); + ASSERT_FALSE(rbi.err_msg.empty()); + } + else { + ASSERT_TRUE(score_in_range(rbi.acting_adj_score, i + 1)); + ASSERT_TRUE(rbi.err_msg.empty()); + } + } + } + set_primary_affinity_all(0.); + } + } + } + + } + +TEST_F(OSDMapTest, ReadBalanceScore2) { + std::srand ( unsigned ( std::time(0) ) ); + uint osd_num = 6 + rand() % 13; + set_up_map(osd_num, true); + for (int i = 0 ; i < 100 ; i++) { //running 100 random tests + uint num_pa_osds = 0; + float pa_sum = 0.; + OSDMap::read_balance_info_t rbi; + + // set pa for all osds + for (uint j = 0 ; j < osd_num ; j++) { + uint pa = 1 + rand() % 100; + if (pa > 80) + pa = 100; + if (pa < 20) + pa = 0; + float fpa = (float)pa / 100.; + if (pa > 0) { + num_pa_osds++; + pa_sum += fpa; + } + osdmap.set_primary_affinity(j, int(fpa * CEPH_OSD_MAX_PRIMARY_AFFINITY)); + } + float pa_ratio = pa_sum / (float) osd_num; + + // create a pool with the current osdmap configuration + OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); + new_pool_inc.new_pool_max = osdmap.get_pool_max(); + new_pool_inc.fsid = osdmap.get_fsid(); + string pool_name = "rep_pool" + stringify(i); + uint64_t new_pid = set_rep_pool(pool_name, new_pool_inc, false); + ASSERT_TRUE(new_pid > 0); + osdmap.apply_incremental(new_pool_inc); + + // now run the test on the pool. + const pg_pool_t *pi = osdmap.get_pg_pool(new_pid); + ASSERT_NE(pi, nullptr); + ASSERT_TRUE(pi->is_replicated()); + float fratio = 1. / (float)pi->get_size(); + auto rc = osdmap.calc_read_balance_score(g_ceph_context, new_pid, &rbi); + if (pa_ratio < fratio) { + ASSERT_TRUE(rc < 0); + ASSERT_FALSE(rbi.err_msg.empty()); + ASSERT_TRUE(rbi.acting_adj_score == 0.); + ASSERT_TRUE(rbi.adjusted_score == 0.); + } + else { + if (rc < 0) { + ASSERT_TRUE(rbi.adjusted_score == 0.); + ASSERT_TRUE(rbi.acting_adj_score == 0.); + ASSERT_FALSE(rbi.err_msg.empty()); + } + else { + if (rbi.err_msg.empty()) { + ASSERT_TRUE(score_in_range(rbi.acting_adj_score, num_pa_osds)); + } + } + } + + } + //TODO add ReadBalanceScore3 - with weighted osds. + + } + INSTANTIATE_TEST_SUITE_P( OSDMap, OSDMapTest, diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index 87b84386ed1..9da7f5f5c40 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -959,7 +959,7 @@ int main(int argc, char **argv) { } else if (map_type == "osdmap") { OSDMap osdmap; osdmap.decode(bl); - osdmap.print(ss); + osdmap.print(cct.get(), ss); } else if (map_type == "mdsmap") { FSMap fs_map; fs_map.decode(bl); diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index 15264645b94..bd50c0869ee 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -820,7 +820,7 @@ skip_upmap: print_formatter->close_section(); print_formatter->flush(cout); } else { - osdmap.print(cout); + osdmap.print(cct.get(), cout); } } diff --git a/src/vstart.sh b/src/vstart.sh index e18184ed1e8..02d9d981872 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -1335,6 +1335,7 @@ else CMONDEBUG=' debug osd = 20 debug mon = 20 + debug osd = 20 debug paxos = 20 debug auth = 20 debug mgrc = 20 -- 2.39.5