From 10b89f96d9632f703083c0b01244de959582636a Mon Sep 17 00:00:00 2001 From: Laura Flores Date: Wed, 1 Feb 2023 23:52:17 +0000 Subject: [PATCH] tools, test/cli: add read balancer to osdmaptool This commit adds the capability to balance reads on a given osdmap with the osdmaptool. The user has the option of performing a "dry run" of read balancing OR taking it a step further and applying the results to a live cluster. Performing a "dry run" would involve simply running an osdmaptool command and inspecting the results. The template for the command is: `osdmaptool --read --read-pool ` An example command a user might run is: `osdmaptool om --read out.txt --read-pool default.rgw.control` This commit also adds a `--vstart` flag that allows a user to print ceph commands in the outfile formatted for a vstart cluster. An example command a user might run is: `./bin/osdmaptool om --vstart --read out.txt --read-pool default.rgw.control` The out.txt file would contain ceph commands prefixed with `./bin/`. The `--vstart` flag may also be applied to an `--upmap` osdmaptool command. If the user wants to apply read balancing results from their dry run to a live cluster, they may either manually apply the ceph commands from the out file, or run `source `. Signed-off-by: Laura Flores --- src/test/cli/osdmaptool/help.t | 3 + src/tools/osdmaptool.cc | 111 ++++++++++++++++++++++++++++++--- 2 files changed, 105 insertions(+), 9 deletions(-) diff --git a/src/test/cli/osdmaptool/help.t b/src/test/cli/osdmaptool/help.t index 7ee3d0aed85..624fe9102e6 100644 --- a/src/test/cli/osdmaptool/help.t +++ b/src/test/cli/osdmaptool/help.t @@ -36,4 +36,7 @@ --test-crush [--range-first --range-last ] map pgs to acting osds --adjust-crush-weight [,,<...>] change CRUSH (but do not persist) --save write modified osdmap with upmap or crush-adjust changes + --read calculate pg upmap entries to balance pg primaries + --read-pool specify which pool the read balancer should adjust + --vstart prefix upmap and read output with './bin/' [1] diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index bd50c0869ee..ddaf2e65a0c 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -67,32 +67,49 @@ void usage() cout << " --test-crush [--range-first --range-last ] map pgs to acting osds" << std::endl; cout << " --adjust-crush-weight [,,<...>] change CRUSH (but do not persist)" << std::endl; cout << " --save write modified osdmap with upmap or crush-adjust changes" << std::endl; + cout << " --read calculate pg upmap entries to balance pg primaries" << std::endl; + cout << " --read-pool specify which pool the read balancer should adjust" << std::endl; + cout << " --vstart prefix upmap and read output with './bin/'" << std::endl; exit(1); } -void print_inc_upmaps(const OSDMap::Incremental& pending_inc, int fd) +void print_inc_upmaps(const OSDMap::Incremental& pending_inc, int fd, bool vstart, std::string cmd="ceph") { ostringstream ss; + std::string prefix = "./bin/"; for (auto& i : pending_inc.old_pg_upmap) { - ss << "ceph osd rm-pg-upmap " << i << std::endl; + if (vstart) + ss << prefix; + ss << cmd + " osd rm-pg-upmap " << i << std::endl; } for (auto& i : pending_inc.new_pg_upmap) { - ss << "ceph osd pg-upmap " << i.first; + if (vstart) + ss << prefix; + ss << cmd + " osd pg-upmap " << i.first; for (auto osd : i.second) { ss << " " << osd; } ss << std::endl; } for (auto& i : pending_inc.old_pg_upmap_items) { - ss << "ceph osd rm-pg-upmap-items " << i << std::endl; + if (vstart) + ss << prefix; + ss << cmd + " osd rm-pg-upmap-items " << i << std::endl; } for (auto& i : pending_inc.new_pg_upmap_items) { - ss << "ceph osd pg-upmap-items " << i.first; + if (vstart) + ss << prefix; + ss << cmd + " osd pg-upmap-items " << i.first; for (auto p : i.second) { ss << " " << p.first << " " << p.second; } ss << std::endl; } + for (auto& i : pending_inc.new_pg_upmap_primary) { + if (vstart) + ss << prefix; + ss << cmd + " osd pg-upmap-primary " << i.first << " " << i.second << std::endl; + } string s = ss.str(); int r = safe_write(fd, s.c_str(), s.size()); if (r < 0) { @@ -157,10 +174,13 @@ int main(int argc, const char **argv) std::set upmap_pools; std::random_device::result_type upmap_seed; std::random_device::result_type *upmap_p_seed = nullptr; + bool read = false; + std::string read_pool; int64_t pg_num = -1; bool test_map_pgs_dump_all = false; bool save = false; + bool vstart = false; std::string val; std::ostringstream err; @@ -186,12 +206,16 @@ int main(int argc, const char **argv) } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap", (char*)NULL)) { upmap_cleanup = true; upmap = true; + } else if (ceph_argparse_witharg(args, i, &upmap_file, "--read", (char*)NULL)) { + read = true; } else if (ceph_argparse_witharg(args, i, &upmap_max, err, "--upmap-max", (char*)NULL)) { } else if (ceph_argparse_witharg(args, i, &upmap_deviation, err, "--upmap-deviation", (char*)NULL)) { } else if (ceph_argparse_witharg(args, i, (int *)&upmap_seed, err, "--upmap-seed", (char*)NULL)) { upmap_p_seed = &upmap_seed; } else if (ceph_argparse_witharg(args, i, &val, "--upmap-pool", (char*)NULL)) { upmap_pools.insert(val); + } else if (ceph_argparse_witharg(args, i, &val, "--read-pool", (char*)NULL)) { + read_pool = val; } else if (ceph_argparse_witharg(args, i, &num_osd, err, "--createsimple", (char*)NULL)) { if (!err.str().empty()) { cerr << err.str() << std::endl; @@ -266,6 +290,8 @@ int main(int argc, const char **argv) adjust_crush_weight = val; } else if (ceph_argparse_flag(args, i, "--save", (char*)NULL)) { save = true; + } else if (ceph_argparse_flag(args, i, "--vstart", (char*)NULL)) { + vstart = true; } else { ++i; } @@ -422,7 +448,7 @@ int main(int argc, const char **argv) OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc); } int upmap_fd = STDOUT_FILENO; - if (upmap || upmap_cleanup) { + if (upmap || upmap_cleanup || read) { if (upmap_file != "-") { upmap_fd = ::open(upmap_file.c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0644); if (upmap_fd < 0) { @@ -439,11 +465,78 @@ int main(int argc, const char **argv) pending_inc.fsid = osdmap.get_fsid(); int r = osdmap.clean_pg_upmaps(g_ceph_context, &pending_inc); if (r > 0) { - print_inc_upmaps(pending_inc, upmap_fd); + print_inc_upmaps(pending_inc, upmap_fd, vstart); r = osdmap.apply_incremental(pending_inc); ceph_assert(r == 0); } } + if (read) { + int64_t pid = osdmap.lookup_pg_pool_name(read_pool); + if (pid < 0) { + cerr << " pool " << read_pool << " does not exist" << std::endl; + exit(1); + } + + const pg_pool_t* pool = osdmap.get_pg_pool(pid); + if (! pool->is_replicated()) { + cerr << read_pool << " is an erasure coded pool; " + << "please try again with a replicated pool." << std::endl; + exit(1); + } + + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(osdmap); + + // Gather BEFORE info + map> pgs_by_osd; + map> prim_pgs_by_osd; + map> acting_prims_by_osd; + pgs_by_osd = tmp_osd_map.get_pgs_by_osd(g_ceph_context, pid, &prim_pgs_by_osd, &acting_prims_by_osd); + OSDMap::read_balance_info_t rb_info; + tmp_osd_map.calc_read_balance_score(g_ceph_context, pid, &rb_info); + float read_balance_score_before = rb_info.adjusted_score; + ceph_assert(read_balance_score_before >= 0); + + // Calculate read balancer + OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); + int num_changes = osdmap.balance_primaries(g_ceph_context, pid, &pending_inc, tmp_osd_map); + + if (num_changes < 0) { + cerr << "Error balancing primaries. Rerun with at least --debug-osd=10 for more details." << std::endl; + exit(1); + } + + // Gather AFTER info + map> pgs_by_osd_2; + map> prim_pgs_by_osd_2; + map> acting_prims_by_osd_2; + pgs_by_osd_2 = tmp_osd_map.get_pgs_by_osd(g_ceph_context, pid, &prim_pgs_by_osd_2, &acting_prims_by_osd_2); + tmp_osd_map.calc_read_balance_score(g_ceph_context, pid, &rb_info); + float read_balance_score_after = rb_info.adjusted_score; + ceph_assert(read_balance_score_after >= 0); + + if (num_changes > 0) { + cout << " \n"; + cout << "---------- BEFORE ------------ \n"; + for (auto & [osd, pgs] : prim_pgs_by_osd) { + cout << " osd." << osd << " | primary affinity: " << tmp_osd_map.get_primary_affinityf(osd) << " | number of prims: " << pgs.size() << "\n"; + } + cout << " \n"; + cout << "read_balance_score of '" << read_pool << "': " << read_balance_score_before << "\n\n\n"; + + cout << "---------- AFTER ------------ \n"; + for (auto & [osd, pgs] : prim_pgs_by_osd_2) { + cout << " osd." << osd << " | primary affinity: " << tmp_osd_map.get_primary_affinityf(osd) << " | number of prims: " << pgs.size() << "\n"; + } + cout << " \n"; + cout << "read_balance_score of '" << read_pool << "': " << read_balance_score_after << "\n\n\n"; + cout << "num changes: " << num_changes << "\n"; + + print_inc_upmaps(pending_inc, upmap_fd, vstart); + } else { + cout << " Unable to find further optimization, or distribution is already perfect\n"; + } + } if (upmap) { cout << "upmap, max-count " << upmap_max << ", max deviation " << upmap_deviation @@ -513,7 +606,7 @@ int main(int argc, const char **argv) if (upmap_active) cout << "Time elapsed " << elapsed_time << " secs" << std::endl; if (total_did > 0) { - print_inc_upmaps(pending_inc, upmap_fd); + print_inc_upmaps(pending_inc, upmap_fd, vstart); if (save || upmap_active) { int r = osdmap.apply_incremental(pending_inc); ceph_assert(r == 0); @@ -798,7 +891,7 @@ skip_upmap: export_crush.empty() && import_crush.empty() && test_map_pg.empty() && test_map_object.empty() && !test_map_pgs && !test_map_pgs_dump && !test_map_pgs_dump_all && - adjust_crush_weight.empty() && !upmap && !upmap_cleanup) { + adjust_crush_weight.empty() && !upmap && !upmap_cleanup && !read) { cerr << me << ": no action specified?" << std::endl; usage(); } -- 2.39.5