From: Yongseok Oh Date: Tue, 11 Oct 2022 11:47:32 +0000 (+0900) Subject: mds: add bal_rank_mask option for ceph fs set X-Git-Tag: v18.1.0~959^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e134c8907013f228152912980fedc7eb5dd48c23;p=ceph.git mds: add bal_rank_mask option for ceph fs set fixes: https://tracker.ceph.com/issues/52720 Signed-off-by: Yongseok Oh --- diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index 1a4100605c0..bd00db9f53f 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -87,10 +87,17 @@ MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : void MDBalancer::handle_conf_change(const std::set& changed, const MDSMap& mds_map) { - if (changed.count("mds_bal_fragment_dirs")) + if (changed.count("mds_bal_fragment_dirs")) { bal_fragment_dirs = g_conf().get_val("mds_bal_fragment_dirs"); - if (changed.count("mds_bal_fragment_interval")) + } + if (changed.count("mds_bal_fragment_interval")) { bal_fragment_interval = g_conf().get_val("mds_bal_fragment_interval"); + } +} + +bool MDBalancer::test_rank_mask(mds_rank_t rank) +{ + return mds->mdsmap->get_bal_rank_mask_bitset().test(rank); } void MDBalancer::handle_export_pins(void) @@ -513,6 +520,9 @@ void MDBalancer::handle_heartbeat(const cref_t &m) } mds_import_map[who] = m->get_import_map(); + mds->mdsmap->update_num_mdss_in_rank_mask_bitset(); + + if (mds->mdsmap->get_num_mdss_in_rank_mask_bitset() > 0) { unsigned cluster_size = mds->get_mds_map()->get_num_in_mds(); if (mds_load.size() == cluster_size) { @@ -735,7 +745,7 @@ void MDBalancer::prep_rebalance(int beat) } // target load - target_load = total_load / (double)cluster_size; + target_load = total_load / (double)mds->mdsmap->get_num_mdss_in_rank_mask_bitset(); dout(7) << "my load " << my_load << " target " << target_load << " total " << total_load @@ -743,7 +753,8 @@ void MDBalancer::prep_rebalance(int beat) // under or over? for (const auto& [load, rank] : load_map) { - if (load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) { + if (test_rank_mask(rank) && + load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) { dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl; mds_last_epoch_under_map[rank] = beat_epoch; } @@ -772,7 +783,7 @@ void MDBalancer::prep_rebalance(int beat) for (multimap::iterator it = load_map.begin(); it != load_map.end(); ++it) { - if (it->first < target_load) { + if (it->first < target_load && test_rank_mask(it->second)) { dout(15) << " mds." << it->second << " is importer" << dendl; importers.insert(pair(it->first,it->second)); importer_set.insert(it->second); @@ -798,14 +809,15 @@ void MDBalancer::prep_rebalance(int beat) for (multimap::reverse_iterator ex = exporters.rbegin(); ex != exporters.rend(); ++ex) { - double maxex = get_maxex(state, ex->second); + double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0; + double maxex = get_maxex(state, ex->second, ex_target_load); if (maxex <= .001) continue; // check importers. for now, just in arbitrary order (no intelligent matching). for (map::iterator im = mds_import_map[ex->second].begin(); im != mds_import_map[ex->second].end(); ++im) { - double maxim = get_maxim(state, im->first); + double maxim = get_maxim(state, im->first, target_load); if (maxim <= .001) continue; try_match(state, ex->second, maxex, im->first, maxim); if (maxex <= .001) break; @@ -821,8 +833,9 @@ void MDBalancer::prep_rebalance(int beat) multimap::iterator im = importers.begin(); while (ex != exporters.rend() && im != importers.end()) { - double maxex = get_maxex(state, ex->second); - double maxim = get_maxim(state, im->second); + double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0; + double maxex = get_maxex(state, ex->second, ex_target_load); + double maxim = get_maxim(state, im->second, target_load); if (maxex < .001 || maxim < .001) break; try_match(state, ex->second, maxex, im->second, maxim); if (maxex <= .001) ++ex; @@ -835,8 +848,9 @@ void MDBalancer::prep_rebalance(int beat) multimap::iterator im = importers.begin(); while (ex != exporters.end() && im != importers.end()) { - double maxex = get_maxex(state, ex->second); - double maxim = get_maxim(state, im->second); + double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0; + double maxex = get_maxex(state, ex->second, ex_target_load); + double maxim = get_maxim(state, im->second, target_load); if (maxex < .001 || maxim < .001) break; try_match(state, ex->second, maxex, im->second, maxim); if (maxex <= .001) ++ex; @@ -941,10 +955,12 @@ void MDBalancer::try_rebalance(balance_state_t& state) mds_rank_t target = it.first; double amount = it.second; - if (amount < MIN_OFFLOAD) + if (amount < MIN_OFFLOAD) { continue; - if (amount * 10 * state.targets.size() < target_load) + } + if (amount * 10 * state.targets.size() < target_load) { continue; + } dout(5) << "want to send " << amount << " to mds." << target //<< " .. " << (*it).second << " * " << load_fac diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h index 1ed0b8531bb..69a6402b17e 100644 --- a/src/mds/MDBalancer.h +++ b/src/mds/MDBalancer.h @@ -102,11 +102,11 @@ private: mds_rank_t ex, double& maxex, mds_rank_t im, double& maxim); - double get_maxim(balance_state_t &state, mds_rank_t im) { - return target_load - mds_meta_load[im] - state.imported[im]; + double get_maxim(balance_state_t &state, mds_rank_t im, double im_target_load) { + return im_target_load - mds_meta_load[im] - state.imported[im]; } - double get_maxex(balance_state_t &state, mds_rank_t ex) { - return mds_meta_load[ex] - target_load - state.exported[ex]; + double get_maxex(balance_state_t &state, mds_rank_t ex, double ex_target_load) { + return mds_meta_load[ex] - ex_target_load - state.exported[ex]; } /** @@ -117,6 +117,7 @@ private: * export targets message again. */ void try_rebalance(balance_state_t& state); + bool test_rank_mask(mds_rank_t rank); bool bal_fragment_dirs; int64_t bal_fragment_interval; diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 955a1e8618a..d2a0a83ecc2 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -223,6 +223,7 @@ void MDSMap::dump(Formatter *f) const f->dump_bool("enabled", enabled); f->dump_string("fs_name", fs_name); f->dump_string("balancer", balancer); + f->dump_string("bal_rank_mask", bal_rank_mask); f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted)); } @@ -280,6 +281,7 @@ void MDSMap::print(ostream& out) const out << "metadata_pool\t" << metadata_pool << "\n"; out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n"; out << "balancer\t" << balancer << "\n"; + out << "bal_rank_mask\t" << bal_rank_mask << "\n"; out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n"; multimap< pair, mds_gid_t > foo; @@ -758,7 +760,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const encode(data_pools, bl); encode(cas_pool, bl); - __u16 ev = 16; + __u16 ev = 17; encode(ev, bl); encode(compat, bl); encode(metadata_pool, bl); @@ -785,6 +787,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const encode(min_compat_client, bl); } encode(required_client_features, bl); + encode(bal_rank_mask, bl); ENCODE_FINISH(bl); } @@ -932,6 +935,10 @@ void MDSMap::decode(bufferlist::const_iterator& p) } } + if (ev >= 17) { + decode(bal_rank_mask, p); + } + /* All MDS since at least v14.0.0 understand INLINE */ /* TODO: remove after R is released */ compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); @@ -1169,3 +1176,99 @@ void MDSMap::set_min_compat_client(ceph_release_t version) std::sort(bits.begin(), bits.end()); required_client_features = feature_bitset_t(bits); } + +const std::bitset& MDSMap::get_bal_rank_mask_bitset() const { + return bal_rank_mask_bitset; +} + +void MDSMap::set_bal_rank_mask(std::string val) +{ + bal_rank_mask = val; + dout(10) << "set bal_rank_mask to \"" << bal_rank_mask << "\""<< dendl; +} + +const bool MDSMap::check_special_bal_rank_mask(std::string val, bal_rank_mask_type_t type) const +{ + if ((type == BAL_RANK_MASK_TYPE_ANY || type == BAL_RANK_MASK_TYPE_ALL) && (val == "-1" || val == "all")) { + return true; + } + if ((type == BAL_RANK_MASK_TYPE_ANY || type == BAL_RANK_MASK_TYPE_NONE) && (val == "0x0" || val == "0")) { + return true; + } + return false; +} + +void MDSMap::update_num_mdss_in_rank_mask_bitset() +{ + int r = -EINVAL; + + if (bal_rank_mask.length() && !check_special_bal_rank_mask(bal_rank_mask, BAL_RANK_MASK_TYPE_ANY)) { + std::string bin_string; + CachedStackStringStream css; + + r = hex2bin(bal_rank_mask, bin_string, MAX_MDS, *css); + if (r == 0) { + auto _mds_bal_mask_bitset = std::bitset(bin_string); + bal_rank_mask_bitset = _mds_bal_mask_bitset; + num_mdss_in_rank_mask_bitset = _mds_bal_mask_bitset.count(); + } else { + dout(10) << css->str() << dendl; + } + } + + if (r == -EINVAL) { + if (check_special_bal_rank_mask(bal_rank_mask, BAL_RANK_MASK_TYPE_NONE)) { + dout(10) << "Balancer is disabled with bal_rank_mask " << bal_rank_mask << dendl; + bal_rank_mask_bitset.reset(); + num_mdss_in_rank_mask_bitset = 0; + } else { + dout(10) << "Balancer distributes mds workloads to all ranks as bal_rank_mask is empty or invalid" << dendl; + bal_rank_mask_bitset.set(); + num_mdss_in_rank_mask_bitset = get_max_mds(); + } + } + + dout(10) << "update num_mdss_in_rank_mask_bitset to " << num_mdss_in_rank_mask_bitset << dendl; +} + +int MDSMap::hex2bin(std::string hex_string, std::string &bin_string, unsigned int max_bits, std::ostream& ss) const +{ + static const unsigned int BITS_PER_QUARTET = CHAR_BIT / 2; + static const unsigned int BITS_PER_ULLONG = sizeof(unsigned long long) * CHAR_BIT ; + static const unsigned int QUARTETS_PER_ULLONG = BITS_PER_ULLONG/BITS_PER_QUARTET; + unsigned int offset = 0; + + std::transform(hex_string.begin(), hex_string.end(), hex_string.begin(), ::tolower); + + if (hex_string.substr(0, 2) == "0x") { + offset = 2; + } + + for (unsigned int i = offset; i < hex_string.size(); i += QUARTETS_PER_ULLONG) { + unsigned long long value; + try { + value = stoull(hex_string.substr(i, QUARTETS_PER_ULLONG), nullptr, 16); + } catch (std::invalid_argument const& ex) { + ss << "invalid hex value "; + return -EINVAL; + } + auto bit_str = std::bitset(value); + bin_string += bit_str.to_string(); + } + + if (bin_string.length() > max_bits) { + ss << "a value exceeds max_mds " << max_bits; + return -EINVAL; + } + + if (bin_string.find('1') == std::string::npos) { + ss << "at least one rank must be set"; + return -EINVAL; + } + + if (bin_string.length() < max_bits) { + bin_string.insert(0, max_bits - bin_string.length(), '0'); + } + + return 0; +} diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 3c630ce9c3a..0ce99bd3e15 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -275,6 +275,21 @@ public: const std::string get_balancer() const { return balancer; } void set_balancer(std::string val) { balancer.assign(val); } + const std::bitset& get_bal_rank_mask_bitset() const; + void set_bal_rank_mask(std::string val); + unsigned get_num_mdss_in_rank_mask_bitset() const { return num_mdss_in_rank_mask_bitset; } + void update_num_mdss_in_rank_mask_bitset(); + int hex2bin(std::string hex_string, std::string &bin_string, unsigned int max_bits, std::ostream& ss) const; + + typedef enum + { + BAL_RANK_MASK_TYPE_ANY = 0, + BAL_RANK_MASK_TYPE_ALL = 1, + BAL_RANK_MASK_TYPE_NONE = 2, + } bal_rank_mask_type_t; + + const bool check_special_bal_rank_mask(std::string val, bal_rank_mask_type_t type) const; + mds_rank_t get_tableserver() const { return tableserver; } mds_rank_t get_root() const { return root; } @@ -626,6 +641,10 @@ protected: mds_rank_t standby_count_wanted = -1; std::string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */ + std::string bal_rank_mask = "-1"; + std::bitset bal_rank_mask_bitset; + uint32_t num_mdss_in_rank_mask_bitset; + std::set in; // currently defined cluster // which ranks are failed, stopped, damaged (i.e. not held by a daemon) diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 8e16f75cdd4..5fd1f4a21de 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -424,6 +424,28 @@ public: fs->mds_map.set_balancer(val); }); return true; + } else if (var == "bal_rank_mask") { + if (val.empty()) { + ss << "bal_rank_mask may not be empty"; + return -EINVAL; + } + + if (fs->mds_map.check_special_bal_rank_mask(val, MDSMap::BAL_RANK_MASK_TYPE_ANY) == false) { + std::string bin_string; + int r = fs->mds_map.hex2bin(val, bin_string, MAX_MDS, ss); + if (r != 0) { + return r; + } + } + ss << "setting the metadata balancer rank mask to " << val; + + fsmap.modify_filesystem( + fs->fscid, + [val](std::shared_ptr fs) + { + fs->mds_map.set_bal_rank_mask(val); + }); + return true; } else if (var == "max_file_size") { if (interr.length()) { ss << var << " requires an integer value"; diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 0ed51e9f46d..dfb301b7539 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -365,7 +365,7 @@ COMMAND("fs set " "name=var,type=CephChoices,strings=max_mds|max_file_size" "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer" "|standby_count_wanted|session_timeout|session_autoclose" - "|allow_standby_replay|down|joinable|min_compat_client " + "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false " "name=yes_i_really_really_mean_it,type=CephBool,req=false",