void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
{
- if (changed.count("mds_bal_fragment_dirs"))
+ if (changed.count("mds_bal_fragment_dirs")) {
bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
- if (changed.count("mds_bal_fragment_interval"))
+ }
+ if (changed.count("mds_bal_fragment_interval")) {
bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+ }
+}
+
+bool MDBalancer::test_rank_mask(mds_rank_t rank)
+{
+ return mds->mdsmap->get_bal_rank_mask_bitset().test(rank);
}
void MDBalancer::handle_export_pins(void)
}
mds_import_map[who] = m->get_import_map();
+ mds->mdsmap->update_num_mdss_in_rank_mask_bitset();
+
+ if (mds->mdsmap->get_num_mdss_in_rank_mask_bitset() > 0)
{
unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
if (mds_load.size() == cluster_size) {
}
// target load
- target_load = total_load / (double)cluster_size;
+ target_load = total_load / (double)mds->mdsmap->get_num_mdss_in_rank_mask_bitset();
dout(7) << "my load " << my_load
<< " target " << target_load
<< " total " << total_load
// under or over?
for (const auto& [load, rank] : load_map) {
- if (load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
+ if (test_rank_mask(rank) &&
+ load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
mds_last_epoch_under_map[rank] = beat_epoch;
}
for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
it != load_map.end();
++it) {
- if (it->first < target_load) {
+ if (it->first < target_load && test_rank_mask(it->second)) {
dout(15) << " mds." << it->second << " is importer" << dendl;
importers.insert(pair<double,mds_rank_t>(it->first,it->second));
importer_set.insert(it->second);
for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
ex != exporters.rend();
++ex) {
- double maxex = get_maxex(state, ex->second);
+ double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
+ double maxex = get_maxex(state, ex->second, ex_target_load);
if (maxex <= .001) continue;
// check importers. for now, just in arbitrary order (no intelligent matching).
for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
im != mds_import_map[ex->second].end();
++im) {
- double maxim = get_maxim(state, im->first);
+ double maxim = get_maxim(state, im->first, target_load);
if (maxim <= .001) continue;
try_match(state, ex->second, maxex, im->first, maxim);
if (maxex <= .001) break;
multimap<double,mds_rank_t>::iterator im = importers.begin();
while (ex != exporters.rend() &&
im != importers.end()) {
- double maxex = get_maxex(state, ex->second);
- double maxim = get_maxim(state, im->second);
+ double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
+ double maxex = get_maxex(state, ex->second, ex_target_load);
+ double maxim = get_maxim(state, im->second, target_load);
if (maxex < .001 || maxim < .001) break;
try_match(state, ex->second, maxex, im->second, maxim);
if (maxex <= .001) ++ex;
multimap<double,mds_rank_t>::iterator im = importers.begin();
while (ex != exporters.end() &&
im != importers.end()) {
- double maxex = get_maxex(state, ex->second);
- double maxim = get_maxim(state, im->second);
+ double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
+ double maxex = get_maxex(state, ex->second, ex_target_load);
+ double maxim = get_maxim(state, im->second, target_load);
if (maxex < .001 || maxim < .001) break;
try_match(state, ex->second, maxex, im->second, maxim);
if (maxex <= .001) ++ex;
mds_rank_t target = it.first;
double amount = it.second;
- if (amount < MIN_OFFLOAD)
+ if (amount < MIN_OFFLOAD) {
continue;
- if (amount * 10 * state.targets.size() < target_load)
+ }
+ if (amount * 10 * state.targets.size() < target_load) {
continue;
+ }
dout(5) << "want to send " << amount << " to mds." << target
//<< " .. " << (*it).second << " * " << load_fac
mds_rank_t ex, double& maxex,
mds_rank_t im, double& maxim);
- double get_maxim(balance_state_t &state, mds_rank_t im) {
- return target_load - mds_meta_load[im] - state.imported[im];
+ double get_maxim(balance_state_t &state, mds_rank_t im, double im_target_load) {
+ return im_target_load - mds_meta_load[im] - state.imported[im];
}
- double get_maxex(balance_state_t &state, mds_rank_t ex) {
- return mds_meta_load[ex] - target_load - state.exported[ex];
+ double get_maxex(balance_state_t &state, mds_rank_t ex, double ex_target_load) {
+ return mds_meta_load[ex] - ex_target_load - state.exported[ex];
}
/**
* export targets message again.
*/
void try_rebalance(balance_state_t& state);
+ bool test_rank_mask(mds_rank_t rank);
bool bal_fragment_dirs;
int64_t bal_fragment_interval;
f->dump_bool("enabled", enabled);
f->dump_string("fs_name", fs_name);
f->dump_string("balancer", balancer);
+ f->dump_string("bal_rank_mask", bal_rank_mask);
f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
}
out << "metadata_pool\t" << metadata_pool << "\n";
out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
out << "balancer\t" << balancer << "\n";
+ out << "bal_rank_mask\t" << bal_rank_mask << "\n";
out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
encode(data_pools, bl);
encode(cas_pool, bl);
- __u16 ev = 16;
+ __u16 ev = 17;
encode(ev, bl);
encode(compat, bl);
encode(metadata_pool, bl);
encode(min_compat_client, bl);
}
encode(required_client_features, bl);
+ encode(bal_rank_mask, bl);
ENCODE_FINISH(bl);
}
}
}
+ if (ev >= 17) {
+ decode(bal_rank_mask, p);
+ }
+
/* All MDS since at least v14.0.0 understand INLINE */
/* TODO: remove after R is released */
compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
std::sort(bits.begin(), bits.end());
required_client_features = feature_bitset_t(bits);
}
+
+const std::bitset<MAX_MDS>& MDSMap::get_bal_rank_mask_bitset() const {
+ return bal_rank_mask_bitset;
+}
+
+void MDSMap::set_bal_rank_mask(std::string val)
+{
+ bal_rank_mask = val;
+ dout(10) << "set bal_rank_mask to \"" << bal_rank_mask << "\""<< dendl;
+}
+
+const bool MDSMap::check_special_bal_rank_mask(std::string val, bal_rank_mask_type_t type) const
+{
+ if ((type == BAL_RANK_MASK_TYPE_ANY || type == BAL_RANK_MASK_TYPE_ALL) && (val == "-1" || val == "all")) {
+ return true;
+ }
+ if ((type == BAL_RANK_MASK_TYPE_ANY || type == BAL_RANK_MASK_TYPE_NONE) && (val == "0x0" || val == "0")) {
+ return true;
+ }
+ return false;
+}
+
+void MDSMap::update_num_mdss_in_rank_mask_bitset()
+{
+ int r = -EINVAL;
+
+ if (bal_rank_mask.length() && !check_special_bal_rank_mask(bal_rank_mask, BAL_RANK_MASK_TYPE_ANY)) {
+ std::string bin_string;
+ CachedStackStringStream css;
+
+ r = hex2bin(bal_rank_mask, bin_string, MAX_MDS, *css);
+ if (r == 0) {
+ auto _mds_bal_mask_bitset = std::bitset<MAX_MDS>(bin_string);
+ bal_rank_mask_bitset = _mds_bal_mask_bitset;
+ num_mdss_in_rank_mask_bitset = _mds_bal_mask_bitset.count();
+ } else {
+ dout(10) << css->str() << dendl;
+ }
+ }
+
+ if (r == -EINVAL) {
+ if (check_special_bal_rank_mask(bal_rank_mask, BAL_RANK_MASK_TYPE_NONE)) {
+ dout(10) << "Balancer is disabled with bal_rank_mask " << bal_rank_mask << dendl;
+ bal_rank_mask_bitset.reset();
+ num_mdss_in_rank_mask_bitset = 0;
+ } else {
+ dout(10) << "Balancer distributes mds workloads to all ranks as bal_rank_mask is empty or invalid" << dendl;
+ bal_rank_mask_bitset.set();
+ num_mdss_in_rank_mask_bitset = get_max_mds();
+ }
+ }
+
+ dout(10) << "update num_mdss_in_rank_mask_bitset to " << num_mdss_in_rank_mask_bitset << dendl;
+}
+
+int MDSMap::hex2bin(std::string hex_string, std::string &bin_string, unsigned int max_bits, std::ostream& ss) const
+{
+ static const unsigned int BITS_PER_QUARTET = CHAR_BIT / 2;
+ static const unsigned int BITS_PER_ULLONG = sizeof(unsigned long long) * CHAR_BIT ;
+ static const unsigned int QUARTETS_PER_ULLONG = BITS_PER_ULLONG/BITS_PER_QUARTET;
+ unsigned int offset = 0;
+
+ std::transform(hex_string.begin(), hex_string.end(), hex_string.begin(), ::tolower);
+
+ if (hex_string.substr(0, 2) == "0x") {
+ offset = 2;
+ }
+
+ for (unsigned int i = offset; i < hex_string.size(); i += QUARTETS_PER_ULLONG) {
+ unsigned long long value;
+ try {
+ value = stoull(hex_string.substr(i, QUARTETS_PER_ULLONG), nullptr, 16);
+ } catch (std::invalid_argument const& ex) {
+ ss << "invalid hex value ";
+ return -EINVAL;
+ }
+ auto bit_str = std::bitset<BITS_PER_ULLONG>(value);
+ bin_string += bit_str.to_string();
+ }
+
+ if (bin_string.length() > max_bits) {
+ ss << "a value exceeds max_mds " << max_bits;
+ return -EINVAL;
+ }
+
+ if (bin_string.find('1') == std::string::npos) {
+ ss << "at least one rank must be set";
+ return -EINVAL;
+ }
+
+ if (bin_string.length() < max_bits) {
+ bin_string.insert(0, max_bits - bin_string.length(), '0');
+ }
+
+ return 0;
+}
const std::string get_balancer() const { return balancer; }
void set_balancer(std::string val) { balancer.assign(val); }
+ const std::bitset<MAX_MDS>& get_bal_rank_mask_bitset() const;
+ void set_bal_rank_mask(std::string val);
+ unsigned get_num_mdss_in_rank_mask_bitset() const { return num_mdss_in_rank_mask_bitset; }
+ void update_num_mdss_in_rank_mask_bitset();
+ int hex2bin(std::string hex_string, std::string &bin_string, unsigned int max_bits, std::ostream& ss) const;
+
+ typedef enum
+ {
+ BAL_RANK_MASK_TYPE_ANY = 0,
+ BAL_RANK_MASK_TYPE_ALL = 1,
+ BAL_RANK_MASK_TYPE_NONE = 2,
+ } bal_rank_mask_type_t;
+
+ const bool check_special_bal_rank_mask(std::string val, bal_rank_mask_type_t type) const;
+
mds_rank_t get_tableserver() const { return tableserver; }
mds_rank_t get_root() const { return root; }
mds_rank_t standby_count_wanted = -1;
std::string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */
+ std::string bal_rank_mask = "-1";
+ std::bitset<MAX_MDS> bal_rank_mask_bitset;
+ uint32_t num_mdss_in_rank_mask_bitset;
+
std::set<mds_rank_t> in; // currently defined cluster
// which ranks are failed, stopped, damaged (i.e. not held by a daemon)
fs->mds_map.set_balancer(val);
});
return true;
+ } else if (var == "bal_rank_mask") {
+ if (val.empty()) {
+ ss << "bal_rank_mask may not be empty";
+ return -EINVAL;
+ }
+
+ if (fs->mds_map.check_special_bal_rank_mask(val, MDSMap::BAL_RANK_MASK_TYPE_ANY) == false) {
+ std::string bin_string;
+ int r = fs->mds_map.hex2bin(val, bin_string, MAX_MDS, ss);
+ if (r != 0) {
+ return r;
+ }
+ }
+ ss << "setting the metadata balancer rank mask to " << val;
+
+ fsmap.modify_filesystem(
+ fs->fscid,
+ [val](std::shared_ptr<Filesystem> fs)
+ {
+ fs->mds_map.set_bal_rank_mask(val);
+ });
+ return true;
} else if (var == "max_file_size") {
if (interr.length()) {
ss << var << " requires an integer value";
"name=var,type=CephChoices,strings=max_mds|max_file_size"
"|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer"
"|standby_count_wanted|session_timeout|session_autoclose"
- "|allow_standby_replay|down|joinable|min_compat_client "
+ "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask "
"name=val,type=CephString "
"name=yes_i_really_mean_it,type=CephBool,req=false "
"name=yes_i_really_really_mean_it,type=CephBool,req=false",