From 47342a8b3ce1ee998ac70687468253135e29f46b Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 23 Jun 2023 17:01:00 -0400 Subject: [PATCH] mds: add balance_automate fs setting To turn off the automatic ("default") balancer in multiple MDS clusters. The new default is "off" as the balancer is a constant source of problems and surprise for administrators trying multiple actives. Instead, it should be a deliberate decision to turn it on and usually with customization like the "bal_rank_mask" setting or pinning. Fixes: https://tracker.ceph.com/issues/61378 Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 4 ++++ src/include/ceph_fs.h | 1 + src/mds/MDBalancer.cc | 4 +++- src/mds/MDSMap.cc | 3 +++ src/mds/MDSMap.h | 12 +++++++++++- src/mon/FSCommands.cc | 15 +++++++++++++++ src/mon/MonCommands.h | 25 ++++++++++++++++++++----- 7 files changed, 57 insertions(+), 7 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index c44851a5eafbb..8e344dba79b94 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -95,6 +95,10 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config mirroring policies between RGW and AWS, you may wish to set "rgw policy reject invalid principals" to "false". This affects only newly set policies, not policies that are already in place. +* The CephFS automatic metadata load (sometimes called "default") balancer is + now disabled by default. The new file system flag `balance_automate` + can be used to toggle it on or off. It can be enabled or disabled via + `ceph fs set balance_automate `. * RGW's default backend for `rgw_enable_ops_log` changed from RADOS to file. The default value of `rgw_ops_log_rados` is now false, and `rgw_ops_log_file_path` defaults to "/var/log/ceph/ops-log-$cluster-$name.log". diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index f567a26f41106..2454216802651 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -292,6 +292,7 @@ struct ceph_mon_subscribe_ack { request */ #define CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS (1<<7) /* fs is forbidden to use standby for another fs */ +#define CEPH_MDSMAP_BALANCE_AUTOMATE (1<<8) /* automate metadata balancing */ #define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \ CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS) diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index cd2e7d9bd002b..b48c59f2606ab 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -230,6 +230,7 @@ void MDBalancer::handle_export_pins(void) void MDBalancer::tick() { static int num_bal_times = g_conf()->mds_bal_max; + bool balance_automate = mds->mdsmap->allows_balance_automate(); auto bal_interval = g_conf().get_val("mds_bal_interval"); auto bal_max_until = g_conf().get_val("mds_bal_max_until"); time now = clock::now(); @@ -248,7 +249,8 @@ void MDBalancer::tick() // We can use duration_cast below, although the result is an int, // because the values from g_conf are also integers. // balance? - if (mds->get_nodeid() == 0 + if (balance_automate + && mds->get_nodeid() == 0 && mds->is_active() && bal_interval > 0 && chrono::duration_cast(now - last_heartbeat).count() >= bal_interval diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 47c823bf76356..699765ebe9176 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -239,6 +239,7 @@ void MDSMap::dump_flags_state(Formatter *f) const f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY), allows_standby_replay()); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)); + f->dump_bool(flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)); f->close_section(); } @@ -383,6 +384,8 @@ void MDSMap::print_flags(std::ostream& out) const { out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION); if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS); + if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)) + out << " " << flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE); } void MDSMap::get_health(list >& summary, diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 9057f05a8cedb..746ae8597151b 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -244,6 +244,15 @@ public: bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); } bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; } + void set_balance_automate() { + set_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); + ever_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + explicitly_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + } + void clear_balance_automate() { clear_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool allows_balance_automate() const { return test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool was_balance_automate_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_BALANCE_AUTOMATE; } + void set_multimds_snaps_allowed() { set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; @@ -676,7 +685,8 @@ private: {CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS, "allow_multimds_snaps"}, {CEPH_MDSMAP_ALLOW_STANDBY_REPLAY, "allow_standby_replay"}, {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"}, - {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"} + {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"}, + {CEPH_MDSMAP_BALANCE_AUTOMATE, "balance_automate"} }; }; WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 2faba87f73de4..2606199690788 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -665,6 +665,21 @@ public: } }; fsmap.modify_filesystem(fsp->get_fscid(), std::move(f)); + } else if (var == "balance_automate") { + bool allow = false; + int r = parse_bool(val, &allow, ss); + if (r != 0) { + return r; + } + + auto f = [allow](auto&& fs) { + if (allow) { + fs.get_mds_map().set_balance_automate(); + } else { + fs.get_mds_map().clear_balance_automate(); + } + }; + fsmap.modify_filesystem(fsp->get_fscid(), std::move(f)); } else if (var == "min_compat_client") { auto vno = ceph_release_from_name(val.c_str()); if (!vno) { diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 71a8bec76ac0c..6f8f25e050af7 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -374,11 +374,26 @@ COMMAND("fs get name=fs_name,type=CephString", "fs", "r") COMMAND("fs set " "name=fs_name,type=CephString " - "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer" - "|standby_count_wanted|session_timeout|session_autoclose" - "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask" - "|refuse_client_session|max_xattr_size|refuse_standby_for_another_fs " + "name=var,type=CephChoices,strings=max_mds" + "|allow_dirfrags" + "|allow_new_snaps" + "|allow_standby_replay" + "|bal_rank_mask" + "|balance_automate" + "|balancer" + "|cluster_down" + "|down" + "|inline_data" + "|joinable" + "|max_file_size" + "|max_xattr_size" + "|min_compat_client" + "|refuse_client_session" + "|refuse_standby_for_another_fs" + "|session_autoclose" + "|session_timeout" + "|standby_count_wanted" + " " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false " "name=yes_i_really_really_mean_it,type=CephBool,req=false", -- 2.39.5