From: Patrick Donnelly Date: Fri, 23 Jun 2023 21:01:00 +0000 (-0400) Subject: mds: add balance_automate fs setting X-Git-Tag: v19.3.0~363^2~6 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=47342a8b3ce1ee998ac70687468253135e29f46b;p=ceph.git mds: add balance_automate fs setting To turn off the automatic ("default") balancer in multiple MDS clusters. The new default is "off" as the balancer is a constant source of problems and surprise for administrators trying multiple actives. Instead, it should be a deliberate decision to turn it on and usually with customization like the "bal_rank_mask" setting or pinning. Fixes: https://tracker.ceph.com/issues/61378 Signed-off-by: Patrick Donnelly --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index c44851a5eafb..8e344dba79b9 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -95,6 +95,10 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config mirroring policies between RGW and AWS, you may wish to set "rgw policy reject invalid principals" to "false". This affects only newly set policies, not policies that are already in place. +* The CephFS automatic metadata load (sometimes called "default") balancer is + now disabled by default. The new file system flag `balance_automate` + can be used to toggle it on or off. It can be enabled or disabled via + `ceph fs set balance_automate `. * RGW's default backend for `rgw_enable_ops_log` changed from RADOS to file. The default value of `rgw_ops_log_rados` is now false, and `rgw_ops_log_file_path` defaults to "/var/log/ceph/ops-log-$cluster-$name.log". diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index f567a26f4110..245421680265 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -292,6 +292,7 @@ struct ceph_mon_subscribe_ack { request */ #define CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS (1<<7) /* fs is forbidden to use standby for another fs */ +#define CEPH_MDSMAP_BALANCE_AUTOMATE (1<<8) /* automate metadata balancing */ #define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \ CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS) diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index cd2e7d9bd002..b48c59f2606a 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -230,6 +230,7 @@ void MDBalancer::handle_export_pins(void) void MDBalancer::tick() { static int num_bal_times = g_conf()->mds_bal_max; + bool balance_automate = mds->mdsmap->allows_balance_automate(); auto bal_interval = g_conf().get_val("mds_bal_interval"); auto bal_max_until = g_conf().get_val("mds_bal_max_until"); time now = clock::now(); @@ -248,7 +249,8 @@ void MDBalancer::tick() // We can use duration_cast below, although the result is an int, // because the values from g_conf are also integers. // balance? - if (mds->get_nodeid() == 0 + if (balance_automate + && mds->get_nodeid() == 0 && mds->is_active() && bal_interval > 0 && chrono::duration_cast(now - last_heartbeat).count() >= bal_interval diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 47c823bf7635..699765ebe917 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -239,6 +239,7 @@ void MDSMap::dump_flags_state(Formatter *f) const f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY), allows_standby_replay()); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)); + f->dump_bool(flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)); f->close_section(); } @@ -383,6 +384,8 @@ void MDSMap::print_flags(std::ostream& out) const { out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION); if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS); + if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)) + out << " " << flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE); } void MDSMap::get_health(list >& summary, diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 9057f05a8ced..746ae8597151 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -244,6 +244,15 @@ public: bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); } bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; } + void set_balance_automate() { + set_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); + ever_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + explicitly_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + } + void clear_balance_automate() { clear_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool allows_balance_automate() const { return test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool was_balance_automate_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_BALANCE_AUTOMATE; } + void set_multimds_snaps_allowed() { set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; @@ -676,7 +685,8 @@ private: {CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS, "allow_multimds_snaps"}, {CEPH_MDSMAP_ALLOW_STANDBY_REPLAY, "allow_standby_replay"}, {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"}, - {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"} + {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"}, + {CEPH_MDSMAP_BALANCE_AUTOMATE, "balance_automate"} }; }; WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 2faba87f73de..260619969078 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -665,6 +665,21 @@ public: } }; fsmap.modify_filesystem(fsp->get_fscid(), std::move(f)); + } else if (var == "balance_automate") { + bool allow = false; + int r = parse_bool(val, &allow, ss); + if (r != 0) { + return r; + } + + auto f = [allow](auto&& fs) { + if (allow) { + fs.get_mds_map().set_balance_automate(); + } else { + fs.get_mds_map().clear_balance_automate(); + } + }; + fsmap.modify_filesystem(fsp->get_fscid(), std::move(f)); } else if (var == "min_compat_client") { auto vno = ceph_release_from_name(val.c_str()); if (!vno) { diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 71a8bec76ac0..6f8f25e050af 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -374,11 +374,26 @@ COMMAND("fs get name=fs_name,type=CephString", "fs", "r") COMMAND("fs set " "name=fs_name,type=CephString " - "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer" - "|standby_count_wanted|session_timeout|session_autoclose" - "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask" - "|refuse_client_session|max_xattr_size|refuse_standby_for_another_fs " + "name=var,type=CephChoices,strings=max_mds" + "|allow_dirfrags" + "|allow_new_snaps" + "|allow_standby_replay" + "|bal_rank_mask" + "|balance_automate" + "|balancer" + "|cluster_down" + "|down" + "|inline_data" + "|joinable" + "|max_file_size" + "|max_xattr_size" + "|min_compat_client" + "|refuse_client_session" + "|refuse_standby_for_another_fs" + "|session_autoclose" + "|session_timeout" + "|standby_count_wanted" + " " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false " "name=yes_i_really_really_mean_it,type=CephBool,req=false",