]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: add balance_automate fs setting
authorPatrick Donnelly <pdonnell@redhat.com>
Fri, 23 Jun 2023 21:01:00 +0000 (17:01 -0400)
committerPatrick Donnelly <pdonnell@redhat.com>
Tue, 12 Dec 2023 16:23:29 +0000 (11:23 -0500)
To turn off the automatic ("default") balancer in multiple MDS clusters. The
new default is "off" as the balancer  is a constant source of problems and
surprise for administrators trying multiple actives. Instead, it should be a
deliberate decision to turn it on and usually with customization like the
"bal_rank_mask" setting or pinning.

Fixes: https://tracker.ceph.com/issues/61378
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
PendingReleaseNotes
src/include/ceph_fs.h
src/mds/MDBalancer.cc
src/mds/MDSMap.cc
src/mds/MDSMap.h
src/mon/FSCommands.cc
src/mon/MonCommands.h

index c44851a5eafbbb522d1b5068f633b29829e52ca0..8e344dba79b94080422ea672f072be09d41c2ca1 100644 (file)
@@ -95,6 +95,10 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
   mirroring policies between RGW and AWS, you may wish to set
   "rgw policy reject invalid principals" to "false". This affects only newly set
   policies, not policies that are already in place.
+* The CephFS automatic metadata load (sometimes called "default") balancer is
+  now disabled by default. The new file system flag `balance_automate`
+  can be used to toggle it on or off. It can be enabled or disabled via
+  `ceph fs set <fs_name> balance_automate <bool>`.
 * RGW's default backend for `rgw_enable_ops_log` changed from RADOS to file.
   The default value of `rgw_ops_log_rados` is now false, and `rgw_ops_log_file_path`
   defaults to "/var/log/ceph/ops-log-$cluster-$name.log".
index f567a26f411066f239f80bb70cb9d8eeaf9b1555..2454216802651b133682809aaea604174219f7f6 100644 (file)
@@ -292,6 +292,7 @@ struct ceph_mon_subscribe_ack {
                                                             request */
 #define CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS (1<<7) /* fs is forbidden to use standby
                                                             for another fs */
+#define CEPH_MDSMAP_BALANCE_AUTOMATE             (1<<8)  /* automate metadata balancing */
 #define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \
                              CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS)
 
index cd2e7d9bd002b945c29a9500997050541ad1dbde..b48c59f2606ab203a09cf1a034e25294f01c6569 100644 (file)
@@ -230,6 +230,7 @@ void MDBalancer::handle_export_pins(void)
 void MDBalancer::tick()
 {
   static int num_bal_times = g_conf()->mds_bal_max;
+  bool balance_automate = mds->mdsmap->allows_balance_automate();
   auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
   auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
   time now = clock::now();
@@ -248,7 +249,8 @@ void MDBalancer::tick()
   // We can use duration_cast below, although the result is an int,
   // because the values from g_conf are also integers.
   // balance?
-  if (mds->get_nodeid() == 0
+  if (balance_automate
+      && mds->get_nodeid() == 0
       && mds->is_active()
       && bal_interval > 0
       && chrono::duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
index 47c823bf76356e233c17bf0775e74f9d4f25be7d..699765ebe917638db457bdc18f5c9b18cb9d41ed 100644 (file)
@@ -239,6 +239,7 @@ void MDSMap::dump_flags_state(Formatter *f) const
     f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY), allows_standby_replay());
     f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION));
     f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS));
+    f->dump_bool(flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE));
     f->close_section();
 }
 
@@ -383,6 +384,8 @@ void MDSMap::print_flags(std::ostream& out) const {
     out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION);
   if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS))
     out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS);
+  if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE))
+    out << " " << flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE);
 }
 
 void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
index 9057f05a8cedbcfda8a0474c9bee8960ac958be6..746ae8597151bc90a28eae43579c69c73fa44209 100644 (file)
@@ -244,6 +244,15 @@ public:
   bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); }
   bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; }
 
+  void set_balance_automate() {
+    set_flag(CEPH_MDSMAP_BALANCE_AUTOMATE);
+    ever_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE;
+    explicitly_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE;
+  }
+  void clear_balance_automate() { clear_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); }
+  bool allows_balance_automate() const { return test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); }
+  bool was_balance_automate_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_BALANCE_AUTOMATE; }
+
   void set_multimds_snaps_allowed() {
     set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS);
     ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS;
@@ -676,7 +685,8 @@ private:
     {CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS, "allow_multimds_snaps"},
     {CEPH_MDSMAP_ALLOW_STANDBY_REPLAY, "allow_standby_replay"},
     {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"},
-    {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"}
+    {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"},
+    {CEPH_MDSMAP_BALANCE_AUTOMATE, "balance_automate"}
   };
 };
 WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t)
index 2faba87f73de40859d29d9de87f0c3949bfaadb1..2606199690788d4fa97492b2c2affa3bf68f1cf1 100644 (file)
@@ -665,6 +665,21 @@ public:
         }
       };
       fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+    } else if (var == "balance_automate") {
+      bool allow = false;
+      int r = parse_bool(val, &allow, ss);
+      if (r != 0) {
+        return r;
+      }
+
+      auto f = [allow](auto&& fs) {
+        if (allow) {
+          fs.get_mds_map().set_balance_automate();
+        } else {
+          fs.get_mds_map().clear_balance_automate();
+        }
+      };
+      fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
     } else if (var == "min_compat_client") {
       auto vno = ceph_release_from_name(val.c_str());
       if (!vno) {
index 71a8bec76ac0c547c852f2d7d26ad48c14be842a..6f8f25e050af73f47e1e3d7aa732749579d8c856 100644 (file)
@@ -374,11 +374,26 @@ COMMAND("fs get name=fs_name,type=CephString",
        "fs", "r")
 COMMAND("fs set "
        "name=fs_name,type=CephString "
-       "name=var,type=CephChoices,strings=max_mds|max_file_size"
-        "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer"
-        "|standby_count_wanted|session_timeout|session_autoclose"
-        "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask"
-       "|refuse_client_session|max_xattr_size|refuse_standby_for_another_fs "
+       "name=var,type=CephChoices,strings=max_mds"
+          "|allow_dirfrags"
+          "|allow_new_snaps"
+          "|allow_standby_replay"
+          "|bal_rank_mask"
+          "|balance_automate"
+          "|balancer"
+          "|cluster_down"
+          "|down"
+          "|inline_data"
+          "|joinable"
+          "|max_file_size"
+          "|max_xattr_size"
+          "|min_compat_client"
+          "|refuse_client_session"
+          "|refuse_standby_for_another_fs"
+          "|session_autoclose"
+          "|session_timeout"
+          "|standby_count_wanted"
+          " "
        "name=val,type=CephString "
        "name=yes_i_really_mean_it,type=CephBool,req=false "
        "name=yes_i_really_really_mean_it,type=CephBool,req=false",