]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon [stretch-mode]: Allow a max bucket weight diff threshold 67790/head
authorKamoltat Sirivadhna <ksirivad@redhat.com>
Tue, 9 Dec 2025 21:00:38 +0000 (21:00 +0000)
committerKamoltat (Junior) Sirivadhna <ksirivad@redhat.com>
Tue, 17 Mar 2026 21:36:20 +0000 (21:36 +0000)
Problem:
Users ran into a problem where the crush bucket
weight different check in stretch mode is too strict, e.g.,
one of the disk that is added to one of the node had slight variation
in the capacity and this caused ceph to fail from enabling the stretch
cluster because crush weight is not balanced. The difference was very small.

Solution:
- Introducing: mon_stretch_max_bucket_weight_delta in mon.yaml.in
  this config var is default to 0.1 and is used as a threshold
  to allow the difference between the two crush buckets in stretch mode
  to be no greater than 10%.
- Introducing: STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE as health warnings
  when the weight delta between the two sites exceeds 10%
- Modified documentations
- Modified tests that exercises this code path

Fixes: https://tracker.ceph.com/issues/72994
Signed-off-by: Kamoltat Sirivadhna <ksirivad@redhat.com>
(cherry picked from commit d58de5174d05ad2df1f1d6771abf504b25e62c54)

Conflicts:
doc/rados/operations/health-checks.rst - Trivial Fix
PendingReleaseNotes - Remove this
Signed-off-by: Kamoltat (Junior) Sirivadhna <ksirivad@redhat.com>
doc/rados/operations/health-checks.rst
qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
src/common/options/mon.yaml.in
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc

index 30a9bd64405fdfb5c590a78ed910eb8eecc63ef7..013553c7f1dfdd35a0339b3abe0b1cf89e72de5a 100644 (file)
@@ -1660,17 +1660,21 @@ until the condition is fixed.
 We encourage you to fix this by removing additional dividing buckets or bump the
 number of dividing buckets to 2.
 
-UNEVEN_WEIGHTS_STRETCH_MODE
-___________________________
+STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE
+____________________________________
+
+The two dividing buckets must have weights within a fractional difference 
+when stretch mode is enabled. This is determined by the configuration option
+``mon_stretch_max_bucket_weight_delta`` (default: 0.1).
 
-The 2 dividing buckets must have equal weights when stretch mode is enabled.
-This warning suggests that the 2 dividing buckets have uneven weights after
-stretch mode is enabled. This is not immediately fatal, however, you can expect
-Ceph to be confused when trying to process transitions between dividing buckets.
+This is not immediately fatal, however, you can expect Ceph to experience performance bottlenecks
+and imbalanced PG distribution if the aggregate CRUSH weights of the buckets differ significantly,
+as the smaller bucket will carry a higher I/O load per OSD.
 
-We encourage you to fix this by making the weights even on both dividing buckets.
+We encourage you to fix this by making the weights of the dividing buckets more even.
 This can be done by making sure the combined weight of the OSDs on each dividing
-bucket are the same.
+bucket are within the fractional difference defined by
+``mon_stretch_max_bucket_weight_delta``.
 
 NONEXISTENT_MON_CRUSH_LOC_STRETCH_MODE
 ______________________________________
index 7e13f4076812f04ed27b20f7f426e8a6c82851ef..518a221bdd4be41223590d22003a6a9bc742442a 100755 (executable)
@@ -23,7 +23,7 @@ function run() {
         teardown $dir || return 1
     done
 }
-TEST_stretched_cluster_uneven_weight() {
+TEST_stretch_cluster_uneven_crush_weights() {
     local dir=$1
     local OSDS=4
     local weight=0.09000
@@ -130,16 +130,20 @@ EOF
     ceph osd crush rm sham # clear the health warn
     wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
 
-    # Next, we test for uneven weights across buckets
+    # Next, we test for STRETCH_BUCKET_WEIGHT_IMBALANCE
 
-    ceph osd crush reweight osd.0 0.07000
+    ceph osd crush reweight osd.0 0.08999 # make weights uneven below threshold
+    sleep 5 # sleep to allow monitor to process the weight change or health check
+    wait_for_health_ok || return 1 # we should not see any health warning
 
-    wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
+    ceph osd crush reweight osd.0 0.00000 # now make the weights uneven above threshold
+    ceph osd crush reweight osd.1 0.00000 # now make the weights uneven above threshold
+    wait_for_health "STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE" || return 1 # we should see the health warning
 
-    ceph osd crush reweight osd.0 $weight # clear the health warn
-
-    wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
+    ceph osd crush reweight osd.0 $weight # make weights even again
+    ceph osd crush reweight osd.1 $weight # make weights even again
 
+    wait_for_health_gone "STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE" || return 1 # health warning should be cleared
     teardown $dir || return 1
 }
-main mon-stretched-cluster-uneven-weight "$@"
\ No newline at end of file
+main mon-stretch-cluster-uneven-crush-weights "$@"
\ No newline at end of file
index f3e3f889505cb5bf10e4eea16a9188c108fb9bb7..1e2f1d74da11f72be982de5502057995c2109188 100644 (file)
@@ -652,6 +652,14 @@ options:
   - mon
   min: 2
   max: 4
+- name: mon_stretch_max_bucket_weight_delta
+  type: float
+  level: dev
+  desc: Max difference allowed among CRUSH bucket weights when in stretch mode.
+    The value is a percentage expressed as a real number between 0.0 and 1.0.
+  default: 0.1
+  services:
+  - mon
 - name: mon_clock_drift_allowed
   type: float
   level: advanced
index 530a329930961346694996288fdda1b5ebce29ad..abb3b6251883339da36c681141e5e01a8b93a830 100644 (file)
@@ -15510,23 +15510,26 @@ void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
     return;
   }
   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
-
+  if (bucket_count != 2) {
+    ss << "currently we only support 2-site stretch clusters!";
+    *errcode = -EINVAL;
+    ceph_assert(!commit || bucket_count == 2);
+    return;
+  }
+  double stretch_max_weight_delta = g_conf().get_val<double>("mon_stretch_max_bucket_weight_delta");
   int weight1 = crush.get_item_weight(subtrees[0]);
   int weight2 = crush.get_item_weight(subtrees[1]);
-  if (weight1 != weight2) {
-    // TODO: I'm really not sure this is a good idea?
+  bool exceeds_threshold = abs(weight1 - weight2) >
+      (stretch_max_weight_delta * std::min(weight1, weight2));
+  if (exceeds_threshold) {
     ss << "the 2 " << dividing_bucket
        << "instances in the cluster have differing weights "
        << weight1 << " and " << weight2
-       <<" but stretch mode currently requires they be the same!";
+       << " but stretch mode currently" 
+       <<" requires the difference to be no greater than "
+       << stretch_max_weight_delta * 100 << "%";
     *errcode = -EINVAL;
-    ceph_assert(!commit || (weight1 == weight2));
-    return;
-  }
-  if (bucket_count != 2) {
-    ss << "currently we only support 2-site stretch clusters!";
-    *errcode = -EINVAL;
-    ceph_assert(!commit || bucket_count == 2);
+    ceph_assert(!commit || !exceeds_threshold);
     return;
   }
   // TODO: check CRUSH rules for pools so that we are appropriately divided
index 4030507ac3f0917f920c7095e30a9ec916ef3648..efa7d77f046c1bd24be26d23e6fa6cc36fe63c63 100644 (file)
@@ -7850,7 +7850,7 @@ void OSDMap::check_health(CephContext *cct,
                            ss.str(), 0);
     }
   }
-  // UNEQUAL_WEIGHT
+  // INCORRECT_NUM_BUCKETS_STRETCH_MODE
   if (stretch_mode_enabled) {
     vector<int> subtrees;
     crush->get_subtree_of_type(stretch_mode_bucket, &subtrees);
@@ -7860,12 +7860,15 @@ void OSDMap::check_health(CephContext *cct,
       checks->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
       return;
     }
+    // STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE
     int weight1 = crush->get_item_weight(subtrees[0]);
     int weight2 = crush->get_item_weight(subtrees[1]);
+    double stretch_max_weight_delta = cct->_conf.get_val<double>("mon_stretch_max_bucket_weight_delta");
     stringstream ss;
-    if (weight1 != weight2) {
-      ss << "Stretch mode buckets have different weights!";
-      checks->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
+    if (abs(weight1 - weight2) >
+      (stretch_max_weight_delta * std::min(weight1, weight2))) {
+      ss << "Stretch mode buckets differ in weight by more than " << (stretch_max_weight_delta * 100) << "%";
+      checks->add("STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE", HEALTH_WARN, ss.str(), 0);
     }
   }