From: Kamoltat Sirivadhna Date: Tue, 9 Dec 2025 21:00:38 +0000 (+0000) Subject: mon [stretch-mode]: Allow a max bucket weight diff threshold X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d58de5174d05ad2df1f1d6771abf504b25e62c54;p=ceph.git mon [stretch-mode]: Allow a max bucket weight diff threshold Problem: Users ran into a problem where the crush bucket weight different check in stretch mode is too strict, e.g., one of the disk that is added to one of the node had slight variation in the capacity and this caused ceph to fail from enabling the stretch cluster because crush weight is not balanced. The difference was very small. Solution: - Introducing: mon_stretch_max_bucket_weight_delta in mon.yaml.in this config var is default to 0.1 and is used as a threshold to allow the difference between the two crush buckets in stretch mode to be no greater than 10%. - Introducing: STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE as health warnings when the weight delta between the two sites exceeds 10% - Modified documentations - Modified tests that exercises this code path Fixes: https://tracker.ceph.com/issues/72994 Signed-off-by: Kamoltat Sirivadhna --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 446297dad61..98d310f1d9c 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -18,6 +18,10 @@ the I/O capacity of the OSD. The default stride size (``osd_deep_scrub_stride``) was 512 KBytes, and is now 4 MBytes. +* RADOS: Stretch mode can now be entered even if the two dividing buckets differ + in weight by a small fraction (default 0.1). This is tunable via + `mon_stretch_max_bucket_weight_delta`. + * CephFS: The offline CephFS tools (cephfs-data-scan, cephfs-journal-tool, and cephfs-table-tool) now include progress tracking with ETA (Estimated Time of Arrival) for long-running operations. Progress updates are displayed automatically diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index aece6171436..a5d9fd962b9 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1702,17 +1702,21 @@ until the condition is fixed. We encourage you to fix this by removing additional dividing CRUSH buckets or by increasing the number of dividing buckets to two. For more information, see :ref:`stretch_mode`. -UNEVEN_WEIGHTS_STRETCH_MODE -___________________________ +STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE +____________________________________ + +The two dividing buckets must have weights within a fractional difference +when stretch mode is enabled. This is determined by the configuration option +``mon_stretch_max_bucket_weight_delta`` (default: 0.1). -The two dividing CRUSH buckets must have equal weights when stretch mode is enabled. -This warning suggests that the two dividing buckets have uneven weights after -stretch mode is enabled. This is not immediately fatal, however, you can expect -Ceph to be confused when trying to process transitions between dividing buckets. +This is not immediately fatal, however, you can expect Ceph to experience performance bottlenecks +and imbalanced PG distribution if the aggregate CRUSH weights of the buckets differ significantly, +as the smaller bucket will carry a higher I/O load per OSD. -We encourage you to fix this by making the weights even on both dividing CRUSH buckets. +We encourage you to fix this by making the weights of the dividing buckets more even. This can be done by making sure the combined weight of the OSDs on each dividing -bucket are the same. For more information, see :ref:`stretch_mode`. +bucket are within the fractional difference defined by +``mon_stretch_max_bucket_weight_delta``. NONEXISTENT_MON_CRUSH_LOC_STRETCH_MODE ______________________________________ diff --git a/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh b/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh index 7e13f407681..518a221bdd4 100755 --- a/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh +++ b/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh @@ -23,7 +23,7 @@ function run() { teardown $dir || return 1 done } -TEST_stretched_cluster_uneven_weight() { +TEST_stretch_cluster_uneven_crush_weights() { local dir=$1 local OSDS=4 local weight=0.09000 @@ -130,16 +130,20 @@ EOF ceph osd crush rm sham # clear the health warn wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1 - # Next, we test for uneven weights across buckets + # Next, we test for STRETCH_BUCKET_WEIGHT_IMBALANCE - ceph osd crush reweight osd.0 0.07000 + ceph osd crush reweight osd.0 0.08999 # make weights uneven below threshold + sleep 5 # sleep to allow monitor to process the weight change or health check + wait_for_health_ok || return 1 # we should not see any health warning - wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1 + ceph osd crush reweight osd.0 0.00000 # now make the weights uneven above threshold + ceph osd crush reweight osd.1 0.00000 # now make the weights uneven above threshold + wait_for_health "STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE" || return 1 # we should see the health warning - ceph osd crush reweight osd.0 $weight # clear the health warn - - wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1 + ceph osd crush reweight osd.0 $weight # make weights even again + ceph osd crush reweight osd.1 $weight # make weights even again + wait_for_health_gone "STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE" || return 1 # health warning should be cleared teardown $dir || return 1 } -main mon-stretched-cluster-uneven-weight "$@" \ No newline at end of file +main mon-stretch-cluster-uneven-crush-weights "$@" \ No newline at end of file diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 9e9554b60df..34dcc3d0382 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -668,6 +668,14 @@ options: - mon min: 2 max: 4 +- name: mon_stretch_max_bucket_weight_delta + type: float + level: dev + desc: Max difference allowed among CRUSH bucket weights when in stretch mode. + The value is a percentage expressed as a real number between 0.0 and 1.0. + default: 0.1 + services: + - mon - name: mon_clock_drift_allowed type: float level: advanced diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c71a66c853a..7535602f40a 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -15575,23 +15575,26 @@ void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, return; } __u8 new_rule = static_cast<__u8>(new_crush_rule_result); - + if (bucket_count != 2) { + ss << "currently we only support 2-site stretch clusters!"; + *errcode = -EINVAL; + ceph_assert(!commit || bucket_count == 2); + return; + } + double stretch_max_weight_delta = g_conf().get_val("mon_stretch_max_bucket_weight_delta"); int weight1 = crush.get_item_weight(subtrees[0]); int weight2 = crush.get_item_weight(subtrees[1]); - if (weight1 != weight2) { - // TODO: I'm really not sure this is a good idea? + bool exceeds_threshold = abs(weight1 - weight2) > + (stretch_max_weight_delta * std::min(weight1, weight2)); + if (exceeds_threshold) { ss << "the 2 " << dividing_bucket << "instances in the cluster have differing weights " << weight1 << " and " << weight2 - <<" but stretch mode currently requires they be the same!"; + << " but stretch mode currently" + <<" requires the difference to be no greater than " + << stretch_max_weight_delta * 100 << "%"; *errcode = -EINVAL; - ceph_assert(!commit || (weight1 == weight2)); - return; - } - if (bucket_count != 2) { - ss << "currently we only support 2-site stretch clusters!"; - *errcode = -EINVAL; - ceph_assert(!commit || bucket_count == 2); + ceph_assert(!commit || !exceeds_threshold); return; } // TODO: check CRUSH rules for pools so that we are appropriately divided diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 8f6a8c08b87..99b34e92907 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -7858,7 +7858,7 @@ void OSDMap::check_health(CephContext *cct, ss.str(), 0); } } - // UNEQUAL_WEIGHT + // INCORRECT_NUM_BUCKETS_STRETCH_MODE if (stretch_mode_enabled) { vector subtrees; crush->get_subtree_of_type(stretch_mode_bucket, &subtrees); @@ -7868,12 +7868,15 @@ void OSDMap::check_health(CephContext *cct, checks->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0); return; } + // STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE int weight1 = crush->get_item_weight(subtrees[0]); int weight2 = crush->get_item_weight(subtrees[1]); + double stretch_max_weight_delta = cct->_conf.get_val("mon_stretch_max_bucket_weight_delta"); stringstream ss; - if (weight1 != weight2) { - ss << "Stretch mode buckets have different weights!"; - checks->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0); + if (abs(weight1 - weight2) > + (stretch_max_weight_delta * std::min(weight1, weight2))) { + ss << "Stretch mode buckets differ in weight by more than " << (stretch_max_weight_delta * 100) << "%"; + checks->add("STRETCH_MODE_BUCKET_WEIGHT_IMBALANCE", HEALTH_WARN, ss.str(), 0); } }