From 3bd34683e7977acb84d2921b36487be6f306ca1d Mon Sep 17 00:00:00 2001 From: Prashant D Date: Wed, 20 Jul 2022 05:00:54 -0400 Subject: [PATCH] crush/CrushWrapper: Merge weights if rule contains multiple take If there are multiple takes in a crush rule that place a different number of objects we do not take that into account. This creates a MAX AVAIL stat inconsistency for a given pool from crush rule having multiple takes. Merge weights before normalizing the weight map. Fixes: https://tracker.ceph.com/issues/56650 Signed-off-by: Prashant D --- PendingReleaseNotes | 10 +++ qa/standalone/mon/osd-pool-df.sh | 107 +++++++++++++++++++++++++++++++ src/crush/CrushWrapper.cc | 12 ++-- 3 files changed, 124 insertions(+), 5 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index d25acfa9c6d77..9f03b2b35341f 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -69,6 +69,16 @@ headers only when the `read-stats` querystring is explicitly included in the API request. +* RADOS: The ceph df command reports incorrect MAX AVAIL for stretch mode pools when + CRUSH rules use multiple take steps for datacenters. PGMap::get_rule_avail + incorrectly calculates available space from only one datacenter. + As a workaround, define CRUSH rules with take default and choose firstn 0 type + datacenter. See https://tracker.ceph.com/issues/56650#note-6 for details. + Upgrading a cluster configured with a crush rule with multiple take steps + can lead to data shuffling, as the new crush changes may necessitate data + redistribution. In contrast, a stretch rule with a single-take configuration + will not cause any data movement during the upgrade process. + >=19.2.1 * CephFS: Command `fs subvolume create` now allows tagging subvolumes through option diff --git a/qa/standalone/mon/osd-pool-df.sh b/qa/standalone/mon/osd-pool-df.sh index d2b80ec729c3c..261cf58f4a27c 100755 --- a/qa/standalone/mon/osd-pool-df.sh +++ b/qa/standalone/mon/osd-pool-df.sh @@ -73,4 +73,111 @@ function TEST_ceph_df() { teardown $dir || return 1 } +function TEST_stretched_cluster_ceph_df() { + local dir=$1 + local OSDS=12 + local nodes=6 + setup $dir || return 1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + export CEPH_ARGS + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd || return 1 + done + + for dc in DC1 DC2 + do + ceph osd crush add-bucket $dc datacenter + ceph osd crush move $dc root=default + done + + for node in $(seq 0 $(expr $nodes - 1)) + do + ceph osd crush add-bucket osd-$node host + done + + for p in $(seq 0 2) + do + ceph osd crush move osd-$p datacenter=DC1 + done + + for q in $(seq 3 5) + do + ceph osd crush move osd-$q datacenter=DC2 + done + + ceph osd crush move osd.0 host=osd-0 + ceph osd crush move osd.3 host=osd-0 + ceph osd crush move osd.1 host=osd-1 + ceph osd crush move osd.2 host=osd-1 + ceph osd crush move osd.7 host=osd-2 + ceph osd crush move osd.11 host=osd-2 + ceph osd crush move osd.4 host=osd-3 + ceph osd crush move osd.8 host=osd-3 + ceph osd crush move osd.5 host=osd-4 + ceph osd crush move osd.9 host=osd-4 + ceph osd crush move osd.6 host=osd-5 + ceph osd crush move osd.10 host=osd-5 + + hostname=$(hostname -s) + ceph osd crush remove $hostname + + ceph osd getcrushmap > $dir/crushmap || return 1 + crushtool --decompile $dir/crushmap > $dir/crushmap.txt || return 1 + sed 's/^# end crush map$//' $dir/crushmap.txt > $dir/crushmap_modified.txt || return 1 + cat >> $dir/crushmap_modified.txt << EOF +rule stretch_rule { + id 1 + type replicated + min_size 1 + max_size 10 + step take DC1 + step chooseleaf firstn 2 type host + step emit + step take DC2 + step chooseleaf firstn 2 type host + step emit +} + +rule stretch_replicated_rule { + id 2 + type replicated + min_size 1 + max_size 10 + step take default + step choose firstn 0 type datacenter + step chooseleaf firstn 2 type host + step emit +} + +# end crush map +EOF + + crushtool --compile $dir/crushmap_modified.txt -o $dir/crushmap.bin || return 1 + ceph osd setcrushmap -i $dir/crushmap.bin || return 1 + + ceph osd crush rule ls + + local stretched_poolname=stretched_rbdpool + local stretched_rep_poolname=stretched_replicated_rbdpool + ceph osd pool create $stretched_poolname 32 32 stretch_rule + ceph osd pool create $stretched_rep_poolname 32 32 stretch_replicated_rule + ceph osd pool set $stretched_poolname size 4 + ceph osd pool set $stretched_rep_poolname size 4 + wait_for_clean || return 1 + + local global_avail=`ceph df -f json | jq '.stats.total_avail_bytes'` + local stretched_avail=`ceph df -f json | jq '.pools | map(select(.name == "'$stretched_poolname'"))[0].stats.max_avail'` + local stretched_rep_avail=`ceph df -f json | jq '.pools | map(select(.name == "'$stretched_rep_poolname'"))[0].stats.max_avail'` + + test ${stretched_avail} == ${stretched_rep_avail} || return 1 + + ceph osd pool delete $stretched_poolname $stretched_poolname --yes-i-really-really-mean-it + ceph osd pool delete $stretched_rep_poolname $stretched_rep_poolname --yes-i-really-really-mean-it + teardown $dir || return 1 +} + main osd-pool-df "$@" diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 32da5251da4f0..44fd84e400e05 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -2541,13 +2541,15 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, // build a weight map for each TAKE in the rule, and then merge them - // FIXME: if there are multiple takes that place a different number of - // objects we do not take that into account. (Also, note that doing this + // We do handles varying object counts across multiple takes. + // However, the ultimate object placement is subject to the + // pool size constraint, which might restrict the number of + // objects selected by the Crush rule (Also, note that doing this // right is also a function of the pool, since the crush rule // might choose 2 + choose 2 but pool size may only be 3.) + float sum = 0; + map m; for (unsigned i=0; ilen; ++i) { - map m; - float sum = 0; if (rule->steps[i].op == CRUSH_RULE_TAKE) { int n = rule->steps[i].arg1; if (n >= 0) { @@ -2557,8 +2559,8 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, sum += _get_take_weight_osd_map(n, &m); } } - _normalize_weight_map(sum, m, pmap); } + _normalize_weight_map(sum, m, pmap); return 0; } -- 2.39.5