From 6761549c5a719f541a850f243318a1212b76c0a6 Mon Sep 17 00:00:00 2001 From: Sridhar Seshasayee Date: Wed, 6 May 2026 20:41:33 +0530 Subject: [PATCH] mgr/DaemonServer: Aggregate and globally sort OSDs for ok-to-upgrade The 'ok-to-upgrade' command output sorting did not scale accurately when target CRUSH buckets contained multiple child buckets (e.g., a chassis containing multiple hosts). OSDs were previously sorted individually per child bucket and appended sequentially. This created fragmented, per-host sort segments rather than a globally sorted list for the parent bucket. Changes: 1. Fix the issue above by aggregating all child OSDs into a single vector prior to executing a single, global sort operation based on PG counts. Additionally, optimize memory efficiency and future-proof the logic by reserving continuous vector blocks to avoid dynamic heap reallocations. 2. Add integration tests with chassis and rack based CRUSH hierarchies which verifies the ok-to-upgrade functionality. In addition, the tests crucially verify the order of OSDs returned is according to the ascending order of acting PG count. Additionally, make minor fix-ups to lines that determine the length of a list in JSON response by removing the redundant "| bc". Fixes: https://tracker.ceph.com/issues/77272 Signed-off-by: Sridhar Seshasayee --- qa/standalone/misc/ok-to-upgrade.sh | 436 ++++++++++++++++++++++++++-- src/mgr/DaemonServer.cc | 47 ++- 2 files changed, 423 insertions(+), 60 deletions(-) diff --git a/qa/standalone/misc/ok-to-upgrade.sh b/qa/standalone/misc/ok-to-upgrade.sh index 5be0eee49fb..2bdb5d7497b 100755 --- a/qa/standalone/misc/ok-to-upgrade.sh +++ b/qa/standalone/misc/ok-to-upgrade.sh @@ -35,6 +35,33 @@ function run() { done } +# Populate the global pg_count associative array with each OSD's acting +# PG count from 'ceph osd df'. +function snapshot_osd_pg_counts() { + declare -gA pg_count + pg_count=() + while read -r osd_id count; do + pg_count[$osd_id]=$count + done < <(ceph osd df --format=json | \ + jq -r '.nodes[] | select(.id >= 0) | "\(.id) \(.pgs)"') +} + +# Verify that the named OSD list in a JSON result is ordered in non-decreasing +# acting PG count. Uses the global pg_count array populated by +# snapshot_osd_pg_counts(). +# Usage: assert_osds_sorted_by_pg_count +function assert_osds_sorted_by_pg_count() { + local field=$1 + local json=$2 + local prev_count=-1 + local osd_id count + for osd_id in $(echo "$json" | jq -r ".${field}[]"); do + count=${pg_count[$osd_id]} + test "$count" -ge "$prev_count" || return 1 + prev_count=$count + done +} + function TEST_ok_to_upgrade_invalid_args() { local dir=$1 CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON " @@ -103,9 +130,9 @@ function TEST_ok_to_upgrade_replicated_pool() { # and verifies that at least the expected minimum OSDs are returned. test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -ge $exp_osds_upgradable || return 1 - local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test the same command as above, but exercise the 'max' parameter. @@ -116,9 +143,9 @@ function TEST_ok_to_upgrade_replicated_pool() { res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version $max --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -eq $exp_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test same command above with verbose syntax @@ -126,9 +153,9 @@ function TEST_ok_to_upgrade_replicated_pool() { --ceph_version $ceph_version --max $max --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -eq $exp_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for upgradability with min_size=1 and 1 OSD to upgrade. The outcome @@ -139,9 +166,9 @@ function TEST_ok_to_upgrade_replicated_pool() { res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for upgradability with min_size=2 @@ -154,9 +181,9 @@ function TEST_ok_to_upgrade_replicated_pool() { # 3 OSDs should be reported as upgradable. test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -ge $exp_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for upgradability with min_size=3 @@ -168,9 +195,9 @@ function TEST_ok_to_upgrade_replicated_pool() { # No OSD should be reported as upgradable. test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = false || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for condition when all OSDs are running desired version. @@ -179,9 +206,9 @@ function TEST_ok_to_upgrade_replicated_pool() { res=$(ceph osd ok-to-upgrade $crush_bucket $upgrade_version --format=json) test $(echo $res | jq '.all_osds_upgraded') = true || return 1 test $(echo $res | jq '.ok_to_upgrade') = false || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -eq 0 || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq $OSDS || return 1 } @@ -217,9 +244,9 @@ function TEST_ok_to_upgrade_erasure_pool() { # in 3 OSDs being reported as upgradable. test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test the same command as above, but exercise the 'max' parameter. @@ -230,9 +257,9 @@ function TEST_ok_to_upgrade_erasure_pool() { res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version $max --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -eq $exp_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test command above with verbose syntax @@ -240,9 +267,9 @@ function TEST_ok_to_upgrade_erasure_pool() { --ceph_version $ceph_version --max $max --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -eq $exp_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for upgradability with min_size=5 and 1 OSD to upgrade. The outcome @@ -253,9 +280,9 @@ function TEST_ok_to_upgrade_erasure_pool() { res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for upgradability with min_size=6 @@ -268,9 +295,9 @@ function TEST_ok_to_upgrade_erasure_pool() { # 2 OSDs should be reported as upgradable. test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = true || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for upgradability with min_size=8 @@ -282,9 +309,9 @@ function TEST_ok_to_upgrade_erasure_pool() { # No OSD should be reported as upgradable. test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = false || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq 0 || return 1 # Test for condition when all OSDs are running desired version. @@ -293,9 +320,9 @@ function TEST_ok_to_upgrade_erasure_pool() { res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json) test $(echo $res | jq '.all_osds_upgraded') = true || return 1 test $(echo $res | jq '.ok_to_upgrade') = false || return 1 - num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $num_osds_upgradable -eq 0 || return 1 - num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc) + num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') test $num_osds_upgraded -eq $OSDS || return 1 } @@ -331,11 +358,360 @@ function TEST_ok_to_upgrade_bad_osd_version() { local res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json) test $(echo $res | jq '.all_osds_upgraded') = false || return 1 test $(echo $res | jq '.ok_to_upgrade') = false || return 1 - local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc) + local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') test $exp_osds_upgradable = $num_osds_upgradable || return 1 - local num_osds_bad_version=$(echo $res | jq '.bad_no_version | length' | bc) + local num_osds_bad_version=$(echo $res | jq '.bad_no_version | length') test $num_osds_bad_version -eq 3 || return 1 } +function TEST_ok_to_upgrade_chassis_bucket() { + local dir=$1 + local poolname="test" + local OSDS=4 + local ceph_version="01.2.3-1234-g1234deed" + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON " + + run_mon $dir a --public-addr=$CEPH_MON || return 1 + run_mgr $dir x || return 1 + + for osd in $(seq 0 $(expr $OSDS - 1)); do + run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1 + done + + # Use size=2 so the pool can be satisfied with 2 distinct host buckets. + # min_size=1 allows upgrading all OSDs on one host simultaneously. + create_pool $poolname 8 8 + ceph osd pool set $poolname pg_autoscale_mode off + ceph osd pool set $poolname size 2 + ceph osd pool set $poolname min_size 1 + sleep 5 + wait_for_clean || return 1 + + # Build a chassis CRUSH hierarchy: + # root "default" + # chassis "chassis-0" + # host "host-0-0" -> osd.0, osd.1 + # host "host-0-1" -> osd.2, osd.3 + # + # OSDs start in auto-generated host buckets; crush set replaces each + # OSD's location with the new custom hierarchy. + ceph osd crush add-bucket chassis-0 chassis || return 1 + ceph osd crush add-bucket host-0-0 host || return 1 + ceph osd crush add-bucket host-0-1 host || return 1 + ceph osd crush move host-0-0 chassis=chassis-0 || return 1 + ceph osd crush move host-0-1 chassis=chassis-0 || return 1 + ceph osd crush move chassis-0 root=default || return 1 + ceph osd crush set 0 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1 + ceph osd crush set 1 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1 + ceph osd crush set 2 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1 + ceph osd crush set 3 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1 + sleep 5 + wait_for_clean || return 1 + + # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in + # osd df equals the acting count the mgr's PGMap records internally. + snapshot_osd_pg_counts + + # With min_size=1 some OSDs inside chassis-0 must be reported upgradable. + local res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -ge 1 || return 1 + local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') + test $num_osds_upgraded -eq 0 || return 1 + + # Verify that osds_in_crush_bucket lists all chassis OSDs in non-decreasing + # order of acting PG count. This directly tests the global sort inside + # _populate_crush_bucket_osds(): consecutive entries must never decrease. + assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1 + + # With max=1, exactly one OSD is returned. + local max=1 + res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version $max --format=json) + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -eq 1 || return 1 + + # With min_size=2 on a size=2 pool, taking any OSD offline drops a PG + # below the minimum replica count -- no OSD should be upgradable. + ceph osd pool set $poolname min_size 2 + sleep 5 + wait_for_clean || return 1 + res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = false || return 1 + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -eq 0 || return 1 +} + +function TEST_ok_to_upgrade_rack_bucket() { + local dir=$1 + local poolname="test" + local OSDS=4 + local ceph_version="01.2.3-1234-g1234deed" + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON " + + run_mon $dir a --public-addr=$CEPH_MON || return 1 + run_mgr $dir x || return 1 + + for osd in $(seq 0 $(expr $OSDS - 1)); do + run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1 + done + + # Use size=2 so the pool can be satisfied with 2 distinct host buckets. + # min_size=1 allows upgrading all OSDs on one host simultaneously. + create_pool $poolname 8 8 + ceph osd pool set $poolname pg_autoscale_mode off + ceph osd pool set $poolname size 2 + ceph osd pool set $poolname min_size 1 + sleep 5 + wait_for_clean || return 1 + + # Build a rack CRUSH hierarchy: + # root "default" + # rack "rack-0" + # host "rack-host-0" -> osd.0, osd.1 + # host "rack-host-1" -> osd.2, osd.3 + ceph osd crush add-bucket rack-0 rack || return 1 + ceph osd crush add-bucket rack-host-0 host || return 1 + ceph osd crush add-bucket rack-host-1 host || return 1 + ceph osd crush move rack-host-0 rack=rack-0 || return 1 + ceph osd crush move rack-host-1 rack=rack-0 || return 1 + ceph osd crush move rack-0 root=default || return 1 + ceph osd crush set 0 1.0 root=default rack=rack-0 host=rack-host-0 || return 1 + ceph osd crush set 1 1.0 root=default rack=rack-0 host=rack-host-0 || return 1 + ceph osd crush set 2 1.0 root=default rack=rack-0 host=rack-host-1 || return 1 + ceph osd crush set 3 1.0 root=default rack=rack-0 host=rack-host-1 || return 1 + sleep 5 + wait_for_clean || return 1 + + # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in + # osd df equals the acting count the mgr's PGMap records internally. + snapshot_osd_pg_counts + + # With min_size=1 some OSDs inside rack-0 must be reported upgradable. + local res=$(ceph osd ok-to-upgrade rack-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -ge 1 || return 1 + local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') + test $num_osds_upgraded -eq 0 || return 1 + + # Verify that osds_in_crush_bucket lists all rack OSDs in non-decreasing + # order of acting PG count. This directly tests the global sort inside + # _populate_crush_bucket_osds(): consecutive entries must never decrease. + assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1 + + # With max=1, exactly one OSD is returned. + local max=1 + res=$(ceph osd ok-to-upgrade rack-0 $ceph_version $max --format=json) + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -eq 1 || return 1 + + # With min_size=2 on a size=2 pool, no OSD can go offline without + # violating the pool's minimum replica count. + ceph osd pool set $poolname min_size 2 + sleep 5 + wait_for_clean || return 1 + res=$(ceph osd ok-to-upgrade rack-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = false || return 1 + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -eq 0 || return 1 +} + +function TEST_ok_to_upgrade_rack_chassis_bucket() { + local dir=$1 + local poolname="test" + local OSDS=4 + local ceph_version="01.2.3-1234-g1234deed" + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON " + + run_mon $dir a --public-addr=$CEPH_MON || return 1 + run_mgr $dir x || return 1 + + for osd in $(seq 0 $(expr $OSDS - 1)); do + run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1 + done + + # Use size=2 so the pool can be satisfied with 2 distinct host buckets. + # min_size=1 allows upgrading all OSDs on one host simultaneously. + create_pool $poolname 8 8 + ceph osd pool set $poolname pg_autoscale_mode off + ceph osd pool set $poolname size 2 + ceph osd pool set $poolname min_size 1 + sleep 5 + wait_for_clean || return 1 + + # Build a rack CRUSH hierarchy: + # root "default" + # rack "rack-0" + # chassis "chassis-0" + # host "chassis-host-0" -> osd.0, osd.1 + # host "chassis-host-1" -> osd.2, osd.3 + ceph osd crush add-bucket rack-0 rack || return 1 + ceph osd crush add-bucket chassis-0 chassis || return 1 + ceph osd crush add-bucket chassis-host-0 host || return 1 + ceph osd crush add-bucket chassis-host-1 host || return 1 + ceph osd crush move chassis-host-0 chassis=chassis-0 || return 1 + ceph osd crush move chassis-host-1 chassis=chassis-0 || return 1 + ceph osd crush move chassis-0 rack=rack-0 || return 1 + ceph osd crush move rack-0 root=default || return 1 + ceph osd crush set 0 1.0 root=default rack=rack-0 host=chassis-host-0 || return 1 + ceph osd crush set 1 1.0 root=default rack=rack-0 host=chassis-host-0 || return 1 + ceph osd crush set 2 1.0 root=default rack=rack-0 host=chassis-host-1 || return 1 + ceph osd crush set 3 1.0 root=default rack=rack-0 host=chassis-host-1 || return 1 + sleep 5 + wait_for_clean || return 1 + + # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in + # osd df equals the acting count the mgr's PGMap records internally. + snapshot_osd_pg_counts + + # With min_size=1 some OSDs inside chassis-0 must be reported upgradable. + local res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -ge 1 || return 1 + local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length') + test $num_osds_upgraded -eq 0 || return 1 + + # Verify that osds_in_crush_bucket lists all chassis OSDs in non-decreasing + # order of acting PG count. This directly tests the global sort inside + # _populate_crush_bucket_osds(): consecutive entries must never decrease. + assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1 + + # With max=1, exactly one OSD is returned. + local max=1 + res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version $max --format=json) + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -eq 1 || return 1 + + # With min_size=2 on a size=2 pool, no OSD can go offline without + # violating the pool's minimum replica count. + ceph osd pool set $poolname min_size 2 + sleep 5 + wait_for_clean || return 1 + res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = false || return 1 + num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_osds_upgradable -eq 0 || return 1 +} + +# Verify that ok-to-upgrade returns OSDs in ascending order of acting PG +# count globally across the entire chassis (not per-host). +# +# _populate_crush_bucket_osds collects OSDs from every child host bucket, +# then sorts them once globally by (num_pgs, osd_id) ascending before +# returning to _maximize_ok_to_upgrade_set. The convergence algorithm +# prunes from the tail (osds.resize), preserving this global order in the +# final osds_ok_to_upgrade output. +# +# Test design +# ----------- +# 6 OSDs in a chassis (3 per host), pool with 7 PGs and size=2: CRUSH +# maps each PG to one OSD per host. 7 % 3 != 0 produces an unequal +# per-OSD acting count across the chassis, making the global ordering +# non-trivial to predict without actually measuring it. Using 7 (rather +# than a larger number such as 16) widens the relative spread between +# OSD PG counts, reducing ties and making the ordering invariant more +# discriminating. +# +# The test snapshots the acting PG count per OSD from 'ceph osd df', then +# checks two invariants: +# 1. The returned list is non-decreasing by acting PG count (global sort). +# 2. The first returned OSD holds the minimum acting PG count among all +# OSDs in chassis-0, confirming that the globally-least-loaded OSD +# is always offered for upgrade first. +function TEST_ok_to_upgrade_osd_order_by_pg_count() { + local dir=$1 + local poolname="test" + local OSDS=6 + local ALL_CHASSIS_OSDS="0 1 2 3 4 5" + local ceph_version="01.2.3-1234-g1234deed" + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON " + + run_mon $dir a --public-addr=$CEPH_MON || return 1 + run_mgr $dir x || return 1 + + for osd in $(seq 0 $(expr $OSDS - 1)); do + run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1 + done + + # 7 PGs, size=2, min_size=1. 7 % 3 != 0 guarantees an unequal + # per-OSD acting count within the chassis. + create_pool $poolname 7 7 + ceph osd pool set $poolname pg_autoscale_mode off + ceph osd pool set $poolname size 2 + ceph osd pool set $poolname min_size 1 + sleep 5 + wait_for_clean || return 1 + + # Build a chassis hierarchy: + # root "default" + # chassis "chassis-0" + # host "host-0-0" -> osd.0, osd.1, osd.2 + # host "host-0-1" -> osd.3, osd.4, osd.5 + ceph osd crush add-bucket chassis-0 chassis || return 1 + ceph osd crush add-bucket host-0-0 host || return 1 + ceph osd crush add-bucket host-0-1 host || return 1 + ceph osd crush move host-0-0 chassis=chassis-0 || return 1 + ceph osd crush move host-0-1 chassis=chassis-0 || return 1 + ceph osd crush move chassis-0 root=default || return 1 + ceph osd crush set 0 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1 + ceph osd crush set 1 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1 + ceph osd crush set 2 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1 + ceph osd crush set 3 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1 + ceph osd crush set 4 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1 + ceph osd crush set 5 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1 + sleep 5 + wait_for_clean || return 1 + + # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in + # osd df equals the acting count the mgr's PGMap records internally. + snapshot_osd_pg_counts + + local res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json) + test $(echo $res | jq '.all_osds_upgraded') = false || return 1 + test $(echo $res | jq '.ok_to_upgrade') = true || return 1 + local num_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length') + test $num_upgradable -ge 1 || return 1 + + # Verify that osds_in_crush_bucket lists all chassis OSDs in non-decreasing + # order of acting PG count. osds_in_crush_bucket is the direct output of + # _populate_crush_bucket_osds() before the convergence algorithm runs, so + # this check covers every OSD in the bucket, not just the upgrade subset. + assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1 + + # Invariant 1: the returned OSD list must be globally non-decreasing by + # acting PG count. _populate_crush_bucket_osds sorts all chassis OSDs + # together; the convergence algorithm only prunes from the tail, so the + # ascending order is always preserved in the output. + local prev_count=-1 + for osd_id in $(echo $res | jq -r '.osds_ok_to_upgrade[]'); do + local count=${pg_count[$osd_id]} + test "$count" -ge "$prev_count" || return 1 + prev_count=$count + done + + # Invariant 2: the first OSD in the list must have the globally minimum + # acting PG count across all OSDs in chassis-0. This confirms that the + # globally-least-loaded OSD is always offered for upgrade first. + local first_osd=$(echo $res | jq -r '.osds_ok_to_upgrade[0]') + local first_count=${pg_count[$first_osd]} + for osd_id in $ALL_CHASSIS_OSDS; do + test "${pg_count[$osd_id]}" -ge "$first_count" || return 1 + done +} main ok-to-upgrade "$@" diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index afe0a0c37fa..a7ddac0868d 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -1276,6 +1276,7 @@ int DaemonServer::_populate_crush_bucket_osds( } else if (bucket_type_str == "host" || bucket_type_str == "osd") { bucket_names.push_back(item_name); } + // The following struct is to help re-order the // osds based on the number of pgs on them. struct pgs_per_osd { @@ -1283,10 +1284,8 @@ int DaemonServer::_populate_crush_bucket_osds( size_t num_pgs; }; std::vector child_bucket_pgs_per_osd; - // get osds under each child bucket + // get osds under each child bucket and associate with their PG counts for (const auto &name : bucket_names) { - // Clear the items for the current child bucket - child_bucket_pgs_per_osd.clear(); std::set tmp_bucket_osds; r = osdmap.get_osds_by_bucket_name(name, &tmp_bucket_osds); if (r < 0) { @@ -1300,39 +1299,27 @@ int DaemonServer::_populate_crush_bucket_osds( dout(20) << os.str() << dendl; return r; } - - // Special case when bucket contains only 1 osd - if (tmp_bucket_osds.size() == 1) { - for (const auto &osd : tmp_bucket_osds) { - crush_bucket_osds.push_back(osd); - } - dout(20) << "picked osd: " << tmp_bucket_osds - << " from bucket: " << name << dendl; - continue; - } - /** - * The osds in this bucket are further re-ordered based on the - * number of pgs (ascending) they host. This helps optimize - * the result of _check_offlines_pgs() down the line. - */ for (const auto &osd : tmp_bucket_osds) { child_bucket_pgs_per_osd.push_back({osd, pgmap.get_num_pg_by_osd(osd)}); } - // Sort once after all data is added - std::sort(child_bucket_pgs_per_osd.begin(), child_bucket_pgs_per_osd.end(), - [](const pgs_per_osd& a, const pgs_per_osd& b) { - return std::tie(a.num_pgs, a.osd_id) < std::tie(b.num_pgs, b.osd_id); - }); - /** - * The sorted osds are finally pushed to the passed crush_bucket_osds - * vector where osds are maintained according to the child order. - */ - for (const auto &item : child_bucket_pgs_per_osd) { - crush_bucket_osds.push_back(item.osd_id); - } dout(20) << "picked osds: " << tmp_bucket_osds << " from bucket: " << name << dendl; } + + /** + * Sort all collected osds globally based on the number of pgs (ascending) + * they host and update the crush_bucket_osds vector with the same order. + */ + std::sort(child_bucket_pgs_per_osd.begin(), child_bucket_pgs_per_osd.end(), + [](const pgs_per_osd& a, const pgs_per_osd& b) { + return std::tie(a.num_pgs, a.osd_id) < std::tie(b.num_pgs, b.osd_id); + }); + crush_bucket_osds.reserve( + crush_bucket_osds.size() + child_bucket_pgs_per_osd.size()); + for (const auto &item : child_bucket_pgs_per_osd) { + crush_bucket_osds.push_back(item.osd_id); + } + return r; } -- 2.47.3