From 6761549c5a719f541a850f243318a1212b76c0a6 Mon Sep 17 00:00:00 2001
From: Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
Date: Wed, 6 May 2026 20:41:33 +0530
Subject: [PATCH] mgr/DaemonServer: Aggregate and globally sort OSDs for
 ok-to-upgrade

The 'ok-to-upgrade' command output sorting did not scale accurately
when target CRUSH buckets contained multiple child buckets (e.g., a
chassis containing multiple hosts). OSDs were previously sorted
individually per child bucket and appended sequentially. This created
fragmented, per-host sort segments rather than a globally sorted list
for the parent bucket.

Changes:

1. Fix the issue above by aggregating all child OSDs into a single vector prior
to executing a single, global sort operation based on PG counts. Additionally,
optimize memory efficiency and future-proof the logic by reserving continuous
vector blocks to avoid dynamic heap reallocations.

2. Add integration tests with chassis and rack based CRUSH hierarchies which
verifies the ok-to-upgrade functionality. In addition, the tests crucially
verify the order of OSDs returned is according to the ascending order of
acting PG count. Additionally, make minor fix-ups to lines that determine the
length of a list in JSON response by removing the redundant "| bc".

Fixes: https://tracker.ceph.com/issues/77272
Signed-off-by: Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
---
 qa/standalone/misc/ok-to-upgrade.sh | 436 ++++++++++++++++++++++++++--
 src/mgr/DaemonServer.cc             |  47 ++-
 2 files changed, 423 insertions(+), 60 deletions(-)
diff --git a/qa/standalone/misc/ok-to-upgrade.sh b/qa/standalone/misc/ok-to-upgrade.sh
index 5be0eee49fb..2bdb5d7497b 100755
--- a/qa/standalone/misc/ok-to-upgrade.sh
+++ b/qa/standalone/misc/ok-to-upgrade.sh
@@ -35,6 +35,33 @@ function run() {
     done
 }
 
+# Populate the global pg_count associative array with each OSD's acting
+# PG count from 'ceph osd df'.
+function snapshot_osd_pg_counts() {
+    declare -gA pg_count
+    pg_count=()
+    while read -r osd_id count; do
+        pg_count[$osd_id]=$count
+    done < <(ceph osd df --format=json | \
+             jq -r '.nodes[] | select(.id >= 0) | "\(.id) \(.pgs)"')
+}
+
+# Verify that the named OSD list in a JSON result is ordered in non-decreasing
+# acting PG count. Uses the global pg_count array populated by
+# snapshot_osd_pg_counts().
+# Usage: assert_osds_sorted_by_pg_count <field> <json>
+function assert_osds_sorted_by_pg_count() {
+    local field=$1
+    local json=$2
+    local prev_count=-1
+    local osd_id count
+    for osd_id in $(echo "$json" | jq -r ".${field}[]"); do
+        count=${pg_count[$osd_id]}
+        test "$count" -ge "$prev_count" || return 1
+        prev_count=$count
+    done
+}
+
 function TEST_ok_to_upgrade_invalid_args() {
     local dir=$1
     CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON "
@@ -103,9 +130,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
     # and verifies that at least the expected minimum OSDs are returned.
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -ge $exp_osds_upgradable || return 1
-    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test the same command as above, but exercise the 'max' parameter.
@@ -116,9 +143,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
     res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version $max --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -eq $exp_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test same command above with verbose syntax
@@ -126,9 +153,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
         --ceph_version $ceph_version --max $max --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -eq $exp_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for upgradability with min_size=1 and 1 OSD to upgrade. The outcome
@@ -139,9 +166,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
     res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for upgradability with min_size=2
@@ -154,9 +181,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
     # 3 OSDs should be reported as upgradable.
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -ge $exp_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for upgradability with min_size=3
@@ -168,9 +195,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
     # No OSD should be reported as upgradable.
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = false || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for condition when all OSDs are running desired version.
@@ -179,9 +206,9 @@ function TEST_ok_to_upgrade_replicated_pool() {
     res=$(ceph osd ok-to-upgrade $crush_bucket $upgrade_version --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = true || return 1
     test $(echo $res | jq '.ok_to_upgrade') = false || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -eq 0 || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq $OSDS || return 1
 }
 
@@ -217,9 +244,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
     # in 3 OSDs being reported as upgradable.
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test the same command as above, but exercise the 'max' parameter.
@@ -230,9 +257,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
     res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version $max --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -eq $exp_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test command above with verbose syntax
@@ -240,9 +267,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
         --ceph_version $ceph_version --max $max --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -eq $exp_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for upgradability with min_size=5 and 1 OSD to upgrade. The outcome
@@ -253,9 +280,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
     res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for upgradability with min_size=6
@@ -268,9 +295,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
     # 2 OSDs should be reported as upgradable.
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = true || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for upgradability with min_size=8
@@ -282,9 +309,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
     # No OSD should be reported as upgradable.
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = false || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq 0 || return 1
 
     # Test for condition when all OSDs are running desired version.
@@ -293,9 +320,9 @@ function TEST_ok_to_upgrade_erasure_pool() {
     res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = true || return 1
     test $(echo $res | jq '.ok_to_upgrade') = false || return 1
-    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $num_osds_upgradable -eq 0 || return 1
-    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length' | bc)
+    num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
     test $num_osds_upgraded -eq $OSDS || return 1
 }
 
@@ -331,11 +358,360 @@ function TEST_ok_to_upgrade_bad_osd_version() {
     local res=$(ceph osd ok-to-upgrade $crush_bucket $ceph_version --format=json)
     test $(echo $res | jq '.all_osds_upgraded') = false || return 1
     test $(echo $res | jq '.ok_to_upgrade') = false || return 1
-    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length' | bc)
+    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
     test $exp_osds_upgradable = $num_osds_upgradable || return 1
-    local num_osds_bad_version=$(echo $res | jq '.bad_no_version | length' | bc)
+    local num_osds_bad_version=$(echo $res | jq '.bad_no_version | length')
     test $num_osds_bad_version -eq 3 || return 1
 }
 
+function TEST_ok_to_upgrade_chassis_bucket() {
+    local dir=$1
+    local poolname="test"
+    local OSDS=4
+    local ceph_version="01.2.3-1234-g1234deed"
+
+    CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON "
+
+    run_mon $dir a --public-addr=$CEPH_MON || return 1
+    run_mgr $dir x || return 1
+
+    for osd in $(seq 0 $(expr $OSDS - 1)); do
+      run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1
+    done
+
+    # Use size=2 so the pool can be satisfied with 2 distinct host buckets.
+    # min_size=1 allows upgrading all OSDs on one host simultaneously.
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname pg_autoscale_mode off
+    ceph osd pool set $poolname size 2
+    ceph osd pool set $poolname min_size 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Build a chassis CRUSH hierarchy:
+    #   root "default"
+    #     chassis "chassis-0"
+    #       host "host-0-0" -> osd.0, osd.1
+    #       host "host-0-1" -> osd.2, osd.3
+    #
+    # OSDs start in auto-generated host buckets; crush set replaces each
+    # OSD's location with the new custom hierarchy.
+    ceph osd crush add-bucket chassis-0 chassis || return 1
+    ceph osd crush add-bucket host-0-0 host || return 1
+    ceph osd crush add-bucket host-0-1 host || return 1
+    ceph osd crush move host-0-0 chassis=chassis-0 || return 1
+    ceph osd crush move host-0-1 chassis=chassis-0 || return 1
+    ceph osd crush move chassis-0 root=default || return 1
+    ceph osd crush set 0 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1
+    ceph osd crush set 1 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1
+    ceph osd crush set 2 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1
+    ceph osd crush set 3 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in
+    # osd df equals the acting count the mgr's PGMap records internally.
+    snapshot_osd_pg_counts
+
+    # With min_size=1 some OSDs inside chassis-0 must be reported upgradable.
+    local res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -ge 1 || return 1
+    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
+    test $num_osds_upgraded -eq 0 || return 1
+
+    # Verify that osds_in_crush_bucket lists all chassis OSDs in non-decreasing
+    # order of acting PG count.  This directly tests the global sort inside
+    # _populate_crush_bucket_osds(): consecutive entries must never decrease.
+    assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1
+
+    # With max=1, exactly one OSD is returned.
+    local max=1
+    res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version $max --format=json)
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -eq 1 || return 1
+
+    # With min_size=2 on a size=2 pool, taking any OSD offline drops a PG
+    # below the minimum replica count -- no OSD should be upgradable.
+    ceph osd pool set $poolname min_size 2
+    sleep 5
+    wait_for_clean || return 1
+    res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = false || return 1
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -eq 0 || return 1
+}
+
+function TEST_ok_to_upgrade_rack_bucket() {
+    local dir=$1
+    local poolname="test"
+    local OSDS=4
+    local ceph_version="01.2.3-1234-g1234deed"
+
+    CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON "
+
+    run_mon $dir a --public-addr=$CEPH_MON || return 1
+    run_mgr $dir x || return 1
+
+    for osd in $(seq 0 $(expr $OSDS - 1)); do
+      run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1
+    done
+
+    # Use size=2 so the pool can be satisfied with 2 distinct host buckets.
+    # min_size=1 allows upgrading all OSDs on one host simultaneously.
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname pg_autoscale_mode off
+    ceph osd pool set $poolname size 2
+    ceph osd pool set $poolname min_size 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Build a rack CRUSH hierarchy:
+    #   root "default"
+    #     rack "rack-0"
+    #       host "rack-host-0" -> osd.0, osd.1
+    #       host "rack-host-1" -> osd.2, osd.3
+    ceph osd crush add-bucket rack-0 rack || return 1
+    ceph osd crush add-bucket rack-host-0 host || return 1
+    ceph osd crush add-bucket rack-host-1 host || return 1
+    ceph osd crush move rack-host-0 rack=rack-0 || return 1
+    ceph osd crush move rack-host-1 rack=rack-0 || return 1
+    ceph osd crush move rack-0 root=default || return 1
+    ceph osd crush set 0 1.0 root=default rack=rack-0 host=rack-host-0 || return 1
+    ceph osd crush set 1 1.0 root=default rack=rack-0 host=rack-host-0 || return 1
+    ceph osd crush set 2 1.0 root=default rack=rack-0 host=rack-host-1 || return 1
+    ceph osd crush set 3 1.0 root=default rack=rack-0 host=rack-host-1 || return 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in
+    # osd df equals the acting count the mgr's PGMap records internally.
+    snapshot_osd_pg_counts
+
+    # With min_size=1 some OSDs inside rack-0 must be reported upgradable.
+    local res=$(ceph osd ok-to-upgrade rack-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -ge 1 || return 1
+    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
+    test $num_osds_upgraded -eq 0 || return 1
+
+    # Verify that osds_in_crush_bucket lists all rack OSDs in non-decreasing
+    # order of acting PG count.  This directly tests the global sort inside
+    # _populate_crush_bucket_osds(): consecutive entries must never decrease.
+    assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1
+
+    # With max=1, exactly one OSD is returned.
+    local max=1
+    res=$(ceph osd ok-to-upgrade rack-0 $ceph_version $max --format=json)
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -eq 1 || return 1
+
+    # With min_size=2 on a size=2 pool, no OSD can go offline without
+    # violating the pool's minimum replica count.
+    ceph osd pool set $poolname min_size 2
+    sleep 5
+    wait_for_clean || return 1
+    res=$(ceph osd ok-to-upgrade rack-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = false || return 1
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -eq 0 || return 1
+}
+
+function TEST_ok_to_upgrade_rack_chassis_bucket() {
+    local dir=$1
+    local poolname="test"
+    local OSDS=4
+    local ceph_version="01.2.3-1234-g1234deed"
+
+    CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON "
+
+    run_mon $dir a --public-addr=$CEPH_MON || return 1
+    run_mgr $dir x || return 1
+
+    for osd in $(seq 0 $(expr $OSDS - 1)); do
+      run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1
+    done
+
+    # Use size=2 so the pool can be satisfied with 2 distinct host buckets.
+    # min_size=1 allows upgrading all OSDs on one host simultaneously.
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname pg_autoscale_mode off
+    ceph osd pool set $poolname size 2
+    ceph osd pool set $poolname min_size 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Build a rack CRUSH hierarchy:
+    #   root "default"
+    #     rack "rack-0"
+    #       chassis "chassis-0"
+    #         host "chassis-host-0" -> osd.0, osd.1
+    #         host "chassis-host-1" -> osd.2, osd.3
+    ceph osd crush add-bucket rack-0 rack || return 1
+    ceph osd crush add-bucket chassis-0 chassis || return 1
+    ceph osd crush add-bucket chassis-host-0 host || return 1
+    ceph osd crush add-bucket chassis-host-1 host || return 1
+    ceph osd crush move chassis-host-0 chassis=chassis-0 || return 1
+    ceph osd crush move chassis-host-1 chassis=chassis-0 || return 1
+    ceph osd crush move chassis-0 rack=rack-0 || return 1
+    ceph osd crush move rack-0 root=default || return 1
+    ceph osd crush set 0 1.0 root=default rack=rack-0 host=chassis-host-0 || return 1
+    ceph osd crush set 1 1.0 root=default rack=rack-0 host=chassis-host-0 || return 1
+    ceph osd crush set 2 1.0 root=default rack=rack-0 host=chassis-host-1 || return 1
+    ceph osd crush set 3 1.0 root=default rack=rack-0 host=chassis-host-1 || return 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in
+    # osd df equals the acting count the mgr's PGMap records internally.
+    snapshot_osd_pg_counts
+
+    # With min_size=1 some OSDs inside chassis-0 must be reported upgradable.
+    local res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    local num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -ge 1 || return 1
+    local num_osds_upgraded=$(echo $res | jq '.osds_upgraded | length')
+    test $num_osds_upgraded -eq 0 || return 1
+
+    # Verify that osds_in_crush_bucket lists all chassis OSDs in non-decreasing
+    # order of acting PG count.  This directly tests the global sort inside
+    # _populate_crush_bucket_osds(): consecutive entries must never decrease.
+    assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1
+
+    # With max=1, exactly one OSD is returned.
+    local max=1
+    res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version $max --format=json)
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -eq 1 || return 1
+
+    # With min_size=2 on a size=2 pool, no OSD can go offline without
+    # violating the pool's minimum replica count.
+    ceph osd pool set $poolname min_size 2
+    sleep 5
+    wait_for_clean || return 1
+    res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = false || return 1
+    num_osds_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_osds_upgradable -eq 0 || return 1
+}
+
+# Verify that ok-to-upgrade returns OSDs in ascending order of acting PG
+# count globally across the entire chassis (not per-host).
+#
+# _populate_crush_bucket_osds collects OSDs from every child host bucket,
+# then sorts them once globally by (num_pgs, osd_id) ascending before
+# returning to _maximize_ok_to_upgrade_set.  The convergence algorithm
+# prunes from the tail (osds.resize), preserving this global order in the
+# final osds_ok_to_upgrade output.
+#
+# Test design
+# -----------
+# 6 OSDs in a chassis (3 per host), pool with 7 PGs and size=2: CRUSH
+# maps each PG to one OSD per host. 7 % 3 != 0 produces an unequal
+# per-OSD acting count across the chassis, making the global ordering
+# non-trivial to predict without actually measuring it.  Using 7 (rather
+# than a larger number such as 16) widens the relative spread between
+# OSD PG counts, reducing ties and making the ordering invariant more
+# discriminating.
+#
+# The test snapshots the acting PG count per OSD from 'ceph osd df', then
+# checks two invariants:
+#   1. The returned list is non-decreasing by acting PG count (global sort).
+#   2. The first returned OSD holds the minimum acting PG count among all
+#      OSDs in chassis-0, confirming that the globally-least-loaded OSD
+#      is always offered for upgrade first.
+function TEST_ok_to_upgrade_osd_order_by_pg_count() {
+    local dir=$1
+    local poolname="test"
+    local OSDS=6
+    local ALL_CHASSIS_OSDS="0 1 2 3 4 5"
+    local ceph_version="01.2.3-1234-g1234deed"
+
+    CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON "
+
+    run_mon $dir a --public-addr=$CEPH_MON || return 1
+    run_mgr $dir x || return 1
+
+    for osd in $(seq 0 $(expr $OSDS - 1)); do
+      run_osd $dir $osd --osd-mclock-skip-benchmark=true || return 1
+    done
+
+    # 7 PGs, size=2, min_size=1. 7 % 3 != 0 guarantees an unequal
+    # per-OSD acting count within the chassis.
+    create_pool $poolname 7 7
+    ceph osd pool set $poolname pg_autoscale_mode off
+    ceph osd pool set $poolname size 2
+    ceph osd pool set $poolname min_size 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Build a chassis hierarchy:
+    #   root "default"
+    #     chassis "chassis-0"
+    #       host "host-0-0" -> osd.0, osd.1, osd.2
+    #       host "host-0-1" -> osd.3, osd.4, osd.5
+    ceph osd crush add-bucket chassis-0 chassis || return 1
+    ceph osd crush add-bucket host-0-0 host || return 1
+    ceph osd crush add-bucket host-0-1 host || return 1
+    ceph osd crush move host-0-0 chassis=chassis-0 || return 1
+    ceph osd crush move host-0-1 chassis=chassis-0 || return 1
+    ceph osd crush move chassis-0 root=default || return 1
+    ceph osd crush set 0 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1
+    ceph osd crush set 1 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1
+    ceph osd crush set 2 1.0 root=default chassis=chassis-0 host=host-0-0 || return 1
+    ceph osd crush set 3 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1
+    ceph osd crush set 4 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1
+    ceph osd crush set 5 1.0 root=default chassis=chassis-0 host=host-0-1 || return 1
+    sleep 5
+    wait_for_clean || return 1
+
+    # Snapshot acting PG counts per OSD. In a clean cluster 'pgs' in
+    # osd df equals the acting count the mgr's PGMap records internally.
+    snapshot_osd_pg_counts
+
+    local res=$(ceph osd ok-to-upgrade chassis-0 $ceph_version --format=json)
+    test $(echo $res | jq '.all_osds_upgraded') = false || return 1
+    test $(echo $res | jq '.ok_to_upgrade') = true || return 1
+    local num_upgradable=$(echo $res | jq '.osds_ok_to_upgrade | length')
+    test $num_upgradable -ge 1 || return 1
+
+    # Verify that osds_in_crush_bucket lists all chassis OSDs in non-decreasing
+    # order of acting PG count.  osds_in_crush_bucket is the direct output of
+    # _populate_crush_bucket_osds() before the convergence algorithm runs, so
+    # this check covers every OSD in the bucket, not just the upgrade subset.
+    assert_osds_sorted_by_pg_count osds_in_crush_bucket "$res" || return 1
+
+    # Invariant 1: the returned OSD list must be globally non-decreasing by
+    # acting PG count.  _populate_crush_bucket_osds sorts all chassis OSDs
+    # together; the convergence algorithm only prunes from the tail, so the
+    # ascending order is always preserved in the output.
+    local prev_count=-1
+    for osd_id in $(echo $res | jq -r '.osds_ok_to_upgrade[]'); do
+        local count=${pg_count[$osd_id]}
+        test "$count" -ge "$prev_count" || return 1
+        prev_count=$count
+    done
+
+    # Invariant 2: the first OSD in the list must have the globally minimum
+    # acting PG count across all OSDs in chassis-0.  This confirms that the
+    # globally-least-loaded OSD is always offered for upgrade first.
+    local first_osd=$(echo $res | jq -r '.osds_ok_to_upgrade[0]')
+    local first_count=${pg_count[$first_osd]}
+    for osd_id in $ALL_CHASSIS_OSDS; do
+        test "${pg_count[$osd_id]}" -ge "$first_count" || return 1
+    done
+}
 
 main ok-to-upgrade "$@"
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
index afe0a0c37fa..a7ddac0868d 100644
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -1276,6 +1276,7 @@ int DaemonServer::_populate_crush_bucket_osds(
   } else if (bucket_type_str == "host" || bucket_type_str == "osd") {
     bucket_names.push_back(item_name);
   }
+
   // The following struct is to help re-order the
   // osds based on the number of pgs on them.
   struct pgs_per_osd {
@@ -1283,10 +1284,8 @@ int DaemonServer::_populate_crush_bucket_osds(
     size_t num_pgs;
   };
   std::vector<pgs_per_osd> child_bucket_pgs_per_osd;
-  // get osds under each child bucket
+  // get osds under each child bucket and associate with their PG counts
   for (const auto &name : bucket_names) {
-    // Clear the items for the current child bucket
-    child_bucket_pgs_per_osd.clear();
     std::set<int> tmp_bucket_osds;
     r = osdmap.get_osds_by_bucket_name(name, &tmp_bucket_osds);
     if (r < 0) {
@@ -1300,39 +1299,27 @@ int DaemonServer::_populate_crush_bucket_osds(
       dout(20) << os.str() << dendl;
       return r;
     }
-
-    // Special case when bucket contains only 1 osd
-    if (tmp_bucket_osds.size() == 1) {
-      for (const auto &osd : tmp_bucket_osds) {
-        crush_bucket_osds.push_back(osd);
-      }
-      dout(20) << "picked osd: " << tmp_bucket_osds
-               << " from bucket: " << name << dendl;
-      continue;
-    }
-    /**
-     * The osds in this bucket are further re-ordered based on the
-     * number of pgs (ascending) they host. This helps optimize
-     * the result of _check_offlines_pgs() down the line.
-     */
     for (const auto &osd : tmp_bucket_osds) {
       child_bucket_pgs_per_osd.push_back({osd, pgmap.get_num_pg_by_osd(osd)});
     }
-    // Sort once after all data is added
-    std::sort(child_bucket_pgs_per_osd.begin(), child_bucket_pgs_per_osd.end(),
-              [](const pgs_per_osd& a, const pgs_per_osd& b) {
-        return std::tie(a.num_pgs, a.osd_id) < std::tie(b.num_pgs, b.osd_id);
-    });
-    /**
-     * The sorted osds are finally pushed to the passed crush_bucket_osds
-     * vector where osds are maintained according to the child order.
-     */
-    for (const auto &item : child_bucket_pgs_per_osd) {
-      crush_bucket_osds.push_back(item.osd_id);
-    }
     dout(20) << "picked osds: " << tmp_bucket_osds
              << " from bucket: " << name << dendl;
   }
+
+  /**
+   * Sort all collected osds globally based on the number of pgs (ascending)
+   * they host and update the crush_bucket_osds vector with the same order.
+   */
+  std::sort(child_bucket_pgs_per_osd.begin(), child_bucket_pgs_per_osd.end(),
+            [](const pgs_per_osd& a, const pgs_per_osd& b) {
+      return std::tie(a.num_pgs, a.osd_id) < std::tie(b.num_pgs, b.osd_id);
+  });
+  crush_bucket_osds.reserve(
+    crush_bucket_osds.size() + child_bucket_pgs_per_osd.size());
+  for (const auto &item : child_bucket_pgs_per_osd) {
+    crush_bucket_osds.push_back(item.osd_id);
+  }
+
   return r;
 }
 
-- 
2.47.3