]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
vstart: Add new --crimson-balance-cpu option
authorJose J Palacios-Perez <perezjos@uk.ibm.com>
Wed, 27 Nov 2024 11:34:15 +0000 (11:34 +0000)
committerJose J Palacios-Perez <perezjos@uk.ibm.com>
Thu, 30 Jan 2025 10:51:01 +0000 (10:51 +0000)
Signed-off-by: Jose J Palacios-Perez <perezjos@uk.ibm.com>
src/stop.sh
src/vstart.sh

index bec67cf5a24fd0905193023e1fc7105751df1bd9..e88ee063e68d5a8639b718e9be4bd6346eeef731 100755 (executable)
@@ -234,3 +234,8 @@ else
     [ $stop_rgw -eq 1 ] && do_killall radosgw lt-radosgw apache2
     [ $stop_cephadm -eq 1 ] && do_killcephadm
 fi
+
+# Check whether the --crimson-balance-cpu option was used, if so remove any auxiliary files left:
+if [ "$ceph_osd" == "crimson-osd" ] && [ -f /tmp/numa_args_*.out ]; then
+    rm -f /tmp/numa_args_*.out
+fi
index 29fbd0ae59c22c3da6e9d2ee1d17845e12ae6452..adeeab9bb9a7a43553c9ebfd0f54b7ff1cfda21a 100755 (executable)
@@ -199,6 +199,7 @@ declare -a block_devs
 declare -a bluestore_db_devs
 declare -a bluestore_wal_devs
 declare -a secondary_block_devs
+declare -a cpu_table
 secondary_block_devs_type="SSD"
 
 VSTART_SEC="client.vstart.sh"
@@ -275,6 +276,7 @@ options:
        --crimson-smp: number of cores to use for crimson
        --crimson-alien-num-threads: number of alien-tp threads
        --crimson-alien-num-cores: number of cores to use for alien-tp
+       --crimson-balance-cpu: distribute the Seastar reactors uniformly across OSDs (osd) or NUMA (socket)
        --osds-per-host: populate crush_location as each host holds the specified number of osds if set
        --require-osd-and-client-version: if supplied, do set-require-min-compat-client and require-osd-release to specified value
        --use-crush-tunables: if supplied, set tunables to specified value
@@ -346,10 +348,43 @@ parse_secondary_devs() {
     done
 }
 
+# Auxiliar function to prepare the CPU cores to pin Seastar reactors
+prep_balance_cpu() {
+    local crimson_smp=$1
+    local balance_strategy=$2
+    local in_file_name="/tmp/numa_args_${balance_strategy}.out"
+    local out_file_name="/tmp/numa_nodes.json"
+    local log_file_name="/tmp/numa_bal_${balance_strategy}.log"
+    local cmd
+
+    # Check the table is empty
+    if [ "${#cpu_table[@]}" -eq 0 ]; then
+        # Ensure the file with the CPU mappings exist
+        if [ ! -f ${in_file_name} ]; then
+            debug echo "lscpu --json > ${out_file_name}"
+            lscpu --json > ${out_file_name}
+            MY_CPUS=$(taskset -acp $$ | awk -F : '{print $2}')
+            cmd="python3 ${CEPH_DIR}/../src/tools/contrib/balance_cpu.py -o ${CEPH_NUM_OSD}\
+                -r ${crimson_smp}  -b ${balance_strategy} -u ${out_file_name} -t ${MY_CPUS} > ${in_file_name}"
+            debug echo "$cmd"
+            eval "$cmd" >> ${log_file_name}
+        fi
+
+        readarray -t cpu_table < ${in_file_name}
+        # Check the table is not empty, bail out otherwise
+        if [ "${#cpu_table[@]}" -ne 0 ]; then
+            debug echo "CPU table not empty with ${#cpu_table[@]} entries"
+        else
+            debug echo "CPU table empty, bailing out. Check  ${log_file_name}"
+        fi
+    fi
+}
+
 # Default values for the crimson options
 crimson_smp=1
 crimson_alien_num_threads=0
 crimson_alien_num_cores=0
+crimson_balance_cpu="" # "osd", "socket"
 
 while [ $# -ge 1 ]; do
 case $1 in
@@ -589,6 +624,10 @@ case $1 in
         crimson_alien_num_cores=$2
         shift
         ;;
+    --crimson-balance-cpu)
+        crimson_balance_cpu=$2
+        shift
+        ;;
     --bluestore-spdk)
         [ -z "$2" ] && usage_exit
         IFS=',' read -r -a bluestore_spdk_dev <<< "$2"
@@ -1156,6 +1195,14 @@ start_cephexporter() {
         --addrs "$IP"
 }
 
+do_balance_cpu() {
+    local osd=$1
+
+    interval=${cpu_table[${osd}]}
+    echo "$CEPH_BIN/ceph -c $conf_fn config set osd.$osd crimson_seastar_cpu_cores $interval"
+    $CEPH_BIN/ceph -c $conf_fn config set "osd.$osd" crimson_seastar_cpu_cores "$interval"
+}
+
 start_osd() {
     if [ $inc_osd_num -gt 0 ]; then
         old_maxosd=$($CEPH_BIN/ceph osd getmaxosd | sed -e 's/max_osd = //' -e 's/ in epoch.*//')
@@ -1167,15 +1214,24 @@ start_osd() {
         end=$(($CEPH_NUM_OSD-1))
     fi
     local osds_wait
+    # If the type of OSD is Crimson and the option to balance the Seastar reactors is true
+       if [ "$ceph_osd" == "crimson-osd" ] && [ ! -z "$crimson_balance_cpu" ]; then
+        debug echo "Preparing balance CPU for Crimson"
+        prep_balance_cpu $crimson_smp $crimson_balance_cpu
+    fi
     for osd in `seq $start $end`
     do
        if [ "$ceph_osd" == "crimson-osd" ]; then
-        bottom_cpu=$(( osd * crimson_smp ))
-        top_cpu=$(( bottom_cpu + crimson_smp - 1 ))
-           # set exclusive CPU nodes for each osd
-           echo "$CEPH_BIN/ceph -c $conf_fn config set osd.$osd crimson_seastar_cpu_cores $bottom_cpu-$top_cpu"
-           $CEPH_BIN/ceph -c $conf_fn config set "osd.$osd" crimson_seastar_cpu_cores "$bottom_cpu-$top_cpu"
-       fi
+        if [ ! -z "$crimson_balance_cpu" ]; then
+            do_balance_cpu $osd
+        else
+            bottom_cpu=$(( osd * crimson_smp ))
+            top_cpu=$(( bottom_cpu + crimson_smp - 1 ))
+            # set exclusive CPU nodes for each osd
+            echo "$CEPH_BIN/ceph -c $conf_fn config set osd.$osd crimson_seastar_cpu_cores $bottom_cpu-$top_cpu"
+            $CEPH_BIN/ceph -c $conf_fn config set "osd.$osd" crimson_seastar_cpu_cores "$bottom_cpu-$top_cpu"
+        fi
+    fi
        if [ "$new" -eq 1 -o $inc_osd_num -gt 0 ]; then
             wconf <<EOF
 [osd.$osd]
@@ -1703,21 +1759,31 @@ if [ "$ceph_osd" == "crimson-osd" ]; then
         extra_seastar_args=" --trace"
     fi
     if [ "$objectstore" == "bluestore" ]; then
+        # This condition verifies the number of logical CPU cores
         if [ "$(expr $(nproc) - 1)" -gt "$(($CEPH_NUM_OSD * crimson_smp))" ]; then
-            if [ $crimson_alien_num_cores -gt 0 ]; then
-                alien_bottom_cpu=$(($CEPH_NUM_OSD * crimson_smp))
-                alien_top_cpu=$(( alien_bottom_cpu + crimson_alien_num_cores - 1 ))
-                # Ensure top value within range:
-                if [ "$(($alien_top_cpu))" -gt "$(expr $(nproc) - 1)" ]; then
-                    alien_top_cpu=$(expr $(nproc) - 1)
-                fi
-                echo "crimson_alien_thread_cpu_cores: $alien_bottom_cpu-$alien_top_cpu"
-                # This is a (logical) processor id range, it could be refined to encompass only physical processor ids
-                # (equivalently, ignore hyperthreading sibling processor ids)
-                $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores "$alien_bottom_cpu-$alien_top_cpu"
+            if [ ! -z "$crimson_balance_cpu" ]; then
+                debug echo "Preparing balance CPU for Crimson"
+                prep_balance_cpu $crimson_smp $crimson_balance_cpu
+                available_cpus="${cpu_table[-1]}"
+                echo "crimson_alien_thread_cpu_cores: '$available_cpus'"
+                $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores "$available_cpus"
             else
-                echo "crimson_alien_thread_cpu_cores:" $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)"
-                $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)"
+                if [ $crimson_alien_num_cores -gt 0 ]; then
+                    alien_bottom_cpu=$(($CEPH_NUM_OSD * crimson_smp))
+                    alien_top_cpu=$(( alien_bottom_cpu + crimson_alien_num_cores - 1 ))
+                    # Ensure top value within range:
+                    if [ "$(($alien_top_cpu))" -gt "$(expr $(nproc) - 1)" ]; then
+                        alien_top_cpu=$(expr $(nproc) - 1)
+                    fi
+                    echo "crimson_alien_thread_cpu_cores: $alien_bottom_cpu-$alien_top_cpu"
+                    # This is a (logical) processor id range, it could be refined to encompass only physical processor ids
+                    # (equivalently, ignore hyperthreading sibling processor ids)
+                    $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores "$alien_bottom_cpu-$alien_top_cpu"
+                else
+                    # This is the legacy default case
+                    echo "crimson_alien_thread_cpu_cores:" $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)"
+                    $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)"
+                fi
             fi
             if [ $crimson_alien_num_threads -gt 0 ]; then
                 echo "$CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_op_num_threads $crimson_alien_num_threads"