From d04b907d319ff6801859dd3852d7e4805a755258 Mon Sep 17 00:00:00 2001 From: Sridhar Seshasayee Date: Thu, 9 Feb 2023 20:47:44 +0530 Subject: [PATCH] osd: Modify mClock scheduler's cost model to represent cost in bytes The mClock scheduler's cost model for HDDs/SSDs is modified and now represents the cost of an IO in terms of bytes. The cost parameters, namely, osd_mclock_cost_per_io_usec_[hdd|ssd] and osd_mclock_cost_per_byte_usec_[hdd|ssd] which represent the cost of an IO in secs are inaccurate and therefore removed. The new model considers the following aspects of an osd to calculate the cost of an IO: - osd_mclock_max_capacity_iops_[hdd|ssd] (existing option) The measured random write IOPS at 4 KiB block size. This is measured during OSD boot-up using OSD bench tool. - osd_mclock_max_sequential_bandwidth_[hdd|ssd] (new config option) The maximum sequential bandwidth of of the underlying device. For HDDs, 150 MiB/s is considered, and for SSDs 750 MiB/s is considered in the cost calculation. The following important changes are made to arrive at the overall cost of an IO, 1. Represent QoS reservation and limit config parameter as proportion: The reservation and limit parameters are now set in terms of a proportion of the OSD's max IOPS capacity. The earlier representation was in terms of IOPS per OSD shard which required the user to perform calculations before setting the parameter. Representing the reservation and limit in terms of proportions is much more intuitive and simpler for a user. 2. Cost per IO Calculation: Using the above config options, osd_bandwidth_cost_per_io for the osd is calculated and set. It is the ratio of the max sequential bandwidth and the max random write iops of the osd. It is a constant and represents the base cost of an IO in terms of bytes. This is added to the actual size of the IO(in bytes) to represent the overall cost of the IO operation.See mClockScheduler::calc_scaled_cost(). 3. Cost calculation in Bytes: The settings for reservation and limit in terms a fraction of the OSD's maximum IOPS capacity is converted to Bytes/sec before updating the mClock server's ClientInfo structure. This is done for each OSD op shard using osd_bandwidth_capacity_per_shard shown below: (res|lim) = (IOPS proportion) * osd_bandwidth_capacity_per_shard (Bytes/sec) (unitless) (bytes/sec) The above result is updated within the mClock server's ClientInfo structure for different op_scheduler_class operations. See mClockScheduler::ClientRegistry::update_from_config(). The overall cost of an IO operation (in secs) is finally determined during the tag calculations performed in the mClock server. See crimson::dmclock::RequestTag::tag_calc() for more details. 4. Profile Allocations: Optimize mClock profile allocations due to the change in the cost model and lower recovery cost. 5. Modify standalone tests to reflect the change in the QoS config parameter representation of reservation and limit options. Fixes: https://tracker.ceph.com/issues/58529 Fixes: https://tracker.ceph.com/issues/59080 Signed-off-by: Samuel Just Signed-off-by: Sridhar Seshasayee --- qa/standalone/misc/mclock-config.sh | 100 ++++--- src/common/options/osd.yaml.in | 199 +++++++------- src/osd/scheduler/OpSchedulerItem.h | 6 +- src/osd/scheduler/mClockScheduler.cc | 383 ++++++++++++--------------- src/osd/scheduler/mClockScheduler.h | 110 ++++++-- 5 files changed, 421 insertions(+), 377 deletions(-) diff --git a/qa/standalone/misc/mclock-config.sh b/qa/standalone/misc/mclock-config.sh index 17260b100ae..d16cd45f43f 100755 --- a/qa/standalone/misc/mclock-config.sh +++ b/qa/standalone/misc/mclock-config.sh @@ -37,7 +37,7 @@ function run() { function TEST_profile_builtin_to_custom() { local dir=$1 - local OSDS=3 + local OSDS=1 setup $dir || return 1 run_mon $dir a --osd_pool_default_size=$OSDS || return 1 @@ -69,7 +69,7 @@ function TEST_profile_builtin_to_custom() { osd.$id) config get osd_mclock_scheduler_client_res | \ jq .osd_mclock_scheduler_client_res | bc) echo "client_res = $client_res" - local client_res_new=$(expr $client_res + 10) + local client_res_new=$(echo "$client_res + 0.1" | bc -l) echo "client_res_new = $client_res_new" ceph config set osd osd_mclock_scheduler_client_res \ $client_res_new || return 1 @@ -78,12 +78,16 @@ function TEST_profile_builtin_to_custom() { # Check value in config monitor db local res=$(ceph config get osd.$id \ osd_mclock_scheduler_client_res) || return 1 - test $res -eq $client_res_new || return 1 + if (( $(echo "$res != $client_res_new" | bc -l) )); then + return 1 + fi # Check value in the in-memory 'values' map res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \ osd.$id) config get osd_mclock_scheduler_client_res | \ jq .osd_mclock_scheduler_client_res | bc) - test $res -eq $client_res_new || return 1 + if (( $(echo "$res != $client_res_new" | bc -l) )); then + return 1 + fi done teardown $dir || return 1 @@ -91,7 +95,7 @@ function TEST_profile_builtin_to_custom() { function TEST_profile_custom_to_builtin() { local dir=$1 - local OSDS=3 + local OSDS=1 setup $dir || return 1 run_mon $dir a --osd_pool_default_size=$OSDS || return 1 @@ -129,7 +133,7 @@ function TEST_profile_custom_to_builtin() { done # Change a mclock config param and confirm the change - local client_res_new=$(expr ${client_res[0]} + 10) + local client_res_new=$(echo "${client_res[0]} + 0.1" | bc -l) echo "client_res_new = $client_res_new" ceph config set osd osd_mclock_scheduler_client_res \ $client_res_new || return 1 @@ -138,12 +142,16 @@ function TEST_profile_custom_to_builtin() { # Check value in config monitor db local res=$(ceph config get osd.$id \ osd_mclock_scheduler_client_res) || return 1 - test $res -eq $client_res_new || return 1 + if (( $(echo "$res != $client_res_new" | bc -l) )); then + return 1 + fi # Check value in the in-memory 'values' map res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \ osd.$id) config get osd_mclock_scheduler_client_res | \ jq .osd_mclock_scheduler_client_res | bc) - test $res -eq $client_res_new || return 1 + if (( $(echo "$res != $client_res_new" | bc -l) )); then + return 1 + fi done # Switch the mclock profile back to the original built-in profile. @@ -166,12 +174,16 @@ function TEST_profile_custom_to_builtin() { # Check value in config monitor db local res=$(ceph config get osd.$id \ osd_mclock_scheduler_client_res) || return 1 - test $res -eq $client_res_new || return 1 + if (( $(echo "$res != $client_res_new" | bc -l) )); then + return 1 + fi # Check value in the in-memory 'values' map res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \ osd.$id) config get osd_mclock_scheduler_client_res | \ jq .osd_mclock_scheduler_client_res | bc) - test $res -eq $client_res_new || return 1 + if (( $(echo "$res != $client_res_new" | bc -l) )); then + return 1 + fi done # Remove the changed QoS config option from monitor db @@ -184,7 +196,9 @@ function TEST_profile_custom_to_builtin() { res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \ osd.$id) config get osd_mclock_scheduler_client_res | \ jq .osd_mclock_scheduler_client_res | bc) - test $res -eq ${client_res[$id]} || return 1 + if (( $(echo "$res != ${client_res[$id]}" | bc -l) )); then + return 1 + fi done teardown $dir || return 1 @@ -274,33 +288,57 @@ function TEST_profile_disallow_builtin_params_modify() { declare -a options=("osd_mclock_scheduler_background_recovery_res" "osd_mclock_scheduler_client_res") + local retries=10 + local errors=0 for opt in "${options[@]}" do # Try and change a mclock config param and confirm that no change occurred local opt_val_orig=$(CEPH_ARGS='' ceph --format=json daemon \ $(get_asok_path osd.0) config get $opt | jq .$opt | bc) - local opt_val_new=$(expr $opt_val_orig + 10) + local opt_val_new=$(echo "$opt_val_orig + 0.1" | bc -l) ceph config set osd.0 $opt $opt_val_new || return 1 - sleep 2 # Allow time for changes to take effect - # Check configuration value on Mon store (or the default) for the osd - local res=$(ceph config get osd.0 $opt) || return 1 - echo "Mon db (or default): osd.0 $opt = $res" - test $res -ne $opt_val_new || return 1 - - # Check running configuration value using "config show" cmd - res=$(ceph config show osd.0 | grep $opt |\ - awk '{ print $2 }' | bc ) || return 1 - echo "Running config: osd.0 $opt = $res" - test $res -ne $opt_val_new || return 1 - test $res -eq $opt_val_orig || return 1 - - # Check value in the in-memory 'values' map is unmodified - res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \ - osd.0) config get $opt | jq .$opt | bc) - echo "Values map: osd.0 $opt = $res" - test $res -ne $opt_val_new || return 1 - test $res -eq $opt_val_orig || return 1 + # Check configuration values + for count in $(seq 0 $(expr $retries - 1)) + do + errors=0 + sleep 2 # Allow time for changes to take effect + + echo "Check configuration values - Attempt#: $count" + # Check configuration value on Mon store (or the default) for the osd + local res=$(ceph config get osd.0 $opt) || return 1 + echo "Mon db (or default): osd.0 $opt = $res" + if (( $(echo "$res == $opt_val_new" | bc -l) )); then + errors=$(expr $errors + 1) + fi + + # Check running configuration value using "config show" cmd + res=$(ceph config show osd.0 | grep $opt |\ + awk '{ print $2 }' | bc ) || return 1 + echo "Running config: osd.0 $opt = $res" + if (( $(echo "$res == $opt_val_new" | bc -l) || \ + $(echo "$res != $opt_val_orig" | bc -l) )); then + errors=$(expr $errors + 1) + fi + + # Check value in the in-memory 'values' map is unmodified + res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \ + osd.0) config get $opt | jq .$opt | bc) + echo "Values map: osd.0 $opt = $res" + if (( $(echo "$res == $opt_val_new" | bc -l) || \ + $(echo "$res != $opt_val_orig" | bc -l) )); then + errors=$(expr $errors + 1) + fi + + # Check if we succeeded or exhausted retry count + if [ $errors -eq 0 ] + then + break + elif [ $count -eq $(expr $retries - 1) ] + then + return 1 + fi + done done teardown $dir || return 1 diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index a1c01374a1e..dbd461aa578 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -951,12 +951,17 @@ options: - debug_random with_legacy: true - name: osd_mclock_scheduler_client_res - type: uint + type: float level: advanced - desc: IO proportion reserved for each client (default) + desc: IO proportion reserved for each client (default). The default value + of 0 specifies the lowest possible reservation. Any value greater than + 0 and up to 1.0 specifies the minimum IO proportion to reserve for each + client in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for each client (default). - default: 1 + default: 0 + min: 0 + max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_client_wgt @@ -969,21 +974,34 @@ options: see_also: - osd_op_queue - name: osd_mclock_scheduler_client_lim - type: uint + type: float level: advanced - desc: IO limit for each client (default) over reservation + desc: IO limit for each client (default) over reservation. The default + value of 0 specifies no limit enforcement, which means each client can + use the maximum possible IOPS capacity of the OSD. Any value greater + than 0 and up to 1.0 specifies the upper IO limit over reservation + that each client receives in terms of a fraction of the OSD's + maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for each client (default) over reservation. - default: 999999 + default: 0 + min: 0 + max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_recovery_res - type: uint + type: float level: advanced - desc: IO proportion reserved for background recovery (default) + desc: IO proportion reserved for background recovery (default). The + default value of 0 specifies the lowest possible reservation. Any value + greater than 0 and up to 1.0 specifies the minimum IO proportion to + reserve for background recovery operations in terms of a fraction of + the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for background recovery (default). - default: 1 + default: 0 + min: 0 + max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_recovery_wgt @@ -996,21 +1014,34 @@ options: see_also: - osd_op_queue - name: osd_mclock_scheduler_background_recovery_lim - type: uint + type: float level: advanced - desc: IO limit for background recovery over reservation + desc: IO limit for background recovery over reservation. The default + value of 0 specifies no limit enforcement, which means background + recovery operation can use the maximum possible IOPS capacity of the + OSD. Any value greater than 0 and up to 1.0 specifies the upper IO + limit over reservation that background recovery operation receives in + terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for background recovery over reservation. - default: 999999 + default: 0 + min: 0 + max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_best_effort_res - type: uint + type: float level: advanced - desc: IO proportion reserved for background best_effort (default) + desc: IO proportion reserved for background best_effort (default). The + default value of 0 specifies the lowest possible reservation. Any value + greater than 0 and up to 1.0 specifies the minimum IO proportion to + reserve for background best_effort operations in terms of a fraction + of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for background best_effort (default). - default: 1 + default: 0 + min: 0 + max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_best_effort_wgt @@ -1023,12 +1054,19 @@ options: see_also: - osd_op_queue - name: osd_mclock_scheduler_background_best_effort_lim - type: uint + type: float level: advanced - desc: IO limit for background best_effort over reservation + desc: IO limit for background best_effort over reservation. The default + value of 0 specifies no limit enforcement, which means background + best_effort operation can use the maximum possible IOPS capacity of the + OSD. Any value greater than 0 and up to 1.0 specifies the upper IO + limit over reservation that background best_effort operation receives + in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for background best_effort over reservation. - default: 999999 + default: 0 + min: 0 + max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_anticipation_timeout @@ -1037,106 +1075,57 @@ options: desc: mclock anticipation timeout in seconds long_desc: the amount of time that mclock waits until the unused resource is forfeited default: 0 -- name: osd_mclock_cost_per_io_usec - type: float - level: dev - desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if - non-zero) - long_desc: This option specifies the cost factor to consider in usec per OSD. This - is considered by the mclock scheduler to set an additional cost factor in QoS - calculations. Only considered for osd_op_queue = mclock_scheduler - fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd - and _hdd if non-zero) - default: 0 - flags: - - runtime -- name: osd_mclock_cost_per_io_usec_hdd - type: float - level: dev - desc: Cost per IO in microseconds to consider per OSD (for rotational media) - long_desc: This option specifies the cost factor to consider in usec per OSD for - rotational device type. This is considered by the mclock_scheduler to set an additional - cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler - fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational - media) - default: 11400 - flags: - - runtime -- name: osd_mclock_cost_per_io_usec_ssd - type: float - level: dev - desc: Cost per IO in microseconds to consider per OSD (for solid state media) - long_desc: This option specifies the cost factor to consider in usec per OSD for - solid state device type. This is considered by the mclock_scheduler to set an - additional cost factor in QoS calculations. Only considered for osd_op_queue = - mclock_scheduler - fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state - media) - default: 50 - flags: - - runtime -- name: osd_mclock_cost_per_byte_usec - type: float - level: dev - desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd - if non-zero) - long_desc: This option specifies the cost per byte to consider in microseconds per - OSD. This is considered by the mclock scheduler to set an additional cost factor - in QoS calculations. Only considered for osd_op_queue = mclock_scheduler - fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd - and _hdd if non-zero) - default: 0 - flags: - - runtime -- name: osd_mclock_cost_per_byte_usec_hdd - type: float - level: dev - desc: Cost per byte in microseconds to consider per OSD (for rotational media) - long_desc: This option specifies the cost per byte to consider in microseconds per - OSD for rotational device type. This is considered by the mclock_scheduler to - set an additional cost factor in QoS calculations. Only considered for osd_op_queue - = mclock_scheduler - fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational - media) - default: 2.6 +- name: osd_mclock_max_sequential_bandwidth_hdd + type: size + level: basic + desc: The maximum sequential bandwidth in bytes/second of the OSD (for + rotational media) + long_desc: This option specifies the maximum sequential bandwidth to consider + for an OSD whose underlying device type is rotational media. This is + considered by the mclock scheduler to derive the cost factor to be used in + QoS calculations. Only considered for osd_op_queue = mclock_scheduler + fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the + OSD (for rotational media) + default: 150_M flags: - runtime -- name: osd_mclock_cost_per_byte_usec_ssd - type: float - level: dev - desc: Cost per byte in microseconds to consider per OSD (for solid state media) - long_desc: This option specifies the cost per byte to consider in microseconds per - OSD for solid state device type. This is considered by the mclock_scheduler to - set an additional cost factor in QoS calculations. Only considered for osd_op_queue - = mclock_scheduler - fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state - media) - default: 0.011 +- name: osd_mclock_max_sequential_bandwidth_ssd + type: size + level: basic + desc: The maximum sequential bandwidth in bytes/second of the OSD (for + solid state media) + long_desc: This option specifies the maximum sequential bandwidth to consider + for an OSD whose underlying device type is solid state media. This is + considered by the mclock scheduler to derive the cost factor to be used in + QoS calculations. Only considered for osd_op_queue = mclock_scheduler + fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the + OSD (for solid state media) + default: 750_M flags: - runtime - name: osd_mclock_max_capacity_iops_hdd type: float level: basic - desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational - media) - long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in - QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue - = mclock_scheduler - fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for - rotational media) + desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD + (for rotational media) + long_desc: This option specifies the max OSD random write IOPS capacity per + OSD. Contributes in QoS calculations when enabling a dmclock profile. Only + considered for osd_op_queue = mclock_scheduler + fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per + OSD (for rotational media) default: 315 flags: - runtime - name: osd_mclock_max_capacity_iops_ssd type: float level: basic - desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state - media) - long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in - QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue - = mclock_scheduler - fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for - solid state media) + desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD + (for solid state media) + long_desc: This option specifies the max OSD random write IOPS capacity per + OSD. Contributes in QoS calculations when enabling a dmclock profile. Only + considered for osd_op_queue = mclock_scheduler + fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per + OSD (for solid state media) default: 21500 flags: - runtime diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h index 2a4c00d0dcd..dbe61b05120 100644 --- a/src/osd/scheduler/OpSchedulerItem.h +++ b/src/osd/scheduler/OpSchedulerItem.h @@ -106,7 +106,7 @@ private: utime_t start_time; uint64_t owner; ///< global id (e.g., client.XXX) epoch_t map_epoch; ///< an epoch we expect the PG to exist in - int qos_cost; ///< scaled cost calculated by the mclock scheduler + uint32_t qos_cost; ///< scaled cost calculated by the mclock scheduler bool qos_item; ///< set to true if item is scheduled by mclock scheduler public: @@ -183,11 +183,11 @@ public: return qos_item; } - void set_qos_cost(int scaled_cost) { + void set_qos_cost(uint32_t scaled_cost) { qos_cost = scaled_cost; } - int get_qos_cost() const { + uint32_t get_qos_cost() const { return qos_cost; } diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc index 9b07ca33421..abb17571a4e 100644 --- a/src/osd/scheduler/mClockScheduler.cc +++ b/src/osd/scheduler/mClockScheduler.cc @@ -51,32 +51,85 @@ mClockScheduler::mClockScheduler(CephContext *cct, { cct->_conf.add_observer(this); ceph_assert(num_shards > 0); - set_max_osd_capacity(); - set_osd_mclock_cost_per_io(); - set_osd_mclock_cost_per_byte(); + set_osd_capacity_params_from_config(); set_mclock_profile(); enable_mclock_profile_settings(); - client_registry.update_from_config(cct->_conf); + client_registry.update_from_config( + cct->_conf, osd_bandwidth_capacity_per_shard); } -void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +/* ClientRegistry holds the dmclock::ClientInfo configuration parameters + * (reservation (bytes/second), weight (unitless), limit (bytes/second)) + * for each IO class in the OSD (client, background_recovery, + * background_best_effort). + * + * mclock expects limit and reservation to have units of /second + * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided + * as ratios of the OSD's capacity. We convert from the one to the other + * using the capacity_per_shard parameter. + * + * Note, mclock profile information will already have been set as a default + * for the osd_mclock_scheduler_client_* parameters prior to calling + * update_from_config -- see set_config_defaults_from_profile(). + */ +void mClockScheduler::ClientRegistry::update_from_config( + const ConfigProxy &conf, + const double capacity_per_shard) { - default_external_client_info.update( - conf.get_val("osd_mclock_scheduler_client_res"), - conf.get_val("osd_mclock_scheduler_client_wgt"), - conf.get_val("osd_mclock_scheduler_client_lim")); + auto get_res = [&](double res) { + if (res) { + return res * capacity_per_shard; + } else { + return default_min; // min reservation + } + }; + + auto get_lim = [&](double lim) { + if (lim) { + return lim * capacity_per_shard; + } else { + return default_max; // high limit + } + }; + + // Set external client infos + double res = conf.get_val( + "osd_mclock_scheduler_client_res"); + double lim = conf.get_val( + "osd_mclock_scheduler_client_lim"); + uint64_t wgt = conf.get_val( + "osd_mclock_scheduler_client_wgt"); + default_external_client_info.update( + get_res(res), + wgt, + get_lim(lim)); + + // Set background recovery client infos + res = conf.get_val( + "osd_mclock_scheduler_background_recovery_res"); + lim = conf.get_val( + "osd_mclock_scheduler_background_recovery_lim"); + wgt = conf.get_val( + "osd_mclock_scheduler_background_recovery_wgt"); internal_client_infos[ static_cast(op_scheduler_class::background_recovery)].update( - conf.get_val("osd_mclock_scheduler_background_recovery_res"), - conf.get_val("osd_mclock_scheduler_background_recovery_wgt"), - conf.get_val("osd_mclock_scheduler_background_recovery_lim")); - + get_res(res), + wgt, + get_lim(lim)); + + // Set background best effort client infos + res = conf.get_val( + "osd_mclock_scheduler_background_best_effort_res"); + lim = conf.get_val( + "osd_mclock_scheduler_background_best_effort_lim"); + wgt = conf.get_val( + "osd_mclock_scheduler_background_best_effort_wgt"); internal_client_infos[ static_cast(op_scheduler_class::background_best_effort)].update( - conf.get_val("osd_mclock_scheduler_background_best_effort_res"), - conf.get_val("osd_mclock_scheduler_background_best_effort_wgt"), - conf.get_val("osd_mclock_scheduler_background_best_effort_lim")); + get_res(res), + wgt, + get_lim(lim)); } const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( @@ -103,70 +156,38 @@ const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( } } -void mClockScheduler::set_max_osd_capacity() +void mClockScheduler::set_osd_capacity_params_from_config() { - if (is_rotational) { - max_osd_capacity = - cct->_conf.get_val("osd_mclock_max_capacity_iops_hdd"); - cct->_conf.set_val("osd_mclock_max_capacity_iops_ssd", "0"); - } else { - max_osd_capacity = - cct->_conf.get_val("osd_mclock_max_capacity_iops_ssd"); - cct->_conf.set_val("osd_mclock_max_capacity_iops_hdd", "0"); - } - // Set per op-shard iops limit - max_osd_capacity /= num_shards; - dout(1) << __func__ << " #op shards: " << num_shards - << std::fixed << std::setprecision(2) - << " max osd capacity(iops) per shard: " << max_osd_capacity - << dendl; -} + uint64_t osd_bandwidth_capacity; + double osd_iop_capacity; -void mClockScheduler::set_osd_mclock_cost_per_io() -{ - std::chrono::seconds sec(1); - if (cct->_conf.get_val("osd_mclock_cost_per_io_usec")) { - osd_mclock_cost_per_io = - cct->_conf.get_val("osd_mclock_cost_per_io_usec"); - } else { + std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] { if (is_rotational) { - osd_mclock_cost_per_io = - cct->_conf.get_val("osd_mclock_cost_per_io_usec_hdd"); - // For HDDs, convert value to seconds - osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count(); + return std::make_tuple( + cct->_conf.get_val( + "osd_mclock_max_sequential_bandwidth_hdd"), + cct->_conf.get_val("osd_mclock_max_capacity_iops_hdd")); } else { - // For SSDs, convert value to milliseconds - osd_mclock_cost_per_io = - cct->_conf.get_val("osd_mclock_cost_per_io_usec_ssd"); - osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count(); + return std::make_tuple( + cct->_conf.get_val( + "osd_mclock_max_sequential_bandwidth_ssd"), + cct->_conf.get_val("osd_mclock_max_capacity_iops_ssd")); } - } - dout(1) << __func__ << " osd_mclock_cost_per_io: " - << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io - << dendl; -} + }(); -void mClockScheduler::set_osd_mclock_cost_per_byte() -{ - std::chrono::seconds sec(1); - if (cct->_conf.get_val("osd_mclock_cost_per_byte_usec")) { - osd_mclock_cost_per_byte = - cct->_conf.get_val("osd_mclock_cost_per_byte_usec"); - } else { - if (is_rotational) { - osd_mclock_cost_per_byte = - cct->_conf.get_val("osd_mclock_cost_per_byte_usec_hdd"); - // For HDDs, convert value to seconds - osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count(); - } else { - osd_mclock_cost_per_byte = - cct->_conf.get_val("osd_mclock_cost_per_byte_usec_ssd"); - // For SSDs, convert value to milliseconds - osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count(); - } - } - dout(1) << __func__ << " osd_mclock_cost_per_byte: " - << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte + osd_bandwidth_capacity = std::max(1, osd_bandwidth_capacity); + osd_iop_capacity = std::max(1.0, osd_iop_capacity); + + osd_bandwidth_cost_per_io = + static_cast(osd_bandwidth_capacity) / osd_iop_capacity; + osd_bandwidth_capacity_per_shard = static_cast(osd_bandwidth_capacity) + / static_cast(num_shards); + + dout(1) << __func__ << ": osd_bandwidth_cost_per_io: " + << std::fixed << std::setprecision(2) + << osd_bandwidth_cost_per_io << " bytes/io" + << ", osd_bandwidth_capacity_per_shard " + << osd_bandwidth_capacity_per_shard << " bytes/second" << dendl; } @@ -181,143 +202,82 @@ std::string mClockScheduler::get_mclock_profile() return mclock_profile; } +// Sets allocations for 'balanced' mClock profile +// +// min and max specification: +// 0 (min): specifies no minimum reservation +// 0 (max): specifies no upper limit +// +// Client Allocation: +// reservation: 40% | weight: 1 | limit: 100% | +// Background Recovery Allocation: +// reservation: 40% | weight: 1 | limit: 70% | +// Background Best Effort Allocation: +// reservation: 20% | weight: 1 | limit: 0 (max) | void mClockScheduler::set_balanced_profile_allocations() { - // Client Allocation: - // reservation: 40% | weight: 1 | limit: 100% | - // Background Recovery Allocation: - // reservation: 40% | weight: 1 | limit: 150% | - // Background Best Effort Allocation: - // reservation: 20% | weight: 2 | limit: max | - - // Client - uint64_t client_res = static_cast( - std::round(0.40 * max_osd_capacity)); - uint64_t client_lim = static_cast( - std::round(max_osd_capacity)); - uint64_t client_wgt = default_min; - - // Background Recovery - uint64_t rec_res = static_cast( - std::round(0.40 * max_osd_capacity)); - uint64_t rec_lim = static_cast( - std::round(1.5 * max_osd_capacity)); - uint64_t rec_wgt = default_min; - - // Background Best Effort - uint64_t best_effort_res = static_cast( - std::round(0.20 * max_osd_capacity)); - uint64_t best_effort_lim = default_max; - uint64_t best_effort_wgt = 2; - - // Set the allocations for the mclock clients + // Set [res, wgt, lim] in that order for each mClock client class. client_allocs[ static_cast(op_scheduler_class::client)].update( - client_res, - client_wgt, - client_lim); + 0.4, 1.0, 1.0); client_allocs[ static_cast(op_scheduler_class::background_recovery)].update( - rec_res, - rec_wgt, - rec_lim); + 0.4, 1.0, 0.7); client_allocs[ static_cast(op_scheduler_class::background_best_effort)].update( - best_effort_res, - best_effort_wgt, - best_effort_lim); + 0.2, 1.0, 0.0); } +// Sets allocations for 'high_recovery_ops' mClock profile +// +// min and max specification: +// 0 (min): specifies no minimum reservation +// 0 (max): specifies no upper limit +// +// Client Allocation: +// reservation: 30% | weight: 1 | limit: 80% | +// Background Recovery Allocation: +// reservation: 60% | weight: 2 | limit: 0 (max) | +// Background Best Effort Allocation: +// reservation: 0 (min) | weight: 1 | limit: 0 (max) | void mClockScheduler::set_high_recovery_ops_profile_allocations() { - // Client Allocation: - // reservation: 30% | weight: 1 | limit: 80% | - // Background Recovery Allocation: - // reservation: 60% | weight: 2 | limit: 200% | - // Background Best Effort Allocation: - // reservation: 1 | weight: 2 | limit: max | - - // Client - uint64_t client_res = static_cast( - std::round(0.30 * max_osd_capacity)); - uint64_t client_lim = static_cast( - std::round(0.80 * max_osd_capacity)); - uint64_t client_wgt = default_min; - - // Background Recovery - uint64_t rec_res = static_cast( - std::round(0.60 * max_osd_capacity)); - uint64_t rec_lim = static_cast( - std::round(2.0 * max_osd_capacity)); - uint64_t rec_wgt = 2; - - // Background Best Effort - uint64_t best_effort_res = default_min; - uint64_t best_effort_lim = default_max; - uint64_t best_effort_wgt = 2; - - // Set the allocations for the mclock clients + // Set [res, wgt, lim] in that order for each mClock client class. client_allocs[ static_cast(op_scheduler_class::client)].update( - client_res, - client_wgt, - client_lim); + 0.3, 1.0, 0.8); client_allocs[ static_cast(op_scheduler_class::background_recovery)].update( - rec_res, - rec_wgt, - rec_lim); + 0.6, 2.0, 0.0); client_allocs[ static_cast(op_scheduler_class::background_best_effort)].update( - best_effort_res, - best_effort_wgt, - best_effort_lim); + 0.0, 1.0, 0.0); } +// Sets allocations for 'high_client_ops' mClock profile +// +// min and max specification: +// 0 (min): specifies no minimum reservation +// 0 (max): specifies no upper limit +// +// Client Allocation: +// reservation: 60% | weight: 5 | limit: 0 (max) | +// Background Recovery Allocation: +// reservation: 20% | weight: 1 | limit: 50% | +// Background Best Effort Allocation: +// reservation: 20% | weight: 1 | limit: 0 (max) | void mClockScheduler::set_high_client_ops_profile_allocations() { - // Client Allocation: - // reservation: 50% | weight: 2 | limit: max | - // Background Recovery Allocation: - // reservation: 25% | weight: 1 | limit: 100% | - // Background Best Effort Allocation: - // reservation: 25% | weight: 2 | limit: max | - - // Client - uint64_t client_res = static_cast( - std::round(0.50 * max_osd_capacity)); - uint64_t client_wgt = 2; - uint64_t client_lim = default_max; - - // Background Recovery - uint64_t rec_res = static_cast( - std::round(0.25 * max_osd_capacity)); - uint64_t rec_lim = static_cast( - std::round(max_osd_capacity)); - uint64_t rec_wgt = default_min; - - // Background Best Effort - uint64_t best_effort_res = static_cast( - std::round(0.25 * max_osd_capacity)); - uint64_t best_effort_lim = default_max; - uint64_t best_effort_wgt = 2; - - // Set the allocations for the mclock clients + // Set [res, wgt, lim] in that order for each mClock client class. client_allocs[ static_cast(op_scheduler_class::client)].update( - client_res, - client_wgt, - client_lim); + 0.6, 5.0, 0.0); client_allocs[ static_cast(op_scheduler_class::background_recovery)].update( - rec_res, - rec_wgt, - rec_lim); + 0.2, 1.0, 0.5); client_allocs[ static_cast(op_scheduler_class::background_best_effort)].update( - best_effort_res, - best_effort_wgt, - best_effort_lim); + 0.2, 1.0, 0.0); } void mClockScheduler::enable_mclock_profile_settings() @@ -361,7 +321,7 @@ void mClockScheduler::set_profile_config() cct->_conf.set_val_default("osd_mclock_scheduler_client_res", std::to_string(client.res)); cct->_conf.set_val_default("osd_mclock_scheduler_client_wgt", - std::to_string(client.wgt)); + std::to_string(uint64_t(client.wgt))); cct->_conf.set_val_default("osd_mclock_scheduler_client_lim", std::to_string(client.lim)); dout(10) << __func__ << " client QoS params: " << "[" @@ -372,7 +332,7 @@ void mClockScheduler::set_profile_config() cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_res", std::to_string(rec.res)); cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_wgt", - std::to_string(rec.wgt)); + std::to_string(uint64_t(rec.wgt))); cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_lim", std::to_string(rec.lim)); dout(10) << __func__ << " Recovery QoS params: " << "[" @@ -383,7 +343,7 @@ void mClockScheduler::set_profile_config() cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_res", std::to_string(best_effort.res)); cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_wgt", - std::to_string(best_effort.wgt)); + std::to_string(uint64_t(best_effort.wgt))); cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_lim", std::to_string(best_effort.lim)); dout(10) << __func__ << " Best effort QoS params: " << "[" @@ -394,12 +354,16 @@ void mClockScheduler::set_profile_config() update_configuration(); } -int mClockScheduler::calc_scaled_cost(int item_cost) +uint32_t mClockScheduler::calc_scaled_cost(int item_cost) { - // Calculate total scaled cost in secs - int scaled_cost = - std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost)); - return std::max(scaled_cost, 1); + auto cost = static_cast( + std::max( + 1, // ensure cost is non-zero and positive + item_cost)); + auto cost_per_io = static_cast(osd_bandwidth_cost_per_io); + + // Calculate total scaled cost in bytes + return cost_per_io + cost; } void mClockScheduler::update_configuration() @@ -452,7 +416,7 @@ void mClockScheduler::enqueue(OpSchedulerItem&& item) } else if (priority >= cutoff) { enqueue_high(priority, std::move(item)); } else { - int cost = calc_scaled_cost(item.get_cost()); + auto cost = calc_scaled_cost(item.get_cost()); item.set_qos_cost(cost); dout(20) << __func__ << " " << id << " item_cost: " << item.get_cost() @@ -557,14 +521,10 @@ const char** mClockScheduler::get_tracked_conf_keys() const "osd_mclock_scheduler_background_best_effort_res", "osd_mclock_scheduler_background_best_effort_wgt", "osd_mclock_scheduler_background_best_effort_lim", - "osd_mclock_cost_per_io_usec", - "osd_mclock_cost_per_io_usec_hdd", - "osd_mclock_cost_per_io_usec_ssd", - "osd_mclock_cost_per_byte_usec", - "osd_mclock_cost_per_byte_usec_hdd", - "osd_mclock_cost_per_byte_usec_ssd", "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd", + "osd_mclock_max_sequential_bandwidth_hdd", + "osd_mclock_max_sequential_bandwidth_ssd", "osd_mclock_profile", NULL }; @@ -575,29 +535,27 @@ void mClockScheduler::handle_conf_change( const ConfigProxy& conf, const std::set &changed) { - if (changed.count("osd_mclock_cost_per_io_usec") || - changed.count("osd_mclock_cost_per_io_usec_hdd") || - changed.count("osd_mclock_cost_per_io_usec_ssd")) { - set_osd_mclock_cost_per_io(); - } - if (changed.count("osd_mclock_cost_per_byte_usec") || - changed.count("osd_mclock_cost_per_byte_usec_hdd") || - changed.count("osd_mclock_cost_per_byte_usec_ssd")) { - set_osd_mclock_cost_per_byte(); - } if (changed.count("osd_mclock_max_capacity_iops_hdd") || changed.count("osd_mclock_max_capacity_iops_ssd")) { - set_max_osd_capacity(); + set_osd_capacity_params_from_config(); if (mclock_profile != "custom") { enable_mclock_profile_settings(); - client_registry.update_from_config(conf); } + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); + } + if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") || + changed.count("osd_mclock_max_sequential_bandwidth_ssd")) { + set_osd_capacity_params_from_config(); + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); } if (changed.count("osd_mclock_profile")) { set_mclock_profile(); if (mclock_profile != "custom") { enable_mclock_profile_settings(); - client_registry.update_from_config(conf); + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); } } @@ -624,7 +582,8 @@ void mClockScheduler::handle_conf_change( if (auto key = get_changed_key(); key.has_value()) { if (mclock_profile == "custom") { - client_registry.update_from_config(conf); + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); } else { // Attempt to change QoS parameter for a built-in profile. Restore the // profile defaults by making one of the OSD shards remove the key from diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h index 053dd1e87fd..9af97830ca7 100644 --- a/src/osd/scheduler/mClockScheduler.h +++ b/src/osd/scheduler/mClockScheduler.h @@ -33,8 +33,10 @@ namespace ceph::osd::scheduler { -constexpr uint64_t default_min = 1; -constexpr uint64_t default_max = 999999; +constexpr double default_min = 1.0; +constexpr double default_max = std::numeric_limits::is_iec559 ? + std::numeric_limits::infinity() : + std::numeric_limits::max(); using client_id_t = uint64_t; using profile_id_t = uint64_t; @@ -78,20 +80,18 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { const int shard_id; bool is_rotational; MonClient *monc; - double max_osd_capacity; - double osd_mclock_cost_per_io; - double osd_mclock_cost_per_byte; + std::string mclock_profile = "high_client_ops"; struct ClientAllocs { - uint64_t res; - uint64_t wgt; - uint64_t lim; + double res; + double wgt; + double lim; - ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) { + ClientAllocs(double _res, double _wgt, double _lim) { update(_res, _wgt, _lim); } - inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) { + inline void update(double _res, double _wgt, double _lim) { res = _res; wgt = _wgt; lim = _lim; @@ -102,11 +102,55 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { static_cast(op_scheduler_class::client) + 1 > client_allocs = { // Placeholder, get replaced with configured values - ClientAllocs(1, 1, 1), // background_recovery - ClientAllocs(1, 1, 1), // background_best_effort - ClientAllocs(1, 1, 1), // immediate (not used) - ClientAllocs(1, 1, 1) // client + ClientAllocs(0, 1, 0), // background_recovery + ClientAllocs(0, 1, 0), // background_best_effort + ClientAllocs(0, 1, 0), // immediate (not used) + ClientAllocs(0, 1, 0) // client }; + + /** + * osd_bandwidth_cost_per_io + * + * mClock expects all queued items to have a uniform expression of + * "cost". However, IO devices generally have quite different capacity + * for sequential IO vs small random IO. This implementation handles this + * by expressing all costs as a number of sequential bytes written adding + * additional cost for each random IO equal to osd_bandwidth_cost_per_io. + * + * Thus, an IO operation requiring a total of bytes to be written + * accross different locations will have a cost of + * + (osd_bandwidth_cost_per_io * ) bytes. + * + * Set in set_osd_capacity_params_from_config in the constructor and upon + * config change. + * + * Has units bytes/io. + */ + double osd_bandwidth_cost_per_io; + + /** + * osd_bandwidth_capacity_per_shard + * + * mClock expects reservation and limit paramters to be expressed in units + * of cost/second -- which means bytes/second for this implementation. + * + * Rather than expecting users to compute appropriate limit and reservation + * values for each class of OSDs in their cluster, we instead express + * reservation and limit paramaters as ratios of the OSD's maxmimum capacity. + * osd_bandwidth_capacity_per_shard is that capacity divided by the number + * of shards. + * + * Set in set_osd_capacity_params_from_config in the constructor and upon + * config change. + * + * This value gets passed to ClientRegistry::update_from_config in order + * to resolve the full reservaiton and limit parameters for mclock from + * the configured ratios. + * + * Has units bytes/second. + */ + double osd_bandwidth_capacity_per_shard; + class ClientRegistry { std::array< crimson::dmclock::ClientInfo, @@ -123,7 +167,16 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { const crimson::dmclock::ClientInfo *get_external_client( const client_profile_id_t &client) const; public: - void update_from_config(const ConfigProxy &conf); + /** + * update_from_config + * + * Sets the mclock paramaters (reservation, weight, and limit) + * for each class of IO (background_recovery, background_best_effort, + * and client). + */ + void update_from_config( + const ConfigProxy &conf, + double capacity_per_shard); const crimson::dmclock::ClientInfo *get_info( const scheduler_id_t &id) const; } client_registry; @@ -171,20 +224,25 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { } } + /** + * set_osd_capacity_params_from_config + * + * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io + * and osd_bandwidth_capacity_per_shard, internally. These two + * parameters are derived from config parameters + * osd_mclock_max_capacity_iops_(hdd|ssd) and + * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards. + * Invoking set_osd_capacity_params_from_config() resets those derived + * params based on the current config and should be invoked any time they + * are modified as well as in the constructor. See handle_conf_change(). + */ + void set_osd_capacity_params_from_config(); + public: mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards, int shard_id, bool is_rotational, MonClient *monc); ~mClockScheduler() override; - // Set the max osd capacity in iops - void set_max_osd_capacity(); - - // Set the cost per io for the osd - void set_osd_mclock_cost_per_io(); - - // Set the cost per byte for the osd - void set_osd_mclock_cost_per_byte(); - // Set the mclock profile type to enable void set_mclock_profile(); @@ -206,8 +264,8 @@ public: // Set mclock config parameter based on allocations void set_profile_config(); - // Calculate scale cost per item - int calc_scaled_cost(int cost); + /// Calculate scaled cost per item + uint32_t calc_scaled_cost(int cost); // Helper method to display mclock queues std::string display_queues() const; -- 2.39.5