function TEST_profile_builtin_to_custom() {
local dir=$1
- local OSDS=3
+ local OSDS=1
setup $dir || return 1
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
osd.$id) config get osd_mclock_scheduler_client_res | \
jq .osd_mclock_scheduler_client_res | bc)
echo "client_res = $client_res"
- local client_res_new=$(expr $client_res + 10)
+ local client_res_new=$(echo "$client_res + 0.1" | bc -l)
echo "client_res_new = $client_res_new"
ceph config set osd osd_mclock_scheduler_client_res \
$client_res_new || return 1
# Check value in config monitor db
local res=$(ceph config get osd.$id \
osd_mclock_scheduler_client_res) || return 1
- test $res -eq $client_res_new || return 1
+ if (( $(echo "$res != $client_res_new" | bc -l) )); then
+ return 1
+ fi
# Check value in the in-memory 'values' map
res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
osd.$id) config get osd_mclock_scheduler_client_res | \
jq .osd_mclock_scheduler_client_res | bc)
- test $res -eq $client_res_new || return 1
+ if (( $(echo "$res != $client_res_new" | bc -l) )); then
+ return 1
+ fi
done
teardown $dir || return 1
function TEST_profile_custom_to_builtin() {
local dir=$1
- local OSDS=3
+ local OSDS=1
setup $dir || return 1
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
done
# Change a mclock config param and confirm the change
- local client_res_new=$(expr ${client_res[0]} + 10)
+ local client_res_new=$(echo "${client_res[0]} + 0.1" | bc -l)
echo "client_res_new = $client_res_new"
ceph config set osd osd_mclock_scheduler_client_res \
$client_res_new || return 1
# Check value in config monitor db
local res=$(ceph config get osd.$id \
osd_mclock_scheduler_client_res) || return 1
- test $res -eq $client_res_new || return 1
+ if (( $(echo "$res != $client_res_new" | bc -l) )); then
+ return 1
+ fi
# Check value in the in-memory 'values' map
res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
osd.$id) config get osd_mclock_scheduler_client_res | \
jq .osd_mclock_scheduler_client_res | bc)
- test $res -eq $client_res_new || return 1
+ if (( $(echo "$res != $client_res_new" | bc -l) )); then
+ return 1
+ fi
done
# Switch the mclock profile back to the original built-in profile.
# Check value in config monitor db
local res=$(ceph config get osd.$id \
osd_mclock_scheduler_client_res) || return 1
- test $res -eq $client_res_new || return 1
+ if (( $(echo "$res != $client_res_new" | bc -l) )); then
+ return 1
+ fi
# Check value in the in-memory 'values' map
res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
osd.$id) config get osd_mclock_scheduler_client_res | \
jq .osd_mclock_scheduler_client_res | bc)
- test $res -eq $client_res_new || return 1
+ if (( $(echo "$res != $client_res_new" | bc -l) )); then
+ return 1
+ fi
done
# Remove the changed QoS config option from monitor db
res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
osd.$id) config get osd_mclock_scheduler_client_res | \
jq .osd_mclock_scheduler_client_res | bc)
- test $res -eq ${client_res[$id]} || return 1
+ if (( $(echo "$res != ${client_res[$id]}" | bc -l) )); then
+ return 1
+ fi
done
teardown $dir || return 1
declare -a options=("osd_mclock_scheduler_background_recovery_res"
"osd_mclock_scheduler_client_res")
+ local retries=10
+ local errors=0
for opt in "${options[@]}"
do
# Try and change a mclock config param and confirm that no change occurred
local opt_val_orig=$(CEPH_ARGS='' ceph --format=json daemon \
$(get_asok_path osd.0) config get $opt | jq .$opt | bc)
- local opt_val_new=$(expr $opt_val_orig + 10)
+ local opt_val_new=$(echo "$opt_val_orig + 0.1" | bc -l)
ceph config set osd.0 $opt $opt_val_new || return 1
- sleep 2 # Allow time for changes to take effect
- # Check configuration value on Mon store (or the default) for the osd
- local res=$(ceph config get osd.0 $opt) || return 1
- echo "Mon db (or default): osd.0 $opt = $res"
- test $res -ne $opt_val_new || return 1
-
- # Check running configuration value using "config show" cmd
- res=$(ceph config show osd.0 | grep $opt |\
- awk '{ print $2 }' | bc ) || return 1
- echo "Running config: osd.0 $opt = $res"
- test $res -ne $opt_val_new || return 1
- test $res -eq $opt_val_orig || return 1
-
- # Check value in the in-memory 'values' map is unmodified
- res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
- osd.0) config get $opt | jq .$opt | bc)
- echo "Values map: osd.0 $opt = $res"
- test $res -ne $opt_val_new || return 1
- test $res -eq $opt_val_orig || return 1
+ # Check configuration values
+ for count in $(seq 0 $(expr $retries - 1))
+ do
+ errors=0
+ sleep 2 # Allow time for changes to take effect
+
+ echo "Check configuration values - Attempt#: $count"
+ # Check configuration value on Mon store (or the default) for the osd
+ local res=$(ceph config get osd.0 $opt) || return 1
+ echo "Mon db (or default): osd.0 $opt = $res"
+ if (( $(echo "$res == $opt_val_new" | bc -l) )); then
+ errors=$(expr $errors + 1)
+ fi
+
+ # Check running configuration value using "config show" cmd
+ res=$(ceph config show osd.0 | grep $opt |\
+ awk '{ print $2 }' | bc ) || return 1
+ echo "Running config: osd.0 $opt = $res"
+ if (( $(echo "$res == $opt_val_new" | bc -l) || \
+ $(echo "$res != $opt_val_orig" | bc -l) )); then
+ errors=$(expr $errors + 1)
+ fi
+
+ # Check value in the in-memory 'values' map is unmodified
+ res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
+ osd.0) config get $opt | jq .$opt | bc)
+ echo "Values map: osd.0 $opt = $res"
+ if (( $(echo "$res == $opt_val_new" | bc -l) || \
+ $(echo "$res != $opt_val_orig" | bc -l) )); then
+ errors=$(expr $errors + 1)
+ fi
+
+ # Check if we succeeded or exhausted retry count
+ if [ $errors -eq 0 ]
+ then
+ break
+ elif [ $count -eq $(expr $retries - 1) ]
+ then
+ return 1
+ fi
+ done
done
teardown $dir || return 1
- debug_random
with_legacy: true
- name: osd_mclock_scheduler_client_res
- type: uint
+ type: float
level: advanced
- desc: IO proportion reserved for each client (default)
+ desc: IO proportion reserved for each client (default). The default value
+ of 0 specifies the lowest possible reservation. Any value greater than
+ 0 and up to 1.0 specifies the minimum IO proportion to reserve for each
+ client in terms of a fraction of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO proportion reserved for each client (default).
- default: 1
+ default: 0
+ min: 0
+ max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_client_wgt
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_client_lim
- type: uint
+ type: float
level: advanced
- desc: IO limit for each client (default) over reservation
+ desc: IO limit for each client (default) over reservation. The default
+ value of 0 specifies no limit enforcement, which means each client can
+ use the maximum possible IOPS capacity of the OSD. Any value greater
+ than 0 and up to 1.0 specifies the upper IO limit over reservation
+ that each client receives in terms of a fraction of the OSD's
+ maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO limit for each client (default) over reservation.
- default: 999999
+ default: 0
+ min: 0
+ max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_recovery_res
- type: uint
+ type: float
level: advanced
- desc: IO proportion reserved for background recovery (default)
+ desc: IO proportion reserved for background recovery (default). The
+ default value of 0 specifies the lowest possible reservation. Any value
+ greater than 0 and up to 1.0 specifies the minimum IO proportion to
+ reserve for background recovery operations in terms of a fraction of
+ the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO proportion reserved for background recovery (default).
- default: 1
+ default: 0
+ min: 0
+ max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_recovery_wgt
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_recovery_lim
- type: uint
+ type: float
level: advanced
- desc: IO limit for background recovery over reservation
+ desc: IO limit for background recovery over reservation. The default
+ value of 0 specifies no limit enforcement, which means background
+ recovery operation can use the maximum possible IOPS capacity of the
+ OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
+ limit over reservation that background recovery operation receives in
+ terms of a fraction of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO limit for background recovery over reservation.
- default: 999999
+ default: 0
+ min: 0
+ max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_res
- type: uint
+ type: float
level: advanced
- desc: IO proportion reserved for background best_effort (default)
+ desc: IO proportion reserved for background best_effort (default). The
+ default value of 0 specifies the lowest possible reservation. Any value
+ greater than 0 and up to 1.0 specifies the minimum IO proportion to
+ reserve for background best_effort operations in terms of a fraction
+ of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO proportion reserved for background best_effort (default).
- default: 1
+ default: 0
+ min: 0
+ max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_wgt
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_lim
- type: uint
+ type: float
level: advanced
- desc: IO limit for background best_effort over reservation
+ desc: IO limit for background best_effort over reservation. The default
+ value of 0 specifies no limit enforcement, which means background
+ best_effort operation can use the maximum possible IOPS capacity of the
+ OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
+ limit over reservation that background best_effort operation receives
+ in terms of a fraction of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO limit for background best_effort over reservation.
- default: 999999
+ default: 0
+ min: 0
+ max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_anticipation_timeout
desc: mclock anticipation timeout in seconds
long_desc: the amount of time that mclock waits until the unused resource is forfeited
default: 0
-- name: osd_mclock_cost_per_io_usec
- type: float
- level: dev
- desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if
- non-zero)
- long_desc: This option specifies the cost factor to consider in usec per OSD. This
- is considered by the mclock scheduler to set an additional cost factor in QoS
- calculations. Only considered for osd_op_queue = mclock_scheduler
- fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd
- and _hdd if non-zero)
- default: 0
- flags:
- - runtime
-- name: osd_mclock_cost_per_io_usec_hdd
- type: float
- level: dev
- desc: Cost per IO in microseconds to consider per OSD (for rotational media)
- long_desc: This option specifies the cost factor to consider in usec per OSD for
- rotational device type. This is considered by the mclock_scheduler to set an additional
- cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
- fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational
- media)
- default: 11400
- flags:
- - runtime
-- name: osd_mclock_cost_per_io_usec_ssd
- type: float
- level: dev
- desc: Cost per IO in microseconds to consider per OSD (for solid state media)
- long_desc: This option specifies the cost factor to consider in usec per OSD for
- solid state device type. This is considered by the mclock_scheduler to set an
- additional cost factor in QoS calculations. Only considered for osd_op_queue =
- mclock_scheduler
- fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state
- media)
- default: 50
- flags:
- - runtime
-- name: osd_mclock_cost_per_byte_usec
- type: float
- level: dev
- desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd
- if non-zero)
- long_desc: This option specifies the cost per byte to consider in microseconds per
- OSD. This is considered by the mclock scheduler to set an additional cost factor
- in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
- fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd
- and _hdd if non-zero)
- default: 0
- flags:
- - runtime
-- name: osd_mclock_cost_per_byte_usec_hdd
- type: float
- level: dev
- desc: Cost per byte in microseconds to consider per OSD (for rotational media)
- long_desc: This option specifies the cost per byte to consider in microseconds per
- OSD for rotational device type. This is considered by the mclock_scheduler to
- set an additional cost factor in QoS calculations. Only considered for osd_op_queue
- = mclock_scheduler
- fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational
- media)
- default: 2.6
+- name: osd_mclock_max_sequential_bandwidth_hdd
+ type: size
+ level: basic
+ desc: The maximum sequential bandwidth in bytes/second of the OSD (for
+ rotational media)
+ long_desc: This option specifies the maximum sequential bandwidth to consider
+ for an OSD whose underlying device type is rotational media. This is
+ considered by the mclock scheduler to derive the cost factor to be used in
+ QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+ fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
+ OSD (for rotational media)
+ default: 150_M
flags:
- runtime
-- name: osd_mclock_cost_per_byte_usec_ssd
- type: float
- level: dev
- desc: Cost per byte in microseconds to consider per OSD (for solid state media)
- long_desc: This option specifies the cost per byte to consider in microseconds per
- OSD for solid state device type. This is considered by the mclock_scheduler to
- set an additional cost factor in QoS calculations. Only considered for osd_op_queue
- = mclock_scheduler
- fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state
- media)
- default: 0.011
+- name: osd_mclock_max_sequential_bandwidth_ssd
+ type: size
+ level: basic
+ desc: The maximum sequential bandwidth in bytes/second of the OSD (for
+ solid state media)
+ long_desc: This option specifies the maximum sequential bandwidth to consider
+ for an OSD whose underlying device type is solid state media. This is
+ considered by the mclock scheduler to derive the cost factor to be used in
+ QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+ fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
+ OSD (for solid state media)
+ default: 750_M
flags:
- runtime
- name: osd_mclock_max_capacity_iops_hdd
type: float
level: basic
- desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational
- media)
- long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
- QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
- = mclock_scheduler
- fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
- rotational media)
+ desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD
+ (for rotational media)
+ long_desc: This option specifies the max OSD random write IOPS capacity per
+ OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
+ considered for osd_op_queue = mclock_scheduler
+ fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
+ OSD (for rotational media)
default: 315
flags:
- runtime
- name: osd_mclock_max_capacity_iops_ssd
type: float
level: basic
- desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state
- media)
- long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
- QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
- = mclock_scheduler
- fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
- solid state media)
+ desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD
+ (for solid state media)
+ long_desc: This option specifies the max OSD random write IOPS capacity per
+ OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
+ considered for osd_op_queue = mclock_scheduler
+ fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
+ OSD (for solid state media)
default: 21500
flags:
- runtime
utime_t start_time;
uint64_t owner; ///< global id (e.g., client.XXX)
epoch_t map_epoch; ///< an epoch we expect the PG to exist in
- int qos_cost; ///< scaled cost calculated by the mclock scheduler
+ uint32_t qos_cost; ///< scaled cost calculated by the mclock scheduler
bool qos_item; ///< set to true if item is scheduled by mclock scheduler
public:
return qos_item;
}
- void set_qos_cost(int scaled_cost) {
+ void set_qos_cost(uint32_t scaled_cost) {
qos_cost = scaled_cost;
}
- int get_qos_cost() const {
+ uint32_t get_qos_cost() const {
return qos_cost;
}
{
cct->_conf.add_observer(this);
ceph_assert(num_shards > 0);
- set_max_osd_capacity();
- set_osd_mclock_cost_per_io();
- set_osd_mclock_cost_per_byte();
+ set_osd_capacity_params_from_config();
set_mclock_profile();
enable_mclock_profile_settings();
- client_registry.update_from_config(cct->_conf);
+ client_registry.update_from_config(
+ cct->_conf, osd_bandwidth_capacity_per_shard);
}
-void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+/* ClientRegistry holds the dmclock::ClientInfo configuration parameters
+ * (reservation (bytes/second), weight (unitless), limit (bytes/second))
+ * for each IO class in the OSD (client, background_recovery,
+ * background_best_effort).
+ *
+ * mclock expects limit and reservation to have units of <cost>/second
+ * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
+ * as ratios of the OSD's capacity. We convert from the one to the other
+ * using the capacity_per_shard parameter.
+ *
+ * Note, mclock profile information will already have been set as a default
+ * for the osd_mclock_scheduler_client_* parameters prior to calling
+ * update_from_config -- see set_config_defaults_from_profile().
+ */
+void mClockScheduler::ClientRegistry::update_from_config(
+ const ConfigProxy &conf,
+ const double capacity_per_shard)
{
- default_external_client_info.update(
- conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
- conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
- conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
+ auto get_res = [&](double res) {
+ if (res) {
+ return res * capacity_per_shard;
+ } else {
+ return default_min; // min reservation
+ }
+ };
+
+ auto get_lim = [&](double lim) {
+ if (lim) {
+ return lim * capacity_per_shard;
+ } else {
+ return default_max; // high limit
+ }
+ };
+
+ // Set external client infos
+ double res = conf.get_val<double>(
+ "osd_mclock_scheduler_client_res");
+ double lim = conf.get_val<double>(
+ "osd_mclock_scheduler_client_lim");
+ uint64_t wgt = conf.get_val<uint64_t>(
+ "osd_mclock_scheduler_client_wgt");
+ default_external_client_info.update(
+ get_res(res),
+ wgt,
+ get_lim(lim));
+
+ // Set background recovery client infos
+ res = conf.get_val<double>(
+ "osd_mclock_scheduler_background_recovery_res");
+ lim = conf.get_val<double>(
+ "osd_mclock_scheduler_background_recovery_lim");
+ wgt = conf.get_val<uint64_t>(
+ "osd_mclock_scheduler_background_recovery_wgt");
internal_client_infos[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
- conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
- conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
- conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
-
+ get_res(res),
+ wgt,
+ get_lim(lim));
+
+ // Set background best effort client infos
+ res = conf.get_val<double>(
+ "osd_mclock_scheduler_background_best_effort_res");
+ lim = conf.get_val<double>(
+ "osd_mclock_scheduler_background_best_effort_lim");
+ wgt = conf.get_val<uint64_t>(
+ "osd_mclock_scheduler_background_best_effort_wgt");
internal_client_infos[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
- conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
- conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
- conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
+ get_res(res),
+ wgt,
+ get_lim(lim));
}
const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
}
}
-void mClockScheduler::set_max_osd_capacity()
+void mClockScheduler::set_osd_capacity_params_from_config()
{
- if (is_rotational) {
- max_osd_capacity =
- cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
- cct->_conf.set_val("osd_mclock_max_capacity_iops_ssd", "0");
- } else {
- max_osd_capacity =
- cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
- cct->_conf.set_val("osd_mclock_max_capacity_iops_hdd", "0");
- }
- // Set per op-shard iops limit
- max_osd_capacity /= num_shards;
- dout(1) << __func__ << " #op shards: " << num_shards
- << std::fixed << std::setprecision(2)
- << " max osd capacity(iops) per shard: " << max_osd_capacity
- << dendl;
-}
+ uint64_t osd_bandwidth_capacity;
+ double osd_iop_capacity;
-void mClockScheduler::set_osd_mclock_cost_per_io()
-{
- std::chrono::seconds sec(1);
- if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
- osd_mclock_cost_per_io =
- cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
- } else {
+ std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] {
if (is_rotational) {
- osd_mclock_cost_per_io =
- cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
- // For HDDs, convert value to seconds
- osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
+ return std::make_tuple(
+ cct->_conf.get_val<Option::size_t>(
+ "osd_mclock_max_sequential_bandwidth_hdd"),
+ cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"));
} else {
- // For SSDs, convert value to milliseconds
- osd_mclock_cost_per_io =
- cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
- osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
+ return std::make_tuple(
+ cct->_conf.get_val<Option::size_t>(
+ "osd_mclock_max_sequential_bandwidth_ssd"),
+ cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"));
}
- }
- dout(1) << __func__ << " osd_mclock_cost_per_io: "
- << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
- << dendl;
-}
+ }();
-void mClockScheduler::set_osd_mclock_cost_per_byte()
-{
- std::chrono::seconds sec(1);
- if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
- osd_mclock_cost_per_byte =
- cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
- } else {
- if (is_rotational) {
- osd_mclock_cost_per_byte =
- cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
- // For HDDs, convert value to seconds
- osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
- } else {
- osd_mclock_cost_per_byte =
- cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
- // For SSDs, convert value to milliseconds
- osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
- }
- }
- dout(1) << __func__ << " osd_mclock_cost_per_byte: "
- << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
+ osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity);
+ osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity);
+
+ osd_bandwidth_cost_per_io =
+ static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity;
+ osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity)
+ / static_cast<double>(num_shards);
+
+ dout(1) << __func__ << ": osd_bandwidth_cost_per_io: "
+ << std::fixed << std::setprecision(2)
+ << osd_bandwidth_cost_per_io << " bytes/io"
+ << ", osd_bandwidth_capacity_per_shard "
+ << osd_bandwidth_capacity_per_shard << " bytes/second"
<< dendl;
}
return mclock_profile;
}
+// Sets allocations for 'balanced' mClock profile
+//
+// min and max specification:
+// 0 (min): specifies no minimum reservation
+// 0 (max): specifies no upper limit
+//
+// Client Allocation:
+// reservation: 40% | weight: 1 | limit: 100% |
+// Background Recovery Allocation:
+// reservation: 40% | weight: 1 | limit: 70% |
+// Background Best Effort Allocation:
+// reservation: 20% | weight: 1 | limit: 0 (max) |
void mClockScheduler::set_balanced_profile_allocations()
{
- // Client Allocation:
- // reservation: 40% | weight: 1 | limit: 100% |
- // Background Recovery Allocation:
- // reservation: 40% | weight: 1 | limit: 150% |
- // Background Best Effort Allocation:
- // reservation: 20% | weight: 2 | limit: max |
-
- // Client
- uint64_t client_res = static_cast<uint64_t>(
- std::round(0.40 * max_osd_capacity));
- uint64_t client_lim = static_cast<uint64_t>(
- std::round(max_osd_capacity));
- uint64_t client_wgt = default_min;
-
- // Background Recovery
- uint64_t rec_res = static_cast<uint64_t>(
- std::round(0.40 * max_osd_capacity));
- uint64_t rec_lim = static_cast<uint64_t>(
- std::round(1.5 * max_osd_capacity));
- uint64_t rec_wgt = default_min;
-
- // Background Best Effort
- uint64_t best_effort_res = static_cast<uint64_t>(
- std::round(0.20 * max_osd_capacity));
- uint64_t best_effort_lim = default_max;
- uint64_t best_effort_wgt = 2;
-
- // Set the allocations for the mclock clients
+ // Set [res, wgt, lim] in that order for each mClock client class.
client_allocs[
static_cast<size_t>(op_scheduler_class::client)].update(
- client_res,
- client_wgt,
- client_lim);
+ 0.4, 1.0, 1.0);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
- rec_res,
- rec_wgt,
- rec_lim);
+ 0.4, 1.0, 0.7);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
- best_effort_res,
- best_effort_wgt,
- best_effort_lim);
+ 0.2, 1.0, 0.0);
}
+// Sets allocations for 'high_recovery_ops' mClock profile
+//
+// min and max specification:
+// 0 (min): specifies no minimum reservation
+// 0 (max): specifies no upper limit
+//
+// Client Allocation:
+// reservation: 30% | weight: 1 | limit: 80% |
+// Background Recovery Allocation:
+// reservation: 60% | weight: 2 | limit: 0 (max) |
+// Background Best Effort Allocation:
+// reservation: 0 (min) | weight: 1 | limit: 0 (max) |
void mClockScheduler::set_high_recovery_ops_profile_allocations()
{
- // Client Allocation:
- // reservation: 30% | weight: 1 | limit: 80% |
- // Background Recovery Allocation:
- // reservation: 60% | weight: 2 | limit: 200% |
- // Background Best Effort Allocation:
- // reservation: 1 | weight: 2 | limit: max |
-
- // Client
- uint64_t client_res = static_cast<uint64_t>(
- std::round(0.30 * max_osd_capacity));
- uint64_t client_lim = static_cast<uint64_t>(
- std::round(0.80 * max_osd_capacity));
- uint64_t client_wgt = default_min;
-
- // Background Recovery
- uint64_t rec_res = static_cast<uint64_t>(
- std::round(0.60 * max_osd_capacity));
- uint64_t rec_lim = static_cast<uint64_t>(
- std::round(2.0 * max_osd_capacity));
- uint64_t rec_wgt = 2;
-
- // Background Best Effort
- uint64_t best_effort_res = default_min;
- uint64_t best_effort_lim = default_max;
- uint64_t best_effort_wgt = 2;
-
- // Set the allocations for the mclock clients
+ // Set [res, wgt, lim] in that order for each mClock client class.
client_allocs[
static_cast<size_t>(op_scheduler_class::client)].update(
- client_res,
- client_wgt,
- client_lim);
+ 0.3, 1.0, 0.8);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
- rec_res,
- rec_wgt,
- rec_lim);
+ 0.6, 2.0, 0.0);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
- best_effort_res,
- best_effort_wgt,
- best_effort_lim);
+ 0.0, 1.0, 0.0);
}
+// Sets allocations for 'high_client_ops' mClock profile
+//
+// min and max specification:
+// 0 (min): specifies no minimum reservation
+// 0 (max): specifies no upper limit
+//
+// Client Allocation:
+// reservation: 60% | weight: 5 | limit: 0 (max) |
+// Background Recovery Allocation:
+// reservation: 20% | weight: 1 | limit: 50% |
+// Background Best Effort Allocation:
+// reservation: 20% | weight: 1 | limit: 0 (max) |
void mClockScheduler::set_high_client_ops_profile_allocations()
{
- // Client Allocation:
- // reservation: 50% | weight: 2 | limit: max |
- // Background Recovery Allocation:
- // reservation: 25% | weight: 1 | limit: 100% |
- // Background Best Effort Allocation:
- // reservation: 25% | weight: 2 | limit: max |
-
- // Client
- uint64_t client_res = static_cast<uint64_t>(
- std::round(0.50 * max_osd_capacity));
- uint64_t client_wgt = 2;
- uint64_t client_lim = default_max;
-
- // Background Recovery
- uint64_t rec_res = static_cast<uint64_t>(
- std::round(0.25 * max_osd_capacity));
- uint64_t rec_lim = static_cast<uint64_t>(
- std::round(max_osd_capacity));
- uint64_t rec_wgt = default_min;
-
- // Background Best Effort
- uint64_t best_effort_res = static_cast<uint64_t>(
- std::round(0.25 * max_osd_capacity));
- uint64_t best_effort_lim = default_max;
- uint64_t best_effort_wgt = 2;
-
- // Set the allocations for the mclock clients
+ // Set [res, wgt, lim] in that order for each mClock client class.
client_allocs[
static_cast<size_t>(op_scheduler_class::client)].update(
- client_res,
- client_wgt,
- client_lim);
+ 0.6, 5.0, 0.0);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
- rec_res,
- rec_wgt,
- rec_lim);
+ 0.2, 1.0, 0.5);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
- best_effort_res,
- best_effort_wgt,
- best_effort_lim);
+ 0.2, 1.0, 0.0);
}
void mClockScheduler::enable_mclock_profile_settings()
cct->_conf.set_val_default("osd_mclock_scheduler_client_res",
std::to_string(client.res));
cct->_conf.set_val_default("osd_mclock_scheduler_client_wgt",
- std::to_string(client.wgt));
+ std::to_string(uint64_t(client.wgt)));
cct->_conf.set_val_default("osd_mclock_scheduler_client_lim",
std::to_string(client.lim));
dout(10) << __func__ << " client QoS params: " << "["
cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_res",
std::to_string(rec.res));
cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_wgt",
- std::to_string(rec.wgt));
+ std::to_string(uint64_t(rec.wgt)));
cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_lim",
std::to_string(rec.lim));
dout(10) << __func__ << " Recovery QoS params: " << "["
cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_res",
std::to_string(best_effort.res));
cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_wgt",
- std::to_string(best_effort.wgt));
+ std::to_string(uint64_t(best_effort.wgt)));
cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_lim",
std::to_string(best_effort.lim));
dout(10) << __func__ << " Best effort QoS params: " << "["
update_configuration();
}
-int mClockScheduler::calc_scaled_cost(int item_cost)
+uint32_t mClockScheduler::calc_scaled_cost(int item_cost)
{
- // Calculate total scaled cost in secs
- int scaled_cost =
- std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
- return std::max(scaled_cost, 1);
+ auto cost = static_cast<uint32_t>(
+ std::max<int>(
+ 1, // ensure cost is non-zero and positive
+ item_cost));
+ auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io);
+
+ // Calculate total scaled cost in bytes
+ return cost_per_io + cost;
}
void mClockScheduler::update_configuration()
} else if (priority >= cutoff) {
enqueue_high(priority, std::move(item));
} else {
- int cost = calc_scaled_cost(item.get_cost());
+ auto cost = calc_scaled_cost(item.get_cost());
item.set_qos_cost(cost);
dout(20) << __func__ << " " << id
<< " item_cost: " << item.get_cost()
"osd_mclock_scheduler_background_best_effort_res",
"osd_mclock_scheduler_background_best_effort_wgt",
"osd_mclock_scheduler_background_best_effort_lim",
- "osd_mclock_cost_per_io_usec",
- "osd_mclock_cost_per_io_usec_hdd",
- "osd_mclock_cost_per_io_usec_ssd",
- "osd_mclock_cost_per_byte_usec",
- "osd_mclock_cost_per_byte_usec_hdd",
- "osd_mclock_cost_per_byte_usec_ssd",
"osd_mclock_max_capacity_iops_hdd",
"osd_mclock_max_capacity_iops_ssd",
+ "osd_mclock_max_sequential_bandwidth_hdd",
+ "osd_mclock_max_sequential_bandwidth_ssd",
"osd_mclock_profile",
NULL
};
const ConfigProxy& conf,
const std::set<std::string> &changed)
{
- if (changed.count("osd_mclock_cost_per_io_usec") ||
- changed.count("osd_mclock_cost_per_io_usec_hdd") ||
- changed.count("osd_mclock_cost_per_io_usec_ssd")) {
- set_osd_mclock_cost_per_io();
- }
- if (changed.count("osd_mclock_cost_per_byte_usec") ||
- changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
- changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
- set_osd_mclock_cost_per_byte();
- }
if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
changed.count("osd_mclock_max_capacity_iops_ssd")) {
- set_max_osd_capacity();
+ set_osd_capacity_params_from_config();
if (mclock_profile != "custom") {
enable_mclock_profile_settings();
- client_registry.update_from_config(conf);
}
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
+ }
+ if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") ||
+ changed.count("osd_mclock_max_sequential_bandwidth_ssd")) {
+ set_osd_capacity_params_from_config();
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
}
if (changed.count("osd_mclock_profile")) {
set_mclock_profile();
if (mclock_profile != "custom") {
enable_mclock_profile_settings();
- client_registry.update_from_config(conf);
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
}
}
if (auto key = get_changed_key(); key.has_value()) {
if (mclock_profile == "custom") {
- client_registry.update_from_config(conf);
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
} else {
// Attempt to change QoS parameter for a built-in profile. Restore the
// profile defaults by making one of the OSD shards remove the key from
namespace ceph::osd::scheduler {
-constexpr uint64_t default_min = 1;
-constexpr uint64_t default_max = 999999;
+constexpr double default_min = 1.0;
+constexpr double default_max = std::numeric_limits<double>::is_iec559 ?
+ std::numeric_limits<double>::infinity() :
+ std::numeric_limits<double>::max();
using client_id_t = uint64_t;
using profile_id_t = uint64_t;
const int shard_id;
bool is_rotational;
MonClient *monc;
- double max_osd_capacity;
- double osd_mclock_cost_per_io;
- double osd_mclock_cost_per_byte;
+
std::string mclock_profile = "high_client_ops";
struct ClientAllocs {
- uint64_t res;
- uint64_t wgt;
- uint64_t lim;
+ double res;
+ double wgt;
+ double lim;
- ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+ ClientAllocs(double _res, double _wgt, double _lim) {
update(_res, _wgt, _lim);
}
- inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+ inline void update(double _res, double _wgt, double _lim) {
res = _res;
wgt = _wgt;
lim = _lim;
static_cast<size_t>(op_scheduler_class::client) + 1
> client_allocs = {
// Placeholder, get replaced with configured values
- ClientAllocs(1, 1, 1), // background_recovery
- ClientAllocs(1, 1, 1), // background_best_effort
- ClientAllocs(1, 1, 1), // immediate (not used)
- ClientAllocs(1, 1, 1) // client
+ ClientAllocs(0, 1, 0), // background_recovery
+ ClientAllocs(0, 1, 0), // background_best_effort
+ ClientAllocs(0, 1, 0), // immediate (not used)
+ ClientAllocs(0, 1, 0) // client
};
+
+ /**
+ * osd_bandwidth_cost_per_io
+ *
+ * mClock expects all queued items to have a uniform expression of
+ * "cost". However, IO devices generally have quite different capacity
+ * for sequential IO vs small random IO. This implementation handles this
+ * by expressing all costs as a number of sequential bytes written adding
+ * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
+ *
+ * Thus, an IO operation requiring a total of <size> bytes to be written
+ * accross <iops> different locations will have a cost of
+ * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
+ *
+ * Set in set_osd_capacity_params_from_config in the constructor and upon
+ * config change.
+ *
+ * Has units bytes/io.
+ */
+ double osd_bandwidth_cost_per_io;
+
+ /**
+ * osd_bandwidth_capacity_per_shard
+ *
+ * mClock expects reservation and limit paramters to be expressed in units
+ * of cost/second -- which means bytes/second for this implementation.
+ *
+ * Rather than expecting users to compute appropriate limit and reservation
+ * values for each class of OSDs in their cluster, we instead express
+ * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
+ * osd_bandwidth_capacity_per_shard is that capacity divided by the number
+ * of shards.
+ *
+ * Set in set_osd_capacity_params_from_config in the constructor and upon
+ * config change.
+ *
+ * This value gets passed to ClientRegistry::update_from_config in order
+ * to resolve the full reservaiton and limit parameters for mclock from
+ * the configured ratios.
+ *
+ * Has units bytes/second.
+ */
+ double osd_bandwidth_capacity_per_shard;
+
class ClientRegistry {
std::array<
crimson::dmclock::ClientInfo,
const crimson::dmclock::ClientInfo *get_external_client(
const client_profile_id_t &client) const;
public:
- void update_from_config(const ConfigProxy &conf);
+ /**
+ * update_from_config
+ *
+ * Sets the mclock paramaters (reservation, weight, and limit)
+ * for each class of IO (background_recovery, background_best_effort,
+ * and client).
+ */
+ void update_from_config(
+ const ConfigProxy &conf,
+ double capacity_per_shard);
const crimson::dmclock::ClientInfo *get_info(
const scheduler_id_t &id) const;
} client_registry;
}
}
+ /**
+ * set_osd_capacity_params_from_config
+ *
+ * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
+ * and osd_bandwidth_capacity_per_shard, internally. These two
+ * parameters are derived from config parameters
+ * osd_mclock_max_capacity_iops_(hdd|ssd) and
+ * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
+ * Invoking set_osd_capacity_params_from_config() resets those derived
+ * params based on the current config and should be invoked any time they
+ * are modified as well as in the constructor. See handle_conf_change().
+ */
+ void set_osd_capacity_params_from_config();
+
public:
mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
int shard_id, bool is_rotational, MonClient *monc);
~mClockScheduler() override;
- // Set the max osd capacity in iops
- void set_max_osd_capacity();
-
- // Set the cost per io for the osd
- void set_osd_mclock_cost_per_io();
-
- // Set the cost per byte for the osd
- void set_osd_mclock_cost_per_byte();
-
// Set the mclock profile type to enable
void set_mclock_profile();
// Set mclock config parameter based on allocations
void set_profile_config();
- // Calculate scale cost per item
- int calc_scaled_cost(int cost);
+ /// Calculate scaled cost per item
+ uint32_t calc_scaled_cost(int cost);
// Helper method to display mclock queues
std::string display_queues() const;