osd: Modify mClock scheduler's cost model to represent cost in bytes

author Sridhar Seshasayee <sseshasa@redhat.com>

Thu, 9 Feb 2023 15:17:44 +0000 (20:47 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Thu, 27 Apr 2023 13:11:38 +0000 (18:41 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Thu, 9 Feb 2023 15:17:44 +0000 (20:47 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Thu, 27 Apr 2023 13:11:38 +0000 (18:41 +0530)
diff --git a/qa/standalone/misc/mclock-config.sh b/qa/standalone/misc/mclock-config.sh

index 17260b100ae7fc209578654e40794eb79a1c983b..d16cd45f43fa6873a8afc97b8cdba2629d0a5b7d 100755 (executable)
--- a/qa/standalone/misc/mclock-config.sh
+++ b/qa/standalone/misc/mclock-config.sh
@@ -37,7 +37,7 @@ function run() {
  
  function TEST_profile_builtin_to_custom() {
      local dir=$1
-    local OSDS=3
+    local OSDS=1
  
      setup $dir || return 1
      run_mon $dir a --osd_pool_default_size=$OSDS || return 1
@@ -69,7 +69,7 @@ function TEST_profile_builtin_to_custom() {
        osd.$id) config get osd_mclock_scheduler_client_res | \
        jq .osd_mclock_scheduler_client_res | bc)
      echo "client_res = $client_res"
-    local client_res_new=$(expr $client_res + 10)
+    local client_res_new=$(echo "$client_res + 0.1" | bc -l)
      echo "client_res_new = $client_res_new"
      ceph config set osd osd_mclock_scheduler_client_res \
        $client_res_new || return 1
@@ -78,12 +78,16 @@ function TEST_profile_builtin_to_custom() {
        # Check value in config monitor db
        local res=$(ceph config get osd.$id \
          osd_mclock_scheduler_client_res) || return 1
-      test $res -eq $client_res_new || return 1
+      if (( $(echo "$res != $client_res_new" | bc -l) )); then
+        return 1
+      fi
        # Check value in the in-memory 'values' map
        res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
          osd.$id) config get osd_mclock_scheduler_client_res | \
          jq .osd_mclock_scheduler_client_res | bc)
-      test $res -eq $client_res_new || return 1
+      if (( $(echo "$res != $client_res_new" | bc -l) )); then
+        return 1
+      fi
      done
  
      teardown $dir || return 1
@@ -91,7 +95,7 @@ function TEST_profile_builtin_to_custom() {
  
  function TEST_profile_custom_to_builtin() {
      local dir=$1
-    local OSDS=3
+    local OSDS=1
  
      setup $dir || return 1
      run_mon $dir a --osd_pool_default_size=$OSDS || return 1
@@ -129,7 +133,7 @@ function TEST_profile_custom_to_builtin() {
      done
  
      # Change a mclock config param and confirm the change
-    local client_res_new=$(expr ${client_res[0]} + 10)
+    local client_res_new=$(echo "${client_res[0]} + 0.1" | bc -l)
      echo "client_res_new = $client_res_new"
      ceph config set osd osd_mclock_scheduler_client_res \
        $client_res_new || return 1
@@ -138,12 +142,16 @@ function TEST_profile_custom_to_builtin() {
        # Check value in config monitor db
        local res=$(ceph config get osd.$id \
          osd_mclock_scheduler_client_res) || return 1
-      test $res -eq $client_res_new || return 1
+      if (( $(echo "$res != $client_res_new" | bc -l) )); then
+        return 1
+      fi
        # Check value in the in-memory 'values' map
        res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
          osd.$id) config get osd_mclock_scheduler_client_res | \
          jq .osd_mclock_scheduler_client_res | bc)
-      test $res -eq $client_res_new || return 1
+      if (( $(echo "$res != $client_res_new" | bc -l) )); then
+        return 1
+      fi
      done
  
      # Switch the mclock profile back to the original built-in profile.
@@ -166,12 +174,16 @@ function TEST_profile_custom_to_builtin() {
        # Check value in config monitor db
        local res=$(ceph config get osd.$id \
          osd_mclock_scheduler_client_res) || return 1
-      test $res -eq $client_res_new || return 1
+      if (( $(echo "$res != $client_res_new" | bc -l) )); then
+        return 1
+      fi
        # Check value in the in-memory 'values' map
        res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
          osd.$id) config get osd_mclock_scheduler_client_res | \
          jq .osd_mclock_scheduler_client_res | bc)
-      test $res -eq $client_res_new || return 1
+      if (( $(echo "$res != $client_res_new" | bc -l) )); then
+        return 1
+      fi
      done
  
      # Remove the changed QoS config option from monitor db
@@ -184,7 +196,9 @@ function TEST_profile_custom_to_builtin() {
        res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
          osd.$id) config get osd_mclock_scheduler_client_res | \
          jq .osd_mclock_scheduler_client_res | bc)
-      test $res -eq ${client_res[$id]} || return 1
+      if (( $(echo "$res != ${client_res[$id]}" | bc -l) )); then
+        return 1
+      fi
      done
  
      teardown $dir || return 1
@@ -274,33 +288,57 @@ function TEST_profile_disallow_builtin_params_modify() {
      declare -a options=("osd_mclock_scheduler_background_recovery_res"
        "osd_mclock_scheduler_client_res")
  
+    local retries=10
+    local errors=0
      for opt in "${options[@]}"
      do
        # Try and change a mclock config param and confirm that no change occurred
        local opt_val_orig=$(CEPH_ARGS='' ceph --format=json daemon \
          $(get_asok_path osd.0) config get $opt | jq .$opt | bc)
-      local opt_val_new=$(expr $opt_val_orig + 10)
+      local opt_val_new=$(echo "$opt_val_orig + 0.1" | bc -l)
        ceph config set osd.0 $opt $opt_val_new || return 1
-      sleep 2 # Allow time for changes to take effect
  
-      # Check configuration value on Mon store (or the default) for the osd
-      local res=$(ceph config get osd.0 $opt) || return 1
-      echo "Mon db (or default): osd.0 $opt = $res"
-      test $res -ne $opt_val_new || return 1
-
-      # Check running configuration value using "config show" cmd
-      res=$(ceph config show osd.0 | grep $opt |\
-        awk '{ print $2 }' | bc ) || return 1
-      echo "Running config: osd.0 $opt = $res"
-      test $res -ne $opt_val_new || return 1
-      test $res -eq $opt_val_orig || return 1
-
-      # Check value in the in-memory 'values' map is unmodified
-      res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
-        osd.0) config get $opt | jq .$opt | bc)
-      echo "Values map: osd.0 $opt = $res"
-      test $res -ne $opt_val_new || return 1
-      test $res -eq $opt_val_orig || return 1
+      # Check configuration values
+      for count in $(seq 0 $(expr $retries - 1))
+      do
+        errors=0
+        sleep 2 # Allow time for changes to take effect
+
+        echo "Check configuration values - Attempt#: $count"
+        # Check configuration value on Mon store (or the default) for the osd
+        local res=$(ceph config get osd.0 $opt) || return 1
+        echo "Mon db (or default): osd.0 $opt = $res"
+        if (( $(echo "$res == $opt_val_new" | bc -l) )); then
+          errors=$(expr $errors + 1)
+        fi
+
+        # Check running configuration value using "config show" cmd
+        res=$(ceph config show osd.0 | grep $opt |\
+          awk '{ print $2 }' | bc ) || return 1
+        echo "Running config: osd.0 $opt = $res"
+        if (( $(echo "$res == $opt_val_new" | bc -l) || \
+              $(echo "$res != $opt_val_orig" | bc -l)  )); then
+          errors=$(expr $errors + 1)
+        fi
+
+        # Check value in the in-memory 'values' map is unmodified
+        res=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path \
+          osd.0) config get $opt | jq .$opt | bc)
+        echo "Values map: osd.0 $opt = $res"
+        if (( $(echo "$res == $opt_val_new" | bc -l) || \
+              $(echo "$res != $opt_val_orig" | bc -l) )); then
+          errors=$(expr $errors + 1)
+        fi
+
+        # Check if we succeeded or exhausted retry count
+        if [ $errors -eq 0 ]
+        then
+          break
+        elif [ $count -eq $(expr $retries - 1) ]
+        then
+          return 1
+        fi
+      done
      done
  
      teardown $dir || return 1
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in

index a1c01374a1ec6f4c1772a255b2315c0970a7b066..dbd461aa5780c08481896e589923792d8c30ff8f 100644 (file)
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -951,12 +951,17 @@ options:
    - debug_random
    with_legacy: true
  - name: osd_mclock_scheduler_client_res
-  type: uint
+  type: float
    level: advanced
-  desc: IO proportion reserved for each client (default)
+  desc: IO proportion reserved for each client (default). The default value
+    of 0 specifies the lowest possible reservation. Any value greater than
+    0 and up to 1.0 specifies the minimum IO proportion to reserve for each
+    client in terms of a fraction of the OSD's maximum IOPS capacity.
    long_desc: Only considered for osd_op_queue = mclock_scheduler
    fmt_desc: IO proportion reserved for each client (default).
-  default: 1
+  default: 0
+  min: 0
+  max: 1.0
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_client_wgt
@@ -969,21 +974,34 @@ options:
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_client_lim
-  type: uint
+  type: float
    level: advanced
-  desc: IO limit for each client (default) over reservation
+  desc: IO limit for each client (default) over reservation. The default
+    value of 0 specifies no limit enforcement, which means each client can
+    use the maximum possible IOPS capacity of the OSD. Any value greater
+    than 0 and up to 1.0 specifies the upper IO limit over reservation
+    that each client receives in terms of a fraction of the OSD's
+    maximum IOPS capacity.
    long_desc: Only considered for osd_op_queue = mclock_scheduler
    fmt_desc: IO limit for each client (default) over reservation.
-  default: 999999
+  default: 0
+  min: 0
+  max: 1.0
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_background_recovery_res
-  type: uint
+  type: float
    level: advanced
-  desc: IO proportion reserved for background recovery (default)
+  desc: IO proportion reserved for background recovery (default). The
+    default value of 0 specifies the lowest possible reservation. Any value
+    greater than 0 and up to 1.0 specifies the minimum IO proportion to
+    reserve for background recovery operations in terms of a fraction of
+    the OSD's maximum IOPS capacity.
    long_desc: Only considered for osd_op_queue = mclock_scheduler
    fmt_desc: IO proportion reserved for background recovery (default).
-  default: 1
+  default: 0
+  min: 0
+  max: 1.0
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_background_recovery_wgt
@@ -996,21 +1014,34 @@ options:
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_background_recovery_lim
-  type: uint
+  type: float
    level: advanced
-  desc: IO limit for background recovery over reservation
+  desc: IO limit for background recovery over reservation. The default
+    value of 0 specifies no limit enforcement, which means background
+    recovery operation can use the maximum possible IOPS capacity of the
+    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
+    limit over reservation that background recovery operation receives in
+    terms of a fraction of the OSD's maximum IOPS capacity.
    long_desc: Only considered for osd_op_queue = mclock_scheduler
    fmt_desc: IO limit for background recovery over reservation.
-  default: 999999
+  default: 0
+  min: 0
+  max: 1.0
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_background_best_effort_res
-  type: uint
+  type: float
    level: advanced
-  desc: IO proportion reserved for background best_effort (default)
+  desc: IO proportion reserved for background best_effort (default). The
+    default value of 0 specifies the lowest possible reservation. Any value
+    greater than 0 and up to 1.0 specifies the minimum IO proportion to
+    reserve for background best_effort operations in terms of a fraction
+    of the OSD's maximum IOPS capacity.
    long_desc: Only considered for osd_op_queue = mclock_scheduler
    fmt_desc: IO proportion reserved for background best_effort (default).
-  default: 1
+  default: 0
+  min: 0
+  max: 1.0
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_background_best_effort_wgt
@@ -1023,12 +1054,19 @@ options:
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_background_best_effort_lim
-  type: uint
+  type: float
    level: advanced
-  desc: IO limit for background best_effort over reservation
+  desc: IO limit for background best_effort over reservation. The default
+    value of 0 specifies no limit enforcement, which means background
+    best_effort operation can use the maximum possible IOPS capacity of the
+    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
+    limit over reservation that background best_effort operation receives
+    in terms of a fraction of the OSD's maximum IOPS capacity.
    long_desc: Only considered for osd_op_queue = mclock_scheduler
    fmt_desc: IO limit for background best_effort over reservation.
-  default: 999999
+  default: 0
+  min: 0
+  max: 1.0
    see_also:
    - osd_op_queue
  - name: osd_mclock_scheduler_anticipation_timeout
@@ -1037,106 +1075,57 @@ options:
    desc: mclock anticipation timeout in seconds
    long_desc: the amount of time that mclock waits until the unused resource is forfeited
    default: 0
-- name: osd_mclock_cost_per_io_usec
-  type: float
-  level: dev
-  desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if
-    non-zero)
-  long_desc: This option specifies the cost factor to consider in usec per OSD. This
-    is considered by the mclock scheduler to set an additional cost factor in QoS
-    calculations. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd
-    and _hdd if non-zero)
-  default: 0
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_io_usec_hdd
-  type: float
-  level: dev
-  desc: Cost per IO in microseconds to consider per OSD (for rotational media)
-  long_desc: This option specifies the cost factor to consider in usec per OSD for
-    rotational device type. This is considered by the mclock_scheduler to set an additional
-    cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational
-    media)
-  default: 11400
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_io_usec_ssd
-  type: float
-  level: dev
-  desc: Cost per IO in microseconds to consider per OSD (for solid state media)
-  long_desc: This option specifies the cost factor to consider in usec per OSD for
-    solid state device type. This is considered by the mclock_scheduler to set an
-    additional cost factor in QoS calculations. Only considered for osd_op_queue =
-    mclock_scheduler
-  fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state
-    media)
-  default: 50
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_byte_usec
-  type: float
-  level: dev
-  desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd
-    if non-zero)
-  long_desc: This option specifies the cost per byte to consider in microseconds per
-    OSD. This is considered by the mclock scheduler to set an additional cost factor
-    in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd
-    and _hdd if non-zero)
-  default: 0
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_byte_usec_hdd
-  type: float
-  level: dev
-  desc: Cost per byte in microseconds to consider per OSD (for rotational media)
-  long_desc: This option specifies the cost per byte to consider in microseconds per
-    OSD for rotational device type. This is considered by the mclock_scheduler to
-    set an additional cost factor in QoS calculations. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational
-    media)
-  default: 2.6
+- name: osd_mclock_max_sequential_bandwidth_hdd
+  type: size
+  level: basic
+  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
+    rotational media)
+  long_desc: This option specifies the maximum sequential bandwidth to consider
+    for an OSD whose underlying device type is rotational media. This is
+    considered by the mclock scheduler to derive the cost factor to be used in
+    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
+    OSD (for rotational media)
+  default: 150_M
    flags:
    - runtime
-- name: osd_mclock_cost_per_byte_usec_ssd
-  type: float
-  level: dev
-  desc: Cost per byte in microseconds to consider per OSD (for solid state media)
-  long_desc: This option specifies the cost per byte to consider in microseconds per
-    OSD for solid state device type. This is considered by the mclock_scheduler to
-    set an additional cost factor in QoS calculations. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state
-    media)
-  default: 0.011
+- name: osd_mclock_max_sequential_bandwidth_ssd
+  type: size
+  level: basic
+  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
+    solid state media)
+  long_desc: This option specifies the maximum sequential bandwidth to consider
+    for an OSD whose underlying device type is solid state media. This is
+    considered by the mclock scheduler to derive the cost factor to be used in
+    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
+    OSD (for solid state media)
+  default: 750_M
    flags:
    - runtime
  - name: osd_mclock_max_capacity_iops_hdd
    type: float
    level: basic
-  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational
-    media)
-  long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
-    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
-    rotational media)
+  desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD
+    (for rotational media)
+  long_desc: This option specifies the max OSD random write IOPS capacity per
+    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
+    considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
+    OSD (for rotational media)
    default: 315
    flags:
    - runtime
  - name: osd_mclock_max_capacity_iops_ssd
    type: float
    level: basic
-  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state
-    media)
-  long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
-    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
-    solid state media)
+  desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD
+    (for solid state media)
+  long_desc: This option specifies the max OSD random write IOPS capacity per
+    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
+    considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
+    OSD (for solid state media)
    default: 21500
    flags:
    - runtime
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h

index 2a4c00d0dcd8f04d278eefad78cddd95e16fc527..dbe61b051206728c78b825ba2430c8c37d95f764 100644 (file)
--- a/src/osd/scheduler/OpSchedulerItem.h
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -106,7 +106,7 @@ private:
    utime_t start_time;
    uint64_t owner;  ///< global id (e.g., client.XXX)
    epoch_t map_epoch;    ///< an epoch we expect the PG to exist in
-  int qos_cost;  ///< scaled cost calculated by the mclock scheduler
+  uint32_t qos_cost;  ///< scaled cost calculated by the mclock scheduler
    bool qos_item;  ///< set to true if item is scheduled by mclock scheduler
  
  public:
@@ -183,11 +183,11 @@ public:
      return qos_item;
    }
  
-  void set_qos_cost(int scaled_cost) {
+  void set_qos_cost(uint32_t scaled_cost) {
      qos_cost = scaled_cost;
    }
  
-  int get_qos_cost() const {
+  uint32_t get_qos_cost() const {
      return qos_cost;
    }
  
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc

index 9b07ca334212f54220d699d7505958df5495c03b..abb17571a4ea38cdbe9ce25ab107c4ec36bc8325 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.cc
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -51,32 +51,85 @@ mClockScheduler::mClockScheduler(CephContext *cct,
  {
    cct->_conf.add_observer(this);
    ceph_assert(num_shards > 0);
-  set_max_osd_capacity();
-  set_osd_mclock_cost_per_io();
-  set_osd_mclock_cost_per_byte();
+  set_osd_capacity_params_from_config();
    set_mclock_profile();
    enable_mclock_profile_settings();
-  client_registry.update_from_config(cct->_conf);
+  client_registry.update_from_config(
+    cct->_conf, osd_bandwidth_capacity_per_shard);
  }
  
-void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+/* ClientRegistry holds the dmclock::ClientInfo configuration parameters
+ * (reservation (bytes/second), weight (unitless), limit (bytes/second))
+ * for each IO class in the OSD (client, background_recovery,
+ * background_best_effort).
+ *
+ * mclock expects limit and reservation to have units of <cost>/second
+ * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
+ * as ratios of the OSD's capacity.  We convert from the one to the other
+ * using the capacity_per_shard parameter.
+ *
+ * Note, mclock profile information will already have been set as a default
+ * for the osd_mclock_scheduler_client_* parameters prior to calling
+ * update_from_config -- see set_config_defaults_from_profile().
+ */
+void mClockScheduler::ClientRegistry::update_from_config(
+  const ConfigProxy &conf,
+  const double capacity_per_shard)
  {
-  default_external_client_info.update(
-    conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
  
+  auto get_res = [&](double res) {
+    if (res) {
+      return res * capacity_per_shard;
+    } else {
+      return default_min; // min reservation
+    }
+  };
+
+  auto get_lim = [&](double lim) {
+    if (lim) {
+      return lim * capacity_per_shard;
+    } else {
+      return default_max; // high limit
+    }
+  };
+
+  // Set external client infos
+  double res = conf.get_val<double>(
+    "osd_mclock_scheduler_client_res");
+  double lim = conf.get_val<double>(
+    "osd_mclock_scheduler_client_lim");
+  uint64_t wgt = conf.get_val<uint64_t>(
+    "osd_mclock_scheduler_client_wgt");
+  default_external_client_info.update(
+    get_res(res),
+    wgt,
+    get_lim(lim));
+
+  // Set background recovery client infos
+  res = conf.get_val<double>(
+    "osd_mclock_scheduler_background_recovery_res");
+  lim = conf.get_val<double>(
+    "osd_mclock_scheduler_background_recovery_lim");
+  wgt = conf.get_val<uint64_t>(
+    "osd_mclock_scheduler_background_recovery_wgt");
    internal_client_infos[
      static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
-
+      get_res(res),
+      wgt,
+      get_lim(lim));
+
+  // Set background best effort client infos
+  res = conf.get_val<double>(
+    "osd_mclock_scheduler_background_best_effort_res");
+  lim = conf.get_val<double>(
+    "osd_mclock_scheduler_background_best_effort_lim");
+  wgt = conf.get_val<uint64_t>(
+    "osd_mclock_scheduler_background_best_effort_wgt");
    internal_client_infos[
      static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
+      get_res(res),
+      wgt,
+      get_lim(lim));
  }
  
  const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
@@ -103,70 +156,38 @@ const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
    }
  }
  
-void mClockScheduler::set_max_osd_capacity()
+void mClockScheduler::set_osd_capacity_params_from_config()
  {
-  if (is_rotational) {
-    max_osd_capacity =
-      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
-    cct->_conf.set_val("osd_mclock_max_capacity_iops_ssd", "0");
-  } else {
-    max_osd_capacity =
-      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
-    cct->_conf.set_val("osd_mclock_max_capacity_iops_hdd", "0");
-  }
-  // Set per op-shard iops limit
-  max_osd_capacity /= num_shards;
-  dout(1) << __func__ << " #op shards: " << num_shards
-          << std::fixed << std::setprecision(2)
-          << " max osd capacity(iops) per shard: " << max_osd_capacity
-          << dendl;
-}
+  uint64_t osd_bandwidth_capacity;
+  double osd_iop_capacity;
  
-void mClockScheduler::set_osd_mclock_cost_per_io()
-{
-  std::chrono::seconds sec(1);
-  if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
-    osd_mclock_cost_per_io =
-      cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
-  } else {
+  std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] {
      if (is_rotational) {
-      osd_mclock_cost_per_io =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
-      // For HDDs, convert value to seconds
-      osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
+      return std::make_tuple(
+        cct->_conf.get_val<Option::size_t>(
+          "osd_mclock_max_sequential_bandwidth_hdd"),
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"));
      } else {
-      // For SSDs, convert value to milliseconds
-      osd_mclock_cost_per_io =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
-      osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
+      return std::make_tuple(
+        cct->_conf.get_val<Option::size_t>(
+          "osd_mclock_max_sequential_bandwidth_ssd"),
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"));
      }
-  }
-  dout(1) << __func__ << " osd_mclock_cost_per_io: "
-          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
-          << dendl;
-}
+  }();
  
-void mClockScheduler::set_osd_mclock_cost_per_byte()
-{
-  std::chrono::seconds sec(1);
-  if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
-    osd_mclock_cost_per_byte =
-      cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
-  } else {
-    if (is_rotational) {
-      osd_mclock_cost_per_byte =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
-      // For HDDs, convert value to seconds
-      osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
-    } else {
-      osd_mclock_cost_per_byte =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
-      // For SSDs, convert value to milliseconds
-      osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
-    }
-  }
-  dout(1) << __func__ << " osd_mclock_cost_per_byte: "
-          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
+  osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity);
+  osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity);
+
+  osd_bandwidth_cost_per_io =
+    static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity;
+  osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity)
+    / static_cast<double>(num_shards);
+
+  dout(1) << __func__ << ": osd_bandwidth_cost_per_io: "
+          << std::fixed << std::setprecision(2)
+          << osd_bandwidth_cost_per_io << " bytes/io"
+          << ", osd_bandwidth_capacity_per_shard "
+          << osd_bandwidth_capacity_per_shard << " bytes/second"
            << dendl;
  }
  
@@ -181,143 +202,82 @@ std::string mClockScheduler::get_mclock_profile()
    return mclock_profile;
  }
  
+// Sets allocations for 'balanced' mClock profile
+//
+// min and max specification:
+//   0 (min): specifies no minimum reservation
+//   0 (max): specifies no upper limit
+//
+//  Client Allocation:
+//    reservation: 40% | weight: 1 | limit: 100% |
+//  Background Recovery Allocation:
+//    reservation: 40% | weight: 1 | limit: 70% |
+//  Background Best Effort Allocation:
+//    reservation: 20% | weight: 1 | limit: 0 (max) |
  void mClockScheduler::set_balanced_profile_allocations()
  {
-  // Client Allocation:
-  //   reservation: 40% | weight: 1 | limit: 100% |
-  // Background Recovery Allocation:
-  //   reservation: 40% | weight: 1 | limit: 150% |
-  // Background Best Effort Allocation:
-  //   reservation: 20% | weight: 2 | limit: max |
-
-  // Client
-  uint64_t client_res = static_cast<uint64_t>(
-    std::round(0.40 * max_osd_capacity));
-  uint64_t client_lim = static_cast<uint64_t>(
-    std::round(max_osd_capacity));
-  uint64_t client_wgt = default_min;
-
-  // Background Recovery
-  uint64_t rec_res = static_cast<uint64_t>(
-    std::round(0.40 * max_osd_capacity));
-  uint64_t rec_lim = static_cast<uint64_t>(
-    std::round(1.5 * max_osd_capacity));
-  uint64_t rec_wgt = default_min;
-
-  // Background Best Effort
-  uint64_t best_effort_res = static_cast<uint64_t>(
-    std::round(0.20 * max_osd_capacity));
-  uint64_t best_effort_lim = default_max;
-  uint64_t best_effort_wgt = 2;
-
-  // Set the allocations for the mclock clients
+  // Set [res, wgt, lim] in that order for each mClock client class.
    client_allocs[
      static_cast<size_t>(op_scheduler_class::client)].update(
-      client_res,
-      client_wgt,
-      client_lim);
+      0.4, 1.0, 1.0);
    client_allocs[
      static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-      rec_res,
-      rec_wgt,
-      rec_lim);
+      0.4, 1.0, 0.7);
    client_allocs[
      static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-      best_effort_res,
-      best_effort_wgt,
-      best_effort_lim);
+      0.2, 1.0, 0.0);
  }
  
+// Sets allocations for 'high_recovery_ops' mClock profile
+//
+// min and max specification:
+//   0 (min): specifies no minimum reservation
+//   0 (max): specifies no upper limit
+//
+// Client Allocation:
+//   reservation: 30% | weight: 1 | limit: 80% |
+// Background Recovery Allocation:
+//   reservation: 60% | weight: 2 | limit: 0 (max) |
+// Background Best Effort Allocation:
+//   reservation: 0 (min) | weight: 1 | limit: 0 (max) |
  void mClockScheduler::set_high_recovery_ops_profile_allocations()
  {
-  // Client Allocation:
-  //   reservation: 30% | weight: 1 | limit: 80% |
-  // Background Recovery Allocation:
-  //   reservation: 60% | weight: 2 | limit: 200% |
-  // Background Best Effort Allocation:
-  //   reservation: 1 | weight: 2 | limit: max |
-
-  // Client
-  uint64_t client_res = static_cast<uint64_t>(
-    std::round(0.30 * max_osd_capacity));
-  uint64_t client_lim = static_cast<uint64_t>(
-    std::round(0.80 * max_osd_capacity));
-  uint64_t client_wgt = default_min;
-
-  // Background Recovery
-  uint64_t rec_res = static_cast<uint64_t>(
-    std::round(0.60 * max_osd_capacity));
-  uint64_t rec_lim = static_cast<uint64_t>(
-    std::round(2.0 * max_osd_capacity));
-  uint64_t rec_wgt = 2;
-
-  // Background Best Effort
-  uint64_t best_effort_res = default_min;
-  uint64_t best_effort_lim = default_max;
-  uint64_t best_effort_wgt = 2;
-
-  // Set the allocations for the mclock clients
+  // Set [res, wgt, lim] in that order for each mClock client class.
    client_allocs[
      static_cast<size_t>(op_scheduler_class::client)].update(
-      client_res,
-      client_wgt,
-      client_lim);
+      0.3, 1.0, 0.8);
    client_allocs[
      static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-      rec_res,
-      rec_wgt,
-      rec_lim);
+      0.6, 2.0, 0.0);
    client_allocs[
      static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-      best_effort_res,
-      best_effort_wgt,
-      best_effort_lim);
+      0.0, 1.0, 0.0);
  }
  
+// Sets allocations for 'high_client_ops' mClock profile
+//
+// min and max specification:
+//   0 (min): specifies no minimum reservation
+//   0 (max): specifies no upper limit
+//
+// Client Allocation:
+//   reservation: 60% | weight: 5 | limit: 0 (max) |
+// Background Recovery Allocation:
+//   reservation: 20% | weight: 1 | limit: 50% |
+// Background Best Effort Allocation:
+//   reservation: 20% | weight: 1 | limit: 0 (max) |
  void mClockScheduler::set_high_client_ops_profile_allocations()
  {
-  // Client Allocation:
-  //   reservation: 50% | weight: 2 | limit: max |
-  // Background Recovery Allocation:
-  //   reservation: 25% | weight: 1 | limit: 100% |
-  // Background Best Effort Allocation:
-  //   reservation: 25% | weight: 2 | limit: max |
-
-  // Client
-  uint64_t client_res = static_cast<uint64_t>(
-    std::round(0.50 * max_osd_capacity));
-  uint64_t client_wgt = 2;
-  uint64_t client_lim = default_max;
-
-  // Background Recovery
-  uint64_t rec_res = static_cast<uint64_t>(
-    std::round(0.25 * max_osd_capacity));
-  uint64_t rec_lim = static_cast<uint64_t>(
-    std::round(max_osd_capacity));
-  uint64_t rec_wgt = default_min;
-
-  // Background Best Effort
-  uint64_t best_effort_res = static_cast<uint64_t>(
-    std::round(0.25 * max_osd_capacity));
-  uint64_t best_effort_lim = default_max;
-  uint64_t best_effort_wgt = 2;
-
-  // Set the allocations for the mclock clients
+  // Set [res, wgt, lim] in that order for each mClock client class.
    client_allocs[
      static_cast<size_t>(op_scheduler_class::client)].update(
-      client_res,
-      client_wgt,
-      client_lim);
+      0.6, 5.0, 0.0);
    client_allocs[
      static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-      rec_res,
-      rec_wgt,
-      rec_lim);
+      0.2, 1.0, 0.5);
    client_allocs[
      static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-      best_effort_res,
-      best_effort_wgt,
-      best_effort_lim);
+      0.2, 1.0, 0.0);
  }
  
  void mClockScheduler::enable_mclock_profile_settings()
@@ -361,7 +321,7 @@ void mClockScheduler::set_profile_config()
    cct->_conf.set_val_default("osd_mclock_scheduler_client_res",
      std::to_string(client.res));
    cct->_conf.set_val_default("osd_mclock_scheduler_client_wgt",
-    std::to_string(client.wgt));
+    std::to_string(uint64_t(client.wgt)));
    cct->_conf.set_val_default("osd_mclock_scheduler_client_lim",
      std::to_string(client.lim));
    dout(10) << __func__ << " client QoS params: " << "["
@@ -372,7 +332,7 @@ void mClockScheduler::set_profile_config()
    cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_res",
      std::to_string(rec.res));
    cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_wgt",
-    std::to_string(rec.wgt));
+    std::to_string(uint64_t(rec.wgt)));
    cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_lim",
      std::to_string(rec.lim));
    dout(10) << __func__ << " Recovery QoS params: " << "["
@@ -383,7 +343,7 @@ void mClockScheduler::set_profile_config()
    cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_res",
      std::to_string(best_effort.res));
    cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_wgt",
-    std::to_string(best_effort.wgt));
+    std::to_string(uint64_t(best_effort.wgt)));
    cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_lim",
      std::to_string(best_effort.lim));
    dout(10) << __func__ << " Best effort QoS params: " << "["
@@ -394,12 +354,16 @@ void mClockScheduler::set_profile_config()
    update_configuration();
  }
  
-int mClockScheduler::calc_scaled_cost(int item_cost)
+uint32_t mClockScheduler::calc_scaled_cost(int item_cost)
  {
-  // Calculate total scaled cost in secs
-  int scaled_cost =
-    std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
-  return std::max(scaled_cost, 1);
+  auto cost = static_cast<uint32_t>(
+    std::max<int>(
+      1, // ensure cost is non-zero and positive
+      item_cost));
+  auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io);
+
+  // Calculate total scaled cost in bytes
+  return cost_per_io + cost;
  }
  
  void mClockScheduler::update_configuration()
@@ -452,7 +416,7 @@ void mClockScheduler::enqueue(OpSchedulerItem&& item)
    } else if (priority >= cutoff) {
      enqueue_high(priority, std::move(item));
    } else {
-    int cost = calc_scaled_cost(item.get_cost());
+    auto cost = calc_scaled_cost(item.get_cost());
      item.set_qos_cost(cost);
      dout(20) << __func__ << " " << id
               << " item_cost: " << item.get_cost()
@@ -557,14 +521,10 @@ const char** mClockScheduler::get_tracked_conf_keys() const
      "osd_mclock_scheduler_background_best_effort_res",
      "osd_mclock_scheduler_background_best_effort_wgt",
      "osd_mclock_scheduler_background_best_effort_lim",
-    "osd_mclock_cost_per_io_usec",
-    "osd_mclock_cost_per_io_usec_hdd",
-    "osd_mclock_cost_per_io_usec_ssd",
-    "osd_mclock_cost_per_byte_usec",
-    "osd_mclock_cost_per_byte_usec_hdd",
-    "osd_mclock_cost_per_byte_usec_ssd",
      "osd_mclock_max_capacity_iops_hdd",
      "osd_mclock_max_capacity_iops_ssd",
+    "osd_mclock_max_sequential_bandwidth_hdd",
+    "osd_mclock_max_sequential_bandwidth_ssd",
      "osd_mclock_profile",
      NULL
    };
@@ -575,29 +535,27 @@ void mClockScheduler::handle_conf_change(
    const ConfigProxy& conf,
    const std::set<std::string> &changed)
  {
-  if (changed.count("osd_mclock_cost_per_io_usec") ||
-      changed.count("osd_mclock_cost_per_io_usec_hdd") ||
-      changed.count("osd_mclock_cost_per_io_usec_ssd")) {
-    set_osd_mclock_cost_per_io();
-  }
-  if (changed.count("osd_mclock_cost_per_byte_usec") ||
-      changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
-      changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
-    set_osd_mclock_cost_per_byte();
-  }
    if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
        changed.count("osd_mclock_max_capacity_iops_ssd")) {
-    set_max_osd_capacity();
+    set_osd_capacity_params_from_config();
      if (mclock_profile != "custom") {
        enable_mclock_profile_settings();
-      client_registry.update_from_config(conf);
      }
+    client_registry.update_from_config(
+      conf, osd_bandwidth_capacity_per_shard);
+  }
+  if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") ||
+      changed.count("osd_mclock_max_sequential_bandwidth_ssd")) {
+    set_osd_capacity_params_from_config();
+    client_registry.update_from_config(
+      conf, osd_bandwidth_capacity_per_shard);
    }
    if (changed.count("osd_mclock_profile")) {
      set_mclock_profile();
      if (mclock_profile != "custom") {
        enable_mclock_profile_settings();
-      client_registry.update_from_config(conf);
+      client_registry.update_from_config(
+        conf, osd_bandwidth_capacity_per_shard);
      }
    }
  
@@ -624,7 +582,8 @@ void mClockScheduler::handle_conf_change(
  
    if (auto key = get_changed_key(); key.has_value()) {
      if (mclock_profile == "custom") {
-      client_registry.update_from_config(conf);
+      client_registry.update_from_config(
+        conf, osd_bandwidth_capacity_per_shard);
      } else {
        // Attempt to change QoS parameter for a built-in profile. Restore the
        // profile defaults by making one of the OSD shards remove the key from
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h

index 053dd1e87fd7598d3a371fd976fcfe854a31cc3b..9af97830ca792d28bd82918db232f32c29ef0fa9 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.h
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -33,8 +33,10 @@
  
  namespace ceph::osd::scheduler {
  
-constexpr uint64_t default_min = 1;
-constexpr uint64_t default_max = 999999;
+constexpr double default_min = 1.0;
+constexpr double default_max = std::numeric_limits<double>::is_iec559 ?
+  std::numeric_limits<double>::infinity() :
+  std::numeric_limits<double>::max();
  
  using client_id_t = uint64_t;
  using profile_id_t = uint64_t;
@@ -78,20 +80,18 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
    const int shard_id;
    bool is_rotational;
    MonClient *monc;
-  double max_osd_capacity;
-  double osd_mclock_cost_per_io;
-  double osd_mclock_cost_per_byte;
+
    std::string mclock_profile = "high_client_ops";
    struct ClientAllocs {
-    uint64_t res;
-    uint64_t wgt;
-    uint64_t lim;
+    double res;
+    double wgt;
+    double lim;
  
-    ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+    ClientAllocs(double _res, double _wgt, double _lim) {
        update(_res, _wgt, _lim);
      }
  
-    inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+    inline void update(double _res, double _wgt, double _lim) {
        res = _res;
        wgt = _wgt;
        lim = _lim;
@@ -102,11 +102,55 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
      static_cast<size_t>(op_scheduler_class::client) + 1
    > client_allocs = {
      // Placeholder, get replaced with configured values
-    ClientAllocs(1, 1, 1), // background_recovery
-    ClientAllocs(1, 1, 1), // background_best_effort
-    ClientAllocs(1, 1, 1), // immediate (not used)
-    ClientAllocs(1, 1, 1)  // client
+    ClientAllocs(0, 1, 0), // background_recovery
+    ClientAllocs(0, 1, 0), // background_best_effort
+    ClientAllocs(0, 1, 0), // immediate (not used)
+    ClientAllocs(0, 1, 0)  // client
    };
+
+  /**
+   * osd_bandwidth_cost_per_io
+   *
+   * mClock expects all queued items to have a uniform expression of
+   * "cost".  However, IO devices generally have quite different capacity
+   * for sequential IO vs small random IO.  This implementation handles this
+   * by expressing all costs as a number of sequential bytes written adding
+   * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
+   *
+   * Thus, an IO operation requiring a total of <size> bytes to be written
+   * accross <iops> different locations will have a cost of
+   * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
+   *
+   * Set in set_osd_capacity_params_from_config in the constructor and upon
+   * config change.
+   *
+   * Has units bytes/io.
+   */
+  double osd_bandwidth_cost_per_io;
+
+  /**
+   * osd_bandwidth_capacity_per_shard
+   *
+   * mClock expects reservation and limit paramters to be expressed in units
+   * of cost/second -- which means bytes/second for this implementation.
+   *
+   * Rather than expecting users to compute appropriate limit and reservation
+   * values for each class of OSDs in their cluster, we instead express
+   * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
+   * osd_bandwidth_capacity_per_shard is that capacity divided by the number
+   * of shards.
+   *
+   * Set in set_osd_capacity_params_from_config in the constructor and upon
+   * config change.
+   *
+   * This value gets passed to ClientRegistry::update_from_config in order
+   * to resolve the full reservaiton and limit parameters for mclock from
+   * the configured ratios.
+   *
+   * Has units bytes/second.
+   */
+  double osd_bandwidth_capacity_per_shard;
+
    class ClientRegistry {
      std::array<
        crimson::dmclock::ClientInfo,
@@ -123,7 +167,16 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
      const crimson::dmclock::ClientInfo *get_external_client(
        const client_profile_id_t &client) const;
    public:
-    void update_from_config(const ConfigProxy &conf);
+    /**
+     * update_from_config
+     *
+     * Sets the mclock paramaters (reservation, weight, and limit)
+     * for each class of IO (background_recovery, background_best_effort,
+     * and client).
+     */
+    void update_from_config(
+      const ConfigProxy &conf,
+      double capacity_per_shard);
      const crimson::dmclock::ClientInfo *get_info(
        const scheduler_id_t &id) const;
    } client_registry;
@@ -171,20 +224,25 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
      }
    }
  
+  /**
+   * set_osd_capacity_params_from_config
+   *
+   * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
+   * and osd_bandwidth_capacity_per_shard, internally.  These two
+   * parameters are derived from config parameters
+   * osd_mclock_max_capacity_iops_(hdd|ssd) and
+   * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
+   * Invoking set_osd_capacity_params_from_config() resets those derived
+   * params based on the current config and should be invoked any time they
+   * are modified as well as in the constructor.  See handle_conf_change().
+   */
+  void set_osd_capacity_params_from_config();
+
  public:
    mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
      int shard_id, bool is_rotational, MonClient *monc);
    ~mClockScheduler() override;
  
-  // Set the max osd capacity in iops
-  void set_max_osd_capacity();
-
-  // Set the cost per io for the osd
-  void set_osd_mclock_cost_per_io();
-
-  // Set the cost per byte for the osd
-  void set_osd_mclock_cost_per_byte();
-
    // Set the mclock profile type to enable
    void set_mclock_profile();
  
@@ -206,8 +264,8 @@ public:
    // Set mclock config parameter based on allocations
    void set_profile_config();
  
-  // Calculate scale cost per item
-  int calc_scaled_cost(int cost);
+  /// Calculate scaled cost per item
+  uint32_t calc_scaled_cost(int cost);
  
    // Helper method to display mclock queues
    std::string display_queues() const;
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Thu, 9 Feb 2023 15:17:44 +0000 (20:47 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Thu, 27 Apr 2023 13:11:38 +0000 (18:41 +0530)
qa/standalone/misc/mclock-config.sh		patch \| blob \| history
src/common/options/osd.yaml.in		patch \| blob \| history
src/osd/scheduler/OpSchedulerItem.h		patch \| blob \| history
src/osd/scheduler/mClockScheduler.cc		patch \| blob \| history
src/osd/scheduler/mClockScheduler.h		patch \| blob \| history