osd: Add mclock profile infrastructure and implement mclock profiles

author Sridhar Seshasayee <sseshasa@redhat.com>

Mon, 7 Dec 2020 11:09:57 +0000 (16:39 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Sat, 16 Jan 2021 19:09:40 +0000 (00:39 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Mon, 7 Dec 2020 11:09:57 +0000 (16:39 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Sat, 16 Jan 2021 19:09:40 +0000 (00:39 +0530)
diff --git a/src/common/options.cc b/src/common/options.cc

index 0fa378de72a69d4274f31efef47d7aea8e40b9e4..4e26ae2481cd24d966c387e1bf1aeb726dcfb994 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -2986,55 +2986,55 @@ std::vector<Option> get_global_options() {
      Option("osd_mclock_scheduler_client_res", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description("IO proportion reserved for each client (default)")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_client_wgt", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description("IO share for each client (default) over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_client_lim", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(999999)
      .set_description("IO limit for each client (default) over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_background_recovery_res", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description("IO proportion reserved for background recovery (default)")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_background_recovery_wgt", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description("IO share for each background recovery over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_background_recovery_lim", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(999999)
      .set_description("IO limit for background recovery over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_background_best_effort_res", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description("IO proportion reserved for background best_effort (default)")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_background_best_effort_wgt", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description("IO share for each background best_effort over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_background_best_effort_lim", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(999999)
      .set_description("IO limit for background best_effort over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
      .add_see_also("osd_op_queue"),
  
      Option("osd_mclock_scheduler_anticipation_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
@@ -3042,6 +3042,50 @@ std::vector<Option> get_global_options() {
      .set_description("mclock anticipation timeout in seconds")
      .set_long_description("the amount of time that mclock waits until the unused resource is forfeited"),
  
+    Option("osd_mclock_cost_per_io_msec", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cost per IO in milliseconds to consider per OSD (overrides _ssd and _hdd if non-zero)")
+    .set_long_description("This option specifies the cost factor to consider in msec per OSD. This is considered by the mclock_scheduler to set an additional cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_cost_per_io_msec_hdd", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cost per IO in milliseconds to consider per OSD (for rotational media)")
+    .set_long_description("This option specifies the cost factor to consider in msec per OSD for rotational device type. This is considered by the mclock_scheduler to set an additional cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_cost_per_io_msec_ssd", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cost per IO in milliseconds to consider per OSD (for solid state media)")
+    .set_long_description("This option specifies the cost factor to consider in msec per OSD for solid state device type. This is considered by the mclock_scheduler to set an additional cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_max_capacity_iops", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(0.0)
+    .set_description("Max IOPs capacity (at 4KiB block size) to consider per OSD (overrides _ssd and _hdd if non-zero)")
+    .set_long_description("This option specifies the max osd capacity in iops per OSD. Helps in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_max_capacity_iops_hdd", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(10000.0)
+    .set_description("Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational media)")
+    .set_long_description("This option specifies the max OSD capacity in iops per OSD. Helps in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_max_capacity_iops_ssd", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(21500.0)
+    .set_description("Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state media)")
+    .set_long_description("This option specifies the max OSD capacity in iops per OSD. Helps in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_profile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("balanced")
+    .set_enum_allowed( { "balanced", "high_recovery_ops", "high_client_ops", "custom" } )
+    .set_description("Which mclock profile to use")
+    .set_long_description("This option specifies the mclock profile to enable - one among the set of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_see_also("osd_op_queue"),
+
      Option("osd_ignore_stale_divergent_priors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(false)
      .set_description(""),
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 61666c392376ae40f2c442742b737d4dcbe39f09..9060a5e39d5027238e3c4dbf440b4a9e1408ecb2 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -10544,7 +10544,8 @@ OSDShard::OSDShard(
      osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
      shard_lock_name(shard_name + "::shard_lock"),
      shard_lock{make_mutex(shard_lock_name)},
-    scheduler(ceph::osd::scheduler::make_scheduler(cct)),
+    scheduler(ceph::osd::scheduler::make_scheduler(
+      cct, osd->num_shards, osd->store->is_rotational())),
      context_queue(sdata_wait_lock, sdata_cond)
  {
    dout(0) << "using op scheduler " << *scheduler << dendl;
diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc

index 2f656966bb4d1049296ae85690828e0e914f5367..3ce6fdb55d17323f78333905fda81b858c28d174 100644 (file)
--- a/src/osd/scheduler/OpScheduler.cc
+++ b/src/osd/scheduler/OpScheduler.cc
@@ -21,7 +21,8 @@
  
  namespace ceph::osd::scheduler {
  
-OpSchedulerRef make_scheduler(CephContext *cct)
+OpSchedulerRef make_scheduler(
+  CephContext *cct, uint32_t num_shards, bool is_rotational)
  {
    const std::string *type = &cct->_conf->osd_op_queue;
    if (*type == "debug_random") {
@@ -41,7 +42,7 @@ OpSchedulerRef make_scheduler(CephContext *cct)
         cct->_conf->osd_op_pq_min_cost
      );
    } else if (*type == "mclock_scheduler") {
-    return std::make_unique<mClockScheduler>(cct);
+    return std::make_unique<mClockScheduler>(cct, num_shards, is_rotational);
    } else {
      ceph_assert("Invalid choice of wq" == 0);
    }
diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h

index 0a17118f02f4b0c97f6052b4cad0003289685303..0c647c95114bd56df8b44651ff892f947040b1d1 100644 (file)
--- a/src/osd/scheduler/OpScheduler.h
+++ b/src/osd/scheduler/OpScheduler.h
@@ -57,7 +57,8 @@ public:
  std::ostream &operator<<(std::ostream &lhs, const OpScheduler &);
  using OpSchedulerRef = std::unique_ptr<OpScheduler>;
  
-OpSchedulerRef make_scheduler(CephContext *cct);
+OpSchedulerRef make_scheduler(
+  CephContext *cct, uint32_t num_shards, bool is_rotational);
  
  /**
   * Implements OpScheduler in terms of OpQueue
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc

index 1b7407a57b90743710ac38338c7a4bf4596b0437..2b3619111bd3a8f85345c83276ac7e39e38ccb96 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.cc
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -16,6 +16,7 @@
  #include <memory>
  #include <functional>
  
+#include "include/stringify.h"
  #include "osd/scheduler/mClockScheduler.h"
  #include "common/dout.h"
  
@@ -25,20 +26,38 @@ using namespace std::placeholders;
  #define dout_context cct
  #define dout_subsys ceph_subsys_osd
  #undef dout_prefix
-#define dout_prefix *_dout
+#define dout_prefix *_dout << "mClockScheduler: "
  
  
  namespace ceph::osd::scheduler {
  
-mClockScheduler::mClockScheduler(CephContext *cct) :
-  scheduler(
-    std::bind(&mClockScheduler::ClientRegistry::get_info,
-             &client_registry,
-             _1),
-    dmc::AtLimit::Wait,
-    cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+mClockScheduler::mClockScheduler(CephContext *cct,
+  uint32_t num_shards,
+  bool is_rotational)
+  : cct(cct),
+    num_shards(num_shards),
+    is_rotational(is_rotational),
+    scheduler(
+      std::bind(&mClockScheduler::ClientRegistry::get_info,
+                &client_registry,
+                _1),
+      dmc::AtLimit::Wait,
+      cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
  {
    cct->_conf.add_observer(this);
+  ceph_assert(num_shards > 0);
+  // Set default blocksize and cost for all op types.
+  for (op_type_t op_type = op_type_t::client_op;
+       op_type <= op_type_t::bg_pg_delete;
+       op_type = op_type_t(static_cast<size_t>(op_type) + 1)) {
+    client_cost_infos[op_type] = 4 * 1024;
+    client_scaled_cost_infos[op_type] = 1;
+  }
+  set_max_osd_capacity();
+  set_osd_mclock_cost_per_io();
+  mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+  set_client_allocations();
+  enable_mclock_profile();
    client_registry.update_from_config(cct->_conf);
  }
  
@@ -86,6 +105,235 @@ const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
    }
  }
  
+void mClockScheduler::set_max_osd_capacity()
+{
+  if (cct->_conf.get_val<double>("osd_mclock_max_capacity_iops")) {
+    max_osd_capacity =
+      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops");
+  } else {
+    if (is_rotational) {
+      max_osd_capacity =
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
+    } else {
+      max_osd_capacity =
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
+    }
+  }
+  // Set per op-shard iops limit
+  max_osd_capacity /= num_shards;
+}
+
+void mClockScheduler::set_osd_mclock_cost_per_io()
+{
+  if (cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec")) {
+    osd_mclock_cost_per_io_msec =
+      cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec");
+  } else {
+    if (is_rotational) {
+      osd_mclock_cost_per_io_msec =
+        cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec_hdd");
+    } else {
+      osd_mclock_cost_per_io_msec =
+        cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec_ssd");
+    }
+  }
+}
+
+void mClockScheduler::set_client_allocations()
+{
+  // Set profile specific client capacity allocations
+  if (mclock_profile == "balanced") {
+    double capacity = std::round(0.5 * max_osd_capacity);
+    client_allocs[op_scheduler_class::client] = capacity;
+    client_allocs[op_scheduler_class::background_recovery] = capacity;
+  } else if (mclock_profile == "high_recovery_ops") {
+    client_allocs[op_scheduler_class::client] =
+      std::round(0.25 * max_osd_capacity);
+    client_allocs[op_scheduler_class::background_recovery] =
+      std::round(0.75 * max_osd_capacity);
+  } else if (mclock_profile == "high_client_ops") {
+    client_allocs[op_scheduler_class::client] =
+      std::round(0.75 * max_osd_capacity);
+    client_allocs[op_scheduler_class::background_recovery] =
+      std::round(0.25 * max_osd_capacity);
+  } else {
+    ceph_assert("Invalid mclock profile" == 0);
+    return;
+  }
+}
+
+double mClockScheduler::get_client_allocation(op_type_t op_type)
+{
+  double default_allocation = 1.0;
+
+  switch (op_type) {
+  case op_type_t::client_op:
+    return client_allocs[op_scheduler_class::client];
+  case op_type_t::bg_recovery:
+    return client_allocs[op_scheduler_class::background_recovery];
+  default:
+    // TODO for other op types.
+    return default_allocation;
+  }
+}
+
+void mClockScheduler::enable_mclock_profile()
+{
+  // Nothing to do for "custom" profile
+  if (mclock_profile == "custom") {
+    return;
+  }
+
+  // Set mclock and ceph config options for the chosen profile
+  if (mclock_profile == "balanced") {
+    set_balanced_profile_config();
+  } else if (mclock_profile == "high_recovery_ops") {
+    set_high_recovery_ops_profile_config();
+  } else if (mclock_profile == "high_client_ops") {
+    set_high_client_ops_profile_config();
+  } else {
+    ceph_assert("Invalid choice of mclock profile" == 0);
+    return;
+  }
+}
+
+std::string mClockScheduler::get_mclock_profile()
+{
+  return mclock_profile;
+}
+
+void mClockScheduler::set_balanced_profile_config()
+{
+  double client_lim = get_client_allocation(op_type_t::client_op);
+  double rec_lim = get_client_allocation(op_type_t::bg_recovery);
+  int client_wgt = 10;
+
+  // Set external client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_wgt", stringify(client_wgt));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_lim", stringify(client_lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_wgt", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
+}
+
+void mClockScheduler::set_high_recovery_ops_profile_config()
+{
+  double client_lim = get_client_allocation(op_type_t::client_op);
+  double rec_lim = get_client_allocation(op_type_t::bg_recovery);
+  int rec_wgt = 10;
+
+  // Set external client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_wgt", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_lim", stringify(client_lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_wgt", stringify(rec_wgt));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
+}
+
+void mClockScheduler::set_high_client_ops_profile_config()
+{
+  double client_lim = get_client_allocation(op_type_t::client_op);
+  double rec_lim = get_client_allocation(op_type_t::bg_recovery);
+  int client_wgt = 10;
+
+  // Set external client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_wgt", stringify(client_wgt));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_lim", stringify(client_lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_wgt", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
+}
+
+int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
+{
+  double client_alloc = get_client_allocation(op_type);
+  if (client_alloc == 1.0) {
+    // Client not yet supported, return default cost.
+    return 1;
+  }
+
+  // Calculate bandwidth from max osd capacity (at 4KiB blocksize).
+  double max_osd_bandwidth = max_osd_capacity * num_shards * 4 * 1024;
+
+  // Calculate scaled cost based on item cost
+  double scaled_cost = (cost / max_osd_bandwidth) * client_alloc;
+
+  // Scale the cost down by an additional cost factor if specified
+  // to account for different device characteristics (hdd, ssd).
+  // This option can be used to further tune the performance further
+  // if necessary (disabled by default).
+  if (osd_mclock_cost_per_io_msec > 0) {
+    scaled_cost *= osd_mclock_cost_per_io_msec / 1000.0;
+  }
+
+  return std::floor(scaled_cost);
+}
+
+bool mClockScheduler::maybe_update_client_cost_info(
+  op_type_t op_type, int new_cost)
+{
+  int capped_item_cost = 4 * 1024 * 1024;
+
+  if (new_cost == 0) {
+    return false;
+  }
+
+  // The mclock params represented in terms of the per-osd capacity
+  // are scaled up or down according to the cost associated with
+  // item cost and updated within the dmclock server.
+  int cur_cost = client_cost_infos[op_type];
+
+  // Note: Cap the scaling of item cost to ~4MiB as the tag increments
+  // beyond this point are too long causing performance issues. This may
+  // need to be in place until benchmark data is available or a better
+  // scaling model can be put in place. This is a TODO.
+  if (new_cost >= capped_item_cost) {
+    new_cost = capped_item_cost;
+  }
+
+  bool cost_changed =
+    ((new_cost >= (cur_cost << 1)) || (cur_cost >= (new_cost << 1)));
+
+  if (cost_changed) {
+    client_cost_infos[op_type] = new_cost;
+    // Update client scaled cost info
+    int scaled_cost = std::max(calc_scaled_cost(op_type, new_cost), 1);
+    if (scaled_cost != client_scaled_cost_infos[op_type]) {
+      client_scaled_cost_infos[op_type] = scaled_cost;
+      return true;
+    }
+  }
+
+  return false;
+}
+
  void mClockScheduler::dump(ceph::Formatter &f) const
  {
  }
@@ -93,8 +341,13 @@ void mClockScheduler::dump(ceph::Formatter &f) const
  void mClockScheduler::enqueue(OpSchedulerItem&& item)
  {
    auto id = get_scheduler_id(item);
-  // TODO: express cost, mclock params in terms of per-node capacity?
-  auto cost = 1; //std::max(item.get_cost(), 1);
+  auto op_type = item.get_op_type();
+  int cost = client_scaled_cost_infos[op_type];
+
+  // Re-calculate the scaled cost for the client if the item cost changed
+  if (maybe_update_client_cost_info(op_type, item.get_cost())) {
+    cost = client_scaled_cost_infos[op_type];
+  }
  
    // TODO: move this check into OpSchedulerItem, handle backwards compat
    if (op_scheduler_class::immediate == item.get_scheduler_class()) {
@@ -149,6 +402,13 @@ const char** mClockScheduler::get_tracked_conf_keys() const
      "osd_mclock_scheduler_background_best_effort_res",
      "osd_mclock_scheduler_background_best_effort_wgt",
      "osd_mclock_scheduler_background_best_effort_lim",
+    "osd_mclock_cost_per_io_msec",
+    "osd_mclock_cost_per_io_msec_hdd",
+    "osd_mclock_cost_per_io_msec_ssd",
+    "osd_mclock_max_capacity_iops",
+    "osd_mclock_max_capacity_iops_hdd",
+    "osd_mclock_max_capacity_iops_ssd",
+    "osd_mclock_profile",
      NULL
    };
    return KEYS;
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h

index 2573afead3b362d6d18ff842b3bef8b59bdbf0d1..f6091a8734abd02a36f2890739c39923739a3f51 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.h
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -33,8 +33,12 @@
  
  namespace ceph::osd::scheduler {
  
+constexpr uint64_t default_min = 1;
+constexpr uint64_t default_max = 999999;
+
  using client_id_t = uint64_t;
  using profile_id_t = uint64_t;
+using op_type_t = OpSchedulerItem::OpQueueable::op_type_t;
  
  struct client_profile_id_t {
    client_id_t client_id;
@@ -60,6 +64,15 @@ WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
   */
  class mClockScheduler : public OpScheduler, md_config_obs_t {
  
+  CephContext *cct;
+  const uint32_t num_shards;
+  bool is_rotational;
+  double max_osd_capacity;
+  uint64_t osd_mclock_cost_per_io_msec;
+  std::string mclock_profile = "balanced";
+  std::map<op_scheduler_class, double> client_allocs;
+  std::map<op_type_t, int> client_cost_infos;
+  std::map<op_type_t, int> client_scaled_cost_infos;
    class ClientRegistry {
      std::array<
        crimson::dmclock::ClientInfo,
@@ -101,7 +114,40 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
    }
  
  public:
-  mClockScheduler(CephContext *cct);
+  mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational);
+
+  // Set the max osd capacity in iops
+  void set_max_osd_capacity();
+
+  // Set the cost per io for the osd
+  void set_osd_mclock_cost_per_io();
+
+  // Set the mclock related config params based on the profile
+  void enable_mclock_profile();
+
+  // Get the active mclock profile
+  std::string get_mclock_profile();
+
+  // Set client capacity allocations based on profile
+  void set_client_allocations();
+
+  // Get client allocation
+  double get_client_allocation(op_type_t op_type);
+
+  // Set "balanced" profile parameters
+  void set_balanced_profile_config();
+
+  // Set "high_recovery_ops" profile parameters
+  void set_high_recovery_ops_profile_config();
+
+  // Set "high_client_ops" profile parameters
+  void set_high_client_ops_profile_config();
+
+  // Calculate scale cost per item
+  int calc_scaled_cost(op_type_t op_type, int cost);
+
+  // Update mclock client cost info
+  bool maybe_update_client_cost_info(op_type_t op_type, int new_cost);
  
    // Enqueue op in the back of the regular queue
    void enqueue(OpSchedulerItem &&item) final;
diff --git a/src/test/osd/TestMClockScheduler.cc b/src/test/osd/TestMClockScheduler.cc

index 775dbb2f993433bacb57b255f4cd18a73395dbb9..a2e2f2a4edfeceaaf87c28f4d266628e70e763cc 100644 (file)
--- a/src/test/osd/TestMClockScheduler.cc
+++ b/src/test/osd/TestMClockScheduler.cc
@@ -1,5 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  
+#include <chrono>
+
  #include "gtest/gtest.h"
  
  #include "global/global_context.h"
@@ -25,6 +27,8 @@ int main(int argc, char **argv) {
  
  class mClockSchedulerTest : public testing::Test {
  public:
+  uint32_t num_shards;
+  bool is_rotational;
    mClockScheduler q;
  
    uint64_t client1;
@@ -32,7 +36,9 @@ public:
    uint64_t client3;
  
    mClockSchedulerTest() :
-    q(g_ceph_context),
+    num_shards(1),
+    is_rotational(false),
+    q(g_ceph_context, num_shards, is_rotational),
      client1(1001),
      client2(9999),
      client3(100000001)
@@ -85,9 +91,10 @@ OpSchedulerItem get_item(WorkItem item)
  TEST_F(mClockSchedulerTest, TestEmpty) {
    ASSERT_TRUE(q.empty());
  
-  q.enqueue(create_item(100, client1, op_scheduler_class::client));
-  q.enqueue(create_item(102, client1, op_scheduler_class::client));
-  q.enqueue(create_item(104, client1, op_scheduler_class::client));
+  for (unsigned i = 100; i < 105; i+=2) {
+    q.enqueue(create_item(i, client1, op_scheduler_class::client));
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
  
    ASSERT_FALSE(q.empty());
  
@@ -96,6 +103,7 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
    reqs.push_back(get_item(q.dequeue()));
    reqs.push_back(get_item(q.dequeue()));
  
+  ASSERT_EQ(2u, reqs.size());
    ASSERT_FALSE(q.empty());
  
    for (auto &&i : reqs) {
@@ -114,11 +122,12 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
  }
  
  TEST_F(mClockSchedulerTest, TestSingleClientOrderedEnqueueDequeue) {
-  q.enqueue(create_item(100, client1));
-  q.enqueue(create_item(101, client1));
-  q.enqueue(create_item(102, client1));
-  q.enqueue(create_item(103, client1));
-  q.enqueue(create_item(104, client1));
+  ASSERT_TRUE(q.empty());
+
+  for (unsigned i = 100; i < 105; ++i) {
+    q.enqueue(create_item(i, client1, op_scheduler_class::client));
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
  
    auto r = get_item(q.dequeue());
    ASSERT_EQ(100u, r.get_map_epoch());
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Mon, 7 Dec 2020 11:09:57 +0000 (16:39 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Sat, 16 Jan 2021 19:09:40 +0000 (00:39 +0530)
src/common/options.cc		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/scheduler/OpScheduler.cc		patch \| blob \| history
src/osd/scheduler/OpScheduler.h		patch \| blob \| history
src/osd/scheduler/mClockScheduler.cc		patch \| blob \| history
src/osd/scheduler/mClockScheduler.h		patch \| blob \| history
src/test/osd/TestMClockScheduler.cc		patch \| blob \| history