]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Add mclock profile infrastructure and implement mclock profiles
authorSridhar Seshasayee <sseshasa@redhat.com>
Mon, 7 Dec 2020 11:09:57 +0000 (16:39 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Sat, 16 Jan 2021 19:09:40 +0000 (00:39 +0530)
Define config options to specify the cost per io for an osd (hdd & ssd).
 - osd_mclock_cost_per_io_msec
 - osd_mclock_cost_per_io_msec_hdd
 - osd_mclock_cost_per_io_msec_ssd

Define config options to set max osd capacity (hdd & ssd) to be allocated
between clients of dmclock namely,
 - osd_mclock_max_capacity_iops
 - osd_mclock_max_capacity_iops_hdd
 - osd_mclock_max_capacity_iops_ssd

Define config option "osd_mclock_profile" to specify the built-in profile
to enable.

Also, Set the number of op shards being used in the osd within the mclock
scheduler as well. This is necessary to calculate the per shard limits
within the mclock scheduler.

With the above information, enable the specified mclock profile by
calling the appropriate method to set the profile specific mclock
parameters and Ceph options.

Prior to enqueuing an op, the scheduler performs a calculation to scale
up or down the cost associated for the OpSchedulerItem. This calculation
is done based on the existing item cost, the max osd capacity provided
and an additional cost factor based on underlying device type(hdd/ssd).

Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
src/common/options.cc
src/osd/OSD.cc
src/osd/scheduler/OpScheduler.cc
src/osd/scheduler/OpScheduler.h
src/osd/scheduler/mClockScheduler.cc
src/osd/scheduler/mClockScheduler.h
src/test/osd/TestMClockScheduler.cc

index 0fa378de72a69d4274f31efef47d7aea8e40b9e4..4e26ae2481cd24d966c387e1bf1aeb726dcfb994 100644 (file)
@@ -2986,55 +2986,55 @@ std::vector<Option> get_global_options() {
     Option("osd_mclock_scheduler_client_res", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description("IO proportion reserved for each client (default)")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_client_wgt", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description("IO share for each client (default) over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_client_lim", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(999999)
     .set_description("IO limit for each client (default) over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_background_recovery_res", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description("IO proportion reserved for background recovery (default)")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_background_recovery_wgt", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description("IO share for each background recovery over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_background_recovery_lim", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(999999)
     .set_description("IO limit for background recovery over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_background_best_effort_res", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description("IO proportion reserved for background best_effort (default)")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_background_best_effort_wgt", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description("IO share for each background best_effort over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_background_best_effort_lim", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(999999)
     .set_description("IO limit for background best_effort over reservation")
-    .set_long_description("Only considered for osd_op_queue = mClockScheduler")
+    .set_long_description("Only considered for osd_op_queue = mclock_scheduler")
     .add_see_also("osd_op_queue"),
 
     Option("osd_mclock_scheduler_anticipation_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
@@ -3042,6 +3042,50 @@ std::vector<Option> get_global_options() {
     .set_description("mclock anticipation timeout in seconds")
     .set_long_description("the amount of time that mclock waits until the unused resource is forfeited"),
 
+    Option("osd_mclock_cost_per_io_msec", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cost per IO in milliseconds to consider per OSD (overrides _ssd and _hdd if non-zero)")
+    .set_long_description("This option specifies the cost factor to consider in msec per OSD. This is considered by the mclock_scheduler to set an additional cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_cost_per_io_msec_hdd", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cost per IO in milliseconds to consider per OSD (for rotational media)")
+    .set_long_description("This option specifies the cost factor to consider in msec per OSD for rotational device type. This is considered by the mclock_scheduler to set an additional cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_cost_per_io_msec_ssd", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cost per IO in milliseconds to consider per OSD (for solid state media)")
+    .set_long_description("This option specifies the cost factor to consider in msec per OSD for solid state device type. This is considered by the mclock_scheduler to set an additional cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_max_capacity_iops", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(0.0)
+    .set_description("Max IOPs capacity (at 4KiB block size) to consider per OSD (overrides _ssd and _hdd if non-zero)")
+    .set_long_description("This option specifies the max osd capacity in iops per OSD. Helps in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_max_capacity_iops_hdd", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(10000.0)
+    .set_description("Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational media)")
+    .set_long_description("This option specifies the max OSD capacity in iops per OSD. Helps in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_max_capacity_iops_ssd", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(21500.0)
+    .set_description("Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state media)")
+    .set_long_description("This option specifies the max OSD capacity in iops per OSD. Helps in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("osd_mclock_profile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("balanced")
+    .set_enum_allowed( { "balanced", "high_recovery_ops", "high_client_ops", "custom" } )
+    .set_description("Which mclock profile to use")
+    .set_long_description("This option specifies the mclock profile to enable - one among the set of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_see_also("osd_op_queue"),
+
     Option("osd_ignore_stale_divergent_priors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
     .set_description(""),
index 61666c392376ae40f2c442742b737d4dcbe39f09..9060a5e39d5027238e3c4dbf440b4a9e1408ecb2 100644 (file)
@@ -10544,7 +10544,8 @@ OSDShard::OSDShard(
     osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
     shard_lock_name(shard_name + "::shard_lock"),
     shard_lock{make_mutex(shard_lock_name)},
-    scheduler(ceph::osd::scheduler::make_scheduler(cct)),
+    scheduler(ceph::osd::scheduler::make_scheduler(
+      cct, osd->num_shards, osd->store->is_rotational())),
     context_queue(sdata_wait_lock, sdata_cond)
 {
   dout(0) << "using op scheduler " << *scheduler << dendl;
index 2f656966bb4d1049296ae85690828e0e914f5367..3ce6fdb55d17323f78333905fda81b858c28d174 100644 (file)
@@ -21,7 +21,8 @@
 
 namespace ceph::osd::scheduler {
 
-OpSchedulerRef make_scheduler(CephContext *cct)
+OpSchedulerRef make_scheduler(
+  CephContext *cct, uint32_t num_shards, bool is_rotational)
 {
   const std::string *type = &cct->_conf->osd_op_queue;
   if (*type == "debug_random") {
@@ -41,7 +42,7 @@ OpSchedulerRef make_scheduler(CephContext *cct)
        cct->_conf->osd_op_pq_min_cost
     );
   } else if (*type == "mclock_scheduler") {
-    return std::make_unique<mClockScheduler>(cct);
+    return std::make_unique<mClockScheduler>(cct, num_shards, is_rotational);
   } else {
     ceph_assert("Invalid choice of wq" == 0);
   }
index 0a17118f02f4b0c97f6052b4cad0003289685303..0c647c95114bd56df8b44651ff892f947040b1d1 100644 (file)
@@ -57,7 +57,8 @@ public:
 std::ostream &operator<<(std::ostream &lhs, const OpScheduler &);
 using OpSchedulerRef = std::unique_ptr<OpScheduler>;
 
-OpSchedulerRef make_scheduler(CephContext *cct);
+OpSchedulerRef make_scheduler(
+  CephContext *cct, uint32_t num_shards, bool is_rotational);
 
 /**
  * Implements OpScheduler in terms of OpQueue
index 1b7407a57b90743710ac38338c7a4bf4596b0437..2b3619111bd3a8f85345c83276ac7e39e38ccb96 100644 (file)
@@ -16,6 +16,7 @@
 #include <memory>
 #include <functional>
 
+#include "include/stringify.h"
 #include "osd/scheduler/mClockScheduler.h"
 #include "common/dout.h"
 
@@ -25,20 +26,38 @@ using namespace std::placeholders;
 #define dout_context cct
 #define dout_subsys ceph_subsys_osd
 #undef dout_prefix
-#define dout_prefix *_dout
+#define dout_prefix *_dout << "mClockScheduler: "
 
 
 namespace ceph::osd::scheduler {
 
-mClockScheduler::mClockScheduler(CephContext *cct) :
-  scheduler(
-    std::bind(&mClockScheduler::ClientRegistry::get_info,
-             &client_registry,
-             _1),
-    dmc::AtLimit::Wait,
-    cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+mClockScheduler::mClockScheduler(CephContext *cct,
+  uint32_t num_shards,
+  bool is_rotational)
+  : cct(cct),
+    num_shards(num_shards),
+    is_rotational(is_rotational),
+    scheduler(
+      std::bind(&mClockScheduler::ClientRegistry::get_info,
+                &client_registry,
+                _1),
+      dmc::AtLimit::Wait,
+      cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
 {
   cct->_conf.add_observer(this);
+  ceph_assert(num_shards > 0);
+  // Set default blocksize and cost for all op types.
+  for (op_type_t op_type = op_type_t::client_op;
+       op_type <= op_type_t::bg_pg_delete;
+       op_type = op_type_t(static_cast<size_t>(op_type) + 1)) {
+    client_cost_infos[op_type] = 4 * 1024;
+    client_scaled_cost_infos[op_type] = 1;
+  }
+  set_max_osd_capacity();
+  set_osd_mclock_cost_per_io();
+  mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+  set_client_allocations();
+  enable_mclock_profile();
   client_registry.update_from_config(cct->_conf);
 }
 
@@ -86,6 +105,235 @@ const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
   }
 }
 
+void mClockScheduler::set_max_osd_capacity()
+{
+  if (cct->_conf.get_val<double>("osd_mclock_max_capacity_iops")) {
+    max_osd_capacity =
+      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops");
+  } else {
+    if (is_rotational) {
+      max_osd_capacity =
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
+    } else {
+      max_osd_capacity =
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
+    }
+  }
+  // Set per op-shard iops limit
+  max_osd_capacity /= num_shards;
+}
+
+void mClockScheduler::set_osd_mclock_cost_per_io()
+{
+  if (cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec")) {
+    osd_mclock_cost_per_io_msec =
+      cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec");
+  } else {
+    if (is_rotational) {
+      osd_mclock_cost_per_io_msec =
+        cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec_hdd");
+    } else {
+      osd_mclock_cost_per_io_msec =
+        cct->_conf.get_val<uint64_t>("osd_mclock_cost_per_io_msec_ssd");
+    }
+  }
+}
+
+void mClockScheduler::set_client_allocations()
+{
+  // Set profile specific client capacity allocations
+  if (mclock_profile == "balanced") {
+    double capacity = std::round(0.5 * max_osd_capacity);
+    client_allocs[op_scheduler_class::client] = capacity;
+    client_allocs[op_scheduler_class::background_recovery] = capacity;
+  } else if (mclock_profile == "high_recovery_ops") {
+    client_allocs[op_scheduler_class::client] =
+      std::round(0.25 * max_osd_capacity);
+    client_allocs[op_scheduler_class::background_recovery] =
+      std::round(0.75 * max_osd_capacity);
+  } else if (mclock_profile == "high_client_ops") {
+    client_allocs[op_scheduler_class::client] =
+      std::round(0.75 * max_osd_capacity);
+    client_allocs[op_scheduler_class::background_recovery] =
+      std::round(0.25 * max_osd_capacity);
+  } else {
+    ceph_assert("Invalid mclock profile" == 0);
+    return;
+  }
+}
+
+double mClockScheduler::get_client_allocation(op_type_t op_type)
+{
+  double default_allocation = 1.0;
+
+  switch (op_type) {
+  case op_type_t::client_op:
+    return client_allocs[op_scheduler_class::client];
+  case op_type_t::bg_recovery:
+    return client_allocs[op_scheduler_class::background_recovery];
+  default:
+    // TODO for other op types.
+    return default_allocation;
+  }
+}
+
+void mClockScheduler::enable_mclock_profile()
+{
+  // Nothing to do for "custom" profile
+  if (mclock_profile == "custom") {
+    return;
+  }
+
+  // Set mclock and ceph config options for the chosen profile
+  if (mclock_profile == "balanced") {
+    set_balanced_profile_config();
+  } else if (mclock_profile == "high_recovery_ops") {
+    set_high_recovery_ops_profile_config();
+  } else if (mclock_profile == "high_client_ops") {
+    set_high_client_ops_profile_config();
+  } else {
+    ceph_assert("Invalid choice of mclock profile" == 0);
+    return;
+  }
+}
+
+std::string mClockScheduler::get_mclock_profile()
+{
+  return mclock_profile;
+}
+
+void mClockScheduler::set_balanced_profile_config()
+{
+  double client_lim = get_client_allocation(op_type_t::client_op);
+  double rec_lim = get_client_allocation(op_type_t::bg_recovery);
+  int client_wgt = 10;
+
+  // Set external client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_wgt", stringify(client_wgt));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_lim", stringify(client_lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_wgt", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
+}
+
+void mClockScheduler::set_high_recovery_ops_profile_config()
+{
+  double client_lim = get_client_allocation(op_type_t::client_op);
+  double rec_lim = get_client_allocation(op_type_t::bg_recovery);
+  int rec_wgt = 10;
+
+  // Set external client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_wgt", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_lim", stringify(client_lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_wgt", stringify(rec_wgt));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
+}
+
+void mClockScheduler::set_high_client_ops_profile_config()
+{
+  double client_lim = get_client_allocation(op_type_t::client_op);
+  double rec_lim = get_client_allocation(op_type_t::bg_recovery);
+  int client_wgt = 10;
+
+  // Set external client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_wgt", stringify(client_wgt));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_client_lim", stringify(client_lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_res", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_wgt", stringify(default_min));
+  cct->_conf.set_val(
+    "osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
+}
+
+int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
+{
+  double client_alloc = get_client_allocation(op_type);
+  if (client_alloc == 1.0) {
+    // Client not yet supported, return default cost.
+    return 1;
+  }
+
+  // Calculate bandwidth from max osd capacity (at 4KiB blocksize).
+  double max_osd_bandwidth = max_osd_capacity * num_shards * 4 * 1024;
+
+  // Calculate scaled cost based on item cost
+  double scaled_cost = (cost / max_osd_bandwidth) * client_alloc;
+
+  // Scale the cost down by an additional cost factor if specified
+  // to account for different device characteristics (hdd, ssd).
+  // This option can be used to further tune the performance further
+  // if necessary (disabled by default).
+  if (osd_mclock_cost_per_io_msec > 0) {
+    scaled_cost *= osd_mclock_cost_per_io_msec / 1000.0;
+  }
+
+  return std::floor(scaled_cost);
+}
+
+bool mClockScheduler::maybe_update_client_cost_info(
+  op_type_t op_type, int new_cost)
+{
+  int capped_item_cost = 4 * 1024 * 1024;
+
+  if (new_cost == 0) {
+    return false;
+  }
+
+  // The mclock params represented in terms of the per-osd capacity
+  // are scaled up or down according to the cost associated with
+  // item cost and updated within the dmclock server.
+  int cur_cost = client_cost_infos[op_type];
+
+  // Note: Cap the scaling of item cost to ~4MiB as the tag increments
+  // beyond this point are too long causing performance issues. This may
+  // need to be in place until benchmark data is available or a better
+  // scaling model can be put in place. This is a TODO.
+  if (new_cost >= capped_item_cost) {
+    new_cost = capped_item_cost;
+  }
+
+  bool cost_changed =
+    ((new_cost >= (cur_cost << 1)) || (cur_cost >= (new_cost << 1)));
+
+  if (cost_changed) {
+    client_cost_infos[op_type] = new_cost;
+    // Update client scaled cost info
+    int scaled_cost = std::max(calc_scaled_cost(op_type, new_cost), 1);
+    if (scaled_cost != client_scaled_cost_infos[op_type]) {
+      client_scaled_cost_infos[op_type] = scaled_cost;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 void mClockScheduler::dump(ceph::Formatter &f) const
 {
 }
@@ -93,8 +341,13 @@ void mClockScheduler::dump(ceph::Formatter &f) const
 void mClockScheduler::enqueue(OpSchedulerItem&& item)
 {
   auto id = get_scheduler_id(item);
-  // TODO: express cost, mclock params in terms of per-node capacity?
-  auto cost = 1; //std::max(item.get_cost(), 1);
+  auto op_type = item.get_op_type();
+  int cost = client_scaled_cost_infos[op_type];
+
+  // Re-calculate the scaled cost for the client if the item cost changed
+  if (maybe_update_client_cost_info(op_type, item.get_cost())) {
+    cost = client_scaled_cost_infos[op_type];
+  }
 
   // TODO: move this check into OpSchedulerItem, handle backwards compat
   if (op_scheduler_class::immediate == item.get_scheduler_class()) {
@@ -149,6 +402,13 @@ const char** mClockScheduler::get_tracked_conf_keys() const
     "osd_mclock_scheduler_background_best_effort_res",
     "osd_mclock_scheduler_background_best_effort_wgt",
     "osd_mclock_scheduler_background_best_effort_lim",
+    "osd_mclock_cost_per_io_msec",
+    "osd_mclock_cost_per_io_msec_hdd",
+    "osd_mclock_cost_per_io_msec_ssd",
+    "osd_mclock_max_capacity_iops",
+    "osd_mclock_max_capacity_iops_hdd",
+    "osd_mclock_max_capacity_iops_ssd",
+    "osd_mclock_profile",
     NULL
   };
   return KEYS;
index 2573afead3b362d6d18ff842b3bef8b59bdbf0d1..f6091a8734abd02a36f2890739c39923739a3f51 100644 (file)
 
 namespace ceph::osd::scheduler {
 
+constexpr uint64_t default_min = 1;
+constexpr uint64_t default_max = 999999;
+
 using client_id_t = uint64_t;
 using profile_id_t = uint64_t;
+using op_type_t = OpSchedulerItem::OpQueueable::op_type_t;
 
 struct client_profile_id_t {
   client_id_t client_id;
@@ -60,6 +64,15 @@ WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
  */
 class mClockScheduler : public OpScheduler, md_config_obs_t {
 
+  CephContext *cct;
+  const uint32_t num_shards;
+  bool is_rotational;
+  double max_osd_capacity;
+  uint64_t osd_mclock_cost_per_io_msec;
+  std::string mclock_profile = "balanced";
+  std::map<op_scheduler_class, double> client_allocs;
+  std::map<op_type_t, int> client_cost_infos;
+  std::map<op_type_t, int> client_scaled_cost_infos;
   class ClientRegistry {
     std::array<
       crimson::dmclock::ClientInfo,
@@ -101,7 +114,40 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
   }
 
 public:
-  mClockScheduler(CephContext *cct);
+  mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational);
+
+  // Set the max osd capacity in iops
+  void set_max_osd_capacity();
+
+  // Set the cost per io for the osd
+  void set_osd_mclock_cost_per_io();
+
+  // Set the mclock related config params based on the profile
+  void enable_mclock_profile();
+
+  // Get the active mclock profile
+  std::string get_mclock_profile();
+
+  // Set client capacity allocations based on profile
+  void set_client_allocations();
+
+  // Get client allocation
+  double get_client_allocation(op_type_t op_type);
+
+  // Set "balanced" profile parameters
+  void set_balanced_profile_config();
+
+  // Set "high_recovery_ops" profile parameters
+  void set_high_recovery_ops_profile_config();
+
+  // Set "high_client_ops" profile parameters
+  void set_high_client_ops_profile_config();
+
+  // Calculate scale cost per item
+  int calc_scaled_cost(op_type_t op_type, int cost);
+
+  // Update mclock client cost info
+  bool maybe_update_client_cost_info(op_type_t op_type, int new_cost);
 
   // Enqueue op in the back of the regular queue
   void enqueue(OpSchedulerItem &&item) final;
index 775dbb2f993433bacb57b255f4cd18a73395dbb9..a2e2f2a4edfeceaaf87c28f4d266628e70e763cc 100644 (file)
@@ -1,5 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 
+#include <chrono>
+
 #include "gtest/gtest.h"
 
 #include "global/global_context.h"
@@ -25,6 +27,8 @@ int main(int argc, char **argv) {
 
 class mClockSchedulerTest : public testing::Test {
 public:
+  uint32_t num_shards;
+  bool is_rotational;
   mClockScheduler q;
 
   uint64_t client1;
@@ -32,7 +36,9 @@ public:
   uint64_t client3;
 
   mClockSchedulerTest() :
-    q(g_ceph_context),
+    num_shards(1),
+    is_rotational(false),
+    q(g_ceph_context, num_shards, is_rotational),
     client1(1001),
     client2(9999),
     client3(100000001)
@@ -85,9 +91,10 @@ OpSchedulerItem get_item(WorkItem item)
 TEST_F(mClockSchedulerTest, TestEmpty) {
   ASSERT_TRUE(q.empty());
 
-  q.enqueue(create_item(100, client1, op_scheduler_class::client));
-  q.enqueue(create_item(102, client1, op_scheduler_class::client));
-  q.enqueue(create_item(104, client1, op_scheduler_class::client));
+  for (unsigned i = 100; i < 105; i+=2) {
+    q.enqueue(create_item(i, client1, op_scheduler_class::client));
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
 
   ASSERT_FALSE(q.empty());
 
@@ -96,6 +103,7 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
   reqs.push_back(get_item(q.dequeue()));
   reqs.push_back(get_item(q.dequeue()));
 
+  ASSERT_EQ(2u, reqs.size());
   ASSERT_FALSE(q.empty());
 
   for (auto &&i : reqs) {
@@ -114,11 +122,12 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
 }
 
 TEST_F(mClockSchedulerTest, TestSingleClientOrderedEnqueueDequeue) {
-  q.enqueue(create_item(100, client1));
-  q.enqueue(create_item(101, client1));
-  q.enqueue(create_item(102, client1));
-  q.enqueue(create_item(103, client1));
-  q.enqueue(create_item(104, client1));
+  ASSERT_TRUE(q.empty());
+
+  for (unsigned i = 100; i < 105; ++i) {
+    q.enqueue(create_item(i, client1, op_scheduler_class::client));
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
 
   auto r = get_item(q.dequeue());
   ASSERT_EQ(100u, r.get_map_epoch());