osd: Fix the OpSchedulerItem cost scaling calculation.

author Sridhar Seshasayee <sseshasa@redhat.com>

Wed, 27 Jan 2021 13:09:57 +0000 (18:39 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Mon, 22 Feb 2021 05:56:04 +0000 (11:26 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Wed, 27 Jan 2021 13:09:57 +0000 (18:39 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Mon, 22 Feb 2021 05:56:04 +0000 (11:26 +0530)
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc

index d00a8aafbaa8a517c9ad03d901723fe8de1a3513..979311d8c7b4f066d30ff78a29c4193b162382ec 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.cc
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -45,13 +45,6 @@ mClockScheduler::mClockScheduler(CephContext *cct,
  {
    cct->_conf.add_observer(this);
    ceph_assert(num_shards > 0);
-  // Set default blocksize and cost for all op types.
-  for (op_type_t op_type = op_type_t::client_op;
-       op_type <= op_type_t::bg_pg_delete;
-       op_type = op_type_t(static_cast<size_t>(op_type) + 1)) {
-    client_cost_infos[op_type] = 4 * 1024;
-    client_scaled_cost_infos[op_type] = 1;
-  }
    set_max_osd_capacity();
    set_osd_mclock_cost_per_io();
    set_mclock_profile();
@@ -117,6 +110,8 @@ void mClockScheduler::set_max_osd_capacity()
          cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
      }
    }
+  // Set max osd bandwidth across all shards (at 4KiB blocksize)
+  max_osd_bandwidth = max_osd_capacity * 4 * 1024;
    // Set per op-shard iops limit
    max_osd_capacity /= num_shards;
  }
@@ -363,19 +358,10 @@ void mClockScheduler::set_global_recovery_options()
    cct->_conf.apply_changes(nullptr);
  }
  
-int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
+int mClockScheduler::calc_scaled_cost(int cost)
  {
-  double client_alloc = get_client_allocation(op_type);
-  if (client_alloc == 1.0) {
-    // Client not yet supported, return default cost.
-    return 1;
-  }
-
-  // Calculate bandwidth from max osd capacity (at 4KiB blocksize).
-  double max_osd_bandwidth = max_osd_capacity * num_shards * 4 * 1024;
-
-  // Calculate scaled cost based on item cost
-  double scaled_cost = (cost / max_osd_bandwidth) * client_alloc;
+  // Calculate scaled cost in msecs based on item cost
+  int scaled_cost = std::floor((cost / max_osd_bandwidth) * 1000);
  
    // Scale the cost down by an additional cost factor if specified
    // to account for different device characteristics (hdd, ssd).
@@ -385,45 +371,7 @@ int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
      scaled_cost *= osd_mclock_cost_per_io_msec / 1000.0;
    }
  
-  return std::floor(scaled_cost);
-}
-
-bool mClockScheduler::maybe_update_client_cost_info(
-  op_type_t op_type, int new_cost)
-{
-  int capped_item_cost = 4 * 1024 * 1024;
-
-  if (new_cost == 0) {
-    return false;
-  }
-
-  // The mclock params represented in terms of the per-osd capacity
-  // are scaled up or down according to the cost associated with
-  // item cost and updated within the dmclock server.
-  int cur_cost = client_cost_infos[op_type];
-
-  // Note: Cap the scaling of item cost to ~4MiB as the tag increments
-  // beyond this point are too long causing performance issues. This may
-  // need to be in place until benchmark data is available or a better
-  // scaling model can be put in place. This is a TODO.
-  if (new_cost >= capped_item_cost) {
-    new_cost = capped_item_cost;
-  }
-
-  bool cost_changed =
-    ((new_cost >= (cur_cost << 1)) || (cur_cost >= (new_cost << 1)));
-
-  if (cost_changed) {
-    client_cost_infos[op_type] = new_cost;
-    // Update client scaled cost info
-    int scaled_cost = std::max(calc_scaled_cost(op_type, new_cost), 1);
-    if (scaled_cost != client_scaled_cost_infos[op_type]) {
-      client_scaled_cost_infos[op_type] = scaled_cost;
-      return true;
-    }
-  }
-
-  return false;
+  return std::max(scaled_cost, 1);
  }
  
  void mClockScheduler::dump(ceph::Formatter &f) const
@@ -433,18 +381,13 @@ void mClockScheduler::dump(ceph::Formatter &f) const
  void mClockScheduler::enqueue(OpSchedulerItem&& item)
  {
    auto id = get_scheduler_id(item);
-  auto op_type = item.get_op_type();
-  int cost = client_scaled_cost_infos[op_type];
-
-  // Re-calculate the scaled cost for the client if the item cost changed
-  if (maybe_update_client_cost_info(op_type, item.get_cost())) {
-    cost = client_scaled_cost_infos[op_type];
-  }
  
    // TODO: move this check into OpSchedulerItem, handle backwards compat
-  if (op_scheduler_class::immediate == item.get_scheduler_class()) {
+  if (op_scheduler_class::immediate == id.class_id) {
      immediate.push_front(std::move(item));
    } else {
+    int cost = calc_scaled_cost(item.get_cost());
+    // Add item to scheduler queue
      scheduler.add_request(
        std::move(item),
        id,
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h

index aa228339845d3deb872daf3254f98b785e7f60e9..8727af6179833379e0004a2fd6c9964824e330d7 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.h
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -38,7 +38,6 @@ constexpr uint64_t default_max = 999999;
  
  using client_id_t = uint64_t;
  using profile_id_t = uint64_t;
-using op_type_t = OpSchedulerItem::OpQueueable::op_type_t;
  
  struct client_profile_id_t {
    client_id_t client_id;
@@ -68,6 +67,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
    const uint32_t num_shards;
    bool is_rotational;
    double max_osd_capacity;
+  double max_osd_bandwidth;
    uint64_t osd_mclock_cost_per_io_msec;
    std::string mclock_profile = "high_client_ops";
    struct ClientAllocs {
@@ -95,8 +95,6 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
      ClientAllocs(1, 1, 1), // immediate (not used)
      ClientAllocs(1, 1, 1)  // client
    };
-  std::map<op_type_t, int> client_cost_infos;
-  std::map<op_type_t, int> client_scaled_cost_infos;
    class ClientRegistry {
      std::array<
        crimson::dmclock::ClientInfo,
@@ -172,10 +170,7 @@ public:
    void set_global_recovery_options();
  
    // Calculate scale cost per item
-  int calc_scaled_cost(op_type_t op_type, int cost);
-
-  // Update mclock client cost info
-  bool maybe_update_client_cost_info(op_type_t op_type, int new_cost);
+  int calc_scaled_cost(int cost);
  
    // Enqueue op in the back of the regular queue
    void enqueue(OpSchedulerItem &&item) final;
diff --git a/src/test/osd/TestMClockScheduler.cc b/src/test/osd/TestMClockScheduler.cc

index a2e2f2a4edfeceaaf87c28f4d266628e70e763cc..0feb427ec1016fb6211ad9a6b5578b2e013e0d10 100644 (file)
--- a/src/test/osd/TestMClockScheduler.cc
+++ b/src/test/osd/TestMClockScheduler.cc
@@ -93,7 +93,7 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
  
    for (unsigned i = 100; i < 105; i+=2) {
      q.enqueue(create_item(i, client1, op_scheduler_class::client));
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    std::this_thread::sleep_for(std::chrono::microseconds(1));
    }
  
    ASSERT_FALSE(q.empty());
@@ -126,7 +126,7 @@ TEST_F(mClockSchedulerTest, TestSingleClientOrderedEnqueueDequeue) {
  
    for (unsigned i = 100; i < 105; ++i) {
      q.enqueue(create_item(i, client1, op_scheduler_class::client));
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    std::this_thread::sleep_for(std::chrono::microseconds(1));
    }
  
    auto r = get_item(q.dequeue());
@@ -150,6 +150,7 @@ TEST_F(mClockSchedulerTest, TestMultiClientOrderedEnqueueDequeue) {
    for (unsigned i = 0; i < NUM; ++i) {
      for (auto &&c: {client1, client2, client3}) {
        q.enqueue(create_item(i, c));
+      std::this_thread::sleep_for(std::chrono::microseconds(1));
      }
    }
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Wed, 27 Jan 2021 13:09:57 +0000 (18:39 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Mon, 22 Feb 2021 05:56:04 +0000 (11:26 +0530)
src/osd/scheduler/mClockScheduler.cc		patch \| blob \| history
src/osd/scheduler/mClockScheduler.h		patch \| blob \| history
src/test/osd/TestMClockScheduler.cc		patch \| blob \| history