From aebd9b71f29031dfb68964046af994bd89fea36b Mon Sep 17 00:00:00 2001
From: Sridhar Seshasayee <sseshasa@redhat.com>
Date: Wed, 27 Jan 2021 18:39:57 +0530
Subject: [PATCH] osd: Fix the OpSchedulerItem cost scaling calculation.

Calculate the scaled cost for an OpSchedulerItem in msec based on the
overall osd capacity (across all shards). Remove logic to cache the
cost of the previous op to re-use for the next transaction as it doesn't
seem to provide any benefit in terms of performance.

Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
---
 src/osd/scheduler/mClockScheduler.cc | 75 ++++------------------------
 src/osd/scheduler/mClockScheduler.h  |  9 +---
 src/test/osd/TestMClockScheduler.cc  |  5 +-
 3 files changed, 14 insertions(+), 75 deletions(-)
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc
index d00a8aafbaa..979311d8c7b 100644
--- a/src/osd/scheduler/mClockScheduler.cc
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -45,13 +45,6 @@ mClockScheduler::mClockScheduler(CephContext *cct,
 {
   cct->_conf.add_observer(this);
   ceph_assert(num_shards > 0);
-  // Set default blocksize and cost for all op types.
-  for (op_type_t op_type = op_type_t::client_op;
-       op_type <= op_type_t::bg_pg_delete;
-       op_type = op_type_t(static_cast<size_t>(op_type) + 1)) {
-    client_cost_infos[op_type] = 4 * 1024;
-    client_scaled_cost_infos[op_type] = 1;
-  }
   set_max_osd_capacity();
   set_osd_mclock_cost_per_io();
   set_mclock_profile();
@@ -117,6 +110,8 @@ void mClockScheduler::set_max_osd_capacity()
         cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
     }
   }
+  // Set max osd bandwidth across all shards (at 4KiB blocksize)
+  max_osd_bandwidth = max_osd_capacity * 4 * 1024;
   // Set per op-shard iops limit
   max_osd_capacity /= num_shards;
 }
@@ -363,19 +358,10 @@ void mClockScheduler::set_global_recovery_options()
   cct->_conf.apply_changes(nullptr);
 }
 
-int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
+int mClockScheduler::calc_scaled_cost(int cost)
 {
-  double client_alloc = get_client_allocation(op_type);
-  if (client_alloc == 1.0) {
-    // Client not yet supported, return default cost.
-    return 1;
-  }
-
-  // Calculate bandwidth from max osd capacity (at 4KiB blocksize).
-  double max_osd_bandwidth = max_osd_capacity * num_shards * 4 * 1024;
-
-  // Calculate scaled cost based on item cost
-  double scaled_cost = (cost / max_osd_bandwidth) * client_alloc;
+  // Calculate scaled cost in msecs based on item cost
+  int scaled_cost = std::floor((cost / max_osd_bandwidth) * 1000);
 
   // Scale the cost down by an additional cost factor if specified
   // to account for different device characteristics (hdd, ssd).
@@ -385,45 +371,7 @@ int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
     scaled_cost *= osd_mclock_cost_per_io_msec / 1000.0;
   }
 
-  return std::floor(scaled_cost);
-}
-
-bool mClockScheduler::maybe_update_client_cost_info(
-  op_type_t op_type, int new_cost)
-{
-  int capped_item_cost = 4 * 1024 * 1024;
-
-  if (new_cost == 0) {
-    return false;
-  }
-
-  // The mclock params represented in terms of the per-osd capacity
-  // are scaled up or down according to the cost associated with
-  // item cost and updated within the dmclock server.
-  int cur_cost = client_cost_infos[op_type];
-
-  // Note: Cap the scaling of item cost to ~4MiB as the tag increments
-  // beyond this point are too long causing performance issues. This may
-  // need to be in place until benchmark data is available or a better
-  // scaling model can be put in place. This is a TODO.
-  if (new_cost >= capped_item_cost) {
-    new_cost = capped_item_cost;
-  }
-
-  bool cost_changed =
-    ((new_cost >= (cur_cost << 1)) || (cur_cost >= (new_cost << 1)));
-
-  if (cost_changed) {
-    client_cost_infos[op_type] = new_cost;
-    // Update client scaled cost info
-    int scaled_cost = std::max(calc_scaled_cost(op_type, new_cost), 1);
-    if (scaled_cost != client_scaled_cost_infos[op_type]) {
-      client_scaled_cost_infos[op_type] = scaled_cost;
-      return true;
-    }
-  }
-
-  return false;
+  return std::max(scaled_cost, 1);
 }
 
 void mClockScheduler::dump(ceph::Formatter &f) const
@@ -433,18 +381,13 @@ void mClockScheduler::dump(ceph::Formatter &f) const
 void mClockScheduler::enqueue(OpSchedulerItem&& item)
 {
   auto id = get_scheduler_id(item);
-  auto op_type = item.get_op_type();
-  int cost = client_scaled_cost_infos[op_type];
-
-  // Re-calculate the scaled cost for the client if the item cost changed
-  if (maybe_update_client_cost_info(op_type, item.get_cost())) {
-    cost = client_scaled_cost_infos[op_type];
-  }
 
   // TODO: move this check into OpSchedulerItem, handle backwards compat
-  if (op_scheduler_class::immediate == item.get_scheduler_class()) {
+  if (op_scheduler_class::immediate == id.class_id) {
     immediate.push_front(std::move(item));
   } else {
+    int cost = calc_scaled_cost(item.get_cost());
+    // Add item to scheduler queue
     scheduler.add_request(
       std::move(item),
       id,
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h
index aa228339845..8727af61798 100644
--- a/src/osd/scheduler/mClockScheduler.h
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -38,7 +38,6 @@ constexpr uint64_t default_max = 999999;
 
 using client_id_t = uint64_t;
 using profile_id_t = uint64_t;
-using op_type_t = OpSchedulerItem::OpQueueable::op_type_t;
 
 struct client_profile_id_t {
   client_id_t client_id;
@@ -68,6 +67,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
   const uint32_t num_shards;
   bool is_rotational;
   double max_osd_capacity;
+  double max_osd_bandwidth;
   uint64_t osd_mclock_cost_per_io_msec;
   std::string mclock_profile = "high_client_ops";
   struct ClientAllocs {
@@ -95,8 +95,6 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
     ClientAllocs(1, 1, 1), // immediate (not used)
     ClientAllocs(1, 1, 1)  // client
   };
-  std::map<op_type_t, int> client_cost_infos;
-  std::map<op_type_t, int> client_scaled_cost_infos;
   class ClientRegistry {
     std::array<
       crimson::dmclock::ClientInfo,
@@ -172,10 +170,7 @@ public:
   void set_global_recovery_options();
 
   // Calculate scale cost per item
-  int calc_scaled_cost(op_type_t op_type, int cost);
-
-  // Update mclock client cost info
-  bool maybe_update_client_cost_info(op_type_t op_type, int new_cost);
+  int calc_scaled_cost(int cost);
 
   // Enqueue op in the back of the regular queue
   void enqueue(OpSchedulerItem &&item) final;
diff --git a/src/test/osd/TestMClockScheduler.cc b/src/test/osd/TestMClockScheduler.cc
index a2e2f2a4edf..0feb427ec10 100644
--- a/src/test/osd/TestMClockScheduler.cc
+++ b/src/test/osd/TestMClockScheduler.cc
@@ -93,7 +93,7 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
 
   for (unsigned i = 100; i < 105; i+=2) {
     q.enqueue(create_item(i, client1, op_scheduler_class::client));
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    std::this_thread::sleep_for(std::chrono::microseconds(1));
   }
 
   ASSERT_FALSE(q.empty());
@@ -126,7 +126,7 @@ TEST_F(mClockSchedulerTest, TestSingleClientOrderedEnqueueDequeue) {
 
   for (unsigned i = 100; i < 105; ++i) {
     q.enqueue(create_item(i, client1, op_scheduler_class::client));
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    std::this_thread::sleep_for(std::chrono::microseconds(1));
   }
 
   auto r = get_item(q.dequeue());
@@ -150,6 +150,7 @@ TEST_F(mClockSchedulerTest, TestMultiClientOrderedEnqueueDequeue) {
   for (unsigned i = 0; i < NUM; ++i) {
     for (auto &&c: {client1, client2, client3}) {
       q.enqueue(create_item(i, c));
+      std::this_thread::sleep_for(std::chrono::microseconds(1));
     }
   }
 
-- 
2.39.5