From e3ccb80bbccd460e226a7d89162ec43a71217ca6 Mon Sep 17 00:00:00 2001
From: Sridhar Seshasayee <sseshasa@redhat.com>
Date: Thu, 2 Feb 2023 15:30:26 +0530
Subject: [PATCH] osd: update PGRecovery queue item cost to reflect object size

Previously, we used a static value of osd_recovery_cost (20M
by default) for PGRecovery. For pools with relatively small
objects, this causes mclock to backfill very very slowly as
20M massively overestimates the amount of IO each recovery
queue operation requires. Instead, add a cost_per_object
parameter to OSDService::awaiting_throttle and set it to the
average object size in the PG being queued.

Fixes: https://tracker.ceph.com/issues/58606
Signed-off-by: Samuel Just <sjust@redhat.com>
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
---
 src/osd/OSD.cc | 24 ++++++++++++++++++++----
 src/osd/OSD.h  | 27 +++++++++++++++++++--------
 src/osd/PG.cc  | 12 +++++++++++-
 3 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index e8314521d10f7..c4c951b355cd9 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2039,20 +2039,36 @@ void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
 // ---
 
 void OSDService::_queue_for_recovery(
-  std::pair<epoch_t, PGRef> p,
+  pg_awaiting_throttle_t p,
   uint64_t reserved_pushes)
 {
   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
+
+  uint64_t cost_for_queue = [this, &reserved_pushes, &p] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      return p.cost_per_object * reserved_pushes;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
+       * require very large costs for several messages in order to do any
+       * meaningful amount of throttling.  This branch should be removed after
+       * Reef.
+       */
+      return cct->_conf->osd_recovery_cost;
+    }
+  }();
+
   enqueue_back(
     OpSchedulerItem(
       unique_ptr<OpSchedulerItem::OpQueueable>(
 	new PGRecovery(
-	  p.second->get_pgid(), p.first, reserved_pushes)),
-      cct->_conf->osd_recovery_cost,
+	  p.pg->get_pgid(),
+	  p.epoch_queued,
+          reserved_pushes)),
+      cost_for_queue,
       cct->_conf->osd_recovery_priority,
       ceph_clock_now(),
       0,
-      p.first));
+      p.epoch_queued));
 }
 
 // ====================================================================
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 88f75cc93eb98..ad000a0d1d8e2 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -584,7 +584,13 @@ public:
 private:
   // -- pg recovery and associated throttling --
   ceph::mutex recovery_lock = ceph::make_mutex("OSDService::recovery_lock");
-  std::list<std::pair<epoch_t, PGRef> > awaiting_throttle;
+
+  struct pg_awaiting_throttle_t {
+    const epoch_t epoch_queued;
+    PGRef pg;
+    const uint64_t cost_per_object;
+  };
+  std::list<pg_awaiting_throttle_t> awaiting_throttle;
 
   /// queue a scrub-related message for a PG
   template <class MSG_TYPE>
@@ -607,8 +613,7 @@ private:
 #endif
   bool _recover_now(uint64_t *available_pushes);
   void _maybe_queue_recovery();
-  void _queue_for_recovery(
-    std::pair<epoch_t, PGRef> p, uint64_t reserved_pushes);
+  void _queue_for_recovery(pg_awaiting_throttle_t p, uint64_t reserved_pushes);
 public:
   void start_recovery_op(PG *pg, const hobject_t& soid);
   void finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue);
@@ -639,26 +644,32 @@ public:
     std::lock_guard l(recovery_lock);
     awaiting_throttle.remove_if(
       [pg](decltype(awaiting_throttle)::const_reference awaiting ) {
-	return awaiting.second.get() == pg;
+        return awaiting.pg.get() == pg;
       });
   }
 
   unsigned get_target_pg_log_entries() const;
 
   // delayed pg activation
-  void queue_for_recovery(PG *pg) {
+  void queue_for_recovery(PG *pg, uint64_t cost_per_object) {
     std::lock_guard l(recovery_lock);
 
     if (pg->is_forced_recovery_or_backfill()) {
-      awaiting_throttle.push_front(std::make_pair(pg->get_osdmap()->get_epoch(), pg));
+      awaiting_throttle.emplace_front(
+        pg_awaiting_throttle_t{
+          pg->get_osdmap()->get_epoch(), pg, cost_per_object});
     } else {
-      awaiting_throttle.push_back(std::make_pair(pg->get_osdmap()->get_epoch(), pg));
+      awaiting_throttle.emplace_back(
+        pg_awaiting_throttle_t{
+          pg->get_osdmap()->get_epoch(), pg, cost_per_object});
     }
     _maybe_queue_recovery();
   }
   void queue_recovery_after_sleep(PG *pg, epoch_t queued, uint64_t reserved_pushes) {
     std::lock_guard l(recovery_lock);
-    _queue_for_recovery(std::make_pair(queued, pg), reserved_pushes);
+    // Send cost as 1 in pg_awaiting_throttle_t below. The cost is ignored
+    // as this path is only applicable for WeightedPriorityQueue scheduler.
+    _queue_for_recovery(pg_awaiting_throttle_t{queued, pg, 1}, reserved_pushes);
   }
 
   void queue_check_readable(spg_t spgid,
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index b57cafd4475f7..8a307462cf621 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -419,7 +419,17 @@ void PG::queue_recovery()
   } else {
     dout(10) << "queue_recovery -- queuing" << dendl;
     recovery_queued = true;
-    osd->queue_for_recovery(this);
+    // Let cost per object be the average object size
+    auto num_bytes = static_cast<uint64_t>(
+      std::max<int64_t>(
+	0, // ensure bytes is non-negative
+	info.stats.stats.sum.num_bytes));
+    auto num_objects = static_cast<uint64_t>(
+      std::max<int64_t>(
+	1, // ensure objects is non-negative and non-zero
+	info.stats.stats.sum.num_objects));
+    uint64_t cost_per_object = std::max<uint64_t>(num_bytes / num_objects, 1);
+    osd->queue_for_recovery(this, cost_per_object);
   }
 }
 
-- 
2.39.5