From: Sridhar Seshasayee <sseshasa@redhat.com>
Date: Mon, 20 Nov 2023 13:17:14 +0000 (+0530)
Subject: osd: Update PGSnapTrim op queue item cost to reflect average object size
X-Git-Tag: testing/wip-pdonnell-testing-20240430.123648-reef-debug~300^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=890bb1613d37d3b2edea077e5be83f13de5fa3f9;p=ceph-ci.git

osd: Update PGSnapTrim op queue item cost to reflect average object size

Previously, a static value of snap_trim_cost (1 MiB by default) for
PGSnapTrim item was used (see config option osd_snap_trim_cost). For pools
with significantly different sizes of objects, the static cost doesn't
accurately estimate the amount of IO each snap trim operation requires.
Instead, add a cost_per_object parameter to OSDService::queue_for_snap_trim
and set it to the average object size in the PG being queued by using
PG::get_average_object_size().

In addition, for the mClock scheduler, the cost_per_object is multiplied
by the actual number of object trimmed per iteration. This multiplier is
represented by osd_pg_max_concurrent_snap_trims config option which is
used when the actual work starts (See DoSnapWork).

Note: The above cost calculation is valid for most snap trim
iterations except for:

1. The penultimate iteration which may return only 1 object to be trimmed,
in which case the cost will be off by a factor equivalent to the average
object size.

2. The final iteration (returns -ENOENT), involving clean-ups.

Fixes: https://tracker.ceph.com/issues/63604
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
(cherry picked from commit fbd5c40edccccbb44b66e9d82fc71b14ac1d04ae)
---

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index c61e7d33218..3d2e32ce0ec 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1726,14 +1726,32 @@ void OSDService::queue_recovery_context(
       e));
 }
 
-void OSDService::queue_for_snap_trim(PG *pg)
+void OSDService::queue_for_snap_trim(PG *pg, uint64_t cost_per_object)
 {
   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
+  uint64_t cost_for_queue = [this, cost_per_object] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      /* The cost calculation is valid for most snap trim iterations except
+       * for the following cases:
+       * 1) The penultimate iteration which may return 1 object to trim, in
+       *    which case the cost will be off by a factor equivalent to the
+       *    average object size, and,
+       * 2) The final iteration which returns -ENOENT and performs clean-ups.
+       */
+      return cost_per_object * cct->_conf->osd_pg_max_concurrent_snap_trims;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue.
+       * This branch should be removed after Squid.
+       */
+      return cct->_conf->osd_snap_trim_cost;
+    }
+  }();
+
   enqueue_back(
     OpSchedulerItem(
       unique_ptr<OpSchedulerItem::OpQueueable>(
 	new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
-      cct->_conf->osd_snap_trim_cost,
+      cost_for_queue,
       cct->_conf->osd_snap_trim_priority,
       ceph_clock_now(),
       0,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 00fab7ec83e..2db3bc87f3d 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -511,7 +511,7 @@ public:
                               GenContext<ThreadPool::TPHandle&> *c,
                               uint64_t cost,
 			      int priority);
-  void queue_for_snap_trim(PG *pg);
+  void queue_for_snap_trim(PG *pg, uint64_t cost);
   void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
 
   void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 075b2f3ef18..d37f07745ed 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -15627,8 +15627,10 @@ PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
     NamedState(nullptr, "Trimming/AwaitAsyncWork")
 {
   auto *pg = context< SnapTrimmer >().pg;
+  // Determine cost in terms of the average object size
+  uint64_t cost_per_object = pg->get_average_object_size();
   context< SnapTrimmer >().log_enter(state_name);
-  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
+  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg, cost_per_object);
   pg->state_set(PG_STATE_SNAPTRIM);
   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
   pg->publish_stats_to_osd();