]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Update PGSnapTrim op queue item cost to reflect average object size 55040/head
authorSridhar Seshasayee <sseshasa@redhat.com>
Mon, 20 Nov 2023 13:17:14 +0000 (18:47 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Tue, 2 Jan 2024 07:44:21 +0000 (13:14 +0530)
Previously, a static value of snap_trim_cost (1 MiB by default) for
PGSnapTrim item was used (see config option osd_snap_trim_cost). For pools
with significantly different sizes of objects, the static cost doesn't
accurately estimate the amount of IO each snap trim operation requires.
Instead, add a cost_per_object parameter to OSDService::queue_for_snap_trim
and set it to the average object size in the PG being queued by using
PG::get_average_object_size().

In addition, for the mClock scheduler, the cost_per_object is multiplied
by the actual number of object trimmed per iteration. This multiplier is
represented by osd_pg_max_concurrent_snap_trims config option which is
used when the actual work starts (See DoSnapWork).

Note: The above cost calculation is valid for most snap trim
iterations except for:

1. The penultimate iteration which may return only 1 object to be trimmed,
in which case the cost will be off by a factor equivalent to the average
object size.

2. The final iteration (returns -ENOENT), involving clean-ups.

Fixes: https://tracker.ceph.com/issues/63604
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
(cherry picked from commit fbd5c40edccccbb44b66e9d82fc71b14ac1d04ae)

src/osd/OSD.cc
src/osd/OSD.h
src/osd/PrimaryLogPG.cc

index c61e7d33218abaa74b5ea02d736d03ca0217c4b7..3d2e32ce0ec4ff9fa73b700c12ea0cf085d3ec6b 100644 (file)
@@ -1726,14 +1726,32 @@ void OSDService::queue_recovery_context(
       e));
 }
 
-void OSDService::queue_for_snap_trim(PG *pg)
+void OSDService::queue_for_snap_trim(PG *pg, uint64_t cost_per_object)
 {
   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
+  uint64_t cost_for_queue = [this, cost_per_object] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      /* The cost calculation is valid for most snap trim iterations except
+       * for the following cases:
+       * 1) The penultimate iteration which may return 1 object to trim, in
+       *    which case the cost will be off by a factor equivalent to the
+       *    average object size, and,
+       * 2) The final iteration which returns -ENOENT and performs clean-ups.
+       */
+      return cost_per_object * cct->_conf->osd_pg_max_concurrent_snap_trims;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue.
+       * This branch should be removed after Squid.
+       */
+      return cct->_conf->osd_snap_trim_cost;
+    }
+  }();
+
   enqueue_back(
     OpSchedulerItem(
       unique_ptr<OpSchedulerItem::OpQueueable>(
        new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
-      cct->_conf->osd_snap_trim_cost,
+      cost_for_queue,
       cct->_conf->osd_snap_trim_priority,
       ceph_clock_now(),
       0,
index 00fab7ec83ed6947eb9a56ff54471fcfb3b6d368..2db3bc87f3deb22de08a13500b85db37a6d08601 100644 (file)
@@ -511,7 +511,7 @@ public:
                               GenContext<ThreadPool::TPHandle&> *c,
                               uint64_t cost,
                              int priority);
-  void queue_for_snap_trim(PG *pg);
+  void queue_for_snap_trim(PG *pg, uint64_t cost);
   void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
 
   void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
index 075b2f3ef18d88270e51b431427d463dbdc3e2ad..d37f07745ed90c7de80e24cf439ac578d41adf33 100644 (file)
@@ -15627,8 +15627,10 @@ PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
     NamedState(nullptr, "Trimming/AwaitAsyncWork")
 {
   auto *pg = context< SnapTrimmer >().pg;
+  // Determine cost in terms of the average object size
+  uint64_t cost_per_object = pg->get_average_object_size();
   context< SnapTrimmer >().log_enter(state_name);
-  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
+  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg, cost_per_object);
   pg->state_set(PG_STATE_SNAPTRIM);
   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
   pg->publish_stats_to_osd();