]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Update PGSnapTrim op queue item cost to reflect average object size 54597/head
authorSridhar Seshasayee <sseshasa@redhat.com>
Mon, 20 Nov 2023 13:17:14 +0000 (18:47 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Mon, 27 Nov 2023 16:00:33 +0000 (21:30 +0530)
Previously, a static value of snap_trim_cost (1 MiB by default) for
PGSnapTrim item was used (see config option osd_snap_trim_cost). For pools
with significantly different sizes of objects, the static cost doesn't
accurately estimate the amount of IO each snap trim operation requires.
Instead, add a cost_per_object parameter to OSDService::queue_for_snap_trim
and set it to the average object size in the PG being queued by using
PG::get_average_object_size().

In addition, for the mClock scheduler, the cost_per_object is multiplied
by the actual number of object trimmed per iteration. This multiplier is
represented by osd_pg_max_concurrent_snap_trims config option which is
used when the actual work starts (See DoSnapWork).

Note: The above cost calculation is valid for most snap trim
iterations except for:

1. The penultimate iteration which may return only 1 object to be trimmed,
in which case the cost will be off by a factor equivalent to the average
object size.

2. The final iteration (returns -ENOENT), involving clean-ups.

Fixes: https://tracker.ceph.com/issues/63604
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PrimaryLogPG.cc

index a79b388deca4ac8c2cc78b96587d82eb8ef3dfb3..ba1250d5ccc6707a7f8d92e373b5f1f0685ec1fa 100644 (file)
@@ -1719,14 +1719,32 @@ void OSDService::queue_recovery_context(
       e));
 }
 
-void OSDService::queue_for_snap_trim(PG *pg)
+void OSDService::queue_for_snap_trim(PG *pg, uint64_t cost_per_object)
 {
   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
+  uint64_t cost_for_queue = [this, cost_per_object] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      /* The cost calculation is valid for most snap trim iterations except
+       * for the following cases:
+       * 1) The penultimate iteration which may return 1 object to trim, in
+       *    which case the cost will be off by a factor equivalent to the
+       *    average object size, and,
+       * 2) The final iteration which returns -ENOENT and performs clean-ups.
+       */
+      return cost_per_object * cct->_conf->osd_pg_max_concurrent_snap_trims;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue.
+       * This branch should be removed after Squid.
+       */
+      return cct->_conf->osd_snap_trim_cost;
+    }
+  }();
+
   enqueue_back(
     OpSchedulerItem(
       unique_ptr<OpSchedulerItem::OpQueueable>(
        new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
-      cct->_conf->osd_snap_trim_cost,
+      cost_for_queue,
       cct->_conf->osd_snap_trim_priority,
       ceph_clock_now(),
       0,
index a6b7a3c31ed3fad546605405f4fd8474afa7b6bd..bf611acba4ba1907a4ff7c05ed57a4607e4daf23 100644 (file)
@@ -499,7 +499,7 @@ public:
                               GenContext<ThreadPool::TPHandle&> *c,
                               uint64_t cost,
                              int priority);
-  void queue_for_snap_trim(PG *pg);
+  void queue_for_snap_trim(PG *pg, uint64_t cost);
   void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
 
   void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
index 5cf9bbba68d38adf59717f8c8b90a7405f0f5b18..7fd8dee072bc223a9a1a23829d97be5bd077a6f4 100644 (file)
@@ -15605,8 +15605,10 @@ PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
     NamedState(nullptr, "Trimming/AwaitAsyncWork")
 {
   auto *pg = context< SnapTrimmer >().pg;
+  // Determine cost in terms of the average object size
+  uint64_t cost_per_object = pg->get_average_object_size();
   context< SnapTrimmer >().log_enter(state_name);
-  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
+  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg, cost_per_object);
   pg->state_set(PG_STATE_SNAPTRIM);
   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
   pg->publish_stats_to_osd();