From: Sridhar Seshasayee Date: Mon, 20 Nov 2023 13:17:14 +0000 (+0530) Subject: osd: Update PGSnapTrim op queue item cost to reflect average object size X-Git-Tag: testing/wip-pdonnell-testing-20240430.123648-reef-debug~300^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=890bb1613d37d3b2edea077e5be83f13de5fa3f9;p=ceph-ci.git osd: Update PGSnapTrim op queue item cost to reflect average object size Previously, a static value of snap_trim_cost (1 MiB by default) for PGSnapTrim item was used (see config option osd_snap_trim_cost). For pools with significantly different sizes of objects, the static cost doesn't accurately estimate the amount of IO each snap trim operation requires. Instead, add a cost_per_object parameter to OSDService::queue_for_snap_trim and set it to the average object size in the PG being queued by using PG::get_average_object_size(). In addition, for the mClock scheduler, the cost_per_object is multiplied by the actual number of object trimmed per iteration. This multiplier is represented by osd_pg_max_concurrent_snap_trims config option which is used when the actual work starts (See DoSnapWork). Note: The above cost calculation is valid for most snap trim iterations except for: 1. The penultimate iteration which may return only 1 object to be trimmed, in which case the cost will be off by a factor equivalent to the average object size. 2. The final iteration (returns -ENOENT), involving clean-ups. Fixes: https://tracker.ceph.com/issues/63604 Signed-off-by: Sridhar Seshasayee (cherry picked from commit fbd5c40edccccbb44b66e9d82fc71b14ac1d04ae) --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index c61e7d33218..3d2e32ce0ec 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1726,14 +1726,32 @@ void OSDService::queue_recovery_context( e)); } -void OSDService::queue_for_snap_trim(PG *pg) +void OSDService::queue_for_snap_trim(PG *pg, uint64_t cost_per_object) { dout(10) << "queueing " << *pg << " for snaptrim" << dendl; + uint64_t cost_for_queue = [this, cost_per_object] { + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + /* The cost calculation is valid for most snap trim iterations except + * for the following cases: + * 1) The penultimate iteration which may return 1 object to trim, in + * which case the cost will be off by a factor equivalent to the + * average object size, and, + * 2) The final iteration which returns -ENOENT and performs clean-ups. + */ + return cost_per_object * cct->_conf->osd_pg_max_concurrent_snap_trims; + } else { + /* We retain this legacy behavior for WeightedPriorityQueue. + * This branch should be removed after Squid. + */ + return cct->_conf->osd_snap_trim_cost; + } + }(); + enqueue_back( OpSchedulerItem( unique_ptr( new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())), - cct->_conf->osd_snap_trim_cost, + cost_for_queue, cct->_conf->osd_snap_trim_priority, ceph_clock_now(), 0, diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 00fab7ec83e..2db3bc87f3d 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -511,7 +511,7 @@ public: GenContext *c, uint64_t cost, int priority); - void queue_for_snap_trim(PG *pg); + void queue_for_snap_trim(PG *pg, uint64_t cost); void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority); void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 075b2f3ef18..d37f07745ed 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -15627,8 +15627,10 @@ PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx) NamedState(nullptr, "Trimming/AwaitAsyncWork") { auto *pg = context< SnapTrimmer >().pg; + // Determine cost in terms of the average object size + uint64_t cost_per_object = pg->get_average_object_size(); context< SnapTrimmer >().log_enter(state_name); - context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg); + context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg, cost_per_object); pg->state_set(PG_STATE_SNAPTRIM); pg->state_clear(PG_STATE_SNAPTRIM_ERROR); pg->publish_stats_to_osd();