From 953e854cd2cdea7bbfc9d82fe7c41143aa590ff6 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 5 Apr 2023 22:57:42 -0700 Subject: [PATCH] osd/: differentiate scheduler class for undersized/degraded vs data movement Recovery operations on pgs/objects that have fewer than the configured number of copies should be treated more urgently than operations on pgs/objects that simply need to be moved to a new location. Signed-off-by: Samuel Just --- src/osd/PeeringState.h | 38 ++++++++++++++++++++++++++--- src/osd/scheduler/OpSchedulerItem.h | 17 +++++++++---- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index e4149f9fa0506..6901ab5062b06 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1578,11 +1578,43 @@ public: unsigned get_delete_priority(); public: + /** + * recovery_msg_priority_t + * + * Defines priority values for use with recovery messages. The values are + * chosen to be reasonable for wpq during an upgrade scenarios, but are + * actually translated into a class in PGRecoveryMsg::get_scheduler_class() + */ + enum recovery_msg_priority_t : int { + FORCED = 20, + UNDERSIZED = 15, + DEGRADED = 10, + BEST_EFFORT = 5 + }; + /// get message priority for recovery messages int get_recovery_op_priority() const { - int64_t pri = 0; - pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); - return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority; + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + /* For mclock, we use special priority values which will be + * translated into op classes within PGRecoveryMsg::get_scheduler_class + */ + if (is_forced_recovery_or_backfill()) { + return recovery_msg_priority_t::FORCED; + } else if (is_undersized()) { + return recovery_msg_priority_t::UNDERSIZED; + } else if (is_degraded()) { + return recovery_msg_priority_t::DEGRADED; + } else { + return recovery_msg_priority_t::BEST_EFFORT; + } + } else { + /* For WeightedPriorityQueue, we use pool or osd config settings to + * statically set the priority for recovery messages. This special + * handling should probably be removed after Reef */ + int64_t pri = 0; + pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority; + } } private: diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h index f5e739a318722..d8339353bfb90 100644 --- a/src/osd/scheduler/OpSchedulerItem.h +++ b/src/osd/scheduler/OpSchedulerItem.h @@ -194,6 +194,17 @@ protected: const spg_t& get_pgid() const { return pgid; } + + static op_scheduler_class priority_to_scheduler_class(int priority) { + if (priority >= CEPH_MSG_PRIO_HIGH) { + return op_scheduler_class::immediate; + } else if (priority >= PeeringState::recovery_msg_priority_t::DEGRADED) { + return op_scheduler_class::background_recovery; + } else { + return op_scheduler_class::background_best_effort; + } + } + public: explicit PGOpQueueable(spg_t pg) : pgid(pg) {} uint32_t get_queue_token() const final { @@ -581,11 +592,7 @@ public: } op_scheduler_class get_scheduler_class() const final { - auto priority = op->get_req()->get_priority(); - if (priority >= CEPH_MSG_PRIO_HIGH) { - return op_scheduler_class::immediate; - } - return op_scheduler_class::background_recovery; + return priority_to_scheduler_class(op->get_req()->get_priority()); } void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; -- 2.39.5