From 953e854cd2cdea7bbfc9d82fe7c41143aa590ff6 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Wed, 5 Apr 2023 22:57:42 -0700
Subject: [PATCH] osd/: differentiate scheduler class for undersized/degraded
 vs data movement

Recovery operations on pgs/objects that have fewer than the configured
number of copies should be treated more urgently than operations on
pgs/objects that simply need to be moved to a new location.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/osd/PeeringState.h              | 38 ++++++++++++++++++++++++++---
 src/osd/scheduler/OpSchedulerItem.h | 17 +++++++++----
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
index e4149f9fa0506..6901ab5062b06 100644
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -1578,11 +1578,43 @@ public:
   unsigned get_delete_priority();
 
 public:
+  /**
+   * recovery_msg_priority_t
+   *
+   * Defines priority values for use with recovery messages.  The values are
+   * chosen to be reasonable for wpq during an upgrade scenarios, but are
+   * actually translated into a class in PGRecoveryMsg::get_scheduler_class()
+   */
+  enum recovery_msg_priority_t : int {
+    FORCED = 20,
+    UNDERSIZED = 15,
+    DEGRADED = 10,
+    BEST_EFFORT = 5
+  };
+
   /// get message priority for recovery messages
   int get_recovery_op_priority() const {
-    int64_t pri = 0;
-    pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
-    return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      /* For mclock, we use special priority values which will be
+       * translated into op classes within PGRecoveryMsg::get_scheduler_class
+       */
+      if (is_forced_recovery_or_backfill()) {
+	return recovery_msg_priority_t::FORCED;
+      } else if (is_undersized()) {
+	return recovery_msg_priority_t::UNDERSIZED;
+      } else if (is_degraded()) {
+	return recovery_msg_priority_t::DEGRADED;
+      } else {
+	return recovery_msg_priority_t::BEST_EFFORT;
+      }
+    } else {
+      /* For WeightedPriorityQueue, we use pool or osd config settings to
+       * statically set the priority for recovery messages.  This special
+       * handling should probably be removed after Reef */
+      int64_t pri = 0;
+      pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+      return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+    }
   }
 
 private:
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h
index f5e739a318722..d8339353bfb90 100644
--- a/src/osd/scheduler/OpSchedulerItem.h
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -194,6 +194,17 @@ protected:
   const spg_t& get_pgid() const {
     return pgid;
   }
+
+  static op_scheduler_class priority_to_scheduler_class(int priority) {
+    if (priority >= CEPH_MSG_PRIO_HIGH) {
+      return op_scheduler_class::immediate;
+    } else if (priority >= PeeringState::recovery_msg_priority_t::DEGRADED) {
+      return op_scheduler_class::background_recovery;
+    } else {
+      return op_scheduler_class::background_best_effort;
+    }
+  }
+
 public:
   explicit PGOpQueueable(spg_t pg) : pgid(pg) {}
   uint32_t get_queue_token() const final {
@@ -581,11 +592,7 @@ public:
   }
 
   op_scheduler_class get_scheduler_class() const final {
-    auto priority = op->get_req()->get_priority();
-    if (priority >= CEPH_MSG_PRIO_HIGH) {
-      return op_scheduler_class::immediate;
-    }
-    return op_scheduler_class::background_recovery;
+    return priority_to_scheduler_class(op->get_req()->get_priority());
   }
 
   void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
-- 
2.39.5