]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/: differentiate scheduler class for undersized/degraded vs data movement
authorSamuel Just <sjust@redhat.com>
Thu, 6 Apr 2023 05:57:42 +0000 (22:57 -0700)
committerSridhar Seshasayee <sseshasa@redhat.com>
Mon, 8 May 2023 10:52:00 +0000 (16:22 +0530)
Recovery operations on pgs/objects that have fewer than the configured
number of copies should be treated more urgently than operations on
pgs/objects that simply need to be moved to a new location.

Signed-off-by: Samuel Just <sjust@redhat.com>
src/osd/PeeringState.h
src/osd/scheduler/OpSchedulerItem.h

index f4c9cd9badb7bb9d71bc0ed663eb4845bd908ae4..583499af899de8a13f51ea87adea6ee71f18a065 100644 (file)
@@ -1569,11 +1569,43 @@ public:
   unsigned get_delete_priority();
 
 public:
+  /**
+   * recovery_msg_priority_t
+   *
+   * Defines priority values for use with recovery messages.  The values are
+   * chosen to be reasonable for wpq during an upgrade scenarios, but are
+   * actually translated into a class in PGRecoveryMsg::get_scheduler_class()
+   */
+  enum recovery_msg_priority_t : int {
+    FORCED = 20,
+    UNDERSIZED = 15,
+    DEGRADED = 10,
+    BEST_EFFORT = 5
+  };
+
   /// get message priority for recovery messages
   int get_recovery_op_priority() const {
-    int64_t pri = 0;
-    pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
-    return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      /* For mclock, we use special priority values which will be
+       * translated into op classes within PGRecoveryMsg::get_scheduler_class
+       */
+      if (is_forced_recovery_or_backfill()) {
+       return recovery_msg_priority_t::FORCED;
+      } else if (is_undersized()) {
+       return recovery_msg_priority_t::UNDERSIZED;
+      } else if (is_degraded()) {
+       return recovery_msg_priority_t::DEGRADED;
+      } else {
+       return recovery_msg_priority_t::BEST_EFFORT;
+      }
+    } else {
+      /* For WeightedPriorityQueue, we use pool or osd config settings to
+       * statically set the priority for recovery messages.  This special
+       * handling should probably be removed after Reef */
+      int64_t pri = 0;
+      pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+      return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+    }
   }
 
 private:
index f5e739a318722b038e53bef634949262741ec9ca..d8339353bfb906d69d3db04c1ea35b1ce74b6935 100644 (file)
@@ -194,6 +194,17 @@ protected:
   const spg_t& get_pgid() const {
     return pgid;
   }
+
+  static op_scheduler_class priority_to_scheduler_class(int priority) {
+    if (priority >= CEPH_MSG_PRIO_HIGH) {
+      return op_scheduler_class::immediate;
+    } else if (priority >= PeeringState::recovery_msg_priority_t::DEGRADED) {
+      return op_scheduler_class::background_recovery;
+    } else {
+      return op_scheduler_class::background_best_effort;
+    }
+  }
+
 public:
   explicit PGOpQueueable(spg_t pg) : pgid(pg) {}
   uint32_t get_queue_token() const final {
@@ -581,11 +592,7 @@ public:
   }
 
   op_scheduler_class get_scheduler_class() const final {
-    auto priority = op->get_req()->get_priority();
-    if (priority >= CEPH_MSG_PRIO_HIGH) {
-      return op_scheduler_class::immediate;
-    }
-    return op_scheduler_class::background_recovery;
+    return priority_to_scheduler_class(op->get_req()->get_priority());
   }
 
   void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;