]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Apply randomly determined IO priority cutoff across all OSD shards 54981/head
authorSridhar Seshasayee <sseshasa@redhat.com>
Mon, 13 Nov 2023 12:13:40 +0000 (17:43 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Thu, 21 Dec 2023 07:38:41 +0000 (13:08 +0530)
Determine the op priority cutoff for an OSD and apply it on all the OSD
shards, which is a more realistic scenario. Previously, the cut off value
was randomized between OSD shards leading to issues in testing. The IO
priority cut off is first determined before initializing the OSD shards.
The cut off value is then passed to the OpScheduler implementations that
are modified accordingly to apply the values during initialization.

Fixes: https://tracker.ceph.com/issues/62171
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
(cherry picked from commit bfbc6b65c672d6dc4326d4e29b4a1ee106c9c091)

src/osd/OSD.cc
src/osd/OSD.h
src/osd/scheduler/OpScheduler.cc
src/osd/scheduler/OpScheduler.h
src/osd/scheduler/mClockScheduler.cc
src/osd/scheduler/mClockScheduler.h
src/test/osd/TestMClockScheduler.cc

index 3875c209da23743285a149da91c5b2e6a5fbfd10..8334b74915b2245af645b362aa55b6fffc79935a 100644 (file)
@@ -2410,6 +2410,21 @@ OSD::OSD(CephContext *cct_,
   };
   op_queue_type_t op_queue = get_op_queue_type();
 
+  // Determine op queue cutoff
+  auto get_op_queue_cut_off = [&conf = cct->_conf]() {
+    if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") {
+      std::random_device rd;
+      std::mt19937 random_gen(rd());
+      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
+    } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") {
+      return CEPH_MSG_PRIO_HIGH;
+    } else {
+      // default / catch-all is 'low'
+      return CEPH_MSG_PRIO_LOW;
+    }
+  };
+  unsigned op_queue_cut_off = get_op_queue_cut_off();
+
   // initialize shards
   num_shards = get_num_op_shards();
   for (uint32_t i = 0; i < num_shards; i++) {
@@ -2417,7 +2432,8 @@ OSD::OSD(CephContext *cct_,
       i,
       cct,
       this,
-      op_queue);
+      op_queue,
+      op_queue_cut_off);
     shards.push_back(one_shard);
   }
 }
@@ -10739,7 +10755,8 @@ OSDShard::OSDShard(
   int id,
   CephContext *cct,
   OSD *osd,
-  op_queue_type_t osd_op_queue)
+  op_queue_type_t osd_op_queue,
+  unsigned osd_op_queue_cut_off)
   : shard_id(id),
     cct(cct),
     osd(osd),
@@ -10751,7 +10768,7 @@ OSDShard::OSDShard(
     shard_lock{make_mutex(shard_lock_name)},
     scheduler(ceph::osd::scheduler::make_scheduler(
       cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
-      osd->store->get_type(), osd_op_queue, osd->monc)),
+      osd->store->get_type(), osd_op_queue, osd_op_queue_cut_off, osd->monc)),
     context_queue(sdata_wait_lock, sdata_cond)
 {
   dout(0) << "using op scheduler " << *scheduler << dendl;
index 19c5472e6540210417a6ffcd7af7f13c7f8f3b9e..fdaa2ab137d767f501549843f78c496bb435fdd5 100644 (file)
@@ -1056,7 +1056,8 @@ struct OSDShard {
     int id,
     CephContext *cct,
     OSD *osd,
-    op_queue_type_t osd_op_queue);
+    op_queue_type_t osd_op_queue,
+    unsigned osd_op_queue_cut_off);
 };
 
 class OSD : public Dispatcher,
index 7b89f4be0221d616e0c4a5f5ef3bb0ea4eab034c..12e5bdb6c45fbfb611d301c4449f4295d292931a 100644 (file)
@@ -24,7 +24,7 @@ namespace ceph::osd::scheduler {
 OpSchedulerRef make_scheduler(
   CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
   bool is_rotational, std::string_view osd_objectstore,
-  op_queue_type_t osd_scheduler, MonClient *monc)
+  op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc)
 {
   // Force the use of 'wpq' scheduler for filestore OSDs.
   // The 'mclock_scheduler' is not supported for filestore OSDs.
@@ -33,13 +33,15 @@ OpSchedulerRef make_scheduler(
     return std::make_unique<
       ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>(
        cct,
+        op_queue_cut_off,
        cct->_conf->osd_op_pq_max_tokens_per_priority,
        cct->_conf->osd_op_pq_min_cost
     );
   } else if (op_queue_type_t::mClockScheduler == osd_scheduler) {
     // default is 'mclock_scheduler'
     return std::make_unique<
-      mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc);
+      mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational,
+        op_queue_cut_off, monc);
   } else {
     ceph_assert("Invalid choice of wq" == 0);
   }
index 382f48dd40c126b878a3e120c59a5d2bc2729605..570a2a162900a9ee85d0dbc8695801936ec190d2 100644 (file)
@@ -68,7 +68,7 @@ using OpSchedulerRef = std::unique_ptr<OpScheduler>;
 OpSchedulerRef make_scheduler(
   CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
   bool is_rotational, std::string_view osd_objectstore,
-  op_queue_type_t osd_scheduler, MonClient *monc);
+  op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc);
 
 /**
  * Implements OpScheduler in terms of OpQueue
@@ -83,21 +83,10 @@ class ClassedOpQueueScheduler final : public OpScheduler {
   unsigned cutoff;
   T queue;
 
-  static unsigned int get_io_prio_cut(CephContext *cct) {
-    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
-      srand(time(NULL));
-      return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
-    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
-      return CEPH_MSG_PRIO_HIGH;
-    } else {
-      // default / catch-all is 'low'
-      return CEPH_MSG_PRIO_LOW;
-    }
-  }
 public:
   template <typename... Args>
-  ClassedOpQueueScheduler(CephContext *cct, Args&&... args) :
-    cutoff(get_io_prio_cut(cct)),
+  ClassedOpQueueScheduler(CephContext *cct, unsigned prio_cut, Args&&... args) :
+    cutoff(prio_cut),
     queue(std::forward<Args>(args)...)
   {}
 
index 0ea519655d85b5a4e02fc3010658213336367e9a..f72683d527923b9e73c6d513606b121ad34476aa 100644 (file)
@@ -35,12 +35,14 @@ mClockScheduler::mClockScheduler(CephContext *cct,
   uint32_t num_shards,
   int shard_id,
   bool is_rotational,
+  unsigned cutoff_priority,
   MonClient *monc)
   : cct(cct),
     whoami(whoami),
     num_shards(num_shards),
     shard_id(shard_id),
     is_rotational(is_rotational),
+    cutoff_priority(cutoff_priority),
     monc(monc),
     scheduler(
       std::bind(&mClockScheduler::ClientRegistry::get_info,
index 9f32918827980be06e6651ab3d81080ec9c785c0..16e7f911ff95426cbba3503ec90e364157fba5d8 100644 (file)
@@ -96,6 +96,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
   const uint32_t num_shards;
   const int shard_id;
   const bool is_rotational;
+  const unsigned cutoff_priority;
   MonClient *monc;
 
   /**
@@ -198,21 +199,6 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
     };
   }
 
-  static unsigned int get_io_prio_cut(CephContext *cct) {
-    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
-      std::random_device rd;
-      std::mt19937 random_gen(rd());
-      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
-    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
-      return CEPH_MSG_PRIO_HIGH;
-    } else {
-      // default / catch-all is 'low'
-      return CEPH_MSG_PRIO_LOW;
-    }
-  }
-
-  unsigned cutoff_priority = get_io_prio_cut(cct);
-
   /**
    * set_osd_capacity_params_from_config
    *
@@ -232,7 +218,8 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
 
 public: 
   mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
-    int shard_id, bool is_rotational, MonClient *monc);
+    int shard_id, bool is_rotational, unsigned cutoff_priority,
+    MonClient *monc);
   ~mClockScheduler() override;
 
   /// Calculate scaled cost per item
@@ -260,6 +247,7 @@ public:
 
   void print(std::ostream &ostream) const final {
     ostream << get_op_queue_type_name(get_type());
+    ostream << ", cutoff=" << cutoff_priority;
   }
 
   // Update data associated with the modified mclock config key(s)
index 8291da2684a9682ab239bbb67188a0a1d963b3d1..205ef2f98f6bce33978e61a6dc14576b6e07cc83 100644 (file)
@@ -31,6 +31,7 @@ public:
   uint32_t num_shards;
   int shard_id;
   bool is_rotational;
+  unsigned cutoff_priority;
   MonClient *monc;
   mClockScheduler q;
 
@@ -43,8 +44,10 @@ public:
     num_shards(1),
     shard_id(0),
     is_rotational(false),
+    cutoff_priority(12),
     monc(nullptr),
-    q(g_ceph_context, whoami, num_shards, shard_id, is_rotational, monc),
+    q(g_ceph_context, whoami, num_shards, shard_id, is_rotational,
+      cutoff_priority, monc),
     client1(1001),
     client2(9999),
     client3(100000001)