]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: Apply randomly determined IO priority cutoff across all OSD shards
authorSridhar Seshasayee <sseshasa@redhat.com>
Mon, 13 Nov 2023 12:13:40 +0000 (17:43 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Tue, 14 Nov 2023 05:59:32 +0000 (11:29 +0530)
Determine the op priority cutoff for an OSD and apply it on all the OSD
shards, which is a more realistic scenario. Previously, the cut off value
was randomized between OSD shards leading to issues in testing. The IO
priority cut off is first determined before initializing the OSD shards.
The cut off value is then passed to the OpScheduler implementations that
are modified accordingly to apply the values during initialization.

Fixes: https://tracker.ceph.com/issues/62171
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/scheduler/OpScheduler.cc
src/osd/scheduler/OpScheduler.h
src/osd/scheduler/mClockScheduler.cc
src/osd/scheduler/mClockScheduler.h
src/test/osd/TestMClockScheduler.cc

index 1bf3f7a5c906bc31221a23131740042ee01775a6..cc51305a17a31d4d8a8d3957d0cf5813572c221f 100644 (file)
@@ -2403,6 +2403,21 @@ OSD::OSD(CephContext *cct_,
   };
   op_queue_type_t op_queue = get_op_queue_type();
 
+  // Determine op queue cutoff
+  auto get_op_queue_cut_off = [&conf = cct->_conf]() {
+    if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") {
+      std::random_device rd;
+      std::mt19937 random_gen(rd());
+      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
+    } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") {
+      return CEPH_MSG_PRIO_HIGH;
+    } else {
+      // default / catch-all is 'low'
+      return CEPH_MSG_PRIO_LOW;
+    }
+  };
+  unsigned op_queue_cut_off = get_op_queue_cut_off();
+
   // initialize shards
   num_shards = get_num_op_shards();
   for (uint32_t i = 0; i < num_shards; i++) {
@@ -2410,7 +2425,8 @@ OSD::OSD(CephContext *cct_,
       i,
       cct,
       this,
-      op_queue);
+      op_queue,
+      op_queue_cut_off);
     shards.push_back(one_shard);
   }
 }
@@ -10706,7 +10722,8 @@ OSDShard::OSDShard(
   int id,
   CephContext *cct,
   OSD *osd,
-  op_queue_type_t osd_op_queue)
+  op_queue_type_t osd_op_queue,
+  unsigned osd_op_queue_cut_off)
   : shard_id(id),
     cct(cct),
     osd(osd),
@@ -10718,7 +10735,7 @@ OSDShard::OSDShard(
     shard_lock{make_mutex(shard_lock_name)},
     scheduler(ceph::osd::scheduler::make_scheduler(
       cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
-      osd->store->get_type(), osd_op_queue, osd->monc)),
+      osd->store->get_type(), osd_op_queue, osd_op_queue_cut_off, osd->monc)),
     context_queue(sdata_wait_lock, sdata_cond)
 {
   dout(0) << "using op scheduler " << *scheduler << dendl;
index 859fdbbbe234855299e03e199476c0ab3f6e853e..0d08466bf1f127dac835f755af97631791fafa94 100644 (file)
@@ -1038,7 +1038,8 @@ struct OSDShard {
     int id,
     CephContext *cct,
     OSD *osd,
-    op_queue_type_t osd_op_queue);
+    op_queue_type_t osd_op_queue,
+    unsigned osd_op_queue_cut_off);
 };
 
 class OSD : public Dispatcher,
index 7b89f4be0221d616e0c4a5f5ef3bb0ea4eab034c..12e5bdb6c45fbfb611d301c4449f4295d292931a 100644 (file)
@@ -24,7 +24,7 @@ namespace ceph::osd::scheduler {
 OpSchedulerRef make_scheduler(
   CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
   bool is_rotational, std::string_view osd_objectstore,
-  op_queue_type_t osd_scheduler, MonClient *monc)
+  op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc)
 {
   // Force the use of 'wpq' scheduler for filestore OSDs.
   // The 'mclock_scheduler' is not supported for filestore OSDs.
@@ -33,13 +33,15 @@ OpSchedulerRef make_scheduler(
     return std::make_unique<
       ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>(
        cct,
+        op_queue_cut_off,
        cct->_conf->osd_op_pq_max_tokens_per_priority,
        cct->_conf->osd_op_pq_min_cost
     );
   } else if (op_queue_type_t::mClockScheduler == osd_scheduler) {
     // default is 'mclock_scheduler'
     return std::make_unique<
-      mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc);
+      mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational,
+        op_queue_cut_off, monc);
   } else {
     ceph_assert("Invalid choice of wq" == 0);
   }
index 382f48dd40c126b878a3e120c59a5d2bc2729605..570a2a162900a9ee85d0dbc8695801936ec190d2 100644 (file)
@@ -68,7 +68,7 @@ using OpSchedulerRef = std::unique_ptr<OpScheduler>;
 OpSchedulerRef make_scheduler(
   CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
   bool is_rotational, std::string_view osd_objectstore,
-  op_queue_type_t osd_scheduler, MonClient *monc);
+  op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc);
 
 /**
  * Implements OpScheduler in terms of OpQueue
@@ -83,21 +83,10 @@ class ClassedOpQueueScheduler final : public OpScheduler {
   unsigned cutoff;
   T queue;
 
-  static unsigned int get_io_prio_cut(CephContext *cct) {
-    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
-      srand(time(NULL));
-      return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
-    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
-      return CEPH_MSG_PRIO_HIGH;
-    } else {
-      // default / catch-all is 'low'
-      return CEPH_MSG_PRIO_LOW;
-    }
-  }
 public:
   template <typename... Args>
-  ClassedOpQueueScheduler(CephContext *cct, Args&&... args) :
-    cutoff(get_io_prio_cut(cct)),
+  ClassedOpQueueScheduler(CephContext *cct, unsigned prio_cut, Args&&... args) :
+    cutoff(prio_cut),
     queue(std::forward<Args>(args)...)
   {}
 
index 0ea519655d85b5a4e02fc3010658213336367e9a..f72683d527923b9e73c6d513606b121ad34476aa 100644 (file)
@@ -35,12 +35,14 @@ mClockScheduler::mClockScheduler(CephContext *cct,
   uint32_t num_shards,
   int shard_id,
   bool is_rotational,
+  unsigned cutoff_priority,
   MonClient *monc)
   : cct(cct),
     whoami(whoami),
     num_shards(num_shards),
     shard_id(shard_id),
     is_rotational(is_rotational),
+    cutoff_priority(cutoff_priority),
     monc(monc),
     scheduler(
       std::bind(&mClockScheduler::ClientRegistry::get_info,
index 9f32918827980be06e6651ab3d81080ec9c785c0..16e7f911ff95426cbba3503ec90e364157fba5d8 100644 (file)
@@ -96,6 +96,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
   const uint32_t num_shards;
   const int shard_id;
   const bool is_rotational;
+  const unsigned cutoff_priority;
   MonClient *monc;
 
   /**
@@ -198,21 +199,6 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
     };
   }
 
-  static unsigned int get_io_prio_cut(CephContext *cct) {
-    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
-      std::random_device rd;
-      std::mt19937 random_gen(rd());
-      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
-    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
-      return CEPH_MSG_PRIO_HIGH;
-    } else {
-      // default / catch-all is 'low'
-      return CEPH_MSG_PRIO_LOW;
-    }
-  }
-
-  unsigned cutoff_priority = get_io_prio_cut(cct);
-
   /**
    * set_osd_capacity_params_from_config
    *
@@ -232,7 +218,8 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
 
 public: 
   mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
-    int shard_id, bool is_rotational, MonClient *monc);
+    int shard_id, bool is_rotational, unsigned cutoff_priority,
+    MonClient *monc);
   ~mClockScheduler() override;
 
   /// Calculate scaled cost per item
@@ -260,6 +247,7 @@ public:
 
   void print(std::ostream &ostream) const final {
     ostream << get_op_queue_type_name(get_type());
+    ostream << ", cutoff=" << cutoff_priority;
   }
 
   // Update data associated with the modified mclock config key(s)
index e7bac03d2abd56ad9839f151b7b0c71b335066c3..325ebe77e802f60e7bdbbb5ab04dbd1fe624953e 100644 (file)
@@ -31,6 +31,7 @@ public:
   uint32_t num_shards;
   int shard_id;
   bool is_rotational;
+  unsigned cutoff_priority;
   MonClient *monc;
   mClockScheduler q;
 
@@ -43,8 +44,10 @@ public:
     num_shards(1),
     shard_id(0),
     is_rotational(false),
+    cutoff_priority(12),
     monc(nullptr),
-    q(g_ceph_context, whoami, num_shards, shard_id, is_rotational, monc),
+    q(g_ceph_context, whoami, num_shards, shard_id, is_rotational,
+      cutoff_priority, monc),
     client1(1001),
     client2(9999),
     client3(100000001)