osd: Apply randomly determined IO priority cutoff across all OSD shards

author Sridhar Seshasayee <sseshasa@redhat.com>

Mon, 13 Nov 2023 12:13:40 +0000 (17:43 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Thu, 21 Dec 2023 07:38:41 +0000 (13:08 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Mon, 13 Nov 2023 12:13:40 +0000 (17:43 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Thu, 21 Dec 2023 07:38:41 +0000 (13:08 +0530)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 3875c209da23743285a149da91c5b2e6a5fbfd10..8334b74915b2245af645b362aa55b6fffc79935a 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2410,6 +2410,21 @@ OSD::OSD(CephContext *cct_,
    };
    op_queue_type_t op_queue = get_op_queue_type();
  
+  // Determine op queue cutoff
+  auto get_op_queue_cut_off = [&conf = cct->_conf]() {
+    if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") {
+      std::random_device rd;
+      std::mt19937 random_gen(rd());
+      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
+    } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") {
+      return CEPH_MSG_PRIO_HIGH;
+    } else {
+      // default / catch-all is 'low'
+      return CEPH_MSG_PRIO_LOW;
+    }
+  };
+  unsigned op_queue_cut_off = get_op_queue_cut_off();
+
    // initialize shards
    num_shards = get_num_op_shards();
    for (uint32_t i = 0; i < num_shards; i++) {
@@ -2417,7 +2432,8 @@ OSD::OSD(CephContext *cct_,
        i,
        cct,
        this,
-      op_queue);
+      op_queue,
+      op_queue_cut_off);
      shards.push_back(one_shard);
    }
  }
@@ -10739,7 +10755,8 @@ OSDShard::OSDShard(
    int id,
    CephContext *cct,
    OSD *osd,
-  op_queue_type_t osd_op_queue)
+  op_queue_type_t osd_op_queue,
+  unsigned osd_op_queue_cut_off)
    : shard_id(id),
      cct(cct),
      osd(osd),
@@ -10751,7 +10768,7 @@ OSDShard::OSDShard(
      shard_lock{make_mutex(shard_lock_name)},
      scheduler(ceph::osd::scheduler::make_scheduler(
        cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
-      osd->store->get_type(), osd_op_queue, osd->monc)),
+      osd->store->get_type(), osd_op_queue, osd_op_queue_cut_off, osd->monc)),
      context_queue(sdata_wait_lock, sdata_cond)
  {
    dout(0) << "using op scheduler " << *scheduler << dendl;
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 19c5472e6540210417a6ffcd7af7f13c7f8f3b9e..fdaa2ab137d767f501549843f78c496bb435fdd5 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1056,7 +1056,8 @@ struct OSDShard {
      int id,
      CephContext *cct,
      OSD *osd,
-    op_queue_type_t osd_op_queue);
+    op_queue_type_t osd_op_queue,
+    unsigned osd_op_queue_cut_off);
  };
  
  class OSD : public Dispatcher,
diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc

index 7b89f4be0221d616e0c4a5f5ef3bb0ea4eab034c..12e5bdb6c45fbfb611d301c4449f4295d292931a 100644 (file)
--- a/src/osd/scheduler/OpScheduler.cc
+++ b/src/osd/scheduler/OpScheduler.cc
@@ -24,7 +24,7 @@ namespace ceph::osd::scheduler {
  OpSchedulerRef make_scheduler(
    CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
    bool is_rotational, std::string_view osd_objectstore,
-  op_queue_type_t osd_scheduler, MonClient *monc)
+  op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc)
  {
    // Force the use of 'wpq' scheduler for filestore OSDs.
    // The 'mclock_scheduler' is not supported for filestore OSDs.
@@ -33,13 +33,15 @@ OpSchedulerRef make_scheduler(
      return std::make_unique<
        ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>(
         cct,
+        op_queue_cut_off,
         cct->_conf->osd_op_pq_max_tokens_per_priority,
         cct->_conf->osd_op_pq_min_cost
      );
    } else if (op_queue_type_t::mClockScheduler == osd_scheduler) {
      // default is 'mclock_scheduler'
      return std::make_unique<
-      mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc);
+      mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational,
+        op_queue_cut_off, monc);
    } else {
      ceph_assert("Invalid choice of wq" == 0);
    }
diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h

index 382f48dd40c126b878a3e120c59a5d2bc2729605..570a2a162900a9ee85d0dbc8695801936ec190d2 100644 (file)
--- a/src/osd/scheduler/OpScheduler.h
+++ b/src/osd/scheduler/OpScheduler.h
@@ -68,7 +68,7 @@ using OpSchedulerRef = std::unique_ptr<OpScheduler>;
  OpSchedulerRef make_scheduler(
    CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
    bool is_rotational, std::string_view osd_objectstore,
-  op_queue_type_t osd_scheduler, MonClient *monc);
+  op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc);
  
  /**
   * Implements OpScheduler in terms of OpQueue
@@ -83,21 +83,10 @@ class ClassedOpQueueScheduler final : public OpScheduler {
    unsigned cutoff;
    T queue;
  
-  static unsigned int get_io_prio_cut(CephContext *cct) {
-    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
-      srand(time(NULL));
-      return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
-    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
-      return CEPH_MSG_PRIO_HIGH;
-    } else {
-      // default / catch-all is 'low'
-      return CEPH_MSG_PRIO_LOW;
-    }
-  }
  public:
    template <typename... Args>
-  ClassedOpQueueScheduler(CephContext *cct, Args&&... args) :
-    cutoff(get_io_prio_cut(cct)),
+  ClassedOpQueueScheduler(CephContext *cct, unsigned prio_cut, Args&&... args) :
+    cutoff(prio_cut),
      queue(std::forward<Args>(args)...)
    {}
  
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc

index 0ea519655d85b5a4e02fc3010658213336367e9a..f72683d527923b9e73c6d513606b121ad34476aa 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.cc
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -35,12 +35,14 @@ mClockScheduler::mClockScheduler(CephContext *cct,
    uint32_t num_shards,
    int shard_id,
    bool is_rotational,
+  unsigned cutoff_priority,
    MonClient *monc)
    : cct(cct),
      whoami(whoami),
      num_shards(num_shards),
      shard_id(shard_id),
      is_rotational(is_rotational),
+    cutoff_priority(cutoff_priority),
      monc(monc),
      scheduler(
        std::bind(&mClockScheduler::ClientRegistry::get_info,
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h

index 9f32918827980be06e6651ab3d81080ec9c785c0..16e7f911ff95426cbba3503ec90e364157fba5d8 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.h
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -96,6 +96,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
    const uint32_t num_shards;
    const int shard_id;
    const bool is_rotational;
+  const unsigned cutoff_priority;
    MonClient *monc;
  
    /**
@@ -198,21 +199,6 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
      };
    }
  
-  static unsigned int get_io_prio_cut(CephContext *cct) {
-    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
-      std::random_device rd;
-      std::mt19937 random_gen(rd());
-      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
-    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
-      return CEPH_MSG_PRIO_HIGH;
-    } else {
-      // default / catch-all is 'low'
-      return CEPH_MSG_PRIO_LOW;
-    }
-  }
-
-  unsigned cutoff_priority = get_io_prio_cut(cct);
-
    /**
     * set_osd_capacity_params_from_config
     *
@@ -232,7 +218,8 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
  
  public: 
    mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
-    int shard_id, bool is_rotational, MonClient *monc);
+    int shard_id, bool is_rotational, unsigned cutoff_priority,
+    MonClient *monc);
    ~mClockScheduler() override;
  
    /// Calculate scaled cost per item
@@ -260,6 +247,7 @@ public:
  
    void print(std::ostream &ostream) const final {
      ostream << get_op_queue_type_name(get_type());
+    ostream << ", cutoff=" << cutoff_priority;
    }
  
    // Update data associated with the modified mclock config key(s)
diff --git a/src/test/osd/TestMClockScheduler.cc b/src/test/osd/TestMClockScheduler.cc

index 8291da2684a9682ab239bbb67188a0a1d963b3d1..205ef2f98f6bce33978e61a6dc14576b6e07cc83 100644 (file)
--- a/src/test/osd/TestMClockScheduler.cc
+++ b/src/test/osd/TestMClockScheduler.cc
@@ -31,6 +31,7 @@ public:
    uint32_t num_shards;
    int shard_id;
    bool is_rotational;
+  unsigned cutoff_priority;
    MonClient *monc;
    mClockScheduler q;
  
@@ -43,8 +44,10 @@ public:
      num_shards(1),
      shard_id(0),
      is_rotational(false),
+    cutoff_priority(12),
      monc(nullptr),
-    q(g_ceph_context, whoami, num_shards, shard_id, is_rotational, monc),
+    q(g_ceph_context, whoami, num_shards, shard_id, is_rotational,
+      cutoff_priority, monc),
      client1(1001),
      client2(9999),
      client3(100000001)
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Mon, 13 Nov 2023 12:13:40 +0000 (17:43 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Thu, 21 Dec 2023 07:38:41 +0000 (13:08 +0530)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/scheduler/OpScheduler.cc		patch \| blob \| history
src/osd/scheduler/OpScheduler.h		patch \| blob \| history
src/osd/scheduler/mClockScheduler.cc		patch \| blob \| history
src/osd/scheduler/mClockScheduler.h		patch \| blob \| history
src/test/osd/TestMClockScheduler.cc		patch \| blob \| history