From bfbc6b65c672d6dc4326d4e29b4a1ee106c9c091 Mon Sep 17 00:00:00 2001 From: Sridhar Seshasayee Date: Mon, 13 Nov 2023 17:43:40 +0530 Subject: [PATCH] osd: Apply randomly determined IO priority cutoff across all OSD shards Determine the op priority cutoff for an OSD and apply it on all the OSD shards, which is a more realistic scenario. Previously, the cut off value was randomized between OSD shards leading to issues in testing. The IO priority cut off is first determined before initializing the OSD shards. The cut off value is then passed to the OpScheduler implementations that are modified accordingly to apply the values during initialization. Fixes: https://tracker.ceph.com/issues/62171 Signed-off-by: Sridhar Seshasayee --- src/osd/OSD.cc | 23 ++++++++++++++++++++--- src/osd/OSD.h | 3 ++- src/osd/scheduler/OpScheduler.cc | 6 ++++-- src/osd/scheduler/OpScheduler.h | 17 +++-------------- src/osd/scheduler/mClockScheduler.cc | 2 ++ src/osd/scheduler/mClockScheduler.h | 20 ++++---------------- src/test/osd/TestMClockScheduler.cc | 5 ++++- 7 files changed, 39 insertions(+), 37 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1bf3f7a5c906b..cc51305a17a31 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2403,6 +2403,21 @@ OSD::OSD(CephContext *cct_, }; op_queue_type_t op_queue = get_op_queue_type(); + // Determine op queue cutoff + auto get_op_queue_cut_off = [&conf = cct->_conf]() { + if (conf.get_val("osd_op_queue_cut_off") == "debug_random") { + std::random_device rd; + std::mt19937 random_gen(rd()); + return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; + } else if (conf.get_val("osd_op_queue_cut_off") == "high") { + return CEPH_MSG_PRIO_HIGH; + } else { + // default / catch-all is 'low' + return CEPH_MSG_PRIO_LOW; + } + }; + unsigned op_queue_cut_off = get_op_queue_cut_off(); + // initialize shards num_shards = get_num_op_shards(); for (uint32_t i = 0; i < num_shards; i++) { @@ -2410,7 +2425,8 @@ OSD::OSD(CephContext *cct_, i, cct, this, - op_queue); + op_queue, + op_queue_cut_off); shards.push_back(one_shard); } } @@ -10706,7 +10722,8 @@ OSDShard::OSDShard( int id, CephContext *cct, OSD *osd, - op_queue_type_t osd_op_queue) + op_queue_type_t osd_op_queue, + unsigned osd_op_queue_cut_off) : shard_id(id), cct(cct), osd(osd), @@ -10718,7 +10735,7 @@ OSDShard::OSDShard( shard_lock{make_mutex(shard_lock_name)}, scheduler(ceph::osd::scheduler::make_scheduler( cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(), - osd->store->get_type(), osd_op_queue, osd->monc)), + osd->store->get_type(), osd_op_queue, osd_op_queue_cut_off, osd->monc)), context_queue(sdata_wait_lock, sdata_cond) { dout(0) << "using op scheduler " << *scheduler << dendl; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 859fdbbbe2348..0d08466bf1f12 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1038,7 +1038,8 @@ struct OSDShard { int id, CephContext *cct, OSD *osd, - op_queue_type_t osd_op_queue); + op_queue_type_t osd_op_queue, + unsigned osd_op_queue_cut_off); }; class OSD : public Dispatcher, diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc index 7b89f4be0221d..12e5bdb6c45fb 100644 --- a/src/osd/scheduler/OpScheduler.cc +++ b/src/osd/scheduler/OpScheduler.cc @@ -24,7 +24,7 @@ namespace ceph::osd::scheduler { OpSchedulerRef make_scheduler( CephContext *cct, int whoami, uint32_t num_shards, int shard_id, bool is_rotational, std::string_view osd_objectstore, - op_queue_type_t osd_scheduler, MonClient *monc) + op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc) { // Force the use of 'wpq' scheduler for filestore OSDs. // The 'mclock_scheduler' is not supported for filestore OSDs. @@ -33,13 +33,15 @@ OpSchedulerRef make_scheduler( return std::make_unique< ClassedOpQueueScheduler>>( cct, + op_queue_cut_off, cct->_conf->osd_op_pq_max_tokens_per_priority, cct->_conf->osd_op_pq_min_cost ); } else if (op_queue_type_t::mClockScheduler == osd_scheduler) { // default is 'mclock_scheduler' return std::make_unique< - mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc); + mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, + op_queue_cut_off, monc); } else { ceph_assert("Invalid choice of wq" == 0); } diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h index 382f48dd40c12..570a2a162900a 100644 --- a/src/osd/scheduler/OpScheduler.h +++ b/src/osd/scheduler/OpScheduler.h @@ -68,7 +68,7 @@ using OpSchedulerRef = std::unique_ptr; OpSchedulerRef make_scheduler( CephContext *cct, int whoami, uint32_t num_shards, int shard_id, bool is_rotational, std::string_view osd_objectstore, - op_queue_type_t osd_scheduler, MonClient *monc); + op_queue_type_t osd_scheduler, unsigned op_queue_cut_off, MonClient *monc); /** * Implements OpScheduler in terms of OpQueue @@ -83,21 +83,10 @@ class ClassedOpQueueScheduler final : public OpScheduler { unsigned cutoff; T queue; - static unsigned int get_io_prio_cut(CephContext *cct) { - if (cct->_conf->osd_op_queue_cut_off == "debug_random") { - srand(time(NULL)); - return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; - } else if (cct->_conf->osd_op_queue_cut_off == "high") { - return CEPH_MSG_PRIO_HIGH; - } else { - // default / catch-all is 'low' - return CEPH_MSG_PRIO_LOW; - } - } public: template - ClassedOpQueueScheduler(CephContext *cct, Args&&... args) : - cutoff(get_io_prio_cut(cct)), + ClassedOpQueueScheduler(CephContext *cct, unsigned prio_cut, Args&&... args) : + cutoff(prio_cut), queue(std::forward(args)...) {} diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc index 0ea519655d85b..f72683d527923 100644 --- a/src/osd/scheduler/mClockScheduler.cc +++ b/src/osd/scheduler/mClockScheduler.cc @@ -35,12 +35,14 @@ mClockScheduler::mClockScheduler(CephContext *cct, uint32_t num_shards, int shard_id, bool is_rotational, + unsigned cutoff_priority, MonClient *monc) : cct(cct), whoami(whoami), num_shards(num_shards), shard_id(shard_id), is_rotational(is_rotational), + cutoff_priority(cutoff_priority), monc(monc), scheduler( std::bind(&mClockScheduler::ClientRegistry::get_info, diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h index 9f32918827980..16e7f911ff954 100644 --- a/src/osd/scheduler/mClockScheduler.h +++ b/src/osd/scheduler/mClockScheduler.h @@ -96,6 +96,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { const uint32_t num_shards; const int shard_id; const bool is_rotational; + const unsigned cutoff_priority; MonClient *monc; /** @@ -198,21 +199,6 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { }; } - static unsigned int get_io_prio_cut(CephContext *cct) { - if (cct->_conf->osd_op_queue_cut_off == "debug_random") { - std::random_device rd; - std::mt19937 random_gen(rd()); - return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; - } else if (cct->_conf->osd_op_queue_cut_off == "high") { - return CEPH_MSG_PRIO_HIGH; - } else { - // default / catch-all is 'low' - return CEPH_MSG_PRIO_LOW; - } - } - - unsigned cutoff_priority = get_io_prio_cut(cct); - /** * set_osd_capacity_params_from_config * @@ -232,7 +218,8 @@ class mClockScheduler : public OpScheduler, md_config_obs_t { public: mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards, - int shard_id, bool is_rotational, MonClient *monc); + int shard_id, bool is_rotational, unsigned cutoff_priority, + MonClient *monc); ~mClockScheduler() override; /// Calculate scaled cost per item @@ -260,6 +247,7 @@ public: void print(std::ostream &ostream) const final { ostream << get_op_queue_type_name(get_type()); + ostream << ", cutoff=" << cutoff_priority; } // Update data associated with the modified mclock config key(s) diff --git a/src/test/osd/TestMClockScheduler.cc b/src/test/osd/TestMClockScheduler.cc index e7bac03d2abd5..325ebe77e802f 100644 --- a/src/test/osd/TestMClockScheduler.cc +++ b/src/test/osd/TestMClockScheduler.cc @@ -31,6 +31,7 @@ public: uint32_t num_shards; int shard_id; bool is_rotational; + unsigned cutoff_priority; MonClient *monc; mClockScheduler q; @@ -43,8 +44,10 @@ public: num_shards(1), shard_id(0), is_rotational(false), + cutoff_priority(12), monc(nullptr), - q(g_ceph_context, whoami, num_shards, shard_id, is_rotational, monc), + q(g_ceph_context, whoami, num_shards, shard_id, is_rotational, + cutoff_priority, monc), client1(1001), client2(9999), client3(100000001) -- 2.39.5