From: Sridhar Seshasayee Date: Fri, 15 Sep 2023 13:54:08 +0000 (+0530) Subject: common, osd: Apply randomly selected scheduler type across all OSD shards X-Git-Tag: v18.2.4~123^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=294af47cbbd99e5a0fa71c0d5c6f0cad6dbf98bc;p=ceph.git common, osd: Apply randomly selected scheduler type across all OSD shards Originally, the choice of 'debug_random' for osd_op_queue resulted in the selection of a random scheduler type for each OSD shard. A more realistic scenario for testing would be the selection of the random scheduler type applied globally for all shards of an OSD. In other words, all OSD shards would employ the same scheduler type. For e.g., this scenario would be possible during upgrades when the scheduler type has changed between releases. The following changes are made as part of the commit: 1. Introduce enum class op_queue_type_t within osd_types.h that holds the various op queue types supported. This header in included by OpQueue.h. Add helper functions osd_types.cc to return the op_queue_type_t as enum or a string representing the enum member. 2. Determine the scheduler type before initializing the OSD shards in OSD class constructor. 3. Pass the determined op_queue_type_t to the OSDShard's make_scheduler() method for each shard. This ensures all shards of the OSD are initialized with the same scheduler type. 4. Rename & modify the unused OSDShard::get_scheduler_type() method to return op_queue_type_t set for the queue. 5. Introduce OpScheduler::get_type() and OpQueue::get_type() pure virtual functions and define them within the respective queue implementation. This returns a value pertaining to the op queue type. This is called by OSDShard::get_op_queue_type(). 6. Add OSD::osd_op_queue_type() method for determining the scheduler type set on the OSD shards. Since all OSD shards are set to use the same scheduler type, the shard with the lowest id is used to get the scheduler type using OSDShard::get_op_queue_type(). 7. Improve comment description related to 'osd_op_queue' option in common/options/osd.yaml.in. Call Flow -------- OSD OSDShard OpScheduler/OpQueue --- -------- ------------------- osd_op_queue_type() -> get_op_queue_type() -> get_type() Fixes: https://tracker.ceph.com/issues/62171 Signed-off-by: Sridhar Seshasayee (cherry picked from commit 96df279132473f459c692787609702542a32231f) --- diff --git a/src/common/OpQueue.h b/src/common/OpQueue.h index 0204f4b4403..07104b21f53 100644 --- a/src/common/OpQueue.h +++ b/src/common/OpQueue.h @@ -16,6 +16,7 @@ #define OP_QUEUE_H #include "include/msgr.h" +#include "osd/osd_types.h" #include #include @@ -66,6 +67,9 @@ public: // Human readable brief description of queue and relevant parameters virtual void print(std::ostream &f) const = 0; + // Get the type of OpQueue implementation + virtual op_queue_type_t get_type() const = 0; + // Don't leak resources on destruction virtual ~OpQueue() {}; }; diff --git a/src/common/PrioritizedQueue.h b/src/common/PrioritizedQueue.h index 9adf21aafe1..0c006795eb8 100644 --- a/src/common/PrioritizedQueue.h +++ b/src/common/PrioritizedQueue.h @@ -345,7 +345,11 @@ public: } void print(std::ostream &ostream) const final { - ostream << "PrioritizedQueue"; + ostream << get_op_queue_type_name(get_type()); + } + + op_queue_type_t get_type() const final { + return op_queue_type_t::PrioritizedQueue; } }; diff --git a/src/common/WeightedPriorityQueue.h b/src/common/WeightedPriorityQueue.h index cf34709b979..c8d92b5e05f 100644 --- a/src/common/WeightedPriorityQueue.h +++ b/src/common/WeightedPriorityQueue.h @@ -346,7 +346,11 @@ class WeightedPriorityQueue : public OpQueue } void print(std::ostream &ostream) const final { - ostream << "WeightedPriorityQueue"; + ostream << get_op_queue_type_name(get_type()); + } + + op_queue_type_t get_type() const final { + return op_queue_type_t::WeightedPriorityQueue; } }; diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 7291ce11dc1..0a908092331 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -895,13 +895,13 @@ options: desc: Do not store full-object checksums if the backend (bluestore) does its own checksums. Only usable with all BlueStore OSDs. default: false -# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default), -# mclock_opclass, mclock_client, or debug_random. "mclock_opclass" -# and "mclock_client" are based on the mClock/dmClock algorithm -# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the -# class the operation belongs to. "mclock_client" does the same but -# also works to ienforce fairness between clients. "debug_random" -# chooses among all four with equal probability. +# Weighted Priority Queue (wpq), mClock Scheduler (mclock_scheduler: default) +# or debug_random. "mclock_scheduler" is based on the mClock/dmClock +# algorithm (Gulati, et al. 2010). "mclock_scheduler" prioritizes based on +# the class the operation belongs to. "wpq" dequeues ops based on their +# priorities. "debug_random" chooses among the two with equal probability. +# Note: PrioritzedQueue (prio) implementation is not used for scheduling ops +# within OSDs and is therefore not listed. - name: osd_op_queue type: str level: advanced diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index c61e7d33218..c53c07dfc69 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2385,13 +2385,39 @@ OSD::OSD(CephContext *cct_, trace_endpoint.copy_name(ss.str()); #endif + // Determine scheduler type for this OSD + auto get_op_queue_type = [this, &conf = cct->_conf]() { + op_queue_type_t queue_type; + if (auto type = conf.get_val("osd_op_queue"); + type != "debug_random") { + if (auto qt = get_op_queue_type_by_name(type); qt.has_value()) { + queue_type = *qt; + } else { + // This should never happen + dout(0) << "Invalid value passed for 'osd_op_queue': " << type << dendl; + ceph_assert(0 == "Unsupported op queue type"); + } + } else { + static const std::vector index_lookup = { + op_queue_type_t::mClockScheduler, + op_queue_type_t::WeightedPriorityQueue + }; + std::mt19937 random_gen(std::random_device{}()); + auto which = random_gen() % index_lookup.size(); + queue_type = index_lookup[which]; + } + return queue_type; + }; + op_queue_type_t op_queue = get_op_queue_type(); + // initialize shards num_shards = get_num_op_shards(); for (uint32_t i = 0; i < num_shards; i++) { OSDShard *one_shard = new OSDShard( i, cct, - this); + this, + op_queue); shards.push_back(one_shard); } } @@ -10195,6 +10221,16 @@ bool OSD::unsupported_objstore_for_qos() store->get_type()) != unsupported_objstores.end(); } +op_queue_type_t OSD::osd_op_queue_type() const +{ + /** + * All OSD shards employ the same scheduler type. Therefore, return + * the scheduler type set on the OSD shard with lowest id(0). + */ + ceph_assert(shards.size()); + return shards[0]->get_op_queue_type(); +} + void OSD::update_log_config() { auto parsed_options = clog->parse_client_options(cct); @@ -10695,17 +10731,16 @@ void OSDShard::update_scheduler_config() scheduler->update_configuration(); } -std::string OSDShard::get_scheduler_type() +op_queue_type_t OSDShard::get_op_queue_type() const { - std::ostringstream scheduler_type; - scheduler_type << *scheduler; - return scheduler_type.str(); + return scheduler->get_type(); } OSDShard::OSDShard( int id, CephContext *cct, - OSD *osd) + OSD *osd, + op_queue_type_t osd_op_queue) : shard_id(id), cct(cct), osd(osd), @@ -10717,7 +10752,7 @@ OSDShard::OSDShard( shard_lock{make_mutex(shard_lock_name)}, scheduler(ceph::osd::scheduler::make_scheduler( cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(), - osd->store->get_type(), osd->monc)), + osd->store->get_type(), osd_op_queue, osd->monc)), context_queue(sdata_wait_lock, sdata_cond) { dout(0) << "using op scheduler " << *scheduler << dendl; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 00fab7ec83e..19c5472e654 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1050,12 +1050,13 @@ struct OSDShard { void register_and_wake_split_child(PG *pg); void unprime_split_children(spg_t parent, unsigned old_pg_num); void update_scheduler_config(); - std::string get_scheduler_type(); + op_queue_type_t get_op_queue_type() const; OSDShard( int id, CephContext *cct, - OSD *osd); + OSD *osd, + op_queue_type_t osd_op_queue); }; class OSD : public Dispatcher, @@ -2055,6 +2056,9 @@ public: OSDService service; friend class OSDService; + /// op queue type set for the OSD + op_queue_type_t osd_op_queue_type() const; + private: void set_perf_queries(const ConfigPayload &config_payload); MetricPayload get_perf_reports(); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 14694de195b..841a44b32f5 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -7371,3 +7371,31 @@ bool PGLSPlainFilter::filter(const hobject_t& obj, { return xattr_data.contents_equal(val.c_str(), val.size()); } + +std::string_view get_op_queue_type_name(const op_queue_type_t &q) +{ + switch (q) { + case op_queue_type_t::WeightedPriorityQueue: + return "wpq"; + case op_queue_type_t::mClockScheduler: + return "mclock_scheduler"; + case op_queue_type_t::PrioritizedQueue: + return "PrioritizedQueue"; + default: + return "unknown"; + } +} + +std::optional get_op_queue_type_by_name( + const std::string_view &s) +{ + if (s == "wpq") { + return op_queue_type_t::WeightedPriorityQueue; + } else if (s == "mclock_scheduler") { + return op_queue_type_t::mClockScheduler; + } else if (s == "PrioritizedQueue") { + return op_queue_type_t::PrioritizedQueue; + } else { + return std::nullopt; + } +} diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index afed5fa8351..3ef6c8232bc 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -6636,4 +6636,18 @@ using missing_map_t = std::map, std::optional>>; +/** + * op_queue_type_t + * + * Supported op queue types + */ +enum class op_queue_type_t : uint8_t { + WeightedPriorityQueue = 0, + mClockScheduler, + PrioritizedQueue +}; +std::string_view get_op_queue_type_name(const op_queue_type_t &q); +std::optional get_op_queue_type_by_name( + const std::string_view &s); + #endif diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc index cb5ef13b6f6..7b89f4be022 100644 --- a/src/osd/scheduler/OpScheduler.cc +++ b/src/osd/scheduler/OpScheduler.cc @@ -23,27 +23,20 @@ namespace ceph::osd::scheduler { OpSchedulerRef make_scheduler( CephContext *cct, int whoami, uint32_t num_shards, int shard_id, - bool is_rotational, std::string_view osd_objectstore, MonClient *monc) + bool is_rotational, std::string_view osd_objectstore, + op_queue_type_t osd_scheduler, MonClient *monc) { - const std::string *type = &cct->_conf->osd_op_queue; - if (*type == "debug_random") { - static const std::string index_lookup[] = { "mclock_scheduler", - "wpq" }; - srand(time(NULL)); - unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); - type = &index_lookup[which]; - } - // Force the use of 'wpq' scheduler for filestore OSDs. // The 'mclock_scheduler' is not supported for filestore OSDs. - if (*type == "wpq" || osd_objectstore == "filestore") { + if (op_queue_type_t::WeightedPriorityQueue == osd_scheduler || + osd_objectstore == "filestore") { return std::make_unique< ClassedOpQueueScheduler>>( cct, cct->_conf->osd_op_pq_max_tokens_per_priority, cct->_conf->osd_op_pq_min_cost ); - } else if (*type == "mclock_scheduler") { + } else if (op_queue_type_t::mClockScheduler == osd_scheduler) { // default is 'mclock_scheduler' return std::make_unique< mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc); diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h index 1575bcae4f6..382f48dd40c 100644 --- a/src/osd/scheduler/OpScheduler.h +++ b/src/osd/scheduler/OpScheduler.h @@ -18,6 +18,7 @@ #include #include "common/ceph_context.h" +#include "common/OpQueue.h" #include "mon/MonClient.h" #include "osd/scheduler/OpSchedulerItem.h" @@ -54,6 +55,9 @@ public: // Apply config changes to the scheduler (if any) virtual void update_configuration() = 0; + // Get the scheduler type set for the queue + virtual op_queue_type_t get_type() const = 0; + // Destructor virtual ~OpScheduler() {}; }; @@ -63,7 +67,8 @@ using OpSchedulerRef = std::unique_ptr; OpSchedulerRef make_scheduler( CephContext *cct, int whoami, uint32_t num_shards, int shard_id, - bool is_rotational, std::string_view osd_objectstore, MonClient *monc); + bool is_rotational, std::string_view osd_objectstore, + op_queue_type_t osd_scheduler, MonClient *monc); /** * Implements OpScheduler in terms of OpQueue @@ -143,6 +148,10 @@ public: // no-op } + op_queue_type_t get_type() const final { + return queue.get_type(); + } + ~ClassedOpQueueScheduler() final {}; }; diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h index c1aca5da5d1..9f329188279 100644 --- a/src/osd/scheduler/mClockScheduler.h +++ b/src/osd/scheduler/mClockScheduler.h @@ -259,12 +259,17 @@ public: void dump(ceph::Formatter &f) const final; void print(std::ostream &ostream) const final { - ostream << "mClockScheduler"; + ostream << get_op_queue_type_name(get_type()); } // Update data associated with the modified mclock config key(s) void update_configuration() final; + // Return the scheduler type + op_queue_type_t get_type() const final { + return op_queue_type_t::mClockScheduler; + } + const char** get_tracked_conf_keys() const final; void handle_conf_change(const ConfigProxy& conf, const std::set &changed) final;