Originally, the choice of 'debug_random' for osd_op_queue resulted in the
selection of a random scheduler type for each OSD shard. A more realistic
scenario for testing would be the selection of the random scheduler type
applied globally for all shards of an OSD. In other words, all OSD shards
would employ the same scheduler type. For e.g., this scenario would be
possible during upgrades when the scheduler type has changed between
releases.
The following changes are made as part of the commit:
1. Introduce enum class op_queue_type_t within osd_types.h that holds the
various op queue types supported. This header in included by OpQueue.h.
Add helper functions osd_types.cc to return the op_queue_type_t as
enum or a string representing the enum member.
2. Determine the scheduler type before initializing the OSD shards in
OSD class constructor.
3. Pass the determined op_queue_type_t to the OSDShard's make_scheduler()
method for each shard. This ensures all shards of the OSD are
initialized with the same scheduler type.
4. Rename & modify the unused OSDShard::get_scheduler_type() method to
return op_queue_type_t set for the queue.
5. Introduce OpScheduler::get_type() and OpQueue::get_type() pure
virtual functions and define them within the respective queue
implementation. This returns a value pertaining to the op queue type.
This is called by OSDShard::get_op_queue_type().
6. Add OSD::osd_op_queue_type() method for determining the scheduler
type set on the OSD shards. Since all OSD shards are set to use
the same scheduler type, the shard with the lowest id is used to
get the scheduler type using OSDShard::get_op_queue_type().
7. Improve comment description related to 'osd_op_queue' option in
common/options/osd.yaml.in.
Call Flow
--------
OSD OSDShard OpScheduler/OpQueue
--- -------- -------------------
osd_op_queue_type() ->
get_op_queue_type() ->
get_type()
Fixes: https://tracker.ceph.com/issues/62171
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
(cherry picked from commit
96df279132473f459c692787609702542a32231f)
#define OP_QUEUE_H
#include "include/msgr.h"
+#include "osd/osd_types.h"
#include <list>
#include <functional>
// Human readable brief description of queue and relevant parameters
virtual void print(std::ostream &f) const = 0;
+ // Get the type of OpQueue implementation
+ virtual op_queue_type_t get_type() const = 0;
+
// Don't leak resources on destruction
virtual ~OpQueue() {};
};
}
void print(std::ostream &ostream) const final {
- ostream << "PrioritizedQueue";
+ ostream << get_op_queue_type_name(get_type());
+ }
+
+ op_queue_type_t get_type() const final {
+ return op_queue_type_t::PrioritizedQueue;
}
};
}
void print(std::ostream &ostream) const final {
- ostream << "WeightedPriorityQueue";
+ ostream << get_op_queue_type_name(get_type());
+ }
+
+ op_queue_type_t get_type() const final {
+ return op_queue_type_t::WeightedPriorityQueue;
}
};
desc: Do not store full-object checksums if the backend (bluestore) does its own
checksums. Only usable with all BlueStore OSDs.
default: false
-# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
-# mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
-# and "mclock_client" are based on the mClock/dmClock algorithm
-# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
-# class the operation belongs to. "mclock_client" does the same but
-# also works to ienforce fairness between clients. "debug_random"
-# chooses among all four with equal probability.
+# Weighted Priority Queue (wpq), mClock Scheduler (mclock_scheduler: default)
+# or debug_random. "mclock_scheduler" is based on the mClock/dmClock
+# algorithm (Gulati, et al. 2010). "mclock_scheduler" prioritizes based on
+# the class the operation belongs to. "wpq" dequeues ops based on their
+# priorities. "debug_random" chooses among the two with equal probability.
+# Note: PrioritzedQueue (prio) implementation is not used for scheduling ops
+# within OSDs and is therefore not listed.
- name: osd_op_queue
type: str
level: advanced
trace_endpoint.copy_name(ss.str());
#endif
+ // Determine scheduler type for this OSD
+ auto get_op_queue_type = [this, &conf = cct->_conf]() {
+ op_queue_type_t queue_type;
+ if (auto type = conf.get_val<std::string>("osd_op_queue");
+ type != "debug_random") {
+ if (auto qt = get_op_queue_type_by_name(type); qt.has_value()) {
+ queue_type = *qt;
+ } else {
+ // This should never happen
+ dout(0) << "Invalid value passed for 'osd_op_queue': " << type << dendl;
+ ceph_assert(0 == "Unsupported op queue type");
+ }
+ } else {
+ static const std::vector<op_queue_type_t> index_lookup = {
+ op_queue_type_t::mClockScheduler,
+ op_queue_type_t::WeightedPriorityQueue
+ };
+ std::mt19937 random_gen(std::random_device{}());
+ auto which = random_gen() % index_lookup.size();
+ queue_type = index_lookup[which];
+ }
+ return queue_type;
+ };
+ op_queue_type_t op_queue = get_op_queue_type();
+
// initialize shards
num_shards = get_num_op_shards();
for (uint32_t i = 0; i < num_shards; i++) {
OSDShard *one_shard = new OSDShard(
i,
cct,
- this);
+ this,
+ op_queue);
shards.push_back(one_shard);
}
}
store->get_type()) != unsupported_objstores.end();
}
+op_queue_type_t OSD::osd_op_queue_type() const
+{
+ /**
+ * All OSD shards employ the same scheduler type. Therefore, return
+ * the scheduler type set on the OSD shard with lowest id(0).
+ */
+ ceph_assert(shards.size());
+ return shards[0]->get_op_queue_type();
+}
+
void OSD::update_log_config()
{
auto parsed_options = clog->parse_client_options(cct);
scheduler->update_configuration();
}
-std::string OSDShard::get_scheduler_type()
+op_queue_type_t OSDShard::get_op_queue_type() const
{
- std::ostringstream scheduler_type;
- scheduler_type << *scheduler;
- return scheduler_type.str();
+ return scheduler->get_type();
}
OSDShard::OSDShard(
int id,
CephContext *cct,
- OSD *osd)
+ OSD *osd,
+ op_queue_type_t osd_op_queue)
: shard_id(id),
cct(cct),
osd(osd),
shard_lock{make_mutex(shard_lock_name)},
scheduler(ceph::osd::scheduler::make_scheduler(
cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
- osd->store->get_type(), osd->monc)),
+ osd->store->get_type(), osd_op_queue, osd->monc)),
context_queue(sdata_wait_lock, sdata_cond)
{
dout(0) << "using op scheduler " << *scheduler << dendl;
void register_and_wake_split_child(PG *pg);
void unprime_split_children(spg_t parent, unsigned old_pg_num);
void update_scheduler_config();
- std::string get_scheduler_type();
+ op_queue_type_t get_op_queue_type() const;
OSDShard(
int id,
CephContext *cct,
- OSD *osd);
+ OSD *osd,
+ op_queue_type_t osd_op_queue);
};
class OSD : public Dispatcher,
OSDService service;
friend class OSDService;
+ /// op queue type set for the OSD
+ op_queue_type_t osd_op_queue_type() const;
+
private:
void set_perf_queries(const ConfigPayload &config_payload);
MetricPayload get_perf_reports();
{
return xattr_data.contents_equal(val.c_str(), val.size());
}
+
+std::string_view get_op_queue_type_name(const op_queue_type_t &q)
+{
+ switch (q) {
+ case op_queue_type_t::WeightedPriorityQueue:
+ return "wpq";
+ case op_queue_type_t::mClockScheduler:
+ return "mclock_scheduler";
+ case op_queue_type_t::PrioritizedQueue:
+ return "PrioritizedQueue";
+ default:
+ return "unknown";
+ }
+}
+
+std::optional<op_queue_type_t> get_op_queue_type_by_name(
+ const std::string_view &s)
+{
+ if (s == "wpq") {
+ return op_queue_type_t::WeightedPriorityQueue;
+ } else if (s == "mclock_scheduler") {
+ return op_queue_type_t::mClockScheduler;
+ } else if (s == "PrioritizedQueue") {
+ return op_queue_type_t::PrioritizedQueue;
+ } else {
+ return std::nullopt;
+ }
+}
std::pair<std::optional<uint32_t>,
std::optional<uint32_t>>>;
+/**
+ * op_queue_type_t
+ *
+ * Supported op queue types
+ */
+enum class op_queue_type_t : uint8_t {
+ WeightedPriorityQueue = 0,
+ mClockScheduler,
+ PrioritizedQueue
+};
+std::string_view get_op_queue_type_name(const op_queue_type_t &q);
+std::optional<op_queue_type_t> get_op_queue_type_by_name(
+ const std::string_view &s);
+
#endif
OpSchedulerRef make_scheduler(
CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
- bool is_rotational, std::string_view osd_objectstore, MonClient *monc)
+ bool is_rotational, std::string_view osd_objectstore,
+ op_queue_type_t osd_scheduler, MonClient *monc)
{
- const std::string *type = &cct->_conf->osd_op_queue;
- if (*type == "debug_random") {
- static const std::string index_lookup[] = { "mclock_scheduler",
- "wpq" };
- srand(time(NULL));
- unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0]));
- type = &index_lookup[which];
- }
-
// Force the use of 'wpq' scheduler for filestore OSDs.
// The 'mclock_scheduler' is not supported for filestore OSDs.
- if (*type == "wpq" || osd_objectstore == "filestore") {
+ if (op_queue_type_t::WeightedPriorityQueue == osd_scheduler ||
+ osd_objectstore == "filestore") {
return std::make_unique<
ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>(
cct,
cct->_conf->osd_op_pq_max_tokens_per_priority,
cct->_conf->osd_op_pq_min_cost
);
- } else if (*type == "mclock_scheduler") {
+ } else if (op_queue_type_t::mClockScheduler == osd_scheduler) {
// default is 'mclock_scheduler'
return std::make_unique<
mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc);
#include <variant>
#include "common/ceph_context.h"
+#include "common/OpQueue.h"
#include "mon/MonClient.h"
#include "osd/scheduler/OpSchedulerItem.h"
// Apply config changes to the scheduler (if any)
virtual void update_configuration() = 0;
+ // Get the scheduler type set for the queue
+ virtual op_queue_type_t get_type() const = 0;
+
// Destructor
virtual ~OpScheduler() {};
};
OpSchedulerRef make_scheduler(
CephContext *cct, int whoami, uint32_t num_shards, int shard_id,
- bool is_rotational, std::string_view osd_objectstore, MonClient *monc);
+ bool is_rotational, std::string_view osd_objectstore,
+ op_queue_type_t osd_scheduler, MonClient *monc);
/**
* Implements OpScheduler in terms of OpQueue
// no-op
}
+ op_queue_type_t get_type() const final {
+ return queue.get_type();
+ }
+
~ClassedOpQueueScheduler() final {};
};
void dump(ceph::Formatter &f) const final;
void print(std::ostream &ostream) const final {
- ostream << "mClockScheduler";
+ ostream << get_op_queue_type_name(get_type());
}
// Update data associated with the modified mclock config key(s)
void update_configuration() final;
+ // Return the scheduler type
+ op_queue_type_t get_type() const final {
+ return op_queue_type_t::mClockScheduler;
+ }
+
const char** get_tracked_conf_keys() const final;
void handle_conf_change(const ConfigProxy& conf,
const std::set<std::string> &changed) final;