PerfCountersBuilder m(cct, "mclock-shard-queue-" + std::to_string(shard_id),
l_mclock_first, l_mclock_last);
+ // scheduler class queue lengths
m.add_u64_counter(l_mclock_immediate_queue_len, "mclock_immediate_queue_len",
"high_priority op count in mclock queue");
m.add_u64_counter(l_mclock_client_queue_len, "mclock_client_queue_len",
"background_best_effort type op count in mclock queue");
m.add_u64_counter(l_mclock_all_type_queue_len, "mclock_all_type_queue_len",
"all type op count in mclock queue");
+ // op specific counters
+ // peering
+ m.add_time_avg(l_mclock_peering_lat, "mclock_peering_lat",
+ "peering op average latency in mclock high queue");
+ m.add_u64_counter(l_mclock_peering_len, "mclock_peering_len",
+ "peering op count in mclock high queue");
+ // client op
+ m.add_time_avg(l_mclock_client_lat, "mclock_client_lat",
+ "client op average latency in mclock queue");
+ // ec reads
+ m.add_u64_counter(l_mclock_ec_r_ops, "mclock_ec_r_ops",
+ "ec read operations in mclock high queue");
+ m.add_time_avg(l_mclock_ec_r_lat, "mclock_ec_r_lat",
+ "ec read latency in mclock high queue");
+ m.add_u64(l_mclock_ec_r_len, "mclock_ec_r_len",
+ "ec reads outstanding in mclock high queue");
+ // ec writes
+ m.add_u64_counter(l_mclock_ec_w_ops, "mclock_ec_w_ops",
+ "ec write operations in mclock high queue");
+ m.add_time_avg(l_mclock_ec_w_lat, "mclock_ec_w_lat",
+ "ec write latency in mclock high queue");
+ m.add_u64(l_mclock_ec_w_len, "mclock_ec_w_len",
+ "ec writes outstanding in mclock high queue");
+ // ec recovery reads
+ m.add_u64_counter(l_mclock_ec_rec_r_ops, "mclock_ec_rec_r_ops",
+ "ec recovery read operations in mclock queue");
+ m.add_u64_counter(l_mclock_ec_rec_r_outb, "mclock_ec_rec_r_outb",
+ "ec recovery read bytes", NULL, 0, unit_t(UNIT_BYTES));
+ m.add_time_avg(l_mclock_ec_rec_r_lat, "mclock_ec_rec_r_lat",
+ "ec recovery read latency in mclock queue");
+ m.add_u64(l_mclock_ec_rec_r_len, "mclock_ec_rec_r_len",
+ "ec recovery reads outstanding in mclock queue");
logger = m.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
logger->set(l_mclock_recovery_queue_len, 0);
logger->set(l_mclock_best_effort_queue_len, 0);
logger->set(l_mclock_all_type_queue_len, 0);
+ logger->set(l_mclock_peering_lat, 0);
+ logger->set(l_mclock_peering_len, 0);
+ logger->set(l_mclock_client_lat, 0);
+ logger->set(l_mclock_ec_r_ops, 0);
+ logger->set(l_mclock_ec_r_lat, 0);
+ logger->set(l_mclock_ec_r_len, 0);
+ logger->set(l_mclock_ec_w_ops, 0);
+ logger->set(l_mclock_ec_w_lat, 0);
+ logger->set(l_mclock_ec_w_len, 0);
+ logger->set(l_mclock_ec_rec_r_ops, 0);
+ logger->set(l_mclock_ec_rec_r_outb, 0);
+ logger->set(l_mclock_ec_rec_r_lat, 0);
+ logger->set(l_mclock_ec_rec_r_len, 0);
}
-void MclockConfig::get_mclock_counter(scheduler_id_t id)
+void MclockConfig::get_mclock_counter(scheduler_id_t id,
+ scheduler_op_type_t op_type, uint64_t cost)
{
if (!logger) {
return;
logger->inc(l_mclock_all_type_queue_len);
switch (id.class_id) {
- case SchedulerClass::immediate:
+ case SchedulerClass::immediate: {
logger->inc(l_mclock_immediate_queue_len);
+ if (op_type == scheduler_op_type_t::ec_read_op) {
+ logger->inc(l_mclock_ec_r_ops);
+ logger->inc(l_mclock_ec_r_len);
+ } else if (op_type == scheduler_op_type_t::ec_write_op) {
+ logger->inc(l_mclock_ec_w_ops);
+ logger->inc(l_mclock_ec_w_len);
+ } else if (op_type == scheduler_op_type_t::peering_op) {
+ logger->inc(l_mclock_peering_len);
+ }
break;
+ }
case SchedulerClass::client:
logger->inc(l_mclock_client_queue_len);
break;
case SchedulerClass::background_recovery:
logger->inc(l_mclock_recovery_queue_len);
break;
- case SchedulerClass::background_best_effort:
+ case SchedulerClass::background_best_effort: {
logger->inc(l_mclock_best_effort_queue_len);
+ if (op_type == scheduler_op_type_t::ec_rec_read_op) {
+ logger->inc(l_mclock_ec_rec_r_ops);
+ logger->inc(l_mclock_ec_rec_r_outb, cost);
+ logger->inc(l_mclock_ec_rec_r_len);
+ }
break;
- default:
+ }
+ default:
derr << __func__ << " unknown class_id=" << id.class_id
<< " unknown id=" << id << dendl;
break;
}
}
-void MclockConfig::put_mclock_counter(scheduler_id_t id)
+void MclockConfig::put_mclock_counter(scheduler_id_t id,
+ scheduler_op_type_t op_type, utime_t time_queued)
{
if (!logger) {
return;
}
+ auto latency = ceph_clock_now() - time_queued;
+
/* op leave mclock queue will -1 */
logger->dec(l_mclock_all_type_queue_len);
switch (id.class_id) {
- case SchedulerClass::immediate:
+ case SchedulerClass::immediate: {
logger->dec(l_mclock_immediate_queue_len);
+ if (op_type == scheduler_op_type_t::ec_read_op) {
+ logger->dec(l_mclock_ec_r_len);
+ logger->tinc(l_mclock_ec_r_lat, latency);
+ } else if (op_type == scheduler_op_type_t::ec_write_op) {
+ logger->dec(l_mclock_ec_w_len);
+ logger->tinc(l_mclock_ec_w_lat, latency);
+ } else if (op_type == scheduler_op_type_t::peering_op) {
+ logger->dec(l_mclock_peering_len);
+ logger->tinc(l_mclock_peering_lat, latency);
+ }
break;
- case SchedulerClass::client:
+ }
+ case SchedulerClass::client: {
logger->dec(l_mclock_client_queue_len);
+ logger->tinc(l_mclock_client_lat, latency);
break;
+ }
case SchedulerClass::background_recovery:
logger->dec(l_mclock_recovery_queue_len);
break;
- case SchedulerClass::background_best_effort:
+ case SchedulerClass::background_best_effort: {
logger->dec(l_mclock_best_effort_queue_len);
+ if (op_type == scheduler_op_type_t::ec_rec_read_op) {
+ logger->dec(l_mclock_ec_rec_r_len);
+ logger->tinc(l_mclock_ec_rec_r_lat, latency);
+ }
break;
+ }
default:
derr << __func__ << " unknown class_id=" << id.class_id
<< " unknown id=" << id << dendl;
immediate,
};
+// scheduler op types - used for perf counters
+enum class scheduler_op_type_t : uint8_t {
+ peering_op = 0,
+ ec_read_op,
+ ec_write_op,
+ ec_rec_read_op,
+ unknown,
+};
+
#ifdef WITH_CRIMSON
using SchedulerClass = scheduler_class_t;
using MonClient = crimson::mon::Client;
enum {
l_mclock_first = 15000,
+ // scheduler class queue lengths
l_mclock_immediate_queue_len,
l_mclock_client_queue_len,
l_mclock_recovery_queue_len,
l_mclock_best_effort_queue_len,
l_mclock_all_type_queue_len,
+ // op specific counters
+ // peering
+ l_mclock_peering_lat,
+ l_mclock_peering_len,
+ // client ops
+ l_mclock_client_lat,
+ // ec reads
+ l_mclock_ec_r_ops,
+ l_mclock_ec_r_lat,
+ l_mclock_ec_r_len,
+ // ec writes
+ l_mclock_ec_w_ops,
+ l_mclock_ec_w_lat,
+ l_mclock_ec_w_len,
+ // ec recovery reads
+ l_mclock_ec_rec_r_ops,
+ l_mclock_ec_rec_r_outb,
+ l_mclock_ec_rec_r_lat,
+ l_mclock_ec_rec_r_len,
l_mclock_last,
};
void set_from_config();
void init_logger();
- void get_mclock_counter(scheduler_id_t id);
- void put_mclock_counter(scheduler_id_t id);
+ void get_mclock_counter(scheduler_id_t id,
+ scheduler_op_type_t op_type,
+ uint64_t cost);
+ void put_mclock_counter(scheduler_id_t id,
+ scheduler_op_type_t op_type,
+ utime_t time_queued);
double get_cost_per_io() const;
double get_capacity_per_shard() const;
void handle_conf_change(const ConfigProxy& conf,
virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0;
virtual SchedulerClass get_scheduler_class() const = 0;
+ virtual utime_t get_time_queued() const {
+ return utime_t();
+ }
virtual ~OpQueueable() {}
friend std::ostream& operator<<(std::ostream& out, const OpQueueable& q) {
qos_cost = scaled_cost;
}
+ utime_t get_time_queued() const {
+ return qitem ? qitem->get_time_queued() : utime_t();
+ }
+
friend std::ostream& operator<<(std::ostream& out, const OpSchedulerItem& item) {
out << "OpSchedulerItem("
<< item.get_ordering_token() << " " << *item.qitem;
};
class PGOpItem : public PGOpQueueable {
+ utime_t time_queued;
OpRequestRef op;
public:
- PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {}
+ PGOpItem(spg_t pg, OpRequestRef op)
+ : PGOpQueueable(pg), time_queued(ceph_clock_now()), op(std::move(op)) {}
std::ostream &print(std::ostream &rhs) const final {
return rhs << "PGOpItem(op=" << *(op->get_req()) << ")";
}
}
+ utime_t get_time_queued() const final{
+ return time_queued;
+ }
+
void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
};
class PGPeeringItem : public PGOpQueueable {
+ utime_t time_queued;
PGPeeringEventRef evt;
public:
- PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {}
+ PGPeeringItem(spg_t pg, PGPeeringEventRef e)
+ : PGOpQueueable(pg), time_queued(ceph_clock_now()), evt(e) {}
std::ostream &print(std::ostream &rhs) const final {
return rhs << "PGPeeringEvent(" << evt->get_desc() << ")";
}
SchedulerClass get_scheduler_class() const final {
return SchedulerClass::immediate;
}
+ utime_t get_time_queued() const final{
+ return time_queued;
+ }
};
class PGSnapTrim : public PGOpQueueable {
uint64_t get_reserved_pushes() const final {
return reserved_pushes;
}
+ utime_t get_time_queued() const final {
+ return time_queued;
+ }
void run(
OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
SchedulerClass get_scheduler_class() const final {
return fmt::format(
"PGRecoveryContext(pgid={} c={} epoch={})", get_pgid(), (void*)c.get(), epoch);
}
+ utime_t get_time_queued() const final {
+ return time_queued;
+ }
void run(
OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
SchedulerClass get_scheduler_class() const final {
return priority_to_scheduler_class(op->get_req()->get_priority());
}
+ utime_t get_time_queued() const final {
+ return time_queued;
+ }
+
void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
final;
};
f.close_section();
}
+scheduler_op_type_t
+mClockScheduler::get_scheduler_op_type(const OpSchedulerItem &item)
+{
+ if (item.is_peering()) {
+ return scheduler_op_type_t::peering_op;
+ }
+
+ const auto op = item.maybe_get_op();
+ if (!op.has_value() || !(*op) || !(*op)->get_req()) {
+ return scheduler_op_type_t::unknown;
+ }
+
+ scheduler_op_type_t sch_op_type = scheduler_op_type_t::unknown;
+ const auto op_type = (*op)->get_req()->get_type();
+ const auto id = get_scheduler_id(item);
+ switch (id.class_id) {
+ case SchedulerClass::immediate: {
+ if (op_type == MSG_OSD_EC_READ) {
+ sch_op_type = scheduler_op_type_t::ec_read_op;
+ } else if (op_type == MSG_OSD_EC_WRITE) {
+ sch_op_type = scheduler_op_type_t::ec_write_op;
+ }
+ break;
+ }
+ case SchedulerClass::background_best_effort: {
+ if (op_type == MSG_OSD_EC_READ) {
+ sch_op_type = scheduler_op_type_t::ec_rec_read_op;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return sch_op_type;
+}
+
void mClockScheduler::enqueue(OpSchedulerItem&& item)
{
auto id = get_scheduler_id(item);
unsigned priority = item.get_priority();
-
+ scheduler_op_type_t sch_op_type = get_scheduler_op_type(item);
+ auto item_cost = item.get_cost();
+
// TODO: move this check into OpSchedulerItem, handle backwards compat
if (SchedulerClass::immediate == id.class_id) {
enqueue_high(immediate_class_priority, std::move(item));
} else if (priority >= cutoff_priority) {
enqueue_high(priority, std::move(item));
} else {
- auto cost = calc_scaled_cost(item.get_cost());
- item.set_qos_cost(cost);
+ auto qos_cost = calc_scaled_cost(item_cost);
+ item.set_qos_cost(qos_cost);
dout(20) << __func__ << " " << id
- << " item_cost: " << item.get_cost()
- << " scaled_cost: " << cost
+ << " item_cost: " << item_cost
+ << " scaled_cost: " << qos_cost
<< dendl;
+ // trigger perf counter calculations first
+ mclock_conf.get_mclock_counter(id, sch_op_type, item_cost);
+
// Add item to scheduler queue
scheduler.add_request(
std::move(item),
id,
- cost);
- mclock_conf.get_mclock_counter(id);
+ qos_cost);
}
dout(20) << __func__ << ": sched client_count: " << scheduler.client_count()
OpSchedulerItem&& item,
bool front)
{
+ scheduler_op_type_t sch_op_type = get_scheduler_op_type(item);
+ scheduler_id_t id = scheduler_id_t {
+ SchedulerClass::immediate,
+ client_profile_id_t()
+ };
+
+ // trigger perf counter calculations first
+ mclock_conf.get_mclock_counter(id, sch_op_type, item.get_cost());
+
if (front) {
high_priority[priority].push_back(std::move(item));
} else {
high_priority[priority].push_front(std::move(item));
}
-
- scheduler_id_t id = scheduler_id_t {
- SchedulerClass::immediate,
- client_profile_id_t()
- };
- mclock_conf.get_mclock_counter(id);
}
WorkItem mClockScheduler::dequeue()
// maintain invariant, high priority entries are never empty
high_priority.erase(iter);
}
- ceph_assert(std::get_if<OpSchedulerItem>(&ret));
+ auto *item_ptr = std::get_if<OpSchedulerItem>(&ret);
+ ceph_assert(item_ptr);
scheduler_id_t id = scheduler_id_t {
SchedulerClass::immediate,
client_profile_id_t()
};
- mclock_conf.put_mclock_counter(id);
+ scheduler_op_type_t op_type = get_scheduler_op_type(*item_ptr);
+ mclock_conf.put_mclock_counter(id, op_type, item_ptr->get_time_queued());
return ret;
} else {
mclock_queue_t::PullReq result = scheduler.pull_request();
ceph_assert(result.is_retn());
auto &retn = result.get_retn();
- mclock_conf.put_mclock_counter(retn.client);
+
+ // update perf counters
+ scheduler_op_type_t op_type = scheduler_op_type_t::unknown;
+ utime_t time_queued = utime_t();
+ if (retn.request) {
+ op_type = get_scheduler_op_type(*retn.request);
+ time_queued = retn.request->get_time_queued();
+ }
+ mclock_conf.put_mclock_counter(retn.client, op_type, time_queued);
+
return std::move(*retn.request);
}
}
private:
// Enqueue the op to the high priority queue
void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);
+ // Return the scheduler op type - used to update perf counters
+ scheduler_op_type_t get_scheduler_op_type(const OpSchedulerItem &item);
};
}