]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mclock_common, mClockScheduler: Add perf counters for scheduler ops
authorSridhar Seshasayee <sridhar.seshasayee@ibm.com>
Tue, 21 Apr 2026 12:30:50 +0000 (18:00 +0530)
committerSridhar Seshasayee <sridhar.seshasayee@ibm.com>
Fri, 12 Jun 2026 06:58:48 +0000 (12:28 +0530)
Add perf counters to show the status pertaining to the number of ops,
dynamic queue lengths, queue latency and bytes read for the following
ops handled in the high queues and in the scheduler queues:
 - peering
 - client
 - ec reads/writes
 - ec recovery reads

Additional counters can be added in the future based on the requirement.

Signed-off-by: Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
src/common/mclock_common.cc
src/common/mclock_common.h
src/osd/scheduler/OpSchedulerItem.h
src/osd/scheduler/mClockScheduler.cc
src/osd/scheduler/mClockScheduler.h

index 616681018f7821562a5d14eb1b539404917fb493..ae6eca8d9a1850a743b322ac3966be331232e764 100644 (file)
@@ -248,6 +248,7 @@ void MclockConfig::init_logger()
   PerfCountersBuilder m(cct, "mclock-shard-queue-" + std::to_string(shard_id),
                         l_mclock_first, l_mclock_last);
 
+  // scheduler class queue lengths
   m.add_u64_counter(l_mclock_immediate_queue_len, "mclock_immediate_queue_len",
                     "high_priority op count in mclock queue");
   m.add_u64_counter(l_mclock_client_queue_len, "mclock_client_queue_len",
@@ -258,6 +259,38 @@ void MclockConfig::init_logger()
                     "background_best_effort type op count in mclock queue");
   m.add_u64_counter(l_mclock_all_type_queue_len, "mclock_all_type_queue_len",
                     "all type op count in mclock queue");
+  // op specific counters
+  // peering
+  m.add_time_avg(l_mclock_peering_lat, "mclock_peering_lat",
+                 "peering op average latency in mclock high queue");
+  m.add_u64_counter(l_mclock_peering_len, "mclock_peering_len",
+                    "peering op count in mclock high queue");
+  // client op
+  m.add_time_avg(l_mclock_client_lat, "mclock_client_lat",
+                 "client op average latency in mclock queue");
+  // ec reads
+  m.add_u64_counter(l_mclock_ec_r_ops, "mclock_ec_r_ops",
+                    "ec read operations in mclock high queue");
+  m.add_time_avg(l_mclock_ec_r_lat, "mclock_ec_r_lat",
+                 "ec read latency in mclock high queue");
+  m.add_u64(l_mclock_ec_r_len, "mclock_ec_r_len",
+            "ec reads outstanding in mclock high queue");
+  // ec writes
+  m.add_u64_counter(l_mclock_ec_w_ops, "mclock_ec_w_ops",
+                    "ec write operations in mclock high queue");
+  m.add_time_avg(l_mclock_ec_w_lat, "mclock_ec_w_lat",
+                 "ec write latency in mclock high queue");
+  m.add_u64(l_mclock_ec_w_len, "mclock_ec_w_len",
+            "ec writes outstanding in mclock high queue");
+  // ec recovery reads
+  m.add_u64_counter(l_mclock_ec_rec_r_ops, "mclock_ec_rec_r_ops",
+                    "ec recovery read operations in mclock queue");
+  m.add_u64_counter(l_mclock_ec_rec_r_outb, "mclock_ec_rec_r_outb",
+                    "ec recovery read bytes", NULL, 0, unit_t(UNIT_BYTES));
+  m.add_time_avg(l_mclock_ec_rec_r_lat, "mclock_ec_rec_r_lat",
+                 "ec recovery read latency in mclock queue");
+  m.add_u64(l_mclock_ec_rec_r_len, "mclock_ec_rec_r_len",
+            "ec recovery reads outstanding in mclock queue");
 
   logger = m.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
@@ -267,9 +300,23 @@ void MclockConfig::init_logger()
   logger->set(l_mclock_recovery_queue_len, 0);
   logger->set(l_mclock_best_effort_queue_len, 0);
   logger->set(l_mclock_all_type_queue_len, 0);
+  logger->set(l_mclock_peering_lat, 0);
+  logger->set(l_mclock_peering_len, 0);
+  logger->set(l_mclock_client_lat, 0);
+  logger->set(l_mclock_ec_r_ops, 0);
+  logger->set(l_mclock_ec_r_lat, 0);
+  logger->set(l_mclock_ec_r_len, 0);
+  logger->set(l_mclock_ec_w_ops, 0);
+  logger->set(l_mclock_ec_w_lat, 0);
+  logger->set(l_mclock_ec_w_len, 0);
+  logger->set(l_mclock_ec_rec_r_ops, 0);
+  logger->set(l_mclock_ec_rec_r_outb, 0);
+  logger->set(l_mclock_ec_rec_r_lat, 0);
+  logger->set(l_mclock_ec_rec_r_len, 0);
 }
 
-void MclockConfig::get_mclock_counter(scheduler_id_t id)
+void MclockConfig::get_mclock_counter(scheduler_id_t id,
+  scheduler_op_type_t op_type, uint64_t cost)
 {
   if (!logger) {
     return;
@@ -279,47 +326,84 @@ void MclockConfig::get_mclock_counter(scheduler_id_t id)
   logger->inc(l_mclock_all_type_queue_len);
 
   switch (id.class_id) {
-  case SchedulerClass::immediate:
+  case SchedulerClass::immediate: {
     logger->inc(l_mclock_immediate_queue_len);
+    if (op_type == scheduler_op_type_t::ec_read_op) {
+      logger->inc(l_mclock_ec_r_ops);
+      logger->inc(l_mclock_ec_r_len);
+    } else if (op_type == scheduler_op_type_t::ec_write_op) {
+      logger->inc(l_mclock_ec_w_ops);
+      logger->inc(l_mclock_ec_w_len);
+    } else if (op_type == scheduler_op_type_t::peering_op) {
+      logger->inc(l_mclock_peering_len);
+    }
     break;
+  }
   case SchedulerClass::client:
     logger->inc(l_mclock_client_queue_len);
     break;
   case SchedulerClass::background_recovery:
     logger->inc(l_mclock_recovery_queue_len);
     break;
-  case SchedulerClass::background_best_effort:
+  case SchedulerClass::background_best_effort: {
     logger->inc(l_mclock_best_effort_queue_len);
+    if (op_type == scheduler_op_type_t::ec_rec_read_op) {
+      logger->inc(l_mclock_ec_rec_r_ops);
+      logger->inc(l_mclock_ec_rec_r_outb, cost);
+      logger->inc(l_mclock_ec_rec_r_len);
+    }
     break;
-   default:
+  }
+  default:
     derr << __func__ << " unknown class_id=" << id.class_id
          << " unknown id=" << id << dendl;
     break;
   }
 }
 
-void MclockConfig::put_mclock_counter(scheduler_id_t id)
+void MclockConfig::put_mclock_counter(scheduler_id_t id,
+  scheduler_op_type_t op_type, utime_t time_queued)
 {
   if (!logger) {
     return;
   }
 
+  auto latency = ceph_clock_now() - time_queued;
+
   /* op leave mclock queue will -1 */
   logger->dec(l_mclock_all_type_queue_len);
 
   switch (id.class_id) {
-  case SchedulerClass::immediate:
+  case SchedulerClass::immediate: {
     logger->dec(l_mclock_immediate_queue_len);
+    if (op_type == scheduler_op_type_t::ec_read_op) {
+      logger->dec(l_mclock_ec_r_len);
+      logger->tinc(l_mclock_ec_r_lat, latency);
+    } else if (op_type == scheduler_op_type_t::ec_write_op) {
+      logger->dec(l_mclock_ec_w_len);
+      logger->tinc(l_mclock_ec_w_lat, latency);
+    } else if (op_type == scheduler_op_type_t::peering_op) {
+      logger->dec(l_mclock_peering_len);
+      logger->tinc(l_mclock_peering_lat, latency);
+    }
     break;
-  case SchedulerClass::client:
+  }
+  case SchedulerClass::client: {
     logger->dec(l_mclock_client_queue_len);
+    logger->tinc(l_mclock_client_lat, latency);
     break;
+  }
   case SchedulerClass::background_recovery:
     logger->dec(l_mclock_recovery_queue_len);
     break;
-  case SchedulerClass::background_best_effort:
+  case SchedulerClass::background_best_effort: {
     logger->dec(l_mclock_best_effort_queue_len);
+    if (op_type == scheduler_op_type_t::ec_rec_read_op) {
+      logger->dec(l_mclock_ec_rec_r_len);
+      logger->tinc(l_mclock_ec_rec_r_lat, latency);
+    }
     break;
+   }
    default:
     derr << __func__ << " unknown class_id=" << id.class_id
          << " unknown id=" << id << dendl;
index 98beb317db366c90fe373868f477307ae7f01a1f..b736308159de6160acc34c99376ec50de888a073 100644 (file)
@@ -41,6 +41,15 @@ enum class scheduler_class_t : uint8_t {
   immediate,
 };
 
+// scheduler op types - used for perf counters
+enum class scheduler_op_type_t : uint8_t {
+  peering_op = 0,
+  ec_read_op,
+  ec_write_op,
+  ec_rec_read_op,
+  unknown,
+};
+
 #ifdef WITH_CRIMSON
 using SchedulerClass = scheduler_class_t;
 using MonClient = crimson::mon::Client;
@@ -50,11 +59,31 @@ using SchedulerClass = op_scheduler_class;
 
 enum {
   l_mclock_first = 15000,
+  // scheduler class queue lengths
   l_mclock_immediate_queue_len,
   l_mclock_client_queue_len,
   l_mclock_recovery_queue_len,
   l_mclock_best_effort_queue_len,
   l_mclock_all_type_queue_len,
+  // op specific counters
+  // peering
+  l_mclock_peering_lat,
+  l_mclock_peering_len,
+  // client ops
+  l_mclock_client_lat,
+  // ec reads
+  l_mclock_ec_r_ops,
+  l_mclock_ec_r_lat,
+  l_mclock_ec_r_len,
+  // ec writes
+  l_mclock_ec_w_ops,
+  l_mclock_ec_w_lat,
+  l_mclock_ec_w_len,
+  // ec recovery reads
+  l_mclock_ec_rec_r_ops,
+  l_mclock_ec_rec_r_outb,
+  l_mclock_ec_rec_r_lat,
+  l_mclock_ec_rec_r_len,
   l_mclock_last,
 };
 
@@ -222,8 +251,12 @@ public:
 
   void set_from_config();
   void init_logger();
-  void get_mclock_counter(scheduler_id_t id);
-  void put_mclock_counter(scheduler_id_t id);
+  void get_mclock_counter(scheduler_id_t id,
+                          scheduler_op_type_t op_type,
+                          uint64_t cost);
+  void put_mclock_counter(scheduler_id_t id,
+                          scheduler_op_type_t op_type,
+                          utime_t time_queued);
   double get_cost_per_io() const;
   double get_capacity_per_shard() const;
   void handle_conf_change(const ConfigProxy& conf,
index 21fbe1abd6bbdefdf4a9bba7e486d46fb39feedd..e6643c627bbe5476115bc4ac7735f7895353a914 100644 (file)
@@ -74,6 +74,9 @@ public:
 
     virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0;
     virtual SchedulerClass get_scheduler_class() const = 0;
+    virtual utime_t get_time_queued() const {
+      return utime_t();
+    }
 
     virtual ~OpQueueable() {}
     friend std::ostream& operator<<(std::ostream& out, const OpQueueable& q) {
@@ -166,6 +169,10 @@ public:
     qos_cost = scaled_cost;
   }
 
+  utime_t get_time_queued() const {
+    return qitem ? qitem->get_time_queued() : utime_t();
+  }
+
   friend std::ostream& operator<<(std::ostream& out, const OpSchedulerItem& item) {
     out << "OpSchedulerItem("
         << item.get_ordering_token() << " " << *item.qitem;
@@ -220,10 +227,12 @@ public:
 };
 
 class PGOpItem : public PGOpQueueable {
+  utime_t time_queued;
   OpRequestRef op;
 
 public:
-  PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {}
+  PGOpItem(spg_t pg, OpRequestRef op)
+    : PGOpQueueable(pg), time_queued(ceph_clock_now()), op(std::move(op)) {}
 
   std::ostream &print(std::ostream &rhs) const final {
     return rhs << "PGOpItem(op=" << *(op->get_req()) << ")";
@@ -282,13 +291,19 @@ public:
     }
   }
 
+  utime_t get_time_queued() const final{
+    return time_queued;
+  }
+
   void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
 };
 
 class PGPeeringItem : public PGOpQueueable {
+  utime_t time_queued;
   PGPeeringEventRef evt;
 public:
-  PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {}
+  PGPeeringItem(spg_t pg, PGPeeringEventRef e)
+    : PGOpQueueable(pg), time_queued(ceph_clock_now()), evt(e) {}
   std::ostream &print(std::ostream &rhs) const final {
     return rhs << "PGPeeringEvent(" << evt->get_desc() << ")";
   }
@@ -308,6 +323,9 @@ public:
   SchedulerClass get_scheduler_class() const final {
     return SchedulerClass::immediate;
   }
+  utime_t get_time_queued() const final{
+    return time_queued;
+  }
 };
 
 class PGSnapTrim : public PGOpQueueable {
@@ -537,6 +555,9 @@ public:
   uint64_t get_reserved_pushes() const final {
     return reserved_pushes;
   }
+  utime_t get_time_queued() const final {
+    return time_queued;
+  }
   void run(
     OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
   SchedulerClass get_scheduler_class() const final {
@@ -565,6 +586,9 @@ public:
     return fmt::format(
        "PGRecoveryContext(pgid={} c={} epoch={})", get_pgid(), (void*)c.get(), epoch);
   }
+  utime_t get_time_queued() const final {
+    return time_queued;
+  }
   void run(
     OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
   SchedulerClass get_scheduler_class() const final {
@@ -634,6 +658,10 @@ public:
     return priority_to_scheduler_class(op->get_req()->get_priority());
   }
 
+  utime_t get_time_queued() const final {
+    return time_queued;
+  }
+
   void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
       final;
 };
index 3cef5431ce9f35f06b04367b87903cd22971f7f9..a66a06fd769a1718749f88f30e3acf518db1717c 100644 (file)
@@ -73,30 +73,71 @@ void mClockScheduler::dump(ceph::Formatter &f) const
   f.close_section();
 }
 
+scheduler_op_type_t
+mClockScheduler::get_scheduler_op_type(const OpSchedulerItem &item)
+{
+  if (item.is_peering()) {
+    return scheduler_op_type_t::peering_op;
+  }
+
+  const auto op = item.maybe_get_op();
+  if (!op.has_value() || !(*op) || !(*op)->get_req()) {
+    return scheduler_op_type_t::unknown;
+  }
+
+  scheduler_op_type_t sch_op_type = scheduler_op_type_t::unknown;
+  const auto op_type = (*op)->get_req()->get_type();
+  const auto id = get_scheduler_id(item);
+  switch (id.class_id) {
+  case SchedulerClass::immediate: {
+    if (op_type == MSG_OSD_EC_READ) {
+      sch_op_type = scheduler_op_type_t::ec_read_op;
+    } else if (op_type == MSG_OSD_EC_WRITE) {
+      sch_op_type = scheduler_op_type_t::ec_write_op;
+    }
+    break;
+  }
+  case SchedulerClass::background_best_effort: {
+    if (op_type == MSG_OSD_EC_READ) {
+      sch_op_type = scheduler_op_type_t::ec_rec_read_op;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return sch_op_type;
+}
+
 void mClockScheduler::enqueue(OpSchedulerItem&& item)
 {
   auto id = get_scheduler_id(item);
   unsigned priority = item.get_priority();
-  
+  scheduler_op_type_t sch_op_type = get_scheduler_op_type(item);
+  auto item_cost = item.get_cost();
+
   // TODO: move this check into OpSchedulerItem, handle backwards compat
   if (SchedulerClass::immediate == id.class_id) {
     enqueue_high(immediate_class_priority, std::move(item));
   } else if (priority >= cutoff_priority) {
     enqueue_high(priority, std::move(item));
   } else {
-    auto cost = calc_scaled_cost(item.get_cost());
-    item.set_qos_cost(cost);
+    auto qos_cost = calc_scaled_cost(item_cost);
+    item.set_qos_cost(qos_cost);
     dout(20) << __func__ << " " << id
-             << " item_cost: " << item.get_cost()
-             << " scaled_cost: " << cost
+             << " item_cost: " << item_cost
+             << " scaled_cost: " << qos_cost
              << dendl;
 
+    // trigger perf counter calculations first
+    mclock_conf.get_mclock_counter(id, sch_op_type, item_cost);
+
     // Add item to scheduler queue
     scheduler.add_request(
       std::move(item),
       id,
-      cost);
-    mclock_conf.get_mclock_counter(id);
+      qos_cost);
   }
 
   dout(20) << __func__ << ": sched client_count: " << scheduler.client_count()
@@ -141,17 +182,20 @@ void mClockScheduler::enqueue_high(unsigned priority,
                                    OpSchedulerItem&& item,
                                   bool front)
 {
+  scheduler_op_type_t sch_op_type = get_scheduler_op_type(item);
+  scheduler_id_t id = scheduler_id_t {
+    SchedulerClass::immediate,
+    client_profile_id_t()
+  };
+
+  // trigger perf counter calculations first
+  mclock_conf.get_mclock_counter(id, sch_op_type, item.get_cost());
+
   if (front) {
     high_priority[priority].push_back(std::move(item));
   } else {
     high_priority[priority].push_front(std::move(item));
   }
-
-  scheduler_id_t id = scheduler_id_t {
-    SchedulerClass::immediate,
-    client_profile_id_t()
-  };
-  mclock_conf.get_mclock_counter(id);
 }
 
 WorkItem mClockScheduler::dequeue()
@@ -166,13 +210,15 @@ WorkItem mClockScheduler::dequeue()
       // maintain invariant, high priority entries are never empty
       high_priority.erase(iter);
     }
-    ceph_assert(std::get_if<OpSchedulerItem>(&ret));
+    auto *item_ptr = std::get_if<OpSchedulerItem>(&ret);
+    ceph_assert(item_ptr);
 
     scheduler_id_t id = scheduler_id_t {
       SchedulerClass::immediate,
       client_profile_id_t()
     };
-    mclock_conf.put_mclock_counter(id);
+    scheduler_op_type_t op_type = get_scheduler_op_type(*item_ptr);
+    mclock_conf.put_mclock_counter(id, op_type, item_ptr->get_time_queued());
     return ret;
   } else {
     mclock_queue_t::PullReq result = scheduler.pull_request();
@@ -186,7 +232,16 @@ WorkItem mClockScheduler::dequeue()
       ceph_assert(result.is_retn());
 
       auto &retn = result.get_retn();
-      mclock_conf.put_mclock_counter(retn.client);
+
+      // update perf counters
+      scheduler_op_type_t op_type = scheduler_op_type_t::unknown;
+      utime_t time_queued = utime_t();
+      if (retn.request) {
+        op_type = get_scheduler_op_type(*retn.request);
+        time_queued = retn.request->get_time_queued();
+      }
+      mclock_conf.put_mclock_counter(retn.client, op_type, time_queued);
+
       return std::move(*retn.request);
     }
   }
index 503e90580252fa0574e4409f9503920a4ed523ee..109532a4273a9c5838e4adaec299692fa8e19d89 100644 (file)
@@ -149,6 +149,8 @@ public:
 private:
   // Enqueue the op to the high priority queue
   void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);
+  // Return the scheduler op type - used to update perf counters
+  scheduler_op_type_t get_scheduler_op_type(const OpSchedulerItem &item);
 };
 
 }