]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: fixing scrub reservation process counters 62818/head
authorRonen Friedman <rfriedma@redhat.com>
Mon, 14 Apr 2025 16:22:27 +0000 (11:22 -0500)
committerRonen Friedman <rfriedma@redhat.com>
Tue, 15 Apr 2025 11:01:30 +0000 (06:01 -0500)
Using regular (unlabeled) OSD performance counters for
tracking the scrub reservation performance.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/osd_perf_counters.cc
src/osd/osd_perf_counters.h
src/osd/scrubber/pg_scrubber.h
src/osd/scrubber/scrub_machine.cc
src/osd/scrubber/scrub_reservations.cc
src/osd/scrubber/scrub_reservations.h
src/osd/scrubber_common.h

index d79cfbee7c40cd901d8ddb33d8a00240978a1ccd..6efc9d5c5e957af5f7044805c009dbf7aeea96a9 100644 (file)
@@ -369,27 +369,109 @@ PerfCounters *build_osd_logger(CephContext *cct) {
   osd_plb.add_u64_counter(l_osd_scrub_rppool_read_cnt, "scrub_replicated_read_cnt", "scrub replicated pool read calls count");
   osd_plb.add_u64_counter(l_osd_scrub_rppool_read_bytes, "scrub_replicated_read_bytes", "scrub replicated pool read bytes read");
   // scrub I/O performed for EC pools
-  osd_plb.add_u64_counter(l_osd_scrub_ec_getattr_cnt, "scrub_ec_getattr_cnt", "scrub ec getattr calls count");
-  osd_plb.add_u64_counter(l_osd_scrub_ec_stats_cnt, "scrub_ec_stats_cnt", "scrub ec stats calls count");
-  osd_plb.add_u64_counter(l_osd_scrub_ec_read_cnt, "scrub_ec_read_cnt", "scrub ec read calls count");
-  osd_plb.add_u64_counter(l_osd_scrub_ec_read_bytes, "scrub_ec_read_bytes", "scrub ec read bytes read");
+  osd_plb.add_u64_counter(l_osd_scrub_ec_getattr_cnt, "scrub_ec_getattr_cnt", "scrub EC getattr calls count");
+  osd_plb.add_u64_counter(l_osd_scrub_ec_stats_cnt, "scrub_ec_stats_cnt", "scrub EC stats calls count");
+  osd_plb.add_u64_counter(l_osd_scrub_ec_read_cnt, "scrub_ec_read_cnt", "scrub EC read calls count");
+  osd_plb.add_u64_counter(l_osd_scrub_ec_read_bytes, "scrub_ec_read_bytes", "scrub EC read bytes read");
 
-  // scrub (no EC vs. replicated differentiation)
   // scrub - replicated pools
-  osd_plb.add_u64_counter(l_osd_scrub_rppool_started, "num_scrubs_started_replicated", "replicated scrubs attempted count");
-  osd_plb.add_u64_counter(l_osd_scrub_rppool_active_started, "num_scrubs_past_reservation_replicated", "replicated scrubs count");
-  osd_plb.add_u64_counter(l_osd_scrub_rppool_successful, "successful_scrubs_replicated", "successful replicated scrubs count");
-  osd_plb.add_time_avg(l_osd_scrub_rppool_successful_elapsed, "successful_scrubs_replicated_elapsed", "time to complete a successful replicated scrub");
-  osd_plb.add_u64_counter(l_osd_scrub_rppool_failed, "failed_scrubs_replicated", "failed replicated scrubs count");
-  osd_plb.add_time_avg(l_osd_scrub_rppool_failed_elapsed, "failed_scrubs_replicated_elapsed", "time to scrub failure replicated");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_started,
+      "num_scrubs_started_replicated",
+      "replicated scrubs attempted count");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_active_started,
+      "num_scrubs_past_reservation_replicated",
+      "replicated scrubs count");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_successful,
+      "successful_scrubs_replicated",
+      "successful replicated scrubs count");
+  osd_plb.add_time_avg(
+      l_osd_scrub_rppool_successful_elapsed,
+      "successful_scrubs_replicated_elapsed",
+      "time to complete a successful replicated scrub");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_failed, "failed_scrubs_replicated",
+      "failed replicated scrubs count");
+  osd_plb.add_time_avg(
+      l_osd_scrub_rppool_failed_elapsed,
+      "failed_scrubs_replicated_elapsed",
+      "time to scrub failure replicated");
+
+  // the replica reservation process - replicated pool
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_reserv_success,
+      "scrub_replicated_scrub_reservations_completed",
+      "successfully completed reservation processes");
+  osd_plb.add_time_avg(
+      l_osd_scrub_rppool_reserv_successful_elapsed,
+      "scrub_replicated_successful_reservations_elapsed",
+      "time to scrub reservation completion");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_reserv_aborted,
+      "scrub_replicated_reservation_process_aborted",
+      "scrub replicated pool reservation was aborted");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_reserv_rejected,
+      "scrub_replicated_reservation_process_failure",
+      "scrub replicated pool reservation failed due to replica denial");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_rppool_reserv_skipped,
+      "scrub_replicated_reservation_process_skipped",
+      "scrub replicated pool reservation skipped for high priority scrub");
+  osd_plb.add_time_avg(
+      l_osd_scrub_rppool_reserv_failed_elapsed,
+      "scrub_replicated_failed_reservations_elapsed",
+      "scrub replicated pool time for scrub reservation to fail");
+  osd_plb.add_u64(
+      l_osd_scrub_rppool_reserv_secondaries_num,
+      "scrub_replicated_replicas_in_reservation",
+      "scrub replicated pool number of replicas to reserve");
 
   // scrub - EC
-  osd_plb.add_u64_counter(l_osd_scrub_ec_started, "num_scrubs_started_ec", "scrubs attempted count ec");
-  osd_plb.add_u64_counter(l_osd_scrub_ec_active_started, "num_scrubs_past_reservation_ec", "scrubs count ec");
-  osd_plb.add_u64_counter(l_osd_scrub_ec_successful, "successful_scrubs_ec", "successful scrubs count ec");
-  osd_plb.add_time_avg(l_osd_scrub_ec_successful_elapsed, "successful_scrubs_ec_elapsed", "time to complete a successful ec scrub");
-  osd_plb.add_u64_counter(l_osd_scrub_ec_failed, "failed_scrubs_ec", "failed scrubs count ec");
-  osd_plb.add_time_avg(l_osd_scrub_ec_failed_elapsed, "failed_scrubs_ec_elapsed", "time to scrub failure ec");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_started, "num_scrubs_started_ec",
+      "EC scrubs attempted count");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_active_started, "num_scrubs_past_reservation_ec",
+      "EC scrubs count");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_successful, "successful_scrubs_ec",
+      "successful EC scrubs count");
+  osd_plb.add_time_avg(
+      l_osd_scrub_ec_successful_elapsed, "successful_scrubs_ec_elapsed",
+      "time to complete a successful EC scrub");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_failed, "failed_scrubs_ec", "failed scrubs count EC");
+  osd_plb.add_time_avg(
+      l_osd_scrub_ec_failed_elapsed, "failed_scrubs_ec_elapsed",
+      "time to scrub failure ec");
+
+  // the replica reservation process - EC
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_reserv_success, "scrub_ec_reservations_completed",
+      "successfully completed reservation processes EC");
+  osd_plb.add_time_avg(
+      l_osd_scrub_ec_reserv_successful_elapsed,
+      "scrub_ec_successful_reservations_elapsed",
+      "time to EC scrub reservation completion");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_reserv_aborted, "scrub_ec_reservation_process_aborted",
+      "scrub reservation was aborted EC");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_reserv_rejected, "scrub_ec_reservation_process_failure",
+      "scrub reservation failed due to replica denial EC");
+  osd_plb.add_u64_counter(
+      l_osd_scrub_ec_reserv_skipped, "scrub_ec_reservation_process_skipped",
+      "scrub reservation skipped for high priority scrub EC");
+  osd_plb.add_time_avg(
+      l_osd_scrub_ec_reserv_failed_elapsed,
+      "scrub_ec_failed_reservations_elapsed",
+      "time for scrub reservation to fail EC");
+  osd_plb.add_u64(
+      l_osd_scrub_ec_reserv_secondaries_num, "scrub_ec_replicas_in_reservation",
+      "number of replicas to reserve EC");
 
   return osd_plb.create_perf_counters();
 }
@@ -448,14 +530,6 @@ PerfCounters *build_scrub_labeled_perf(CephContext *cct, std::string label)
   scrub_perf.add_u64_counter(scrbcnt_blocked, "locked_object", "waiting on locked object events");
   scrub_perf.add_u64_counter(scrbcnt_write_blocked, "write_blocked_by_scrub", "write blocked by scrub");
 
-  // the replica reservation process
-  scrub_perf.add_u64_counter(scrbcnt_resrv_success, "scrub_reservations_completed", "successfully completed reservation processes");
-  scrub_perf.add_time_avg(scrbcnt_resrv_successful_elapsed, "successful_reservations_elapsed", "time to scrub reservation completion");
-  scrub_perf.add_u64_counter(scrbcnt_resrv_aborted, "reservation_process_aborted", "scrub reservation was aborted");
-  scrub_perf.add_u64_counter(scrbcnt_resrv_rejected, "reservation_process_failure", "scrub reservation failed due to replica denial");
-  scrub_perf.add_u64_counter(scrbcnt_resrv_skipped, "reservation_process_skipped", "scrub reservation skipped for high priority scrub");
-  scrub_perf.add_time_avg(scrbcnt_resrv_failed_elapsed, "failed_reservations_elapsed", "time for scrub reservation to fail");
-  scrub_perf.add_u64(scrbcnt_resrv_replicas_num, "replicas_in_reservation", "number of replicas in reservation");
 
   return scrub_perf.create_perf_counters();
 }
index dc551c16d480de2dc02cb64d5eb4bc6091f50181..2f6ac21276ffb52f7553035102e61256e49aa799 100644 (file)
@@ -169,7 +169,25 @@ enum osd_counter_idx_t {
   l_osd_scrub_rppool_failed, ///< failed scrubs count
   l_osd_scrub_rppool_failed_elapsed, ///< time from start to failure
 
-  // scrub - EC
+  // ----   scrub reservation process - replicated pools
+
+  /// successful replicas reservation count
+  l_osd_scrub_rppool_reserv_success,
+  /// time to complete a successful replicas reservation
+  l_osd_scrub_rppool_reserv_successful_elapsed,
+  /// failed attempt to reserve replicas due to an abort
+  l_osd_scrub_rppool_reserv_aborted,
+  /// reservation failed due to a 'rejected' response
+  l_osd_scrub_rppool_reserv_rejected,
+  /// reservation skipped for high-priority scrubs
+  l_osd_scrub_rppool_reserv_skipped,
+  /// time for a replicas reservation process to fail
+  l_osd_scrub_rppool_reserv_failed_elapsed,
+  /// number of replicas
+  l_osd_scrub_rppool_reserv_secondaries_num,
+
+
+  // ----   scrub - EC
   l_osd_scrub_ec_started, ///< scrubs that got started
   l_osd_scrub_ec_active_started, /// scrubs that got past secondaries reservation
   l_osd_scrub_ec_successful, ///< successful scrubs count
@@ -177,6 +195,23 @@ enum osd_counter_idx_t {
   l_osd_scrub_ec_failed, ///< failed scrubs count
   l_osd_scrub_ec_failed_elapsed, ///< time from start to failure
 
+  // ----   scrub reservation process - EC
+
+  /// successful replicas reservation count
+  l_osd_scrub_ec_reserv_success,
+  /// time to complete a successful replicas reservation
+  l_osd_scrub_ec_reserv_successful_elapsed,
+  /// failed attempt to reserve replicas due to an abort
+  l_osd_scrub_ec_reserv_aborted,
+  /// reservation failed due to a 'rejected' response
+  l_osd_scrub_ec_reserv_rejected,
+  /// reservation skipped for high-priority scrubs
+  l_osd_scrub_ec_reserv_skipped,
+  /// time for a replicas reservation process to fail
+  l_osd_scrub_ec_reserv_failed_elapsed,
+  /// number of replicas
+  l_osd_scrub_ec_reserv_secondaries_num,
+
   l_osd_last,
 };
 
@@ -238,22 +273,6 @@ enum {
   /// # write blocked by the scrub
   scrbcnt_write_blocked,
 
-  // -- replicas reservation
-  /// # successfully completed reservation steps
-  scrbcnt_resrv_success,
-  /// time to complete a successful replicas reservation
-  scrbcnt_resrv_successful_elapsed,
-  /// # failed attempt to reserve replicas due to an abort
-  scrbcnt_resrv_aborted,
-  /// # reservation failed due to a 'rejected' response
-  scrbcnt_resrv_rejected,
-  /// # reservation skipped for high-priority scrubs
-  scrbcnt_resrv_skipped,
-  /// time for a replicas reservation process to fail
-  scrbcnt_resrv_failed_elapsed,
-  /// # number of replicas
-  scrbcnt_resrv_replicas_num,
-
   scrbcnt_last,
 };
 
index 6607f0fa179a106b195133ea80bba1bcb662f0f2..7de2ea9662892b315f01fcb721d56a6d445ee5b3 100644 (file)
@@ -149,7 +149,15 @@ static inline constexpr ScrubCounterSet io_counters_replicated{
   .successful_cnt = l_osd_scrub_rppool_successful,
   .successful_elapsed = l_osd_scrub_rppool_successful_elapsed,
   .failed_cnt = l_osd_scrub_rppool_failed,
-  .failed_elapsed = l_osd_scrub_rppool_failed_elapsed
+  .failed_elapsed = l_osd_scrub_rppool_failed_elapsed,
+  // replica-reservation-related:
+  .rsv_successful_cnt = l_osd_scrub_rppool_reserv_success,
+  .rsv_successful_elapsed = l_osd_scrub_rppool_reserv_successful_elapsed,
+  .rsv_aborted_cnt = l_osd_scrub_rppool_reserv_aborted,
+  .rsv_rejected_cnt = l_osd_scrub_rppool_reserv_rejected,
+  .rsv_skipped_cnt = l_osd_scrub_rppool_reserv_skipped,
+  .rsv_failed_elapsed = l_osd_scrub_rppool_reserv_failed_elapsed,
+  .rsv_secondaries_num = l_osd_scrub_rppool_reserv_secondaries_num
 };
 
 static inline constexpr ScrubCounterSet io_counters_ec{
@@ -166,7 +174,15 @@ static inline constexpr ScrubCounterSet io_counters_ec{
   .successful_cnt = l_osd_scrub_ec_successful,
   .successful_elapsed = l_osd_scrub_ec_successful_elapsed,
   .failed_cnt = l_osd_scrub_ec_failed,
-  .failed_elapsed = l_osd_scrub_ec_failed_elapsed
+  .failed_elapsed = l_osd_scrub_ec_failed_elapsed,
+  // replica-reservation-related:
+  .rsv_successful_cnt = l_osd_scrub_ec_reserv_success,
+  .rsv_successful_elapsed = l_osd_scrub_ec_reserv_successful_elapsed,
+  .rsv_aborted_cnt = l_osd_scrub_ec_reserv_aborted,
+  .rsv_rejected_cnt = l_osd_scrub_ec_reserv_rejected,
+  .rsv_skipped_cnt = l_osd_scrub_ec_reserv_skipped,
+  .rsv_failed_elapsed = l_osd_scrub_ec_reserv_failed_elapsed,
+  .rsv_secondaries_num = l_osd_scrub_ec_reserv_secondaries_num
 };
 }  // namespace Scrub
 
index 72ab17295f2b1e4d1cbef434cc4939a1199fd903..07ce5c54cec82eec27444562f04ca0460ea0b906 100644 (file)
@@ -257,7 +257,7 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
   // initiate the reservation process
   session.m_reservations.emplace(
       *scrbr, context<PrimaryActive>().last_request_sent_nonce,
-      *session.m_perf_set);
+      *session.m_counters_idx);
 
   if (!session.m_reservations->get_last_sent()) {
     // no replicas to reserve
index 0b3265dde4d10305949df866c6b7b01417ee953a..d1784e375d0ff019218e78616523aadc9ecbc22e 100644 (file)
@@ -32,13 +32,13 @@ namespace Scrub {
 ReplicaReservations::ReplicaReservations(
     ScrubMachineListener& scrbr,
     reservation_nonce_t& nonce,
-    PerfCounters& pc)
+    const ScrubCounterSet& pc)
     : m_scrubber{scrbr}
     , m_pg{m_scrubber.get_pg()}
     , m_pgid{m_scrubber.get_spgid().pgid}
     , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
     , m_last_request_sent_nonce{nonce}
-    , m_perf_set{pc}
+    , m_perf_indices{pc}
 {
   // the acting set is sorted by pg_shard_t. The reservations are to be issued
   // in this order, so that the OSDs will receive the requests in a consistent
@@ -52,7 +52,8 @@ ReplicaReservations::ReplicaReservations(
       [whoami = m_pg->pg_whoami](const pg_shard_t& shard) {
        return shard != whoami;
       });
-  m_perf_set.set(scrbcnt_resrv_replicas_num, m_sorted_secondaries.size());
+  m_osds->logger->set(
+      m_perf_indices.rsv_secondaries_num, m_sorted_secondaries.size());
 
   m_next_to_request = m_sorted_secondaries.cbegin();
   if (m_scrubber.is_reservation_required()) {
@@ -63,7 +64,7 @@ ReplicaReservations::ReplicaReservations(
     // for high-priority scrubs (i.e. - user-initiated), no reservations are
     // needed. Note: not perf-counted as either success or failure.
     dout(10) << "high-priority scrub - no reservations needed" << dendl;
-    m_perf_set.inc(scrbcnt_resrv_skipped);
+    m_osds->logger->inc(m_perf_indices.rsv_skipped_cnt);
   }
 }
 
@@ -97,8 +98,8 @@ void ReplicaReservations::log_success_and_duration()
 {
   ceph_assert(m_process_started_at.has_value());
   auto logged_duration = ScrubClock::now() - m_process_started_at.value();
-  m_perf_set.tinc(scrbcnt_resrv_successful_elapsed, logged_duration);
-  m_perf_set.inc(scrbcnt_resrv_success);
+  m_osds->logger->tinc(m_perf_indices.rsv_successful_elapsed, logged_duration);
+  m_osds->logger->inc(m_perf_indices.rsv_successful_cnt);
   m_osds->logger->hinc(
       l_osd_scrub_reservation_dur_hist, std::ssize(m_sorted_secondaries),
       logged_duration.count());
@@ -112,16 +113,16 @@ void ReplicaReservations::log_failure_and_duration(int failure_cause_counter)
     return;
   }
   auto logged_duration = ScrubClock::now() - m_process_started_at.value();
-  m_perf_set.tinc(scrbcnt_resrv_failed_elapsed, logged_duration);
+  m_osds->logger->tinc(m_perf_indices.rsv_failed_elapsed, logged_duration);
   m_process_started_at.reset();
   // note: not counted into l_osd_scrub_reservation_dur_hist
-  m_perf_set.inc(failure_cause_counter);
+  m_osds->logger->inc(failure_cause_counter);
 }
 
 ReplicaReservations::~ReplicaReservations()
 {
   release_all();
-  log_failure_and_duration(scrbcnt_resrv_aborted);
+  log_failure_and_duration(m_perf_indices.rsv_aborted_cnt);
 }
 
 bool ReplicaReservations::is_reservation_response_relevant(
@@ -231,7 +232,7 @@ bool ReplicaReservations::handle_reserve_rejection(
     return false;
   }
 
-  log_failure_and_duration(scrbcnt_resrv_rejected);
+  log_failure_and_duration(m_perf_indices.rsv_rejected_cnt);
 
   // we should never see a rejection carrying a valid
   // reservation nonce - arriving while we have no pending requests
index f5eca48b8887fcf6c70cbb51d4bc64454ef4c575..93f04a872b61759d0b070d3ee22f6396a419c964 100644 (file)
@@ -90,9 +90,8 @@ class ReplicaReservations {
    */
   reservation_nonce_t& m_last_request_sent_nonce;
 
-  /// access to the performance counters container relevant to this scrub
-  /// parameters
-  PerfCounters& m_perf_set;
+  /// the performance counters relevant to this scrub
+  const ScrubCounterSet& m_perf_indices;
 
   /// used only for the 'duration of the reservation process' perf counter.
   /// discarded once the success or failure are recorded
@@ -102,7 +101,7 @@ class ReplicaReservations {
   ReplicaReservations(
       ScrubMachineListener& scrubber,
       reservation_nonce_t& nonce,
-      PerfCounters& pc);
+      const ScrubCounterSet& pc);
 
   ~ReplicaReservations();
 
index b46151c3b02a08435c948c8df674496a19cf8814..73c022bafc0308e86bd371e840dc2fc1a93e2267 100644 (file)
@@ -307,6 +307,14 @@ struct ScrubCounterSet {
   osd_counter_idx_t successful_elapsed; ///< time to complete a successful scrub
   osd_counter_idx_t failed_cnt; ///< failed scrubs count
   osd_counter_idx_t failed_elapsed; ///< time from start to failure
+  // reservation process related:
+  osd_counter_idx_t rsv_successful_cnt; ///< completed reservation processes
+  osd_counter_idx_t rsv_successful_elapsed; ///< time to all-reserved
+  osd_counter_idx_t rsv_aborted_cnt; ///< failed due to an abort
+  osd_counter_idx_t rsv_rejected_cnt; ///< 'rejected' response
+  osd_counter_idx_t rsv_skipped_cnt; ///< high-priority. No reservation
+  osd_counter_idx_t rsv_failed_elapsed; ///< time for reservation to fail
+  osd_counter_idx_t rsv_secondaries_num; ///< number of replicas (EC or rep)
 };
 
 }  // namespace Scrub