]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
Increase metric priorities to CRITICAL for metrics used in dashboard wip-aash-metric-prio-main
authorAnmol Babu <Anmol.Babu@ibm.com>
Mon, 8 Sep 2025 03:40:27 +0000 (09:10 +0530)
committerAnmol Babu <Anmol.Babu@ibm.com>
Mon, 13 Oct 2025 05:57:29 +0000 (11:27 +0530)
As part of scale testing, we observed that, the volume of metrics was
very huge on large clusters. We did an analysis of the used metrics
and the complete list of metrics used in the dashboards and made the
below observations:
1. Only 17 metrics were used in the dashboards(grafana and management UI)
2. Total number of metrics collected in prometheus stack were around 245

A lot of metrics will incur:
1. Greater CPU and Memory demand for all marshaling and un-marshaling
   requirements
2. Greater storage volume
3. Increased per-scrape network consumption

We intend to bump all the metrics leveraged in Ceph monitoring dashboards
to prio_level CRITICAL and also raise the default ceph-exporter prio_level
to CRITICAL. So, that prometheus ends up having only the required metrics.
This is Part 1 of the efforts to request the metric implementation teams to
revisit the metric priorities.

If the customer needs other metrics, they can lower the ceph-exporter prio
level and restart the ceph exporter after a careful evaluation of the storage,
CPU and networking costs.

Signed-off-by: Anmol Babu <Anmol.Babu@ibm.com>
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueStore.cc
src/osd/osd_perf_counters.cc
src/rgw/rgw_perf_counters.cc

index de08397128739103f13e5273ee1ee480e5922b4d..1708dff0d328f3627b7a902267c1d7d28969d7a0 100644 (file)
@@ -246,7 +246,7 @@ void BlueFS::_init_logger()
            "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
   b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
            "Total bytes (wal device)",
-           "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+           "walb", PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
   b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
            "Used bytes (wal device)",
            "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
index 8533d6761190454e9c353bed030211938eb2dd72..c49482237e74543b047b5d0a3a4aa60694b257ba 100644 (file)
@@ -6401,10 +6401,10 @@ void BlueStore::_init_logger()
             "Number of pinned onodes in cache");
   b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
                    "Count of onode cache lookup hits",
-                   "o_ht", PerfCountersBuilder::PRIO_USEFUL);
+                   "o_ht", PerfCountersBuilder::PRIO_CRITICAL);
   b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
                    "Count of onode cache lookup misses",
-                   "o_ms", PerfCountersBuilder::PRIO_USEFUL);
+                   "o_ms", PerfCountersBuilder::PRIO_CRITICAL);
   b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
                    "Count of onode shard cache lookups hits");
   b.add_u64_counter(l_bluestore_onode_shard_misses,
index 26a3a74fcab7c1626a05cdc2cdfc80bae4589128..deb55f30f465ff58888d8a895a064167d4dc07d8 100644 (file)
@@ -64,12 +64,13 @@ PerfCounters *build_osd_logger(CephContext *cct) {
     "Count of ops delayed due to target object being degraded");
 
   osd_plb.add_u64_counter(
-    l_osd_op_r, "op_r", "Client read operations");
+    l_osd_op_r, "op_r", "Client read operations", nullptr, PerfCountersBuilder::PRIO_CRITICAL);
   osd_plb.add_u64_counter(
-    l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+    l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
   osd_plb.add_time_avg(
     l_osd_op_r_lat, "op_r_latency",
-    "Latency of read operation (including queue time)");
+    "Latency of read operation (including queue time)",
+    nullptr, PerfCountersBuilder::PRIO_CRITICAL);
   osd_plb.add_u64_counter_histogram(
     l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
     op_hist_x_axis_config, op_hist_y_axis_config,
@@ -81,12 +82,15 @@ PerfCounters *build_osd_logger(CephContext *cct) {
     l_osd_op_r_prepare_lat, "op_r_prepare_latency",
     "Latency of read operations (excluding queue time and wait for finished)");
   osd_plb.add_u64_counter(
-    l_osd_op_w, "op_w", "Client write operations");
+    l_osd_op_w, "op_w", "Client write operations",
+    nullptr, PerfCountersBuilder::PRIO_CRITICAL);
   osd_plb.add_u64_counter(
-    l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
+    l_osd_op_w_inb, "op_w_in_bytes", "Client data written",
+    nullptr, PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
   osd_plb.add_time_avg(
     l_osd_op_w_lat,  "op_w_latency",
-    "Latency of write operation (including queue time)");
+    "Latency of write operation (including queue time)",
+    nullptr, PerfCountersBuilder::PRIO_CRITICAL);
   osd_plb.add_u64_counter_histogram(
     l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
     op_hist_x_axis_config, op_hist_y_axis_config,
@@ -178,7 +182,7 @@ PerfCounters *build_osd_logger(CephContext *cct) {
   osd_plb.add_u64_counter(
     l_osd_rop, "recovery_ops",
     "Started recovery operations",
-    "rop", PerfCountersBuilder::PRIO_INTERESTING);
+    "rop", PerfCountersBuilder::PRIO_CRITICAL);
 
   osd_plb.add_u64_counter(
    l_osd_rbytes, "recovery_bytes",
@@ -229,7 +233,7 @@ PerfCounters *build_osd_logger(CephContext *cct) {
     "Total number of crc cache misses");
 
   osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
-                 "pgs", PerfCountersBuilder::PRIO_USEFUL);
+          "pgs", PerfCountersBuilder::PRIO_CRITICAL);
   osd_plb.add_u64(
     l_osd_pg_primary, "numpg_primary",
     "Placement groups for which this osd is primary");
@@ -278,10 +282,10 @@ PerfCounters *build_osd_logger(CephContext *cct) {
 
   osd_plb.add_u64(
     l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
-    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
   osd_plb.add_u64(
     l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
-    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
   osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
 
   osd_plb.add_u64_counter(
index 8616293c351886eecaa7063716bb7b7d5576a52b..5b5be011e4eafbe3b86703edc1ed96872c91d10b 100644 (file)
@@ -19,7 +19,7 @@ void add_rgw_frontend_counters(PerfCountersBuilder *pcb) {
   pcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
 
   pcb->add_u64_counter(l_rgw_req, "req", "Requests");
-  pcb->add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests");
+  pcb->add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests", "", PerfCountersBuilder::PRIO_CRITICAL);
 
   pcb->add_u64(l_rgw_qlen, "qlen", "Queue length");
   pcb->add_u64(l_rgw_qactive, "qactive", "Active requests queue");
@@ -69,37 +69,37 @@ void add_rgw_op_counters(PerfCountersBuilder *lpcb) {
   // description must match general rgw counters description above
   lpcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
 
-  lpcb->add_u64_counter(l_rgw_op_put_obj, "put_obj_ops", "Puts");
-  lpcb->add_u64_counter(l_rgw_op_put_obj_b, "put_obj_bytes", "Size of puts");
-  lpcb->add_time_avg(l_rgw_op_put_obj_lat, "put_obj_lat", "Put latency");
+  lpcb->add_u64_counter(l_rgw_op_put_obj, "put_obj_ops", "Puts", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_u64_counter(l_rgw_op_put_obj_b, "put_obj_bytes", "Size of puts", "", PerfCountersBuilder::PRIO_CRITICAL, UNIT_BYTES);
+  lpcb->add_time_avg(l_rgw_op_put_obj_lat, "put_obj_lat", "Put latency", "", PerfCountersBuilder::PRIO_CRITICAL);
 
-  lpcb->add_u64_counter(l_rgw_op_get_obj, "get_obj_ops", "Gets");
-  lpcb->add_u64_counter(l_rgw_op_get_obj_b, "get_obj_bytes", "Size of gets");
-  lpcb->add_time_avg(l_rgw_op_get_obj_lat, "get_obj_lat", "Get latency");
+  lpcb->add_u64_counter(l_rgw_op_get_obj, "get_obj_ops", "Gets", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_u64_counter(l_rgw_op_get_obj_b, "get_obj_bytes", "Size of gets", "", PerfCountersBuilder::PRIO_CRITICAL, UNIT_BYTES);
+  lpcb->add_time_avg(l_rgw_op_get_obj_lat, "get_obj_lat", "Get latency", "", PerfCountersBuilder::PRIO_CRITICAL);
 
-  lpcb->add_u64_counter(l_rgw_op_del_obj, "del_obj_ops", "Delete objects");
-  lpcb->add_u64_counter(l_rgw_op_del_obj_b, "del_obj_bytes", "Size of delete objects");
-  lpcb->add_time_avg(l_rgw_op_del_obj_lat, "del_obj_lat", "Delete object latency");
+  lpcb->add_u64_counter(l_rgw_op_del_obj, "del_obj_ops", "Delete objects", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_u64_counter(l_rgw_op_del_obj_b, "del_obj_bytes", "Size of delete objects", "", PerfCountersBuilder::PRIO_CRITICAL, UNIT_BYTES);
+  lpcb->add_time_avg(l_rgw_op_del_obj_lat, "del_obj_lat", "Delete object latency", "", PerfCountersBuilder::PRIO_CRITICAL);
 
   lpcb->add_u64_counter(l_rgw_op_del_bucket, "del_bucket_ops", "Delete Buckets");
   lpcb->add_time_avg(l_rgw_op_del_bucket_lat, "del_bucket_lat", "Delete bucket latency");
 
-  lpcb->add_u64_counter(l_rgw_op_copy_obj, "copy_obj_ops", "Copy objects");
-  lpcb->add_u64_counter(l_rgw_op_copy_obj_b, "copy_obj_bytes", "Size of copy objects");
-  lpcb->add_time_avg(l_rgw_op_copy_obj_lat, "copy_obj_lat", "Copy object latency");
+  lpcb->add_u64_counter(l_rgw_op_copy_obj, "copy_obj_ops", "Copy objects", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_u64_counter(l_rgw_op_copy_obj_b, "copy_obj_bytes", "Size of copy objects", "", PerfCountersBuilder::PRIO_CRITICAL, UNIT_BYTES);
+  lpcb->add_time_avg(l_rgw_op_copy_obj_lat, "copy_obj_lat", "Copy object latency", "", PerfCountersBuilder::PRIO_CRITICAL);
 
-  lpcb->add_u64_counter(l_rgw_op_list_obj, "list_obj_ops", "List objects");
-  lpcb->add_time_avg(l_rgw_op_list_obj_lat, "list_obj_lat", "List objects latency");
+  lpcb->add_u64_counter(l_rgw_op_list_obj, "list_obj_ops", "List objects", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_time_avg(l_rgw_op_list_obj_lat, "list_obj_lat", "List objects latency", "", PerfCountersBuilder::PRIO_CRITICAL);
 
-  lpcb->add_u64_counter(l_rgw_op_list_buckets, "list_buckets_ops", "List buckets");
-  lpcb->add_time_avg(l_rgw_op_list_buckets_lat, "list_buckets_lat", "List buckets latency");
+  lpcb->add_u64_counter(l_rgw_op_list_buckets, "list_buckets_ops", "List buckets", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_time_avg(l_rgw_op_list_buckets_lat, "list_buckets_lat", "List buckets latency", "", PerfCountersBuilder::PRIO_CRITICAL);
 }
 
 void add_rgw_topic_counters(PerfCountersBuilder *lpcb) {
   lpcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
 
-  lpcb->add_u64(l_rgw_persistent_topic_len, "persistent_topic_len", "Persistent topic queue length");
-  lpcb->add_u64(l_rgw_persistent_topic_size, "persistent_topic_size", "Persistent topic queue size");
+  lpcb->add_u64(l_rgw_persistent_topic_len, "persistent_topic_len", "Persistent topic queue length", "", PerfCountersBuilder::PRIO_CRITICAL);
+  lpcb->add_u64(l_rgw_persistent_topic_size, "persistent_topic_size", "Persistent topic queue size", "", PerfCountersBuilder::PRIO_CRITICAL);
 
 }