From d4b9aa6113eb63a859a7adfa27fba94d428b9ad0 Mon Sep 17 00:00:00 2001 From: Anmol Babu Date: Mon, 8 Sep 2025 09:10:27 +0530 Subject: [PATCH] Increase metric priorities to CRITICAL for metrics used in dashboard As part of scale testing, we observed that, the volume of metrics was very huge on large clusters. We did an analysis of the used metrics and the complete list of metrics used in the dashboards and made the below observations: 1. Only 17 metrics were used in the dashboards(grafana and management UI) 2. Total number of metrics collected in prometheus stack were around 245 A lot of metrics will incur: 1. Greater CPU and Memory demand for all marshaling and un-marshaling requirements 2. Greater storage volume 3. Increased per-scrape network consumption We intend to bump all the metrics leveraged in Ceph monitoring dashboards to prio_level CRITICAL and also raise the default ceph-exporter prio_level to CRITICAL. So, that prometheus ends up having only the required metrics. This is Part 1 of the efforts to request the metric implementation teams to revisit the metric priorities. If the customer needs other metrics, they can lower the ceph-exporter prio level and restart the ceph exporter after a careful evaluation of the storage, CPU and networking costs. Signed-off-by: Anmol Babu --- src/os/bluestore/BlueFS.cc | 2 +- src/os/bluestore/BlueStore.cc | 4 ++-- src/osd/osd_perf_counters.cc | 24 +++++++++++++--------- src/rgw/rgw_perf_counters.cc | 38 +++++++++++++++++------------------ 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index de08397128739..1708dff0d328f 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -246,7 +246,7 @@ void BlueFS::_init_logger() "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", "Total bytes (wal device)", - "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + "walb", PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", "Used bytes (wal device)", "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 8533d67611904..c49482237e745 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6401,10 +6401,10 @@ void BlueStore::_init_logger() "Number of pinned onodes in cache"); b.add_u64_counter(l_bluestore_onode_hits, "onode_hits", "Count of onode cache lookup hits", - "o_ht", PerfCountersBuilder::PRIO_USEFUL); + "o_ht", PerfCountersBuilder::PRIO_CRITICAL); b.add_u64_counter(l_bluestore_onode_misses, "onode_misses", "Count of onode cache lookup misses", - "o_ms", PerfCountersBuilder::PRIO_USEFUL); + "o_ms", PerfCountersBuilder::PRIO_CRITICAL); b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits", "Count of onode shard cache lookups hits"); b.add_u64_counter(l_bluestore_onode_shard_misses, diff --git a/src/osd/osd_perf_counters.cc b/src/osd/osd_perf_counters.cc index 26a3a74fcab7c..deb55f30f465f 100644 --- a/src/osd/osd_perf_counters.cc +++ b/src/osd/osd_perf_counters.cc @@ -64,12 +64,13 @@ PerfCounters *build_osd_logger(CephContext *cct) { "Count of ops delayed due to target object being degraded"); osd_plb.add_u64_counter( - l_osd_op_r, "op_r", "Client read operations"); + l_osd_op_r, "op_r", "Client read operations", nullptr, PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64_counter( - l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_op_r_lat, "op_r_latency", - "Latency of read operation (including queue time)"); + "Latency of read operation (including queue time)", + nullptr, PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64_counter_histogram( l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram", op_hist_x_axis_config, op_hist_y_axis_config, @@ -81,12 +82,15 @@ PerfCounters *build_osd_logger(CephContext *cct) { l_osd_op_r_prepare_lat, "op_r_prepare_latency", "Latency of read operations (excluding queue time and wait for finished)"); osd_plb.add_u64_counter( - l_osd_op_w, "op_w", "Client write operations"); + l_osd_op_w, "op_w", "Client write operations", + nullptr, PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64_counter( - l_osd_op_w_inb, "op_w_in_bytes", "Client data written"); + l_osd_op_w_inb, "op_w_in_bytes", "Client data written", + nullptr, PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_op_w_lat, "op_w_latency", - "Latency of write operation (including queue time)"); + "Latency of write operation (including queue time)", + nullptr, PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64_counter_histogram( l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram", op_hist_x_axis_config, op_hist_y_axis_config, @@ -178,7 +182,7 @@ PerfCounters *build_osd_logger(CephContext *cct) { osd_plb.add_u64_counter( l_osd_rop, "recovery_ops", "Started recovery operations", - "rop", PerfCountersBuilder::PRIO_INTERESTING); + "rop", PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64_counter( l_osd_rbytes, "recovery_bytes", @@ -229,7 +233,7 @@ PerfCounters *build_osd_logger(CephContext *cct) { "Total number of crc cache misses"); osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups", - "pgs", PerfCountersBuilder::PRIO_USEFUL); + "pgs", PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64( l_osd_pg_primary, "numpg_primary", "Placement groups for which this osd is primary"); @@ -278,10 +282,10 @@ PerfCounters *build_osd_logger(CephContext *cct) { osd_plb.add_u64( l_osd_stat_bytes, "stat_bytes", "OSD size", "size", - PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); osd_plb.add_u64( l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used", - PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES)); osd_plb.add_u64_counter( diff --git a/src/rgw/rgw_perf_counters.cc b/src/rgw/rgw_perf_counters.cc index 8616293c35188..bbbf6ffed8c4a 100644 --- a/src/rgw/rgw_perf_counters.cc +++ b/src/rgw/rgw_perf_counters.cc @@ -19,7 +19,7 @@ void add_rgw_frontend_counters(PerfCountersBuilder *pcb) { pcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); pcb->add_u64_counter(l_rgw_req, "req", "Requests"); - pcb->add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests"); + pcb->add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests", "", PerfCountersBuilder::PRIO_CRITICAL); pcb->add_u64(l_rgw_qlen, "qlen", "Queue length"); pcb->add_u64(l_rgw_qactive, "qactive", "Active requests queue"); @@ -69,37 +69,37 @@ void add_rgw_op_counters(PerfCountersBuilder *lpcb) { // description must match general rgw counters description above lpcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); - lpcb->add_u64_counter(l_rgw_op_put_obj, "put_obj_ops", "Puts"); - lpcb->add_u64_counter(l_rgw_op_put_obj_b, "put_obj_bytes", "Size of puts"); - lpcb->add_time_avg(l_rgw_op_put_obj_lat, "put_obj_lat", "Put latency"); + lpcb->add_u64_counter(l_rgw_op_put_obj, "put_obj_ops", "Puts", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_u64_counter(l_rgw_op_put_obj_b, "put_obj_bytes", "Size of puts", "", PerfCountersBuilder::PRIO_CRITICAL, PerfCounters::UNIT_BYTES); + lpcb->add_time_avg(l_rgw_op_put_obj_lat, "put_obj_lat", "Put latency", "", PerfCountersBuilder::PRIO_CRITICAL); - lpcb->add_u64_counter(l_rgw_op_get_obj, "get_obj_ops", "Gets"); - lpcb->add_u64_counter(l_rgw_op_get_obj_b, "get_obj_bytes", "Size of gets"); - lpcb->add_time_avg(l_rgw_op_get_obj_lat, "get_obj_lat", "Get latency"); + lpcb->add_u64_counter(l_rgw_op_get_obj, "get_obj_ops", "Gets", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_u64_counter(l_rgw_op_get_obj_b, "get_obj_bytes", "Size of gets", "", PerfCountersBuilder::PRIO_CRITICAL, PerfCounters::UNIT_BYTES); + lpcb->add_time_avg(l_rgw_op_get_obj_lat, "get_obj_lat", "Get latency", "", PerfCountersBuilder::PRIO_CRITICAL); - lpcb->add_u64_counter(l_rgw_op_del_obj, "del_obj_ops", "Delete objects"); - lpcb->add_u64_counter(l_rgw_op_del_obj_b, "del_obj_bytes", "Size of delete objects"); - lpcb->add_time_avg(l_rgw_op_del_obj_lat, "del_obj_lat", "Delete object latency"); + lpcb->add_u64_counter(l_rgw_op_del_obj, "del_obj_ops", "Delete objects", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_u64_counter(l_rgw_op_del_obj_b, "del_obj_bytes", "Size of delete objects", "", PerfCountersBuilder::PRIO_CRITICAL, PerfCounters::UNIT_BYTES); + lpcb->add_time_avg(l_rgw_op_del_obj_lat, "del_obj_lat", "Delete object latency", "", PerfCountersBuilder::PRIO_CRITICAL); lpcb->add_u64_counter(l_rgw_op_del_bucket, "del_bucket_ops", "Delete Buckets"); lpcb->add_time_avg(l_rgw_op_del_bucket_lat, "del_bucket_lat", "Delete bucket latency"); - lpcb->add_u64_counter(l_rgw_op_copy_obj, "copy_obj_ops", "Copy objects"); - lpcb->add_u64_counter(l_rgw_op_copy_obj_b, "copy_obj_bytes", "Size of copy objects"); - lpcb->add_time_avg(l_rgw_op_copy_obj_lat, "copy_obj_lat", "Copy object latency"); + lpcb->add_u64_counter(l_rgw_op_copy_obj, "copy_obj_ops", "Copy objects", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_u64_counter(l_rgw_op_copy_obj_b, "copy_obj_bytes", "Size of copy objects", "", PerfCountersBuilder::PRIO_CRITICAL, PerfCounters::UNIT_BYTES); + lpcb->add_time_avg(l_rgw_op_copy_obj_lat, "copy_obj_lat", "Copy object latency", "", PerfCountersBuilder::PRIO_CRITICAL); - lpcb->add_u64_counter(l_rgw_op_list_obj, "list_obj_ops", "List objects"); - lpcb->add_time_avg(l_rgw_op_list_obj_lat, "list_obj_lat", "List objects latency"); + lpcb->add_u64_counter(l_rgw_op_list_obj, "list_obj_ops", "List objects", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_time_avg(l_rgw_op_list_obj_lat, "list_obj_lat", "List objects latency", "", PerfCountersBuilder::PRIO_CRITICAL); - lpcb->add_u64_counter(l_rgw_op_list_buckets, "list_buckets_ops", "List buckets"); - lpcb->add_time_avg(l_rgw_op_list_buckets_lat, "list_buckets_lat", "List buckets latency"); + lpcb->add_u64_counter(l_rgw_op_list_buckets, "list_buckets_ops", "List buckets", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_time_avg(l_rgw_op_list_buckets_lat, "list_buckets_lat", "List buckets latency", "", PerfCountersBuilder::PRIO_CRITICAL); } void add_rgw_topic_counters(PerfCountersBuilder *lpcb) { lpcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); - lpcb->add_u64(l_rgw_persistent_topic_len, "persistent_topic_len", "Persistent topic queue length"); - lpcb->add_u64(l_rgw_persistent_topic_size, "persistent_topic_size", "Persistent topic queue size"); + lpcb->add_u64(l_rgw_persistent_topic_len, "persistent_topic_len", "Persistent topic queue length", "", PerfCountersBuilder::PRIO_CRITICAL); + lpcb->add_u64(l_rgw_persistent_topic_size, "persistent_topic_size", "Persistent topic queue size", "", PerfCountersBuilder::PRIO_CRITICAL); } -- 2.39.5