+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include "subvolume_metrics.h"
+#include "mds_client.h"
+#include "super.h"
+
+/**
+ * struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node
+ * @node: Red-black tree linkage for tracker->tree
+ * @subvolume_id: Subvolume identifier (key for rb-tree lookup)
+ * @read_ops: Accumulated read operation count since last snapshot
+ * @write_ops: Accumulated write operation count since last snapshot
+ * @read_bytes: Accumulated bytes read since last snapshot
+ * @write_bytes: Accumulated bytes written since last snapshot
+ * @read_latency_us: Sum of read latencies in microseconds
+ * @write_latency_us: Sum of write latencies in microseconds
+ */
+struct ceph_subvol_metric_rb_entry {
+ struct rb_node node;
+ u64 subvolume_id;
+ u64 read_ops;
+ u64 write_ops;
+ u64 read_bytes;
+ u64 write_bytes;
+ u64 read_latency_us;
+ u64 write_latency_us;
+};
+
+static struct kmem_cache *ceph_subvol_metric_entry_cachep;
+
+void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker)
+{
+ spin_lock_init(&tracker->lock);
+ tracker->tree = RB_ROOT_CACHED;
+ tracker->nr_entries = 0;
+ tracker->enabled = false;
+ atomic64_set(&tracker->snapshot_attempts, 0);
+ atomic64_set(&tracker->snapshot_empty, 0);
+ atomic64_set(&tracker->snapshot_failures, 0);
+ atomic64_set(&tracker->record_calls, 0);
+ atomic64_set(&tracker->record_disabled, 0);
+ atomic64_set(&tracker->record_no_subvol, 0);
+ atomic64_set(&tracker->total_read_ops, 0);
+ atomic64_set(&tracker->total_read_bytes, 0);
+ atomic64_set(&tracker->total_write_ops, 0);
+ atomic64_set(&tracker->total_write_bytes, 0);
+}
+
+static struct ceph_subvol_metric_rb_entry *
+__lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id)
+{
+ struct rb_node *node;
+
+ node = tracker->tree.rb_root.rb_node;
+ while (node) {
+ struct ceph_subvol_metric_rb_entry *entry =
+ rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
+
+ if (subvol_id < entry->subvolume_id)
+ node = node->rb_left;
+ else if (subvol_id > entry->subvolume_id)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+
+ return NULL;
+}
+
+static struct ceph_subvol_metric_rb_entry *
+__insert_entry(struct ceph_subvolume_metrics_tracker *tracker,
+ struct ceph_subvol_metric_rb_entry *entry)
+{
+ struct rb_node **link = &tracker->tree.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ struct ceph_subvol_metric_rb_entry *cur =
+ rb_entry(*link, struct ceph_subvol_metric_rb_entry, node);
+
+ parent = *link;
+ if (entry->subvolume_id < cur->subvolume_id)
+ link = &(*link)->rb_left;
+ else if (entry->subvolume_id > cur->subvolume_id) {
+ link = &(*link)->rb_right;
+ leftmost = false;
+ } else
+ return cur;
+ }
+
+ rb_link_node(&entry->node, parent, link);
+ rb_insert_color_cached(&entry->node, &tracker->tree, leftmost);
+ tracker->nr_entries++;
+ return entry;
+}
+
+static void ceph_subvolume_metrics_clear_locked(
+ struct ceph_subvolume_metrics_tracker *tracker)
+{
+ struct rb_node *node = rb_first_cached(&tracker->tree);
+
+ while (node) {
+ struct ceph_subvol_metric_rb_entry *entry =
+ rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
+ struct rb_node *next = rb_next(node);
+
+ rb_erase_cached(&entry->node, &tracker->tree);
+ tracker->nr_entries--;
+ kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
+ node = next;
+ }
+
+ tracker->tree = RB_ROOT_CACHED;
+}
+
+void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker)
+{
+ spin_lock(&tracker->lock);
+ ceph_subvolume_metrics_clear_locked(tracker);
+ tracker->enabled = false;
+ spin_unlock(&tracker->lock);
+}
+
+void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
+ bool enable)
+{
+ spin_lock(&tracker->lock);
+ if (enable) {
+ tracker->enabled = true;
+ } else {
+ tracker->enabled = false;
+ ceph_subvolume_metrics_clear_locked(tracker);
+ }
+ spin_unlock(&tracker->lock);
+}
+
+void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
+ u64 subvol_id, bool is_write,
+ size_t size, u64 latency_us)
+{
+ struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL;
+ bool retry = false;
+
+ /* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */
+ if (!READ_ONCE(tracker->enabled) ||
+ subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us)
+ return;
+
+ /*
+ * Retry loop for lock-free allocation pattern:
+ * 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry
+ * 2. Second iteration: lookup again (may have been inserted), insert if still missing
+ * 3. On race (another thread inserted same key): free our alloc, retry
+ * All successful paths exit via return, so retry flag doesn't need reset.
+ */
+ do {
+ spin_lock(&tracker->lock);
+ if (!tracker->enabled) {
+ spin_unlock(&tracker->lock);
+ if (new_entry)
+ kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
+ return;
+ }
+
+ entry = __lookup_entry(tracker, subvol_id);
+ if (!entry) {
+ if (!new_entry) {
+ spin_unlock(&tracker->lock);
+ new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep,
+ GFP_NOFS);
+ if (!new_entry)
+ return;
+ new_entry->subvolume_id = subvol_id;
+ retry = true;
+ continue;
+ }
+ entry = __insert_entry(tracker, new_entry);
+ if (entry != new_entry) {
+ /* raced with another insert */
+ spin_unlock(&tracker->lock);
+ kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
+ new_entry = NULL;
+ retry = true;
+ continue;
+ }
+ new_entry = NULL;
+ }
+
+ if (is_write) {
+ entry->write_ops++;
+ entry->write_bytes += size;
+ entry->write_latency_us += latency_us;
+ atomic64_inc(&tracker->total_write_ops);
+ atomic64_add(size, &tracker->total_write_bytes);
+ } else {
+ entry->read_ops++;
+ entry->read_bytes += size;
+ entry->read_latency_us += latency_us;
+ atomic64_inc(&tracker->total_read_ops);
+ atomic64_add(size, &tracker->total_read_bytes);
+ }
+ spin_unlock(&tracker->lock);
+ if (new_entry)
+ kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
+ return;
+ } while (retry);
+}
+
+int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
+ struct ceph_subvol_metric_snapshot **out,
+ u32 *nr, bool consume)
+{
+ struct ceph_subvol_metric_snapshot *snap = NULL;
+ struct rb_node *node;
+ u32 count = 0, idx = 0;
+ int ret = 0;
+
+ *out = NULL;
+ *nr = 0;
+
+ if (!READ_ONCE(tracker->enabled))
+ return 0;
+
+ atomic64_inc(&tracker->snapshot_attempts);
+
+ spin_lock(&tracker->lock);
+ for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
+ struct ceph_subvol_metric_rb_entry *entry =
+ rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
+
+ /* Include entries with ANY I/O activity (read OR write) */
+ if (entry->read_ops || entry->write_ops)
+ count++;
+ }
+ spin_unlock(&tracker->lock);
+
+ if (!count) {
+ atomic64_inc(&tracker->snapshot_empty);
+ return 0;
+ }
+
+ snap = kcalloc(count, sizeof(*snap), GFP_NOFS);
+ if (!snap) {
+ atomic64_inc(&tracker->snapshot_failures);
+ return -ENOMEM;
+ }
+
+ spin_lock(&tracker->lock);
+ node = rb_first_cached(&tracker->tree);
+ while (node) {
+ struct ceph_subvol_metric_rb_entry *entry =
+ rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
+ struct rb_node *next = rb_next(node);
+
+ /* Skip entries with NO I/O activity at all */
+ if (!entry->read_ops && !entry->write_ops) {
+ rb_erase_cached(&entry->node, &tracker->tree);
+ tracker->nr_entries--;
+ kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
+ node = next;
+ continue;
+ }
+
+ if (idx >= count) {
+ pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n",
+ idx, count);
+ break;
+ }
+
+ snap[idx].subvolume_id = entry->subvolume_id;
+ snap[idx].read_ops = entry->read_ops;
+ snap[idx].write_ops = entry->write_ops;
+ snap[idx].read_bytes = entry->read_bytes;
+ snap[idx].write_bytes = entry->write_bytes;
+ snap[idx].read_latency_us = entry->read_latency_us;
+ snap[idx].write_latency_us = entry->write_latency_us;
+ idx++;
+
+ if (consume) {
+ entry->read_ops = 0;
+ entry->write_ops = 0;
+ entry->read_bytes = 0;
+ entry->write_bytes = 0;
+ entry->read_latency_us = 0;
+ entry->write_latency_us = 0;
+ rb_erase_cached(&entry->node, &tracker->tree);
+ tracker->nr_entries--;
+ kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
+ }
+ node = next;
+ }
+ spin_unlock(&tracker->lock);
+
+ if (!idx) {
+ kfree(snap);
+ snap = NULL;
+ ret = 0;
+ } else {
+ *nr = idx;
+ *out = snap;
+ }
+
+ return ret;
+}
+
+void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot)
+{
+ kfree(snapshot);
+}
+
+/*
+ * Dump subvolume metrics to a seq_file for debugfs.
+ *
+ * Iterates the rb-tree directly under spinlock to avoid allocation.
+ * The lock hold time is minimal since we're only doing seq_printf calls.
+ */
+void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
+ struct seq_file *s)
+{
+ struct rb_node *node;
+ bool found = false;
+
+ spin_lock(&tracker->lock);
+ if (!tracker->enabled) {
+ spin_unlock(&tracker->lock);
+ seq_puts(s, "subvolume metrics disabled\n");
+ return;
+ }
+
+ for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
+ struct ceph_subvol_metric_rb_entry *entry =
+ rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
+ u64 avg_rd_lat, avg_wr_lat;
+
+ if (!entry->read_ops && !entry->write_ops)
+ continue;
+
+ if (!found) {
+ seq_puts(s, "subvol_id rd_ops rd_bytes rd_avg_lat_us wr_ops wr_bytes wr_avg_lat_us\n");
+ seq_puts(s, "------------------------------------------------------------------------------------------------\n");
+ found = true;
+ }
+
+ avg_rd_lat = entry->read_ops ?
+ div64_u64(entry->read_latency_us, entry->read_ops) : 0;
+ avg_wr_lat = entry->write_ops ?
+ div64_u64(entry->write_latency_us, entry->write_ops) : 0;
+
+ seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n",
+ entry->subvolume_id,
+ entry->read_ops,
+ entry->read_bytes,
+ avg_rd_lat,
+ entry->write_ops,
+ entry->write_bytes,
+ avg_wr_lat);
+ }
+ spin_unlock(&tracker->lock);
+
+ if (!found)
+ seq_puts(s, "(no subvolume metrics collected)\n");
+}
+
+void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ bool is_write, size_t bytes,
+ ktime_t start, ktime_t end)
+{
+ struct ceph_subvolume_metrics_tracker *tracker;
+ u64 subvol_id;
+ s64 delta_us;
+
+ if (!mdsc || !ci || !bytes)
+ return;
+
+ tracker = &mdsc->subvol_metrics;
+ atomic64_inc(&tracker->record_calls);
+
+ if (!ceph_subvolume_metrics_enabled(tracker)) {
+ atomic64_inc(&tracker->record_disabled);
+ return;
+ }
+
+ subvol_id = READ_ONCE(ci->i_subvolume_id);
+ if (subvol_id == CEPH_SUBVOLUME_ID_NONE) {
+ atomic64_inc(&tracker->record_no_subvol);
+ return;
+ }
+
+ delta_us = ktime_to_us(ktime_sub(end, start));
+ if (delta_us <= 0)
+ delta_us = 1;
+
+ ceph_subvolume_metrics_record(tracker, subvol_id, is_write,
+ bytes, (u64)delta_us);
+}
+
+int __init ceph_subvolume_metrics_cache_init(void)
+{
+ ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ceph_subvol_metric_entry_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void ceph_subvolume_metrics_cache_destroy(void)
+{
+ kmem_cache_destroy(ceph_subvol_metric_entry_cachep);
+}