From: Jaya Prakash <jayaprakash@ibm.com>
Date: Wed, 21 May 2025 14:48:29 +0000 (+0000)
Subject: common: Added cputrace tool for scope-based CPU performance tracing
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=efc8b483664dc5b0d3b4ba61e5aa6ceaaa566a2d;p=ceph.git

common: Added cputrace tool for scope-based CPU performance tracing

Introduce `cputrace`: a scope-based, developer-focused CPU profiler

`cputrace` lets developers annotate specific code regions (functions or blocks)
to collect precise CPU metrics:

- CPU cycles
- Cache misses
- Branch mispredictions
- Instructions retired
- Context switches

To enable it, build Ceph with:
  ./do_cmake.sh -DWITH_CPUTRACE=1

To profile a function or block, use:
  HWProfileFunctionF(profile, __FUNCTION__,
                     HW_PROFILE_CYC  | HW_PROFILE_CMISS |
                     HW_PROFILE_INS  | HW_PROFILE_BMISS |
                     HW_PROFILE_SWI);

Control via admin socket:
- `ceph tell osd.X cputrace start` → Start profiling
- `ceph tell osd.X cputrace stop`  → Stop profiling
- `ceph tell osd.X cputrace dump`  → Dump collected metrics
- `ceph tell osd.X cputrace reset` → Reset all captured data

Signed-off-by: Jaya Prakash <jayaprakash@ibm.com>
---

diff --git a/src/common/cputrace.cc b/src/common/cputrace.cc
new file mode 100644
index 00000000000..7112dd2bae6
--- /dev/null
+++ b/src/common/cputrace.cc
@@ -0,0 +1,412 @@
+#include "cputrace.h"
+#include "common/Formatter.h"
+
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <thread>
+
+#define PROFILE_ASSERT(x) if (!(x)) { fprintf(stderr, "Assert failed %s:%d\n", __FILE__, __LINE__); exit(1); }
+
+static thread_local uint64_t thread_id_hash;
+static thread_local bool thread_id_initialized;
+static cputrace_profiler g_profiler;
+static HW_ctx* active_contexts[CPUTRACE_MAX_ANCHORS][CPUTRACE_MAX_THREADS] = {{nullptr}};
+
+struct read_format {
+    uint64_t nr;
+    struct {
+        uint64_t value;
+        uint64_t id;
+    } values[];
+};
+
+static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
+                           int cpu, int group_fd, unsigned long flags) {
+    return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static Arena* arena_create(size_t size) {
+    void* start = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    PROFILE_ASSERT(start != MAP_FAILED);
+    ArenaRegion* region = (ArenaRegion*)malloc(sizeof(ArenaRegion));
+    region->start = start;
+    region->end = (char*)start + size;
+    region->current = start;
+    region->next = nullptr;
+    Arena* a = (Arena*)malloc(sizeof(Arena));
+    a->region = region;
+    return a;
+}
+
+static void* arena_alloc(Arena* arena, size_t size) {
+    if ((char*)arena->region->current + size > (char*)arena->region->end) {
+        fprintf(stderr, "Arena allocation failed: insufficient space for %zu bytes\n", size);
+        return nullptr;
+    }
+    void* ptr = arena->region->current;
+    arena->region->current = (char*)arena->region->current + size;
+    return ptr;
+}
+
+static void arena_reset(Arena* arena) {
+    arena->region->current = arena->region->start;
+}
+
+static void arena_destroy(Arena* arena) {
+    munmap(arena->region->start, (char*)arena->region->end - (char*)arena->region->start);
+    free(arena->region);
+    free(arena);
+}
+
+static uint64_t get_thread_id() {
+    if (!thread_id_initialized) {
+        uint64_t tid = pthread_self();
+        for (int i = 0; i < 8; i++)
+            tid = (tid << 7) ^ (tid >> 3);
+        thread_id_hash = tid % CPUTRACE_MAX_THREADS;
+        thread_id_initialized = true;
+    }
+    return thread_id_hash;
+}
+
+static void setup_perf_event(struct perf_event_attr* pe, uint32_t type, uint64_t config) {
+    memset(pe, 0, sizeof(*pe));
+    pe->size = sizeof(*pe);
+    pe->type = type;
+    pe->config = config;
+    pe->disabled = 1;
+    pe->exclude_kernel = 0;
+    pe->exclude_hv = 1;
+    pe->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+    if (type != PERF_TYPE_SOFTWARE) {
+        pe->exclude_kernel = 1;
+    }
+}
+
+static void open_perf_fd(int& fd, uint64_t& id, struct perf_event_attr* pe, const char* name, int group_fd) {
+    fd = perf_event_open(pe, gettid(), -1, group_fd, 0);
+    if (fd != -1) {
+        ioctl(fd, PERF_EVENT_IOC_ID, &id);
+        ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+    } else {    
+        fprintf(stderr, "Failed to open perf event for %s: %s\n", name, strerror(errno));
+        id = 0;
+    }
+}
+
+static void close_perf_fd(int& fd) {
+    if (fd != -1) {
+        ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+        close(fd);
+        fd = -1;
+    }
+}
+
+static void HW_init(HW_ctx* ctx, HW_conf* conf) {
+    *ctx = { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, *conf };
+}
+
+static void HW_start(HW_ctx* ctx) {
+    struct perf_event_attr pe;
+    int parent_fd = -1;
+
+    if (ctx->conf.capture_swi) {
+        setup_perf_event(&pe, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES);
+        open_perf_fd(ctx->fd_swi, ctx->id_swi, &pe, "SWI", -1);
+        parent_fd = ctx->fd_swi;
+    }
+    else if (ctx->conf.capture_cyc) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES);
+        open_perf_fd(ctx->fd_cyc, ctx->id_cyc, &pe, "CYC", -1);
+        parent_fd = ctx->fd_cyc;
+    } else if (ctx->conf.capture_cmiss) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES);
+        open_perf_fd(ctx->fd_cmiss, ctx->id_cmiss, &pe, "CMISS", -1);
+        parent_fd = ctx->fd_cmiss;
+    } else if (ctx->conf.capture_bmiss) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES);
+        open_perf_fd(ctx->fd_bmiss, ctx->id_bmiss, &pe, "BMISS", -1);
+        parent_fd = ctx->fd_bmiss;
+    } else if (ctx->conf.capture_ins) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS);
+        open_perf_fd(ctx->fd_ins, ctx->id_ins, &pe, "INS", -1);
+        parent_fd = ctx->fd_ins;
+    }
+
+    ctx->parent_fd = parent_fd;
+
+    if (ctx->conf.capture_swi && ctx->fd_swi == -1 && parent_fd != -1) {
+        setup_perf_event(&pe, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES);
+        open_perf_fd(ctx->fd_swi, ctx->id_swi, &pe, "SWI", parent_fd);
+    }
+    if (ctx->conf.capture_cyc && ctx->fd_cyc == -1 && parent_fd != -1) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES);
+        open_perf_fd(ctx->fd_cyc, ctx->id_cyc, &pe, "CYC", parent_fd);
+    }
+    if (ctx->conf.capture_cmiss && ctx->fd_cmiss == -1 && parent_fd != -1) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES);
+        open_perf_fd(ctx->fd_cmiss, ctx->id_cmiss, &pe, "CMISS", parent_fd);
+    }
+    if (ctx->conf.capture_bmiss && ctx->fd_bmiss == -1 && parent_fd != -1) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES);
+        open_perf_fd(ctx->fd_bmiss, ctx->id_bmiss, &pe, "BMISS", parent_fd);
+    }
+    if (ctx->conf.capture_ins && ctx->fd_ins == -1 && parent_fd != -1) {
+        setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS);
+        open_perf_fd(ctx->fd_ins, ctx->id_ins, &pe, "INS", parent_fd);
+    }
+
+    if (parent_fd != -1) {
+        ioctl(parent_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+        ioctl(parent_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+    }
+}
+
+static void HW_clean(HW_ctx* ctx) {
+    close_perf_fd(ctx->fd_swi);
+    close_perf_fd(ctx->fd_cyc);
+    close_perf_fd(ctx->fd_cmiss);
+    close_perf_fd(ctx->fd_bmiss);
+    close_perf_fd(ctx->fd_ins);
+    close_perf_fd(ctx->parent_fd);
+}
+
+static void read_perf_event(HW_ctx* ctx, cputrace_anchor* anchor, uint64_t tid) {
+    pthread_mutex_lock(&anchor->mutex[tid]);
+    Arena* arena = anchor->thread_arena[tid];
+    if (ctx->parent_fd != -1) {
+        char buf[256];
+        struct read_format* rf = (struct read_format*)buf;
+
+        if (read(ctx->parent_fd, buf, sizeof(buf)) > 0) {
+            for (uint64_t i = 0; i < rf->nr; i++) {
+                cputrace_result_type type;
+                if (rf->values[i].id == ctx->id_swi) type = CPUTRACE_RESULT_SWI;
+                else if (rf->values[i].id == ctx->id_cyc) type = CPUTRACE_RESULT_CYC;
+                else if (rf->values[i].id == ctx->id_cmiss) type = CPUTRACE_RESULT_CMISS;
+                else if (rf->values[i].id == ctx->id_bmiss) type = CPUTRACE_RESULT_BMISS;
+                else if (rf->values[i].id == ctx->id_ins) type = CPUTRACE_RESULT_INS;
+                else continue;
+                auto* r = (cputrace_anchor_result*)arena_alloc(arena, sizeof(cputrace_anchor_result));
+                r->type = type;
+                r->value = rf->values[i].value;
+            }
+            ioctl(ctx->parent_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+        } else {
+            fprintf(stderr, "Failed to read perf events: %s\n", strerror(errno));
+        }
+    }
+    pthread_mutex_unlock(&anchor->mutex[tid]);
+}
+
+static void aggregate_thread_results(cputrace_anchor* anchor, uint64_t tid) {
+    pthread_mutex_lock(&anchor->mutex[tid]);
+    auto* arena = anchor->thread_arena[tid];
+    auto* arr = (cputrace_anchor_result*)arena->region->start;
+    size_t count = ((char*)arena->region->current - (char*)arena->region->start) / sizeof(arr[0]);
+    for (size_t i = 0; i < count; ++i) {
+        if (arr[i].type == CPUTRACE_RESULT_CALL_COUNT && arr[i].value == 1) {
+            anchor->call_count += 1;
+        } else {
+            anchor->global_sum[arr[i].type] += arr[i].value;
+        }
+    }
+    arena_reset(arena);
+    pthread_mutex_unlock(&anchor->mutex[tid]);
+}
+
+HW_profile::HW_profile(const char* function, uint64_t index, uint64_t flags)
+    : function(function), index(index), flags(flags) {
+    if (index >= CPUTRACE_MAX_ANCHORS || !g_profiler.profiling)
+        return;
+
+    cputrace_anchor& anchor = g_profiler.anchors[index];
+    anchor.name = function;
+    anchor.flags = flags;
+    uint64_t tid = get_thread_id();
+
+    pthread_mutex_lock(&anchor.mutex[tid]);
+    auto* a = anchor.thread_arena[tid];
+    auto* r = (cputrace_anchor_result*)arena_alloc(a, sizeof(cputrace_anchor_result));
+    if (r) {
+        r->type = CPUTRACE_RESULT_CALL_COUNT;
+        r->value = 1;
+    }
+    pthread_mutex_unlock(&anchor.mutex[tid]);
+
+    HW_conf conf = {0};
+    if (flags & HW_PROFILE_SWI) conf.capture_swi = true;
+    if (flags & HW_PROFILE_CYC) conf.capture_cyc = true;
+    if (flags & HW_PROFILE_CMISS) conf.capture_cmiss = true;
+    if (flags & HW_PROFILE_BMISS) conf.capture_bmiss = true;
+    if (flags & HW_PROFILE_INS) conf.capture_ins = true;
+
+    HW_init(&ctx, &conf);
+    HW_start(&ctx);
+
+    pthread_mutex_lock(&g_profiler.global_lock);
+    active_contexts[index][tid] = &ctx;
+    pthread_mutex_unlock(&g_profiler.global_lock);
+}
+
+HW_profile::~HW_profile() {
+    if (!g_profiler.profiling || index >= CPUTRACE_MAX_ANCHORS)
+        return;
+
+    uint64_t tid = get_thread_id();
+    pthread_mutex_lock(&g_profiler.global_lock);
+    read_perf_event(&ctx, &g_profiler.anchors[index], tid);
+    aggregate_thread_results(&g_profiler.anchors[index], tid);
+    HW_clean(&ctx);
+    active_contexts[index][tid] = nullptr;
+    pthread_mutex_unlock(&g_profiler.global_lock);
+}
+
+void cputrace_start(ceph::Formatter* f) {
+    pthread_mutex_lock(&g_profiler.global_lock);
+    if (g_profiler.profiling) {
+        f->open_object_section("cputrace_start");
+        f->dump_format("status", "Profiling already active");
+        f->close_section();
+        pthread_mutex_unlock(&g_profiler.global_lock);
+        return;
+    }
+    g_profiler.profiling = true;
+    f->open_object_section("cputrace_start");
+    f->dump_format("status", "Profiling started");
+    f->close_section();
+    pthread_mutex_unlock(&g_profiler.global_lock);
+}
+
+void cputrace_stop(ceph::Formatter* f) {
+    pthread_mutex_lock(&g_profiler.global_lock);
+    if (!g_profiler.profiling) {
+        f->open_object_section("cputrace_stop");
+        f->dump_format("status", "Profiling not active");
+        f->close_section();
+        pthread_mutex_unlock(&g_profiler.global_lock);
+        return;
+    }
+    g_profiler.profiling = false;
+    pthread_mutex_unlock(&g_profiler.global_lock);
+    f->open_object_section("cputrace_stop");
+    f->dump_format("status", "Profiling stopped");
+    f->close_section();
+}
+
+void cputrace_reset(ceph::Formatter* f) {
+    pthread_mutex_lock(&g_profiler.global_lock);
+    for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) {
+        if (!g_profiler.anchors[i].name) continue;
+        for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) {
+            pthread_mutex_lock(&g_profiler.anchors[i].mutex[j]);
+            arena_reset(g_profiler.anchors[i].thread_arena[j]);
+            active_contexts[i][j] = nullptr;
+            pthread_mutex_unlock(&g_profiler.anchors[i].mutex[j]);
+        }
+        g_profiler.anchors[i].call_count = 0;
+        for (int t = 0; t < CPUTRACE_RESULT_COUNT; ++t) {
+            g_profiler.anchors[i].global_sum[t] = 0;
+        }
+    }
+    f->open_object_section("cputrace_reset");
+    f->dump_format("status", "Counters reset");
+    f->close_section();
+    pthread_mutex_unlock(&g_profiler.global_lock);
+}
+
+void cputrace_dump(ceph::Formatter* f, const std::string& logger, const std::string& counter) {
+    pthread_mutex_lock(&g_profiler.global_lock);
+    f->open_object_section("cputrace");
+    bool dumped = false;
+
+    for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) {
+        const auto& anchor = g_profiler.anchors[i];
+        if (!anchor.name || (!logger.empty() && anchor.name != logger)) {
+            continue;
+        }
+
+        for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) {
+            if (active_contexts[i][j]) {
+                read_perf_event(active_contexts[i][j], &g_profiler.anchors[i], j);
+            }
+            aggregate_thread_results(&g_profiler.anchors[i], j);
+        }
+
+        f->open_object_section(anchor.name);
+        f->dump_unsigned("call_count", anchor.call_count);
+
+        if (anchor.flags & HW_PROFILE_SWI && (counter.empty() || counter == "context_switches")) {
+            f->dump_unsigned("context_switches", anchor.global_sum[CPUTRACE_RESULT_SWI]);
+            if (anchor.call_count)
+                f->dump_float("avg_context_switches", (double)anchor.global_sum[CPUTRACE_RESULT_SWI] / anchor.call_count);
+        }
+        if (anchor.flags & HW_PROFILE_CYC && (counter.empty() || counter == "cpu_cycles")) {
+            f->dump_unsigned("cpu_cycles", anchor.global_sum[CPUTRACE_RESULT_CYC]);
+            if (anchor.call_count)
+                f->dump_float("avg_cpu_cycles", (double)anchor.global_sum[CPUTRACE_RESULT_CYC] / anchor.call_count);
+        }
+        if (anchor.flags & HW_PROFILE_CMISS && (counter.empty() || counter == "cache_misses")) {
+            f->dump_unsigned("cache_misses", anchor.global_sum[CPUTRACE_RESULT_CMISS]);
+            if (anchor.call_count)
+                f->dump_float("avg_cache_misses", (double)anchor.global_sum[CPUTRACE_RESULT_CMISS] / anchor.call_count);
+        }
+        if (anchor.flags & HW_PROFILE_BMISS && (counter.empty() || counter == "branch_misses")) {
+            f->dump_unsigned("branch_misses", anchor.global_sum[CPUTRACE_RESULT_BMISS]);
+            if (anchor.call_count)
+                f->dump_float("avg_branch_misses", (double)anchor.global_sum[CPUTRACE_RESULT_BMISS] / anchor.call_count);
+        }
+        if (anchor.flags & HW_PROFILE_INS && (counter.empty() || counter == "instructions")) {
+            f->dump_unsigned("instructions", anchor.global_sum[CPUTRACE_RESULT_INS]);
+            if (anchor.call_count)
+                f->dump_float("avg_instructions", (double)anchor.global_sum[CPUTRACE_RESULT_INS] / anchor.call_count);
+        }
+        f->close_section();
+        dumped = true;
+    }
+
+    f->dump_format("status", dumped ? "Profiling data dumped" : "No profiling data available");
+    f->close_section();
+    pthread_mutex_unlock(&g_profiler.global_lock);
+}
+
+__attribute__((constructor)) static void cputrace_init() {
+    g_profiler.anchors = (cputrace_anchor*)calloc(CPUTRACE_MAX_ANCHORS, sizeof(cputrace_anchor));
+    if (!g_profiler.anchors) {
+        fprintf(stderr, "Failed to allocate memory for profiler anchors: %s\n", strerror(errno));
+        exit(1);
+    }
+    for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) {
+        for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) {
+            if (pthread_mutex_init(&g_profiler.anchors[i].mutex[j], nullptr) != 0) {
+                fprintf(stderr, "Failed to initialize mutex for anchor %d, thread %d: %s\n", i, j, strerror(errno));
+            }
+            g_profiler.anchors[i].thread_arena[j] = arena_create(4 * 1024 * 1024);
+        }
+    }
+    if (pthread_mutex_init(&g_profiler.global_lock, nullptr) != 0) {
+        fprintf(stderr, "Failed to initialize global mutex: %s\n", strerror(errno));
+    }
+}
+
+__attribute__((destructor)) static void cputrace_fini() {
+    for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) {
+        for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) {
+            if (pthread_mutex_destroy(&g_profiler.anchors[i].mutex[j]) != 0) {
+                fprintf(stderr, "Failed to destroy mutex for anchor %d, thread %d: %s\n", i, j, strerror(errno));
+            }
+            arena_destroy(g_profiler.anchors[i].thread_arena[j]);
+        }
+    }
+    if (pthread_mutex_destroy(&g_profiler.global_lock) != 0) {
+        fprintf(stderr, "Failed to destroy global mutex: %s\n", strerror(errno));
+    }
+    free(g_profiler.anchors);
+}
\ No newline at end of file
diff --git a/src/common/cputrace.h b/src/common/cputrace.h
new file mode 100644
index 00000000000..84e069875ff
--- /dev/null
+++ b/src/common/cputrace.h
@@ -0,0 +1,101 @@
+#pragma once
+#include <pthread.h>
+#include <stdint.h>
+#include <string>
+#include "common/Formatter.h"
+
+#define CPUTRACE_MAX_ANCHORS 128
+#define CPUTRACE_MAX_THREADS 64
+
+enum cputrace_result_type {
+    CPUTRACE_RESULT_SWI = 0,
+    CPUTRACE_RESULT_CYC,
+    CPUTRACE_RESULT_CMISS,
+    CPUTRACE_RESULT_BMISS,
+    CPUTRACE_RESULT_INS,
+    CPUTRACE_RESULT_CALL_COUNT,
+    CPUTRACE_RESULT_COUNT
+};
+
+enum cputrace_flags {
+    HW_PROFILE_SWI   = (1ULL << CPUTRACE_RESULT_SWI),
+    HW_PROFILE_CYC   = (1ULL << CPUTRACE_RESULT_CYC),
+    HW_PROFILE_CMISS = (1ULL << CPUTRACE_RESULT_CMISS),
+    HW_PROFILE_BMISS = (1ULL << CPUTRACE_RESULT_BMISS),
+    HW_PROFILE_INS   = (1ULL << CPUTRACE_RESULT_INS),
+};
+
+#define HWProfileFunctionF(var, name, flags) HW_profile var(name, __COUNTER__ + 1, flags)
+
+struct cputrace_anchor_result {
+    cputrace_result_type type;
+    uint64_t value;
+};
+
+struct ArenaRegion {
+    void* start;
+    void* end;
+    void* current;
+    ArenaRegion* next;
+};
+
+struct Arena {
+    ArenaRegion* region;
+};
+
+struct cputrace_anchor {
+    const char* name;
+    pthread_mutex_t mutex[CPUTRACE_MAX_THREADS];
+    Arena* thread_arena[CPUTRACE_MAX_THREADS];
+    uint64_t global_sum[CPUTRACE_RESULT_COUNT];
+    uint64_t call_count;
+    uint64_t flags;
+};
+
+struct cputrace_profiler {
+    cputrace_anchor* anchors;
+    bool profiling;
+    pthread_mutex_t global_lock;
+};
+
+struct HW_conf {
+    bool capture_swi;
+    bool capture_cyc;
+    bool capture_cmiss;
+    bool capture_bmiss;
+    bool capture_ins;
+};
+
+struct HW_ctx {
+    int parent_fd;
+    int fd_swi;
+    int fd_cyc;
+    int fd_cmiss;
+    int fd_bmiss;
+    int fd_ins;
+    uint64_t id_swi;
+    uint64_t id_cyc;
+    uint64_t id_cmiss;
+    uint64_t id_bmiss;
+    uint64_t id_ins;
+    struct HW_conf conf;
+};
+
+class HW_profile {
+public:
+    HW_profile(const char* function, uint64_t index, uint64_t flags);
+    ~HW_profile();
+
+private:
+    const char* function;
+    uint64_t index;
+    uint64_t flags;
+    struct HW_ctx ctx;
+};
+
+void cputrace_start(ceph::Formatter* f);
+void cputrace_stop(ceph::Formatter* f);
+void cputrace_reset(ceph::Formatter* f);
+void cputrace_dump(ceph::Formatter* f, const std::string& logger = "", const std::string& counter = "");
+void cputrace_flush_thread_start();
+void cputrace_flush_thread_stop();
\ No newline at end of file