From: Jaya Prakash Date: Wed, 21 May 2025 14:48:29 +0000 (+0000) Subject: common: Added cputrace tool for scope-based CPU performance tracing X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=efc8b483664dc5b0d3b4ba61e5aa6ceaaa566a2d;p=ceph.git common: Added cputrace tool for scope-based CPU performance tracing Introduce `cputrace`: a scope-based, developer-focused CPU profiler `cputrace` lets developers annotate specific code regions (functions or blocks) to collect precise CPU metrics: - CPU cycles - Cache misses - Branch mispredictions - Instructions retired - Context switches To enable it, build Ceph with: ./do_cmake.sh -DWITH_CPUTRACE=1 To profile a function or block, use: HWProfileFunctionF(profile, __FUNCTION__, HW_PROFILE_CYC | HW_PROFILE_CMISS | HW_PROFILE_INS | HW_PROFILE_BMISS | HW_PROFILE_SWI); Control via admin socket: - `ceph tell osd.X cputrace start` → Start profiling - `ceph tell osd.X cputrace stop`  → Stop profiling - `ceph tell osd.X cputrace dump`  → Dump collected metrics - `ceph tell osd.X cputrace reset` → Reset all captured data Signed-off-by: Jaya Prakash --- diff --git a/src/common/cputrace.cc b/src/common/cputrace.cc new file mode 100644 index 000000000000..7112dd2bae67 --- /dev/null +++ b/src/common/cputrace.cc @@ -0,0 +1,412 @@ +#include "cputrace.h" +#include "common/Formatter.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PROFILE_ASSERT(x) if (!(x)) { fprintf(stderr, "Assert failed %s:%d\n", __FILE__, __LINE__); exit(1); } + +static thread_local uint64_t thread_id_hash; +static thread_local bool thread_id_initialized; +static cputrace_profiler g_profiler; +static HW_ctx* active_contexts[CPUTRACE_MAX_ANCHORS][CPUTRACE_MAX_THREADS] = {{nullptr}}; + +struct read_format { + uint64_t nr; + struct { + uint64_t value; + uint64_t id; + } values[]; +}; + +static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) { + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static Arena* arena_create(size_t size) { + void* start = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROFILE_ASSERT(start != MAP_FAILED); + ArenaRegion* region = (ArenaRegion*)malloc(sizeof(ArenaRegion)); + region->start = start; + region->end = (char*)start + size; + region->current = start; + region->next = nullptr; + Arena* a = (Arena*)malloc(sizeof(Arena)); + a->region = region; + return a; +} + +static void* arena_alloc(Arena* arena, size_t size) { + if ((char*)arena->region->current + size > (char*)arena->region->end) { + fprintf(stderr, "Arena allocation failed: insufficient space for %zu bytes\n", size); + return nullptr; + } + void* ptr = arena->region->current; + arena->region->current = (char*)arena->region->current + size; + return ptr; +} + +static void arena_reset(Arena* arena) { + arena->region->current = arena->region->start; +} + +static void arena_destroy(Arena* arena) { + munmap(arena->region->start, (char*)arena->region->end - (char*)arena->region->start); + free(arena->region); + free(arena); +} + +static uint64_t get_thread_id() { + if (!thread_id_initialized) { + uint64_t tid = pthread_self(); + for (int i = 0; i < 8; i++) + tid = (tid << 7) ^ (tid >> 3); + thread_id_hash = tid % CPUTRACE_MAX_THREADS; + thread_id_initialized = true; + } + return thread_id_hash; +} + +static void setup_perf_event(struct perf_event_attr* pe, uint32_t type, uint64_t config) { + memset(pe, 0, sizeof(*pe)); + pe->size = sizeof(*pe); + pe->type = type; + pe->config = config; + pe->disabled = 1; + pe->exclude_kernel = 0; + pe->exclude_hv = 1; + pe->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + if (type != PERF_TYPE_SOFTWARE) { + pe->exclude_kernel = 1; + } +} + +static void open_perf_fd(int& fd, uint64_t& id, struct perf_event_attr* pe, const char* name, int group_fd) { + fd = perf_event_open(pe, gettid(), -1, group_fd, 0); + if (fd != -1) { + ioctl(fd, PERF_EVENT_IOC_ID, &id); + ioctl(fd, PERF_EVENT_IOC_RESET, 0); + } else { + fprintf(stderr, "Failed to open perf event for %s: %s\n", name, strerror(errno)); + id = 0; + } +} + +static void close_perf_fd(int& fd) { + if (fd != -1) { + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + close(fd); + fd = -1; + } +} + +static void HW_init(HW_ctx* ctx, HW_conf* conf) { + *ctx = { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, *conf }; +} + +static void HW_start(HW_ctx* ctx) { + struct perf_event_attr pe; + int parent_fd = -1; + + if (ctx->conf.capture_swi) { + setup_perf_event(&pe, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES); + open_perf_fd(ctx->fd_swi, ctx->id_swi, &pe, "SWI", -1); + parent_fd = ctx->fd_swi; + } + else if (ctx->conf.capture_cyc) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); + open_perf_fd(ctx->fd_cyc, ctx->id_cyc, &pe, "CYC", -1); + parent_fd = ctx->fd_cyc; + } else if (ctx->conf.capture_cmiss) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); + open_perf_fd(ctx->fd_cmiss, ctx->id_cmiss, &pe, "CMISS", -1); + parent_fd = ctx->fd_cmiss; + } else if (ctx->conf.capture_bmiss) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); + open_perf_fd(ctx->fd_bmiss, ctx->id_bmiss, &pe, "BMISS", -1); + parent_fd = ctx->fd_bmiss; + } else if (ctx->conf.capture_ins) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); + open_perf_fd(ctx->fd_ins, ctx->id_ins, &pe, "INS", -1); + parent_fd = ctx->fd_ins; + } + + ctx->parent_fd = parent_fd; + + if (ctx->conf.capture_swi && ctx->fd_swi == -1 && parent_fd != -1) { + setup_perf_event(&pe, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES); + open_perf_fd(ctx->fd_swi, ctx->id_swi, &pe, "SWI", parent_fd); + } + if (ctx->conf.capture_cyc && ctx->fd_cyc == -1 && parent_fd != -1) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); + open_perf_fd(ctx->fd_cyc, ctx->id_cyc, &pe, "CYC", parent_fd); + } + if (ctx->conf.capture_cmiss && ctx->fd_cmiss == -1 && parent_fd != -1) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); + open_perf_fd(ctx->fd_cmiss, ctx->id_cmiss, &pe, "CMISS", parent_fd); + } + if (ctx->conf.capture_bmiss && ctx->fd_bmiss == -1 && parent_fd != -1) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); + open_perf_fd(ctx->fd_bmiss, ctx->id_bmiss, &pe, "BMISS", parent_fd); + } + if (ctx->conf.capture_ins && ctx->fd_ins == -1 && parent_fd != -1) { + setup_perf_event(&pe, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); + open_perf_fd(ctx->fd_ins, ctx->id_ins, &pe, "INS", parent_fd); + } + + if (parent_fd != -1) { + ioctl(parent_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + ioctl(parent_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); + } +} + +static void HW_clean(HW_ctx* ctx) { + close_perf_fd(ctx->fd_swi); + close_perf_fd(ctx->fd_cyc); + close_perf_fd(ctx->fd_cmiss); + close_perf_fd(ctx->fd_bmiss); + close_perf_fd(ctx->fd_ins); + close_perf_fd(ctx->parent_fd); +} + +static void read_perf_event(HW_ctx* ctx, cputrace_anchor* anchor, uint64_t tid) { + pthread_mutex_lock(&anchor->mutex[tid]); + Arena* arena = anchor->thread_arena[tid]; + if (ctx->parent_fd != -1) { + char buf[256]; + struct read_format* rf = (struct read_format*)buf; + + if (read(ctx->parent_fd, buf, sizeof(buf)) > 0) { + for (uint64_t i = 0; i < rf->nr; i++) { + cputrace_result_type type; + if (rf->values[i].id == ctx->id_swi) type = CPUTRACE_RESULT_SWI; + else if (rf->values[i].id == ctx->id_cyc) type = CPUTRACE_RESULT_CYC; + else if (rf->values[i].id == ctx->id_cmiss) type = CPUTRACE_RESULT_CMISS; + else if (rf->values[i].id == ctx->id_bmiss) type = CPUTRACE_RESULT_BMISS; + else if (rf->values[i].id == ctx->id_ins) type = CPUTRACE_RESULT_INS; + else continue; + auto* r = (cputrace_anchor_result*)arena_alloc(arena, sizeof(cputrace_anchor_result)); + r->type = type; + r->value = rf->values[i].value; + } + ioctl(ctx->parent_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + } else { + fprintf(stderr, "Failed to read perf events: %s\n", strerror(errno)); + } + } + pthread_mutex_unlock(&anchor->mutex[tid]); +} + +static void aggregate_thread_results(cputrace_anchor* anchor, uint64_t tid) { + pthread_mutex_lock(&anchor->mutex[tid]); + auto* arena = anchor->thread_arena[tid]; + auto* arr = (cputrace_anchor_result*)arena->region->start; + size_t count = ((char*)arena->region->current - (char*)arena->region->start) / sizeof(arr[0]); + for (size_t i = 0; i < count; ++i) { + if (arr[i].type == CPUTRACE_RESULT_CALL_COUNT && arr[i].value == 1) { + anchor->call_count += 1; + } else { + anchor->global_sum[arr[i].type] += arr[i].value; + } + } + arena_reset(arena); + pthread_mutex_unlock(&anchor->mutex[tid]); +} + +HW_profile::HW_profile(const char* function, uint64_t index, uint64_t flags) + : function(function), index(index), flags(flags) { + if (index >= CPUTRACE_MAX_ANCHORS || !g_profiler.profiling) + return; + + cputrace_anchor& anchor = g_profiler.anchors[index]; + anchor.name = function; + anchor.flags = flags; + uint64_t tid = get_thread_id(); + + pthread_mutex_lock(&anchor.mutex[tid]); + auto* a = anchor.thread_arena[tid]; + auto* r = (cputrace_anchor_result*)arena_alloc(a, sizeof(cputrace_anchor_result)); + if (r) { + r->type = CPUTRACE_RESULT_CALL_COUNT; + r->value = 1; + } + pthread_mutex_unlock(&anchor.mutex[tid]); + + HW_conf conf = {0}; + if (flags & HW_PROFILE_SWI) conf.capture_swi = true; + if (flags & HW_PROFILE_CYC) conf.capture_cyc = true; + if (flags & HW_PROFILE_CMISS) conf.capture_cmiss = true; + if (flags & HW_PROFILE_BMISS) conf.capture_bmiss = true; + if (flags & HW_PROFILE_INS) conf.capture_ins = true; + + HW_init(&ctx, &conf); + HW_start(&ctx); + + pthread_mutex_lock(&g_profiler.global_lock); + active_contexts[index][tid] = &ctx; + pthread_mutex_unlock(&g_profiler.global_lock); +} + +HW_profile::~HW_profile() { + if (!g_profiler.profiling || index >= CPUTRACE_MAX_ANCHORS) + return; + + uint64_t tid = get_thread_id(); + pthread_mutex_lock(&g_profiler.global_lock); + read_perf_event(&ctx, &g_profiler.anchors[index], tid); + aggregate_thread_results(&g_profiler.anchors[index], tid); + HW_clean(&ctx); + active_contexts[index][tid] = nullptr; + pthread_mutex_unlock(&g_profiler.global_lock); +} + +void cputrace_start(ceph::Formatter* f) { + pthread_mutex_lock(&g_profiler.global_lock); + if (g_profiler.profiling) { + f->open_object_section("cputrace_start"); + f->dump_format("status", "Profiling already active"); + f->close_section(); + pthread_mutex_unlock(&g_profiler.global_lock); + return; + } + g_profiler.profiling = true; + f->open_object_section("cputrace_start"); + f->dump_format("status", "Profiling started"); + f->close_section(); + pthread_mutex_unlock(&g_profiler.global_lock); +} + +void cputrace_stop(ceph::Formatter* f) { + pthread_mutex_lock(&g_profiler.global_lock); + if (!g_profiler.profiling) { + f->open_object_section("cputrace_stop"); + f->dump_format("status", "Profiling not active"); + f->close_section(); + pthread_mutex_unlock(&g_profiler.global_lock); + return; + } + g_profiler.profiling = false; + pthread_mutex_unlock(&g_profiler.global_lock); + f->open_object_section("cputrace_stop"); + f->dump_format("status", "Profiling stopped"); + f->close_section(); +} + +void cputrace_reset(ceph::Formatter* f) { + pthread_mutex_lock(&g_profiler.global_lock); + for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) { + if (!g_profiler.anchors[i].name) continue; + for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) { + pthread_mutex_lock(&g_profiler.anchors[i].mutex[j]); + arena_reset(g_profiler.anchors[i].thread_arena[j]); + active_contexts[i][j] = nullptr; + pthread_mutex_unlock(&g_profiler.anchors[i].mutex[j]); + } + g_profiler.anchors[i].call_count = 0; + for (int t = 0; t < CPUTRACE_RESULT_COUNT; ++t) { + g_profiler.anchors[i].global_sum[t] = 0; + } + } + f->open_object_section("cputrace_reset"); + f->dump_format("status", "Counters reset"); + f->close_section(); + pthread_mutex_unlock(&g_profiler.global_lock); +} + +void cputrace_dump(ceph::Formatter* f, const std::string& logger, const std::string& counter) { + pthread_mutex_lock(&g_profiler.global_lock); + f->open_object_section("cputrace"); + bool dumped = false; + + for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) { + const auto& anchor = g_profiler.anchors[i]; + if (!anchor.name || (!logger.empty() && anchor.name != logger)) { + continue; + } + + for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) { + if (active_contexts[i][j]) { + read_perf_event(active_contexts[i][j], &g_profiler.anchors[i], j); + } + aggregate_thread_results(&g_profiler.anchors[i], j); + } + + f->open_object_section(anchor.name); + f->dump_unsigned("call_count", anchor.call_count); + + if (anchor.flags & HW_PROFILE_SWI && (counter.empty() || counter == "context_switches")) { + f->dump_unsigned("context_switches", anchor.global_sum[CPUTRACE_RESULT_SWI]); + if (anchor.call_count) + f->dump_float("avg_context_switches", (double)anchor.global_sum[CPUTRACE_RESULT_SWI] / anchor.call_count); + } + if (anchor.flags & HW_PROFILE_CYC && (counter.empty() || counter == "cpu_cycles")) { + f->dump_unsigned("cpu_cycles", anchor.global_sum[CPUTRACE_RESULT_CYC]); + if (anchor.call_count) + f->dump_float("avg_cpu_cycles", (double)anchor.global_sum[CPUTRACE_RESULT_CYC] / anchor.call_count); + } + if (anchor.flags & HW_PROFILE_CMISS && (counter.empty() || counter == "cache_misses")) { + f->dump_unsigned("cache_misses", anchor.global_sum[CPUTRACE_RESULT_CMISS]); + if (anchor.call_count) + f->dump_float("avg_cache_misses", (double)anchor.global_sum[CPUTRACE_RESULT_CMISS] / anchor.call_count); + } + if (anchor.flags & HW_PROFILE_BMISS && (counter.empty() || counter == "branch_misses")) { + f->dump_unsigned("branch_misses", anchor.global_sum[CPUTRACE_RESULT_BMISS]); + if (anchor.call_count) + f->dump_float("avg_branch_misses", (double)anchor.global_sum[CPUTRACE_RESULT_BMISS] / anchor.call_count); + } + if (anchor.flags & HW_PROFILE_INS && (counter.empty() || counter == "instructions")) { + f->dump_unsigned("instructions", anchor.global_sum[CPUTRACE_RESULT_INS]); + if (anchor.call_count) + f->dump_float("avg_instructions", (double)anchor.global_sum[CPUTRACE_RESULT_INS] / anchor.call_count); + } + f->close_section(); + dumped = true; + } + + f->dump_format("status", dumped ? "Profiling data dumped" : "No profiling data available"); + f->close_section(); + pthread_mutex_unlock(&g_profiler.global_lock); +} + +__attribute__((constructor)) static void cputrace_init() { + g_profiler.anchors = (cputrace_anchor*)calloc(CPUTRACE_MAX_ANCHORS, sizeof(cputrace_anchor)); + if (!g_profiler.anchors) { + fprintf(stderr, "Failed to allocate memory for profiler anchors: %s\n", strerror(errno)); + exit(1); + } + for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) { + for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) { + if (pthread_mutex_init(&g_profiler.anchors[i].mutex[j], nullptr) != 0) { + fprintf(stderr, "Failed to initialize mutex for anchor %d, thread %d: %s\n", i, j, strerror(errno)); + } + g_profiler.anchors[i].thread_arena[j] = arena_create(4 * 1024 * 1024); + } + } + if (pthread_mutex_init(&g_profiler.global_lock, nullptr) != 0) { + fprintf(stderr, "Failed to initialize global mutex: %s\n", strerror(errno)); + } +} + +__attribute__((destructor)) static void cputrace_fini() { + for (int i = 0; i < CPUTRACE_MAX_ANCHORS; ++i) { + for (int j = 0; j < CPUTRACE_MAX_THREADS; ++j) { + if (pthread_mutex_destroy(&g_profiler.anchors[i].mutex[j]) != 0) { + fprintf(stderr, "Failed to destroy mutex for anchor %d, thread %d: %s\n", i, j, strerror(errno)); + } + arena_destroy(g_profiler.anchors[i].thread_arena[j]); + } + } + if (pthread_mutex_destroy(&g_profiler.global_lock) != 0) { + fprintf(stderr, "Failed to destroy global mutex: %s\n", strerror(errno)); + } + free(g_profiler.anchors); +} \ No newline at end of file diff --git a/src/common/cputrace.h b/src/common/cputrace.h new file mode 100644 index 000000000000..84e069875ffd --- /dev/null +++ b/src/common/cputrace.h @@ -0,0 +1,101 @@ +#pragma once +#include +#include +#include +#include "common/Formatter.h" + +#define CPUTRACE_MAX_ANCHORS 128 +#define CPUTRACE_MAX_THREADS 64 + +enum cputrace_result_type { + CPUTRACE_RESULT_SWI = 0, + CPUTRACE_RESULT_CYC, + CPUTRACE_RESULT_CMISS, + CPUTRACE_RESULT_BMISS, + CPUTRACE_RESULT_INS, + CPUTRACE_RESULT_CALL_COUNT, + CPUTRACE_RESULT_COUNT +}; + +enum cputrace_flags { + HW_PROFILE_SWI = (1ULL << CPUTRACE_RESULT_SWI), + HW_PROFILE_CYC = (1ULL << CPUTRACE_RESULT_CYC), + HW_PROFILE_CMISS = (1ULL << CPUTRACE_RESULT_CMISS), + HW_PROFILE_BMISS = (1ULL << CPUTRACE_RESULT_BMISS), + HW_PROFILE_INS = (1ULL << CPUTRACE_RESULT_INS), +}; + +#define HWProfileFunctionF(var, name, flags) HW_profile var(name, __COUNTER__ + 1, flags) + +struct cputrace_anchor_result { + cputrace_result_type type; + uint64_t value; +}; + +struct ArenaRegion { + void* start; + void* end; + void* current; + ArenaRegion* next; +}; + +struct Arena { + ArenaRegion* region; +}; + +struct cputrace_anchor { + const char* name; + pthread_mutex_t mutex[CPUTRACE_MAX_THREADS]; + Arena* thread_arena[CPUTRACE_MAX_THREADS]; + uint64_t global_sum[CPUTRACE_RESULT_COUNT]; + uint64_t call_count; + uint64_t flags; +}; + +struct cputrace_profiler { + cputrace_anchor* anchors; + bool profiling; + pthread_mutex_t global_lock; +}; + +struct HW_conf { + bool capture_swi; + bool capture_cyc; + bool capture_cmiss; + bool capture_bmiss; + bool capture_ins; +}; + +struct HW_ctx { + int parent_fd; + int fd_swi; + int fd_cyc; + int fd_cmiss; + int fd_bmiss; + int fd_ins; + uint64_t id_swi; + uint64_t id_cyc; + uint64_t id_cmiss; + uint64_t id_bmiss; + uint64_t id_ins; + struct HW_conf conf; +}; + +class HW_profile { +public: + HW_profile(const char* function, uint64_t index, uint64_t flags); + ~HW_profile(); + +private: + const char* function; + uint64_t index; + uint64_t flags; + struct HW_ctx ctx; +}; + +void cputrace_start(ceph::Formatter* f); +void cputrace_stop(ceph::Formatter* f); +void cputrace_reset(ceph::Formatter* f); +void cputrace_dump(ceph::Formatter* f, const std::string& logger = "", const std::string& counter = ""); +void cputrace_flush_thread_start(); +void cputrace_flush_thread_stop(); \ No newline at end of file