From: Igor Golikov Date: Tue, 30 Dec 2025 14:02:56 +0000 (+0000) Subject: mds: implement tracing infra X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9da350deaeb89fa7075e09c99789d7757db1d5d1;p=ceph-ci.git mds: implement tracing infra implement OTEL ready tracing infra Fixes: https://tracker.ceph.com/issues/73701 Signed-off-by: Igor Golikov --- diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index b7ec143ab6f..fe54d5b7345 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1811,4 +1811,16 @@ options: default: 30 min: 30 services: - - mds \ No newline at end of file + - mds +- name: mds_trace_sliding_window_sec + type: secs + level: advanced + desc: MDS trace sliding window duration in seconds + long_desc: Duration in seconds to keep completed traces in memory for admin socket dump + default: 30 + min: 5 + max: 300 + services: + - mds + flags: + - runtime diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt index f3980c7e04b..04d9d57ee37 100644 --- a/src/mds/CMakeLists.txt +++ b/src/mds/CMakeLists.txt @@ -45,6 +45,7 @@ set(mds_srcs QuiesceDbManager.cc QuiesceAgent.cc MDSRankQuiesce.cc + MDSTracer.cc ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc ${CMAKE_SOURCE_DIR}/src/common/MemoryModel.cc ${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc diff --git a/src/mds/MDSTracer.cc b/src/mds/MDSTracer.cc new file mode 100644 index 00000000000..b9532232b1e --- /dev/null +++ b/src/mds/MDSTracer.cc @@ -0,0 +1,212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 sts=2 expandtab + +#include "MDSTracer.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/Formatter.h" + +#include +#include + +#define dout_context cct +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds.tracer " + +// TraceData implementation + +void TraceData::dump(ceph::Formatter* f) const +{ + f->open_object_section("trace"); + f->dump_string("trace_id", trace_id); + f->dump_string("name", name); + f->dump_stream("start_time") << start_time; + f->dump_stream("end_time") << end_time; + f->dump_float("duration_ms", duration_ms); + f->dump_int("result", result); + + f->open_object_section("attributes"); + for (const auto& [key, value] : attributes) { + f->dump_string(key.c_str(), value); + } + f->close_section(); // attributes + + // Child spans with hierarchy (span_id=0 means direct child of root) + f->open_array_section("spans"); + for (const auto& cs : child_spans) { + f->open_object_section("span"); + f->dump_unsigned("span_id", cs.span_id); + f->dump_unsigned("parent_span_id", cs.parent_span_id); + f->dump_string("name", cs.name); + f->dump_stream("start_time") << cs.start_time; + f->dump_stream("end_time") << cs.end_time; + f->dump_float("duration_ms", cs.duration_ms); + if (cs.async) { + f->dump_bool("async", true); + } + f->close_section(); + } + f->close_section(); // spans + + f->close_section(); // trace +} + +std::string TraceData::to_json() const +{ + std::ostringstream ss; + ss << "{\"trace_id\":\"" << trace_id << "\"" + << ",\"name\":\"" << name << "\"" + << ",\"start_time\":" << start_time.to_real_time() + << ",\"end_time\":" << end_time.to_real_time() + << ",\"duration_ms\":" << duration_ms + << ",\"result\":" << result + << ",\"attributes\":{"; + + bool first = true; + for (const auto& [key, value] : attributes) { + if (!first) ss << ","; + ss << "\"" << key << "\":\"" << value << "\""; + first = false; + } + ss << "},\"spans\":["; + + first = true; + for (const auto& cs : child_spans) { + if (!first) ss << ","; + ss << "{\"span_id\":" << cs.span_id + << ",\"parent_span_id\":" << cs.parent_span_id + << ",\"name\":\"" << cs.name << "\"" + << ",\"start_time\":" << cs.start_time.to_real_time() + << ",\"end_time\":" << cs.end_time.to_real_time() + << ",\"duration_ms\":" << cs.duration_ms; + if (cs.async) { + ss << ",\"async\":true"; + } + ss << "}"; + first = false; + } + ss << "]}"; + + return ss.str(); +} + +// MDSTracer implementation + +MDSTracer::MDSTracer(CephContext* cct_) + : cct(cct_), + completed_traces(30) // Default 30 seconds, will be updated in init() +{ +} + +void MDSTracer::init() +{ + ldout(cct, 5) << "initializing MDS tracer" << dendl; + tracer.init(cct, "ceph-mds"); + + // Set sliding window duration from config + auto window_duration = cct->_conf.get_val("mds_trace_sliding_window_sec"); + completed_traces.set_window_duration(window_duration.count()); + ldout(cct, 5) << "trace sliding window set to " << window_duration.count() << " seconds" << dendl; +} + +void MDSTracer::shutdown() +{ + ldout(cct, 5) << "shutting down MDS tracer" << dendl; + // SlidingWindowTracker will be cleaned up by destructor +} + +bool MDSTracer::is_enabled() const +{ + return tracer.is_enabled(); +} + +jspan_ptr MDSTracer::start_trace(const char* name) +{ + return tracer.start_trace(name); +} + +jspan_ptr MDSTracer::add_span(const char* name, const jspan_ptr& parent) +{ + return tracer.add_span(name, parent); +} + +jspan_ptr MDSTracer::add_span(const char* name, const jspan_context& parent_ctx) +{ + return tracer.add_span(name, parent_ctx); +} + +void MDSTracer::store_trace(const TraceData& trace) +{ + // Update window duration from config in case it changed + auto window_duration = cct->_conf.get_val("mds_trace_sliding_window_sec"); + completed_traces.set_window_duration(window_duration.count()); + + // Add trace to sliding window + completed_traces.add_value(trace); + + ldout(cct, 10) << "stored trace " << trace.trace_id + << " name=" << trace.name + << " duration=" << trace.duration_ms << "ms" + << " window_size=" << completed_traces.size() << dendl; +} + +void MDSTracer::dump_traces(ceph::Formatter* f) +{ + // Update window duration from config and prune old traces + auto window_duration = cct->_conf.get_val("mds_trace_sliding_window_sec"); + completed_traces.set_window_duration(window_duration.count()); + + f->open_object_section("trace_dump"); + f->dump_unsigned("count", completed_traces.size()); + f->dump_unsigned("window_sec", completed_traces.get_window_duration_sec()); + + f->open_array_section("traces"); + completed_traces.for_each_value([f](const TraceData& trace) { + trace.dump(f); + }); + f->close_section(); // traces + + f->close_section(); // trace_dump +} + +size_t MDSTracer::get_trace_count() const +{ + return completed_traces.size(); +} + +void MDSTracer::prune_old_traces() +{ + completed_traces.update(); +} + +std::string MDSTracer::get_trace_id(const jspan_ptr& span) +{ +#ifdef HAVE_JAEGER + if (span && span->IsRecording()) { + auto ctx = span->GetContext(); + char trace_id[32]; + ctx.trace_id().ToLowerBase16(trace_id); + return std::string(trace_id, 32); + } +#endif + return ""; +} + +void MDSTracer::trace_event(const char* name, double duration_ms, int result) +{ + if (!is_enabled()) { + return; + } + + TraceData trace; + trace.name = name; + trace.end_time = ceph_clock_now(); + trace.start_time = trace.end_time; + trace.start_time -= (duration_ms / 1000.0); // Subtract duration to get start time + trace.duration_ms = duration_ms; + trace.result = result; + + store_trace(trace); +} diff --git a/src/mds/MDSTracer.h b/src/mds/MDSTracer.h new file mode 100644 index 00000000000..de25586a7eb --- /dev/null +++ b/src/mds/MDSTracer.h @@ -0,0 +1,329 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 sts=2 expandtab + +#ifndef CEPH_MDS_TRACER_H +#define CEPH_MDS_TRACER_H + +#include "common/tracer.h" +#include "common/Clock.h" +#include "include/common_fwd.h" +#include "include/utime.h" +#include "mgr/MDSPerfMetricTypes.h" + +#include +#include +#include +#include + +namespace ceph { class Formatter; } + +/** + * TraceData - Completed trace information for sliding window storage + * + * Uses span_id/parent_span_id to track call hierarchy. + * span_id=0 is the root span. + */ +struct TraceData { + std::string trace_id; + std::string name; + utime_t start_time; + utime_t end_time; + double duration_ms = 0; + int result = 0; + std::map attributes; + + // Spans with hierarchy (parent_span_id=0 means direct child of root) + struct Span { + uint32_t span_id; // Unique ID within this trace + uint32_t parent_span_id; // 0 = direct child of root + std::string name; + utime_t start_time; + utime_t end_time; + double duration_ms; + bool async = false; // true for async spans (may outlive parent) + }; + std::vector child_spans; + + // Counter for generating span IDs (root span is 0) + // Note: Not thread-safe, but each MDRequest has its own TraceData + uint32_t next_span_id = 1; + + // Current span context for hierarchy tracking (0 = root) + uint32_t current_span_id = 0; + + // Pending spans for async operations (started but not yet completed) + struct PendingSpan { + uint32_t span_id; + uint32_t parent_span_id; + std::string name; + utime_t start_time; + }; + std::map pending_spans; + + // Get next span ID + uint32_t alloc_span_id() { return next_span_id++; } + + // Start an async/pending span - returns span_id to pass to end_pending_span + // Note: Async spans may outlive their parent span - this is valid and shows + // the causal relationship (who initiated the async work). + uint32_t start_pending_span(const char* name) { + PendingSpan ps; + ps.span_id = alloc_span_id(); + ps.parent_span_id = current_span_id; // Actual parent that started this + ps.name = name; + ps.start_time = ceph_clock_now(); + pending_spans[ps.span_id] = ps; + return ps.span_id; + } + + // End a pending span and add it to child_spans + void end_pending_span(uint32_t span_id) { + auto it = pending_spans.find(span_id); + if (it != pending_spans.end()) { + Span s; + s.span_id = it->second.span_id; + s.parent_span_id = it->second.parent_span_id; + s.name = it->second.name; + s.start_time = it->second.start_time; + s.end_time = ceph_clock_now(); + s.duration_ms = (s.end_time - s.start_time) * 1000.0; + s.async = true; // Mark as async span + child_spans.push_back(s); + pending_spans.erase(it); + } + } + + void dump(ceph::Formatter* f) const; + std::string to_json() const; +}; + +/** + * MDSTracer - Distributed tracing for MDS operations + * + * Follows the same ownership pattern as MetricsHandler: + * - Owned by MDSRank as a member + * - init()/shutdown() lifecycle + * + * Features: + * - Creates OTEL spans for request tracing + * - Stores completed traces in sliding window buffer + * - Supports admin socket dump via "trace dump" command + * - Trace ID correlation with log records + */ +class MDSTracer { +public: + // MDS-specific span attribute names + static constexpr const char* ATTR_OP_TYPE = "mds.op_type"; + static constexpr const char* ATTR_OP_NAME = "mds.op_name"; + static constexpr const char* ATTR_CLIENT_ID = "mds.client_id"; + static constexpr const char* ATTR_INODE = "mds.inode"; + static constexpr const char* ATTR_PATH = "mds.path"; + static constexpr const char* ATTR_PATH2 = "mds.path2"; + static constexpr const char* ATTR_REQID = "mds.reqid"; + static constexpr const char* ATTR_MDS_RANK = "mds.rank"; + static constexpr const char* ATTR_SESSION_ID = "mds.session_id"; + static constexpr const char* ATTR_RESULT = "mds.result"; + static constexpr const char* ATTR_INTERNAL_OP = "mds.internal_op"; + + MDSTracer(CephContext* cct); + ~MDSTracer() = default; + + // Lifecycle (called by MDSRank) + void init(); + void shutdown(); + + // Check if tracing is enabled + bool is_enabled() const; + + // Span creation + jspan_ptr start_trace(const char* name); + jspan_ptr add_span(const char* name, const jspan_ptr& parent); + jspan_ptr add_span(const char* name, const jspan_context& parent_ctx); + + // Store completed trace in sliding window + void store_trace(const TraceData& trace); + + // Dump all traces in sliding window + void dump_traces(ceph::Formatter* f); + + // Get trace count in window + size_t get_trace_count() const; + + // Prune old traces + void prune_old_traces(); + + // Get trace ID string from span (for log correlation) + static std::string get_trace_id(const jspan_ptr& span); + + // Create and store a simple trace for housekeeping operations + // (operations not tied to an MDRequest) + void trace_event(const char* name, double duration_ms, int result = 0); + +private: + CephContext* cct; + tracing::Tracer tracer; + + // Sliding window for completed traces (reuses generic SlidingWindowTracker) + SlidingWindowTracker completed_traces; +}; + +/** + * ScopedSpan - RAII helper for child spans with automatic duration tracking + * + * Creates a child span on construction, ends it on destruction. + * Duration is automatically captured by OTEL and recorded to TraceData. + * Supports hierarchical spans via span_id/parent_span_id tracking. + * + * Usage: + * void some_function(MDRequestRef& mdr) { + * ScopedSpan span(mds->tracer, "function_name", mdr->trace_span, &mdr->trace_data); + * // ... function body ... + * // Nested spans automatically get this span as parent: + * { + * ScopedSpan inner(mds->tracer, "inner_op", mdr->trace_span, &mdr->trace_data); + * } + * } // span ends automatically, duration recorded to both OTEL and trace_data + */ +class ScopedSpan { +public: + ScopedSpan(MDSTracer& tracer, const char* name, const jspan_ptr& parent, + TraceData* trace_data = nullptr) + : span_name(name), start_time(ceph_clock_now()), trace_data_ptr(trace_data) { + if (parent && parent->IsRecording()) { + span = tracer.add_span(name, parent); + } + // Track hierarchy in trace_data + if (trace_data_ptr) { + my_span_id = trace_data_ptr->alloc_span_id(); + parent_span_id = trace_data_ptr->current_span_id; + trace_data_ptr->current_span_id = my_span_id; + } + } + + ~ScopedSpan() { + if (span) { + span->End(); + } + // Auto-record to trace_data if provided + if (trace_data_ptr && !span_name.empty()) { + TraceData::Span s; + s.span_id = my_span_id; + s.parent_span_id = parent_span_id; + s.name = span_name; + s.start_time = start_time; + s.end_time = ceph_clock_now(); + s.duration_ms = (s.end_time - s.start_time) * 1000.0; + trace_data_ptr->child_spans.push_back(s); + // Restore parent context + trace_data_ptr->current_span_id = parent_span_id; + } + } + + // Add an event to this span + void AddEvent(const char* name) { + if (span && span->IsRecording()) { + span->AddEvent(name); + } + } + + // Set an attribute on this span + template + void SetAttribute(const char* key, const T& value) { + if (span && span->IsRecording()) { + span->SetAttribute(key, value); + } + } + + // Check if span is valid/recording + bool IsRecording() const { + return span && span->IsRecording(); + } + + // Get this span's ID (for manual hierarchy tracking) + uint32_t get_span_id() const { return my_span_id; } + + // Non-copyable + ScopedSpan(const ScopedSpan&) = delete; + ScopedSpan& operator=(const ScopedSpan&) = delete; + +private: + jspan_ptr span; + std::string span_name; + utime_t start_time; + TraceData* trace_data_ptr; + uint32_t my_span_id = 0; + uint32_t parent_span_id = 0; +}; + +/** + * ScopedTrace - RAII helper for standalone traces (housekeeping operations) + * + * For operations not tied to an MDRequest. Creates a complete trace + * and stores it in the sliding window on destruction. + * + * Usage: + * void housekeeping_function() { + * ScopedTrace trace(mds->tracer, "mds:housekeeping:trim_cache"); + * // ... do work ... + * } // trace stored automatically with duration + */ +class ScopedTrace { +public: + ScopedTrace(MDSTracer& tracer, const char* name) + : tracer_ref(tracer), trace_name(name), start_time(ceph_clock_now()) { + if (tracer.is_enabled()) { + span = tracer.start_trace(name); + } + } + + ~ScopedTrace() { + if (span) { + span->End(); + } + if (tracer_ref.is_enabled()) { + utime_t end_time = ceph_clock_now(); + double duration_ms = (end_time - start_time) * 1000.0; + + TraceData trace; + trace.trace_id = MDSTracer::get_trace_id(span); + trace.name = trace_name; + trace.start_time = start_time; + trace.end_time = end_time; + trace.duration_ms = duration_ms; + trace.result = result_code; + trace.attributes = attributes; + // No child spans for standalone traces (they are the root) + + tracer_ref.store_trace(trace); + } + } + + void set_result(int r) { result_code = r; } + + void set_attribute(const std::string& key, const std::string& value) { + attributes[key] = value; + if (span && span->IsRecording()) { + span->SetAttribute(key.c_str(), value); + } + } + + void add_event(const char* name) { + if (span && span->IsRecording()) { + span->AddEvent(name); + } + } + + // Non-copyable + ScopedTrace(const ScopedTrace&) = delete; + ScopedTrace& operator=(const ScopedTrace&) = delete; + +private: + MDSTracer& tracer_ref; + jspan_ptr span; + std::string trace_name; + utime_t start_time; + int result_code = 0; + std::map attributes; +}; + +#endif // CEPH_MDS_TRACER_H diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index e34cd264148..ca8bb706437 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -37,6 +37,7 @@ #include "common/StackStringStream.h" #include "common/TrackedOp.h" +#include "MDSTracer.h" class LogSegment; class BatchOp; @@ -492,6 +493,14 @@ struct MDRequestImpl : public MutationImpl { // referent straydn bool referent_straydn = false; + // -- tracing support -- + // Span for tracing this request through the MDS + jspan_ptr trace_span; + // Collected trace data for sliding window storage + TraceData trace_data; + // Pending span ID for async journal wait tracking + uint32_t journal_span_id = 0; + protected: void _dump(ceph::Formatter *f) const override { _dump(f, false); diff --git a/src/mgr/MDSPerfMetricTypes.h b/src/mgr/MDSPerfMetricTypes.h index aab533938e6..0cbda62cb31 100644 --- a/src/mgr/MDSPerfMetricTypes.h +++ b/src/mgr/MDSPerfMetricTypes.h @@ -527,6 +527,24 @@ public: auto duration = std::chrono::steady_clock::now() - data_points.back().timestamp; return std::chrono::duration_cast(duration).count(); } + + // Update the window duration (for dynamic config changes) + void set_window_duration(uint64_t window_duration_seconds) { + std::unique_lock lock(data_lock); + window_duration = std::chrono::seconds(window_duration_seconds); + prune_old_data(std::chrono::steady_clock::now()); + } + + uint64_t get_window_duration_sec() const { + std::shared_lock lock(data_lock); + return std::chrono::duration_cast(window_duration).count(); + } + + size_t size() const { + std::shared_lock lock(data_lock); + return data_points.size(); + } + private: void prune_old_data(TimePoint now) { TimePoint window_start = now - window_duration;