]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
docs: add tracing guidelines wip-igolikov-trace-infra-73701-new 66768/head
authorIgor Golikov <igolikov@redhat.com>
Tue, 30 Dec 2025 17:43:11 +0000 (17:43 +0000)
committerIgor Golikov <igolikov@redhat.com>
Tue, 6 Jan 2026 16:38:20 +0000 (16:38 +0000)
add tracing usage guide to the docs

Fixes: https://tracker.ceph.com/issues/73701
Signed-off-by: Igor Golikov <igolikov@redhat.com>
doc/cephfs/mds-config-ref.rst
doc/cephfs/troubleshooting.rst
qa/tasks/cephfs/test_misc.py

index 8e759f7e53f594afb36872aeaa2cb1a27bc4bb66..5f58cd4cc724a8712fe51918599692b353efdec0 100644 (file)
@@ -67,4 +67,5 @@
 .. confval:: mds_min_caps_per_client
 .. confval:: mds_symlink_recovery
 .. confval:: mds_extraordinary_events_dump_interval
+.. confval:: mds_trace_sliding_window_sec
 .. confval:: subv_metrics_window_interval
index 846a3cdd5d64d852e21d37581182fbd66af21585..608d19fcc1998924180b01144cc69632580e79f9 100644 (file)
@@ -62,6 +62,102 @@ If there are no slow requests reported on the MDS, and there is no indication
 that clients are misbehaving, then either there is a problem with the client
 or the client's requests are not reaching the MDS.
 
+.. _mds_request_tracing:
+
+MDS Request Tracing
+-------------------
+
+When OpenTelemetry tracing is enabled (via ``jaeger_tracing_enable``), the MDS
+captures detailed hierarchical traces of client request processing. These
+traces help identify performance bottlenecks by showing time spent in each
+phase of request handling.
+
+Dumping Traces
+^^^^^^^^^^^^^^
+
+Use the ``trace dump`` admin socket command to retrieve recent traces:
+
+.. prompt:: bash #
+
+   ceph daemon mds.<name> trace dump
+
+The MDS maintains a sliding window of completed traces in memory. When traces
+are dumped, the window is cleared. The window duration is controlled by
+:confval:`mds_trace_sliding_window_sec`.
+
+Example output::
+
+    {
+        "trace_id": "89a470995cd418dad345a74a78357005",
+        "name": "mds:client_request",
+        "start_time": "2025-12-10T18:07:08.496362+0000",
+        "end_time": "2025-12-10T18:07:09.932454+0000",
+        "duration_ms": 1436.09,
+        "result": 0,
+        "attributes": {
+            "mds.client_id": "4158",
+            "mds.op_name": "unlink",
+            "mds.reqid": "client.4158:37"
+        },
+        "spans": [
+            {
+                "span_id": "abc123",
+                "parent_span_id": "",
+                "name": "handle_unlink",
+                "duration_ms": 4.27
+            },
+            {
+                "span_id": "def456",
+                "parent_span_id": "abc123",
+                "name": "path_traverse",
+                "duration_ms": 0.58
+            }
+        ]
+    }
+
+Interpreting Traces
+^^^^^^^^^^^^^^^^^^^
+
+Each trace represents a single client request and contains:
+
+* **trace_id**: Unique identifier for the trace (correlates with Jaeger if enabled)
+* **name**: Always ``mds:client_request`` for MDS request traces
+* **duration_ms**: Total request processing time in milliseconds
+* **result**: Return code (0 = success)
+* **attributes**: Request metadata (client ID, operation name, request ID, path)
+* **spans**: Hierarchical breakdown of processing phases
+
+Spans show the call hierarchy via ``parent_span_id``. Root spans have an empty
+``parent_span_id``. Spans marked with ``async: true`` represent asynchronous
+operations (like journal commits) that may outlive their parent span.
+
+Common spans include:
+
+* ``handle_*``: Top-level request handlers (handle_unlink, handle_open, etc.)
+* ``path_traverse``: Directory path resolution
+* ``acquire_locks``: Metadata lock acquisition
+* ``journal_wait``: Waiting for journal commit (async)
+
+If span durations don't sum to the total request duration, the gap represents
+uninstrumented code or time spent waiting for async operations.
+
+Enabling OpenTelemetry Integration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To enable full distributed tracing with Jaeger:
+
+.. prompt:: bash #
+
+   ceph config set global jaeger_tracing_enable true
+
+When enabled, trace IDs and span IDs will match those exported to Jaeger,
+allowing correlation between MDS traces and the distributed tracing backend.
+
+.. note::
+
+   The ``trace dump`` command only returns traces when ``jaeger_tracing_enable``
+   is set to true. Tracing is a no-op when disabled.
+
 
 .. _cephfs_dr_stuck_during_recovery:
 
index 8f82200b93cc7580b0eda339f7bec29d6b480cfc..bf77fc52da9b6a00616550a389fb5000c8d828c4 100644 (file)
@@ -1167,7 +1167,7 @@ class TestMDSTrace(CephFSTestCase):
                 # If has parent, verify parent_span_id format
                 if 'parent_span_id' in span and span['parent_span_id']:
                     self.assertEqual(len(span['parent_span_id']), 16,
-                                     f"parent_span_id should be 16 hex chars")
+                                     "parent_span_id should be 16 hex chars")
 
             # All span_ids should be unique within a trace
             self.assertEqual(len(span_ids), len(spans),