From: Igor Golikov Date: Tue, 30 Dec 2025 14:03:13 +0000 (+0000) Subject: test: add tests for MDS tracing infra X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b213eb0d6ebb9e7ffbb599ef816d8aca9d56221a;p=ceph-ci.git test: add tests for MDS tracing infra Fixes: https://tracker.ceph.com/issues/73701 Signed-off-by: Igor Golikov --- diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py index 14f54a784e7..8f82200b93c 100644 --- a/qa/tasks/cephfs/test_misc.py +++ b/qa/tasks/cephfs/test_misc.py @@ -919,3 +919,314 @@ class TestNewFSCreation(CephFSTestCase): test_fs.destroy() if test_fs1 is not None: test_fs1.destroy() + + +class TestMDSTrace(CephFSTestCase): + """ + Tests for MDS tracing functionality via admin socket 'trace dump' command. + + Note: Most tests require OTEL/Jaeger tracing to be enabled at build time. + When tracing is disabled, all trace operations are no-ops. + """ + MDSS_REQUIRED = 1 + CLIENTS_REQUIRED = 1 + + def _is_tracing_enabled(self): + """Check if OTEL tracing is enabled by generating I/O and checking for traces.""" + # Generate some I/O + self.mount_a.run_shell(["mkdir", "-p", "trace_check_dir"]) + self.mount_a.run_shell(["rm", "-rf", "trace_check_dir"]) + # Check if traces were generated + result = self.fs.mds_asok(['trace', 'dump']) + return result and 'traces' in result and len(result['traces']) > 0 + + def _skip_if_tracing_disabled(self): + """Skip test if OTEL tracing is not enabled.""" + if not self._is_tracing_enabled(): + self.skipTest("MDS tracing is not enabled (requires OTEL/Jaeger)") + + def _get_traces(self): + """Dump traces from MDS via admin socket and return parsed JSON.""" + result = self.fs.mds_asok(['trace', 'dump']) + self.assertIsNotNone(result) + self.assertIn('traces', result) + return result['traces'] + + def _set_trace_window(self, seconds): + """Set the trace sliding window duration.""" + self.fs.mds_asok(['config', 'set', 'mds_trace_sliding_window_sec', str(seconds)]) + + def _generate_io(self): + """Generate some I/O to produce traces.""" + self.mount_a.run_shell(["mkdir", "-p", "trace_test_dir"]) + self.mount_a.run_shell(["touch", "trace_test_dir/file1"]) + self.mount_a.run_shell(["ls", "trace_test_dir"]) + self.mount_a.run_shell(["cat", "trace_test_dir/file1"]) + self.mount_a.run_shell(["rm", "-rf", "trace_test_dir"]) + + def test_trace_dump_command(self): + """ + Test that trace dump command returns valid JSON structure. + This test runs regardless of whether OTEL is enabled. + """ + result = self.fs.mds_asok(['trace', 'dump']) + self.assertIsNotNone(result) + self.assertIn('traces', result) + self.assertIn('count', result) + self.assertIn('window_sec', result) + self.assertIsInstance(result['traces'], list) + + def test_trace_toggle_on_off(self): + """ + Test tracing behavior when toggling jaeger_tracing_enable on and off. + + This test verifies: + 1. When tracing is OFF: no traces are generated + 2. When tracing is ON: traces are generated for the window duration + 3. When tracing is OFF again: no new traces, old ones expire + + Requires Jaeger support to be compiled in (HAVE_JAEGER). + """ + window_sec = 3 + self._set_trace_window(window_sec) + + # Step 1: Disable tracing, verify no traces + self.fs.mds_asok(['config', 'set', 'jaeger_tracing_enable', 'false']) + # Clear any existing traces + self._get_traces() + + self.mount_a.run_shell(["mkdir", "-p", "toggle_test_1"]) + self.mount_a.run_shell(["rm", "-rf", "toggle_test_1"]) + + traces_off = self._get_traces() + traces_off_count = len(traces_off) + log.info(f"Tracing OFF: {traces_off_count} traces") + + # Step 2: Enable tracing, verify traces appear + self.fs.mds_asok(['config', 'set', 'jaeger_tracing_enable', 'true']) + + self.mount_a.run_shell(["mkdir", "-p", "toggle_test_2"]) + self.mount_a.run_shell(["touch", "toggle_test_2/file"]) + self.mount_a.run_shell(["rm", "-rf", "toggle_test_2"]) + + traces_on = self._get_traces() + traces_on_count = len(traces_on) + log.info(f"Tracing ON: {traces_on_count} traces") + + # Step 3: Disable tracing again, verify no new traces + self.fs.mds_asok(['config', 'set', 'jaeger_tracing_enable', 'false']) + + # Wait for any in-flight requests to complete and clear leftover traces + time.sleep(1) + self._get_traces() # Clear any traces from in-flight requests + + self.mount_a.run_shell(["mkdir", "-p", "toggle_test_3"]) + self.mount_a.run_shell(["rm", "-rf", "toggle_test_3"]) + + traces_off_again = self._get_traces() + traces_off_again_count = len(traces_off_again) + log.info(f"Tracing OFF again: {traces_off_again_count} traces") + + # Verify behavior based on whether Jaeger is compiled in + if traces_on_count == 0: + # Jaeger not compiled in - all trace counts should be 0 + self.assertEqual(traces_off_count, 0, + "Without Jaeger, no traces when OFF") + self.assertEqual(traces_off_again_count, 0, + "Without Jaeger, no traces when OFF again") + log.info("Jaeger not compiled in - tracing is always no-op") + else: + # Jaeger compiled in - verify toggle behavior + self.assertEqual(traces_off_count, 0, + "With tracing OFF, no traces should be generated") + self.assertGreater(traces_on_count, 0, + "With tracing ON, traces should be generated") + self.assertEqual(traces_off_again_count, 0, + "With tracing OFF again, no new traces should be generated") + log.info("Jaeger compiled in - toggle behavior verified") + + # Reset to default + self._set_trace_window(10) + + def test_trace_dump_basic(self): + """ + Test that trace dump returns valid JSON with expected structure. + Requires OTEL to be enabled. + """ + self._skip_if_tracing_disabled() + self._generate_io() + + traces = self._get_traces() + self.assertIsInstance(traces, list) + self.assertGreater(len(traces), 0, "Expected at least one trace after I/O") + + # Verify trace structure + trace = traces[0] + self.assertIn('trace_id', trace) + self.assertIn('name', trace) + self.assertIn('start_time', trace) + self.assertIn('end_time', trace) + self.assertIn('duration_ms', trace) + self.assertIn('attributes', trace) + + # Verify trace_id format (32 hex chars) + self.assertEqual(len(trace['trace_id']), 32) + self.assertTrue(all(c in '0123456789abcdef' for c in trace['trace_id'])) + + def test_trace_dump_clears_window(self): + """ + Test that traces are cleared after dump (clear-on-dump semantics). + Requires OTEL to be enabled. + """ + self._skip_if_tracing_disabled() + self._generate_io() + + # First dump should have traces + traces_first = self._get_traces() + self.assertGreater(len(traces_first), 0, "Expected traces after I/O") + + # Second dump immediately after should be empty (cleared on first dump) + traces_second = self._get_traces() + self.assertEqual(len(traces_second), 0, + "Expected empty traces after dump (clear-on-dump)") + + # Generate more I/O + self._generate_io() + + # Should have new traces + traces_third = self._get_traces() + self.assertGreater(len(traces_third), 0, + "Expected new traces after more I/O") + + def test_trace_sliding_window_expiry(self): + """ + Test that traces expire after the sliding window duration. + Requires OTEL to be enabled. + """ + self._skip_if_tracing_disabled() + # Set a short window for testing + window_sec = 2 + self._set_trace_window(window_sec) + + self._generate_io() + + # Traces should exist immediately + traces = self._get_traces() + # Note: clear-on-dump means we need to generate more I/O + self._generate_io() + traces = self._get_traces() + self.assertGreater(len(traces), 0, "Expected traces immediately after I/O") + + # Clear by dumping, then generate I/O and wait for expiry + self._generate_io() + time.sleep(window_sec + 1) + + # After window expires and we dump, old traces should be pruned + # Generate a tiny bit of I/O to trigger pruning + traces_after = self._get_traces() + # All traces from before should have expired + self.assertEqual(len(traces_after), 0, + "Expected traces to expire after sliding window") + + # Reset to default + self._set_trace_window(10) + + def test_trace_span_hierarchy(self): + """ + Test that child spans have valid parent_span_id references. + Requires OTEL to be enabled. + """ + self._skip_if_tracing_disabled() + # Generate I/O that produces nested spans + self.mount_a.run_shell(["mkdir", "-p", "hierarchy_test/subdir"]) + self.mount_a.run_shell(["touch", "hierarchy_test/subdir/file"]) + self.mount_a.run_shell(["rm", "-rf", "hierarchy_test"]) + + traces = self._get_traces() + self.assertGreater(len(traces), 0) + + # Find traces with spans + for trace in traces: + if 'spans' not in trace or len(trace['spans']) == 0: + continue + + spans = trace['spans'] + span_ids = set() + for span in spans: + self.assertIn('span_id', span) + self.assertIn('name', span) + self.assertIn('start_time', span) + self.assertIn('end_time', span) + self.assertIn('duration_ms', span) + + # Verify span_id format (16 hex chars for OTEL span ID) + self.assertEqual(len(span['span_id']), 16, + f"span_id should be 16 hex chars: {span['span_id']}") + span_ids.add(span['span_id']) + + # If has parent, verify parent_span_id format + if 'parent_span_id' in span and span['parent_span_id']: + self.assertEqual(len(span['parent_span_id']), 16, + f"parent_span_id should be 16 hex chars") + + # All span_ids should be unique within a trace + self.assertEqual(len(span_ids), len(spans), + "All span_ids should be unique within a trace") + + def test_trace_attributes(self): + """ + Test that trace attributes contain expected MDS operation metadata. + Requires OTEL to be enabled. + """ + self._skip_if_tracing_disabled() + # Create a file to generate a traced operation + test_path = "attr_test_file" + self.mount_a.run_shell(["touch", test_path]) + self.mount_a.run_shell(["rm", test_path]) + + traces = self._get_traces() + self.assertGreater(len(traces), 0) + + # Check that at least one trace has the expected attributes + found_client_request = False + for trace in traces: + if trace.get('name') == 'mds:client_request': + found_client_request = True + attrs = trace.get('attributes', {}) + + # Verify expected attribute keys + self.assertIn('mds.op_name', attrs) + self.assertIn('mds.rank', attrs) + self.assertIn('mds.reqid', attrs) + + # Verify attribute values are non-empty + self.assertTrue(attrs['mds.op_name']) + self.assertTrue(attrs['mds.reqid']) + break + + self.assertTrue(found_client_request, + "Expected at least one mds:client_request trace") + + def test_trace_async_spans(self): + """ + Test that async spans (like journal_wait) are marked with async flag. + Requires OTEL to be enabled. + """ + self._skip_if_tracing_disabled() + # Operations that trigger journaling will have async spans + self.mount_a.run_shell(["mkdir", "async_test_dir"]) + self.mount_a.run_shell(["touch", "async_test_dir/file"]) + self.mount_a.run_shell(["rm", "-rf", "async_test_dir"]) + + traces = self._get_traces() + self.assertGreater(len(traces), 0) + + # Look for async spans + for trace in traces: + if 'spans' not in trace: + continue + for span in trace['spans']: + if span.get('name') == 'journal_wait': + # journal_wait should have async flag + self.assertTrue(span.get('async', False), + "journal_wait span should have async=true")