From 7b2a9141053fd639138ab5bc1db09c32e354cf12 Mon Sep 17 00:00:00 2001
From: Redouane Kachach <rkachach@ibm.com>
Date: Wed, 15 Apr 2026 18:05:36 +0200
Subject: [PATCH] qa: fix misleading "in cluster log" failures during cluster
 log scan
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Summary:

Fix misleading failure reasons reported as `"â¦ in cluster log"` when
no such log entry actually exists.

The cephadm task currently treats `grep` errors from the cluster log
scan as if they were actual log matches. This can produce bogus
failure summaries when `ceph.log` is missing, especially after early
failures such as image pull or bootstrap problems.

Problem:

first_in_ceph_log() currently:

- returns stdout if a match is found
- otherwise returns stderr

The caller then treats any non-None value as a real cluster log hit and formats it as:

"<value>" in cluster log

That means an error like:

  grep: /var/log/ceph/<fsid>/ceph.log: No such file or directory

can be misreported as if it came from the cluster log.

This change makes cluster log scanning robust and accurate by:

- checking whether /var/log/ceph/<fsid>/ceph.log exists before scanning
- using check_status=False for the grep pipeline
- treating only stdout as a real log match
- treating stderr as a scan error instead of log content
- avoiding overwrite of a more accurate pre-existing failure_reason
- reporting scan failures separately as cluster log scan failed

Fixes: https://tracker.ceph.com/issues/76051
Signed-off-by: Redouane Kachach <rkachach@ibm.com>
---
 qa/tasks/cephadm.py | 103 +++++++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 29 deletions(-)
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py
index 445651ca9651..b868c37d4117 100644
--- a/qa/tasks/cephadm.py
+++ b/qa/tasks/cephadm.py
@@ -371,7 +371,6 @@ def ceph_log(ctx, config):
 
     update_archive_setting(ctx, 'log', '/var/log/ceph')
 
-
     try:
         yield
 
@@ -382,20 +381,37 @@ def ceph_log(ctx, config):
 
     finally:
         log.info('Checking cluster log for badness...')
+
+        log_path = '/var/log/ceph/{fsid}/ceph.log'.format(fsid=fsid)
+
+        def _ceph_log_exists():
+            try:
+                ctx.ceph[cluster_name].bootstrap_remote.run(
+                    args=['sudo', 'test', '-f', log_path],
+                )
+                return True
+            except CommandFailedError:
+                return False
+
         def first_in_ceph_log(pattern, excludes, only_match):
             """
-            Find the first occurrence of the pattern specified in the Ceph log,
-            Returns None if none found.
+            Find the first occurrence of the pattern specified in the Ceph log.
 
-            :param pattern: Pattern scanned for.
-            :param excludes: Patterns to ignore.
-            :return: First line of text (or None if not found)
+            Returns:
+                - matching line as a string if found
+                - None if the file does not exist or no match is found
+
+            Raises:
+                - CommandFailedError if the scan command itself fails unexpectedly
             """
+            if not _ceph_log_exists():
+                log.warning('Skipping cluster log scan: %s does not exist', log_path)
+                return None
+
             args = [
                 'sudo',
                 'grep', '-E', pattern,
-                '/var/log/ceph/{fsid}/ceph.log'.format(
-                    fsid=fsid),
+                log_path,
             ]
             if only_match:
                 args.extend([run.Raw('|'), 'grep', '-E', '|'.join(only_match)])
@@ -405,18 +421,33 @@ def ceph_log(ctx, config):
             args.extend([
                 run.Raw('|'), 'head', '-n', '1',
             ])
+
             r = ctx.ceph[cluster_name].bootstrap_remote.run(
                 stdout=BytesIO(),
                 args=args,
                 stderr=StringIO(),
+                check_status=False,
             )
-            stdout = r.stdout.getvalue().decode()
+
+            stdout = r.stdout.getvalue().decode(errors='replace')
+            stderr = r.stderr.getvalue().strip()
+            exitstatus = getattr(r, 'exitstatus', None)
+
             if stdout:
                 return stdout
-            stderr = r.stderr.getvalue()
-            if stderr:
-                return stderr
-            return None
+
+            # No stdout and no stderr means no match.
+            if not stderr:
+                return None
+
+            # stderr is a grep/pipeline execution problem, not a log match.
+            raise CommandFailedError(
+                'error scanning cluster log {}: {}{}'.format(
+                    log_path,
+                    stderr,
+                    '' if exitstatus is None else ' (exitstatus={})'.format(exitstatus),
+                )
+            )
 
         # NOTE: technically the first and third arg to first_in_ceph_log
         # are serving a similar purpose here of being something we
@@ -425,21 +456,32 @@ def ceph_log(ctx, config):
         # we match even if the test yaml specifies nothing else, and then the
         # log-only-match options are for when a test only wants to fail on
         # a specific subset of log lines that '\[ERR\]|\[WRN\]|\[SEC\]' matches
-        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
-                             config.get('log-ignorelist'),
-                             config.get('log-only-match')) is not None:
-            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
-            ctx.summary['success'] = False
-            # use the most severe problem as the failure reason
+        try:
+            if first_in_ceph_log(
+                r'\[ERR\]|\[WRN\]|\[SEC\]',
+                config.get('log-ignorelist'),
+                config.get('log-only-match'),
+            ) is not None:
+                log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+                ctx.summary['success'] = False
+                # use the most severe problem as the failure reason
+                if 'failure_reason' not in ctx.summary:
+                    for pattern in [r'\[SEC\]', r'\[ERR\]', r'\[WRN\]']:
+                        match = first_in_ceph_log(
+                            pattern,
+                            config.get('log-ignorelist'),
+                            config.get('log-only-match'),
+                        )
+                        if match is not None:
+                            ctx.summary['failure_reason'] = \
+                                '"{match}" in cluster log'.format(
+                                    match=match.rstrip('\n'),
+                                )
+                            break
+        except CommandFailedError as e:
+            log.warning('Unable to scan cluster log safely: %s', e)
             if 'failure_reason' not in ctx.summary:
-                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
-                    match = first_in_ceph_log(pattern, config['log-ignorelist'], config.get('log-only-match'))
-                    if match is not None:
-                        ctx.summary['failure_reason'] = \
-                            '"{match}" in cluster log'.format(
-                                match=match.rstrip('\n'),
-                            )
-                        break
+                ctx.summary['failure_reason'] = 'cluster log scan failed'
 
         if ctx.archive is not None and \
                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
@@ -487,8 +529,11 @@ def ceph_log(ctx, config):
                 except OSError:
                     pass
                 try:
-                    teuthology.pull_directory(remote, '/var/log/ceph',  # everything
-                                              os.path.join(sub, 'log'))
+                    teuthology.pull_directory(
+                        remote,
+                        '/var/log/ceph',
+                        os.path.join(sub, 'log'),
+                    )
                 except ReadError:
                     pass
 
-- 
2.47.3