qa: fix misleading "in cluster log" failures during cluster log scan

author Redouane Kachach <rkachach@ibm.com>

Wed, 15 Apr 2026 16:05:36 +0000 (18:05 +0200)

committer Redouane Kachach <rkachach@ibm.com>

Thu, 16 Apr 2026 08:05:11 +0000 (10:05 +0200)
author Redouane Kachach <rkachach@ibm.com>
Wed, 15 Apr 2026 16:05:36 +0000 (18:05 +0200)
committer Redouane Kachach <rkachach@ibm.com>
Thu, 16 Apr 2026 08:05:11 +0000 (10:05 +0200)
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py

index 445651ca9651faab0177e1b679a4dc620da5e1f0..b868c37d411708e33f4ec9b9edc69fbef29d5f0c 100644 (file)
--- a/qa/tasks/cephadm.py
+++ b/qa/tasks/cephadm.py
@@ -371,7 +371,6 @@ def ceph_log(ctx, config):
  
      update_archive_setting(ctx, 'log', '/var/log/ceph')
  
-
      try:
          yield
  
@@ -382,20 +381,37 @@ def ceph_log(ctx, config):
  
      finally:
          log.info('Checking cluster log for badness...')
+
+        log_path = '/var/log/ceph/{fsid}/ceph.log'.format(fsid=fsid)
+
+        def _ceph_log_exists():
+            try:
+                ctx.ceph[cluster_name].bootstrap_remote.run(
+                    args=['sudo', 'test', '-f', log_path],
+                )
+                return True
+            except CommandFailedError:
+                return False
+
          def first_in_ceph_log(pattern, excludes, only_match):
              """
-            Find the first occurrence of the pattern specified in the Ceph log,
-            Returns None if none found.
+            Find the first occurrence of the pattern specified in the Ceph log.
  
-            :param pattern: Pattern scanned for.
-            :param excludes: Patterns to ignore.
-            :return: First line of text (or None if not found)
+            Returns:
+                - matching line as a string if found
+                - None if the file does not exist or no match is found
+
+            Raises:
+                - CommandFailedError if the scan command itself fails unexpectedly
              """
+            if not _ceph_log_exists():
+                log.warning('Skipping cluster log scan: %s does not exist', log_path)
+                return None
+
              args = [
                  'sudo',
                  'grep', '-E', pattern,
-                '/var/log/ceph/{fsid}/ceph.log'.format(
-                    fsid=fsid),
+                log_path,
              ]
              if only_match:
                  args.extend([run.Raw('|'), 'grep', '-E', '|'.join(only_match)])
@@ -405,18 +421,33 @@ def ceph_log(ctx, config):
              args.extend([
                  run.Raw('|'), 'head', '-n', '1',
              ])
+
              r = ctx.ceph[cluster_name].bootstrap_remote.run(
                  stdout=BytesIO(),
                  args=args,
                  stderr=StringIO(),
+                check_status=False,
              )
-            stdout = r.stdout.getvalue().decode()
+
+            stdout = r.stdout.getvalue().decode(errors='replace')
+            stderr = r.stderr.getvalue().strip()
+            exitstatus = getattr(r, 'exitstatus', None)
+
              if stdout:
                  return stdout
-            stderr = r.stderr.getvalue()
-            if stderr:
-                return stderr
-            return None
+
+            # No stdout and no stderr means no match.
+            if not stderr:
+                return None
+
+            # stderr is a grep/pipeline execution problem, not a log match.
+            raise CommandFailedError(
+                'error scanning cluster log {}: {}{}'.format(
+                    log_path,
+                    stderr,
+                    '' if exitstatus is None else ' (exitstatus={})'.format(exitstatus),
+                )
+            )
  
          # NOTE: technically the first and third arg to first_in_ceph_log
          # are serving a similar purpose here of being something we
@@ -425,21 +456,32 @@ def ceph_log(ctx, config):
          # we match even if the test yaml specifies nothing else, and then the
          # log-only-match options are for when a test only wants to fail on
          # a specific subset of log lines that '\[ERR\]|\[WRN\]|\[SEC\]' matches
-        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
-                             config.get('log-ignorelist'),
-                             config.get('log-only-match')) is not None:
-            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
-            ctx.summary['success'] = False
-            # use the most severe problem as the failure reason
+        try:
+            if first_in_ceph_log(
+                r'\[ERR\]|\[WRN\]|\[SEC\]',
+                config.get('log-ignorelist'),
+                config.get('log-only-match'),
+            ) is not None:
+                log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+                ctx.summary['success'] = False
+                # use the most severe problem as the failure reason
+                if 'failure_reason' not in ctx.summary:
+                    for pattern in [r'\[SEC\]', r'\[ERR\]', r'\[WRN\]']:
+                        match = first_in_ceph_log(
+                            pattern,
+                            config.get('log-ignorelist'),
+                            config.get('log-only-match'),
+                        )
+                        if match is not None:
+                            ctx.summary['failure_reason'] = \
+                                '"{match}" in cluster log'.format(
+                                    match=match.rstrip('\n'),
+                                )
+                            break
+        except CommandFailedError as e:
+            log.warning('Unable to scan cluster log safely: %s', e)
              if 'failure_reason' not in ctx.summary:
-                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
-                    match = first_in_ceph_log(pattern, config['log-ignorelist'], config.get('log-only-match'))
-                    if match is not None:
-                        ctx.summary['failure_reason'] = \
-                            '"{match}" in cluster log'.format(
-                                match=match.rstrip('\n'),
-                            )
-                        break
+                ctx.summary['failure_reason'] = 'cluster log scan failed'
  
          if ctx.archive is not None and \
                  not (ctx.config.get('archive-on-error') and ctx.summary['success']):
@@ -487,8 +529,11 @@ def ceph_log(ctx, config):
                  except OSError:
                      pass
                  try:
-                    teuthology.pull_directory(remote, '/var/log/ceph',  # everything
-                                              os.path.join(sub, 'log'))
+                    teuthology.pull_directory(
+                        remote,
+                        '/var/log/ceph',
+                        os.path.join(sub, 'log'),
+                    )
                  except ReadError:
                      pass
author	Redouane Kachach <rkachach@ibm.com>
	Wed, 15 Apr 2026 16:05:36 +0000 (18:05 +0200)
committer	Redouane Kachach <rkachach@ibm.com>
	Thu, 16 Apr 2026 08:05:11 +0000 (10:05 +0200)