]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/tasks/cephadm: add option to limit what matches in log error scraping
authorAdam King <adking@redhat.com>
Mon, 8 Apr 2024 18:27:26 +0000 (14:27 -0400)
committerAdam King <adking@redhat.com>
Mon, 15 Apr 2024 14:31:10 +0000 (10:31 -0400)
This is specifically being added with the orch/cephadm suite
in mind, where coming up with a viable ignorelist has proved
difficult. The orch testing does a lot of actions that can
cause thigns like an OSD or MON daemon to be down very
briefly, and I've found the vast majority of the time we
really don't want to fail the test when these pop up as cephadm
testing really only benefits from catching the CEPHADM_ errors/
warnings rather than eveyr possible one. Rather than continuing to
play whack-a-mole with the errors in the logs, this
patch should allow us to limit what we fail on to at
least get the suite in a good spot again. We can always
phase out the uses of this new "log-only_match" option
later in a more controlled way, and adding it shouldn't
affect log scraping for any of the tests that aren't
facing a similar issue.

Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 443c5913c5247349845a860dba8f6b69def5de75)

qa/tasks/cephadm.py

index f60aabc84359d3bd24071c775af9fb1820de3dd4..913920520ecc46982970a82d49dfc91816001976 100644 (file)
@@ -345,7 +345,7 @@ def ceph_log(ctx, config):
 
     finally:
         log.info('Checking cluster log for badness...')
-        def first_in_ceph_log(pattern, excludes):
+        def first_in_ceph_log(pattern, excludes, only_match):
             """
             Find the first occurrence of the pattern specified in the Ceph log,
             Returns None if none found.
@@ -360,6 +360,8 @@ def ceph_log(ctx, config):
                 '/var/log/ceph/{fsid}/ceph.log'.format(
                     fsid=fsid),
             ]
+            if only_match:
+                args.extend([run.Raw('|'), 'egrep', '|'.join(only_match)])
             if excludes:
                 for exclude in excludes:
                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
@@ -375,14 +377,22 @@ def ceph_log(ctx, config):
                 return stdout
             return None
 
+        # NOTE: technically the first and third arg to first_in_ceph_log
+        # are serving a similar purpose here of being something we
+        # look for in the logs. The reason they are separate args is that
+        # we want '\[ERR\]|\[WRN\]|\[SEC\]' to always have to be in the thing
+        # we match even if the test yaml specifies nothing else, and then the
+        # log-only-match options are for when a test only wants to fail on
+        # a specific subset of log lines that '\[ERR\]|\[WRN\]|\[SEC\]' matches
         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
-                             config.get('log-ignorelist')) is not None:
+                             config.get('log-ignorelist'),
+                             config.get('log-only-match')) is not None:
             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
             ctx.summary['success'] = False
             # use the most severe problem as the failure reason
             if 'failure_reason' not in ctx.summary:
                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
-                    match = first_in_ceph_log(pattern, config['log-ignorelist'])
+                    match = first_in_ceph_log(pattern, config['log-ignorelist'], config.get('log-only-match'))
                     if match is not None:
                         ctx.summary['failure_reason'] = \
                             '"{match}" in cluster log'.format(