From 1b027751976b45e4b066387bcfba79769a340b42 Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 8 Apr 2024 14:27:26 -0400 Subject: [PATCH] qa/tasks/cephadm: add option to limit what matches in log error scraping This is specifically being added with the orch/cephadm suite in mind, where coming up with a viable ignorelist has proved difficult. The orch testing does a lot of actions that can cause thigns like an OSD or MON daemon to be down very briefly, and I've found the vast majority of the time we really don't want to fail the test when these pop up as cephadm testing really only benefits from catching the CEPHADM_ errors/ warnings rather than eveyr possible one. Rather than continuing to play whack-a-mole with the errors in the logs, this patch should allow us to limit what we fail on to at least get the suite in a good spot again. We can always phase out the uses of this new "log-only_match" option later in a more controlled way, and adding it shouldn't affect log scraping for any of the tests that aren't facing a similar issue. Signed-off-by: Adam King (cherry picked from commit 443c5913c5247349845a860dba8f6b69def5de75) --- qa/tasks/cephadm.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index f60aabc84359d..913920520ecc4 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -345,7 +345,7 @@ def ceph_log(ctx, config): finally: log.info('Checking cluster log for badness...') - def first_in_ceph_log(pattern, excludes): + def first_in_ceph_log(pattern, excludes, only_match): """ Find the first occurrence of the pattern specified in the Ceph log, Returns None if none found. @@ -360,6 +360,8 @@ def ceph_log(ctx, config): '/var/log/ceph/{fsid}/ceph.log'.format( fsid=fsid), ] + if only_match: + args.extend([run.Raw('|'), 'egrep', '|'.join(only_match)]) if excludes: for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) @@ -375,14 +377,22 @@ def ceph_log(ctx, config): return stdout return None + # NOTE: technically the first and third arg to first_in_ceph_log + # are serving a similar purpose here of being something we + # look for in the logs. The reason they are separate args is that + # we want '\[ERR\]|\[WRN\]|\[SEC\]' to always have to be in the thing + # we match even if the test yaml specifies nothing else, and then the + # log-only-match options are for when a test only wants to fail on + # a specific subset of log lines that '\[ERR\]|\[WRN\]|\[SEC\]' matches if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', - config.get('log-ignorelist')) is not None: + config.get('log-ignorelist'), + config.get('log-only-match')) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: - match = first_in_ceph_log(pattern, config['log-ignorelist']) + match = first_in_ceph_log(pattern, config['log-ignorelist'], config.get('log-only-match')) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( -- 2.39.5