ansible: Try to summarize failure logs

author Zack Cerza <zack@redhat.com>

Wed, 5 Jul 2023 21:12:05 +0000 (15:12 -0600)

committer Zack Cerza <zack@redhat.com>

Wed, 5 Jul 2023 23:45:23 +0000 (17:45 -0600)
author Zack Cerza <zack@redhat.com>
Wed, 5 Jul 2023 21:12:05 +0000 (15:12 -0600)
committer Zack Cerza <zack@redhat.com>
Wed, 5 Jul 2023 23:45:23 +0000 (17:45 -0600)
diff --git a/teuthology/task/ansible.py b/teuthology/task/ansible.py

index 23453c1238033f6a2343ab9a478f2ac398031d50..1d487ab502ce1ead4a4d8ed7cfb6c570abf2c2dd 100644 (file)
--- a/teuthology/task/ansible.py
+++ b/teuthology/task/ansible.py
@@ -1,5 +1,6 @@
  import json
  import logging
+import re
  import requests
  import os
  import pexpect
@@ -36,6 +37,60 @@ class LoggerFile(object):
          pass
  
  
+class FailureAnalyzer:
+    def analyze(self, failure_log):
+        failure_obj = yaml.safe_load(failure_log)
+        lines = set()
+        if failure_obj is None:
+            return lines
+        for host_obj in failure_obj.items():
+            lines = lines.union(self.analyze_host_record(host_obj))
+        return lines
+
+    def analyze_host_record(self, record):
+        lines = set()
+        for result in record.get("results", [record]):
+            cmd = result.get("cmd", "")
+            # When a CPAN task fails, we get _lots_ of stderr_lines, and they
+            # aren't practical to reduce meaningfully. Instead of analyzing lines,
+            # just report the command that failed.
+            if "cpan" in cmd:
+                lines.add(f"CPAN command failed: {cmd}")
+                continue
+            lines_to_analyze = result.get("stderr_lines", result["msg"].split("\n"))
+            for line in lines_to_analyze:
+                line = self.analyze_line(line)
+                if line:
+                    lines.add(line)
+        return list(lines)
+
+    def analyze_line(self, line):
+        # apt output sometimes contains warnings or suggestions. Those won't be
+        # helpful, so throw them out.
+        if line.startswith("W: ") or line.endswith("?"):
+            return ""
+
+        # Next, we can normalize some common phrases.
+        phrases = [
+            "connection timed out",
+            r"(unable to|could not) connect to [^ ]+",
+            r"temporary failure resolving [^ ]+",
+        ]
+        for phrase in phrases:
+            match = re.search(rf"({phrase})", line, flags=re.IGNORECASE)
+            if match:
+                line = match.groups()[0]
+                break
+
+        # Strip out URLs for specific packages
+        package_re = re.compile(r"https?://.*\.(deb|rpm)")
+        line = package_re.sub("<package>", line)
+        # Strip out IP addresses
+        ip_re = re.compile(r"\[IP: \d+\.\d+\.\d+\.\d+( \d+)?\]")
+        line = ip_re.sub("", line)
+        return line
+
+
  class Ansible(Task):
      """
      A task to run ansible playbooks
@@ -303,17 +358,20 @@ class Ansible(Task):
      def _handle_failure(self, command, status):
          self._set_status('dead')
          failures = None
-        with open(self.failure_log.name, 'r') as fail_log:
+        with open(self.failure_log.name, 'r') as fail_log_file:
+            fail_log = fail_log_file.read()
              try:
-                failures = yaml.safe_load(fail_log)
+                analyzer = FailureAnalyzer()
+                failures = analyzer.analyze(fail_log)
              except yaml.YAMLError as e:
                  log.error(
                      "Failed to parse ansible failure log: {0} ({1})".format(
                          self.failure_log.name, e
                      )
                  )
-                fail_log.seek(0)
-                failures = fail_log.read().replace('\n', '')
+            # If we hit an exception, or if analyze() returned nothing, use the log as-is
+            if not failures:
+                failures = fail_log.replace('\n', '')
  
          if failures:
              self._archive_failures()
diff --git a/teuthology/test/task/test_ansible.py b/teuthology/test/task/test_ansible.py

index 5daf55cdc0fc77a5ffcc1b2c2ce555277922676e..939ec3f938d63b0ba05a1b0f98e6cd4ca5e84568 100644 (file)
--- a/teuthology/test/task/test_ansible.py
+++ b/teuthology/test/task/test_ansible.py
@@ -15,11 +15,32 @@ from teuthology.exceptions import CommandFailedError
  from teuthology.orchestra.cluster import Cluster
  from teuthology.orchestra.remote import Remote
  from teuthology.task import ansible
-from teuthology.task.ansible import Ansible, CephLab
+from teuthology.task.ansible import Ansible, CephLab, FailureAnalyzer
  
  from teuthology.test.task import TestTask
  
  
+class TestFailureAnalyzer:
+    klass = FailureAnalyzer
+
+    @mark.parametrize(
+        'line,result',
+        [
+            [
+                "E: Failed to fetch http://security.ubuntu.com/ubuntu/pool/main/a/apache2/apache2-bin_2.4.41-4ubuntu3.14_amd64.deb  Unable to connect to archive.ubuntu.com:http:",
+                "Unable to connect to archive.ubuntu.com:http:"
+            ],
+            [
+                "E: Failed to fetch http://archive.ubuntu.com/ubuntu/pool/main/libb/libb-hooks-op-check-perl/libb-hooks-op-check-perl_0.22-1build2_amd64.deb  Temporary failure resolving 'archive.ubuntu.com'",
+                "Temporary failure resolving 'archive.ubuntu.com'"
+            ],
+        ]
+    )
+    def test_lines(self, line, result):
+        obj = self.klass()
+        assert obj.analyze_line(line) == result
+
+
  class TestAnsibleTask(TestTask):
      klass = Ansible
      task_name = 'ansible'
author	Zack Cerza <zack@redhat.com>
	Wed, 5 Jul 2023 21:12:05 +0000 (15:12 -0600)
committer	Zack Cerza <zack@redhat.com>
	Wed, 5 Jul 2023 23:45:23 +0000 (17:45 -0600)
teuthology/task/ansible.py		patch \| blob \| history
teuthology/test/task/test_ansible.py		patch \| blob \| history