]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
Added Scraper Script from github.com/jcsp/scrape with tests 1432/head
authorgrapheo12 <donofsuri@gmail.com>
Sun, 29 Mar 2020 16:56:33 +0000 (22:26 +0530)
committerShubham Mishra <smishra99.iitkgp@gmail.com>
Fri, 26 Jun 2020 14:25:28 +0000 (19:55 +0530)
Signed-off-by: Shubham Mishra <smishra99.iitkgp@gmail.com>
teuthology/results.py
teuthology/scrape.py [new file with mode: 0644]
teuthology/test/test_scrape.py [new file with mode: 0644]

index 2e16d764cb70c185966fa65da1f938d374b4d974..73953e02af93582c536fa62bf5bad33965657e31 100644 (file)
@@ -9,6 +9,7 @@ import teuthology
 from teuthology.config import config
 from teuthology import misc
 from teuthology.report import ResultsReporter
+from teuthology.scrape import Scraper
 
 log = logging.getLogger(__name__)
 
@@ -65,6 +66,7 @@ def results(archive_dir, name, email, timeout, dry_run):
 
     (subject, body) = build_email_body(name)
 
+    Scraper(archive_dir).analyze()
     if email and dry_run:
         print("From: %s" % (config.results_sending_email or 'teuthology'))
         print("To: %s" % email)
diff --git a/teuthology/scrape.py b/teuthology/scrape.py
new file mode 100644 (file)
index 0000000..f3b907f
--- /dev/null
@@ -0,0 +1,515 @@
+# Origin: https://github.com/jcsp/scrape/blob/master/scrape.py
+# Author: John Spray (github.com/jcsp)
+
+import difflib
+from errno import ENOENT
+from gzip import GzipFile
+import sys
+import os
+import yaml
+from collections import defaultdict
+import re
+import logging
+import subprocess
+
+import six
+
+log = logging.getLogger('scrape')
+log.addHandler(logging.StreamHandler())
+log.setLevel(logging.INFO)
+
+
+class Reason(object):
+    def get_description(self):
+        return self.description
+
+    def get_detail(self):
+        return None
+
+
+def grep(path, expr):
+    """
+    Call out to native grep rather than feeding massive log files through python line by line
+    """
+    p = subprocess.Popen(["grep", expr, path], stdout=subprocess.PIPE)
+    p.wait()
+    out, err = p.communicate()
+    if p.returncode == 0:
+        return six.ensure_str(out).split("\n")
+    else:
+        return []
+
+
+class GenericReason(Reason):
+    """
+    A reason inferred from a Job: matches Jobs with an apparently-similar failure
+    """
+
+    def __init__(self, job, description=None):
+        self.failure_reason = job.get_failure_reason()
+        self.description = description
+
+        self.backtrace = job.get_backtrace()
+        if self.backtrace:
+            log.debug("Found a backtrace!\n{0}".format(self.backtrace))
+
+    def get_detail(self):
+        return self.backtrace
+
+    def get_description(self):
+        if self.description:
+            return self.description
+        else:
+            if self.backtrace:
+                return "Crash: {0}".format(self.failure_reason)
+            else:
+                return "Failure: {0}".format(self.failure_reason)
+
+    def match(self, job):
+        # I never match dead jobs
+        if job.get_failure_reason() is None:
+            return False
+
+        # If one has a backtrace but the other doesn't, we're a different thing even if the official
+        # failure_reason is the same
+        if (self.backtrace is None) != (job.get_backtrace() is None):
+            return False
+
+        # If we have the same backtrace, we're a match even if the teuthology failure_reason
+        # doesn't match (a crash is a crash, it can have different symptoms)
+        if self.backtrace:
+            ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio()
+            return ratio > 0.5
+        else:
+            if "Test failure:" in self.failure_reason:
+                return self.failure_reason == job.get_failure_reason()
+            elif re.search("workunit test (.*)\) on ", self.failure_reason):
+                workunit_name = re.search("workunit test (.*)\) on ", self.failure_reason).group(1)
+                other_match = re.search("workunit test (.*)\) on ", job.get_failure_reason())
+                return other_match is not None and workunit_name == other_match.group(1)
+            else:
+                reason_ratio = difflib.SequenceMatcher(None, self.failure_reason, job.get_failure_reason()).ratio()
+                return reason_ratio > 0.5
+
+
+class RegexReason(Reason):
+    """
+    A known reason matching a particular regex to failure reason
+    """
+
+    def __init__(self, regexes, description):
+        self.description = description
+        if isinstance(regexes, list):
+            self.regexes = regexes
+        else:
+            self.regexes = [regexes]
+
+    def match(self, job):
+        # I never match dead jobs
+        if job.get_failure_reason() is None:
+            return False
+
+        for regex in self.regexes:
+            if re.match(regex, job.get_failure_reason()):
+                return True
+
+        return False
+
+
+class AssertionReason(Reason):
+    def __init__(self, job):
+        self.assertion = job.get_assertion()
+        self.backtrace = job.get_backtrace()
+
+    def get_description(self):
+        return "Assertion: {0}".format(self.assertion)
+
+    def get_detail(self):
+        return self.backtrace
+
+    @classmethod
+    def could_be(cls, job):
+        return job.get_assertion() is not None
+
+    def match(self, job):
+        return self.assertion == job.get_assertion()
+
+
+class LockdepReason(AssertionReason):
+    """
+    Different to a normal assertion, because matches should not only
+    have the same assertion but the same backtrace (don't want to glob
+    all lockdep failures together if they are really being tripped in
+    different places)
+    """
+    @classmethod
+    def could_be(cls, job):
+        if not super(LockdepReason, cls).could_be(job):
+            return False
+
+        return "common/lockdep" in job.get_assertion()
+
+    def get_description(self):
+        return "Lockdep: {0}".format(self.assertion)
+
+    def match(self, job):
+        if not super(LockdepReason, self).match(job):
+            return False
+
+        if self.backtrace:
+            if job.get_backtrace():
+                ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio()
+                return ratio > 0.5
+            else:
+                return False
+        else:
+            # No backtrace to compare about, allow matches based purely on assertion
+            return True
+
+
+class DeadReason(Reason):
+    """
+    A reason for picking up jobs with no summary.yaml
+    """
+    def __init__(self, job):
+        self.description = "Dead"
+        self.last_tlog_line = job.get_last_tlog_line()
+        self.backtrace = job.get_backtrace()
+
+    def get_description(self):
+        return "Dead: {0}".format(self.last_tlog_line)
+
+    def get_detail(self):
+        return self.backtrace
+
+    @classmethod
+    def could_be(cls, job):
+        return job.summary_data is None
+
+    def match(self, job):
+        if job.summary_data:
+            return False
+
+        if self.backtrace:
+            if job.get_backtrace():
+                # We both have backtrace: use that to decide if we're the same
+                ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio()
+                return ratio > 0.5
+            else:
+                # I have BT but he doesn't, so we're different
+                return False
+
+        if self.last_tlog_line or job.get_last_tlog_line():
+            ratio = difflib.SequenceMatcher(None, self.last_tlog_line,
+                                            job.get_last_tlog_line()).ratio()
+            return ratio > 0.5
+        return True
+
+
+class TimeoutReason(Reason):
+    def __init__(self, job):
+        self.timeout, self.command = self.get_timeout(job)
+
+    def get_description(self):
+        return "Timeout {0} running {1}".format(
+            self.timeout, self.command
+        )
+
+    @classmethod
+    def could_be(cls, job):
+        return cls.get_timeout(job) is not None
+
+    @classmethod
+    def get_timeout(cls, job):
+        if job.get_failure_reason() is None:
+            return None
+
+        match = re.search("status 124:.* timeout ([^ ]+) ([^']+)'", job.get_failure_reason())
+        if not match:
+            return
+
+        timeout, bin_path = match.groups()
+
+        # Given a path like /home/ubuntu/cephtest/workunit.client.0/cephtool/test.sh
+        # ... strip it down to cephtool/test.sh
+        parts = bin_path.split(os.path.sep)
+        parts.reverse()
+        rparts = []
+        for p in parts:
+            if 'workunit.' in p or 'cephtest' in p:
+                break
+            else:
+                rparts.append(p)
+        rparts.reverse()
+        command = os.path.sep.join(rparts)
+
+        return timeout, command
+
+    def match(self, job):
+        return self.get_timeout(job) == (self.timeout, self.command)
+
+MAX_TEUTHOLOGY_LOG = 1024 * 1024 * 100
+MAX_SVC_LOG = 100 * 1024 * 1024
+MAX_BT_LINES = 100
+
+
+class Job(object):
+    def __init__(self, path, job_id):
+        self.path = path
+        self.job_id = job_id
+
+        try:
+            self.config = yaml.safe_load(open(os.path.join(self.path, "config.yaml"), 'r'))
+            self.description = self.config['description']
+            assert self.description
+        except IOError:
+            self.config = None
+            self.description = None
+
+        summary_path = os.path.join(self.path, "summary.yaml")
+        try:
+            self.summary_data = yaml.safe_load(open(summary_path, 'r'))
+        except IOError:
+            self.summary_data = None
+
+        self.backtrace = None
+        self.assertion = None
+        self.populated = False
+
+    def get_success(self):
+        if self.summary_data:
+            return self.summary_data['success']
+        else:
+            return False
+
+    def get_failure_reason(self):
+        if self.summary_data:
+            return self.summary_data['failure_reason']
+        else:
+            return None
+
+    def get_last_tlog_line(self):
+        t_path = os.path.join(self.path, "teuthology.log")
+        if not os.path.exists(t_path):
+            return None
+        else:
+            out, err = subprocess.Popen(["tail", "-n", "1", t_path], stdout=subprocess.PIPE).communicate()
+            return out.strip()
+
+    def _search_backtrace(self, file_obj):
+        bt_lines = []
+        assertion = None
+        for line in file_obj:
+            # Log prefix from teuthology.log
+            if ".stderr:" in line:
+                line = line.split(".stderr:")[1]
+
+            if "FAILED assert" in line:
+                assertion = line.strip()
+
+            if line.startswith(" ceph version"):
+                # The start of a backtrace!
+                bt_lines = [line]
+            elif line.startswith(" NOTE: a copy of the executable"):
+                # The backtrace terminated, if we have a buffer return it
+                if len(bt_lines):
+                    return ("".join(bt_lines)).strip(), assertion
+                else:
+                    log.warning("Saw end of BT but not start")
+            elif bt_lines:
+                # We're in a backtrace, push the line onto the list
+                if len(bt_lines) > MAX_BT_LINES:
+                    # Something wrong with our parsing, drop it
+                    log.warning("Ignoring malparsed backtrace: {0}".format(
+                        ", ".join(bt_lines[0:3])
+                    ))
+                    bt_lines = []
+                bt_lines.append(line)
+
+        return None, assertion
+
+    def get_assertion(self):
+        if not self.populated:
+            self._populate_backtrace()
+        return self.assertion
+
+    def get_backtrace(self):
+        if not self.populated:
+            self._populate_backtrace()
+        return self.backtrace
+
+    def _populate_backtrace(self):
+        tlog_path = os.path.join(self.path, "teuthology.log")
+        try:
+            s = os.stat(tlog_path)
+        except OSError:
+            log.warning("Missing teuthology log {0}".format(tlog_path))
+            return None
+        size = s.st_size
+        if size > MAX_TEUTHOLOGY_LOG:
+            log.debug("Ignoring teuthology log for job {0}, it is {1} bytes".format(self.job_id, size))
+            return None
+
+        self.backtrace, self.assertion = self._search_backtrace(open(tlog_path))
+        if self.backtrace:
+            return
+
+        for line in grep(tlog_path, "command crashed with signal"):
+            log.debug("Found a crash indication: {0}".format(line))
+            # tasks.ceph.osd.1.plana82.stderr
+            match = re.search("tasks.ceph.([^\.]+).([^\.]+).([^\.]+).stderr", line)
+            if not match:
+                log.warning("Not-understood crash indication {0}".format(line))
+                continue
+            svc, svc_id, hostname = match.groups()
+            gzipped_log_path = os.path.join(
+                self.path, "remote", hostname, "log", "ceph-{0}.{1}.log.gz".format(svc, svc_id))
+
+            try:
+                s = os.stat(gzipped_log_path)
+            except OSError as e:
+                if e.errno == ENOENT:
+                    log.warning("Missing log {0}".format(gzipped_log_path))
+                    continue
+                else:
+                    raise
+
+            size = s.st_size
+            if size > MAX_SVC_LOG:
+                log.warning("Not checking for backtrace from {0}:{1}.{2} log, too large ({3})".format(
+                    hostname, svc, svc_id, size
+                ))
+                continue
+
+            bt, ass = self._search_backtrace(GzipFile(gzipped_log_path))
+            if ass and not self.assertion:
+                self.assertion = ass
+            if bt:
+                self.backtrace = bt
+                return
+
+        return None
+
+
+class ValgrindReason(Reason):
+    def __init__(self, job):
+        assert self.could_be(job)
+        self.service_types = self._get_service_types(job)
+
+    def _get_service_types(self, job):
+        """
+        Get dict mapping service type 'osd' etc to sorted list of violation types 'Leak_PossiblyLost' etc
+        """
+
+        result = defaultdict(list)
+        # Lines like:
+        # 2014-08-22T20:07:18.668 ERROR:tasks.ceph:saw valgrind issue   <kind>Leak_DefinitelyLost</kind> in /var/log/ceph/valgrind/osd.3.log.gz
+        for line in grep(os.path.join(job.path, "teuthology.log"), "</kind> in "):
+            match = re.search("<kind>(.+)</kind> in .+/(.+)", line)
+            if not match:
+                log.warning("Misunderstood line: {0}".format(line))
+                continue
+            err_typ, log_basename = match.groups()
+            svc_typ = six.ensure_str(log_basename).split(".")[0]
+            if err_typ not in result[svc_typ]:
+                result[svc_typ].append(err_typ)
+                result[svc_typ] = sorted(result[svc_typ])
+
+        return dict(result)
+
+    def get_description(self):
+        desc_bits = []
+        for service, types in list(self.service_types.items()):
+            desc_bits.append("{0} ({1})".format(service, ", ".join(types)))
+        return "Valgrind: " + ", ".join(desc_bits)
+
+    @classmethod
+    def could_be(cls, job):
+        return job.get_failure_reason() is not None and "saw valgrind issues" in job.get_failure_reason()
+
+    def match(self, job):
+        return self._get_service_types(job) == self.service_types
+
+
+known_reasons = [
+    # If the failure reason indicates no packages found...
+    RegexReason(["Failed to fetch package version from http://",
+                 "Command failed on .* with status 100: 'sudo apt-get update"]
+                , "Missing packages"),
+]
+
+
+def give_me_a_reason(job):
+    """
+    If no existing reasons match the job, generate the most specific reason we can
+    """
+
+    # Note: because we match known reasons, including GenericReasons, before any of
+    # the Timeout/Valgrind whatever, even if a run is a timeout or a valgrind failure,
+    # it will get matched up with a backtrace or assertion if one is there, hiding
+    # the valgrind/timeout aspect.
+
+    for r in known_reasons:
+        if r.match(job):
+            return r
+
+    # NB ordering matters, LockdepReason must come before AssertionReason
+    for klass in [DeadReason, LockdepReason, AssertionReason, TimeoutReason, ValgrindReason]:
+        if klass.could_be(job):
+            return klass(job)
+
+    return GenericReason(job)
+
+
+class Scraper(object):
+    def __init__(self, target_dir):
+        self.target_dir = target_dir
+        log.addHandler(logging.FileHandler(os.path.join(target_dir,
+                                                     "scrape.log")))
+
+    def analyze(self):
+        entries = os.listdir(self.target_dir)
+        jobs = []
+        for entry in entries:
+            job_dir = os.path.join(self.target_dir, entry)
+            if os.path.isdir(job_dir):
+                jobs.append(Job(job_dir, entry))
+
+        log.info("Found {0} jobs".format(len(jobs)))
+
+        passes = []
+        reasons = defaultdict(list)
+
+        for job in jobs:
+            if job.get_success():
+                passes.append(job)
+                continue
+
+            matched = False
+            for reason, reason_jobs in reasons.items():
+                if reason.match(job):
+                    reason_jobs.append(job)
+                    matched = True
+                    break
+
+            if not matched:
+                reasons[give_me_a_reason(job)].append(job)
+
+        log.info("Found {0} distinct failure reasons".format(len(reasons)))
+        for reason, jobs in list(reasons.items()):
+            job_spec = "{0} jobs: {1}".format(len(jobs), [j.job_id for j in jobs]) if len(jobs) < 30 else "{0} jobs".format(len(jobs))
+            log.info(reason.get_description())
+            detail = reason.get_detail()
+            if detail:
+                log.info(detail)
+            log.info(job_spec)
+            suites = [set(j.description.split()) for j in jobs if j.description != None]
+            if len(suites) > 1:
+                log.info("suites intersection: {0}".format(sorted(set.intersection(*suites))))
+                log.info("suites union: {0}".format(sorted(set.union(*suites))))
+            elif len(suites) == 1:
+                log.info("suites: {0}".format(sorted(suites[0])))
+            log.info("")
+
+if __name__ == '__main__':
+    Scraper(sys.argv[1]).analyze()
diff --git a/teuthology/test/test_scrape.py b/teuthology/test/test_scrape.py
new file mode 100644 (file)
index 0000000..f8a0352
--- /dev/null
@@ -0,0 +1,167 @@
+from __future__ import with_statement
+
+import os
+import shutil
+import tempfile
+import yaml
+from teuthology import scrape
+
+class FakeResultDir(object):
+    """Mocks a Result Directory"""
+
+    def __init__(self,
+                failure_reason="Dummy reason",
+                assertion="FAILED assert 1 == 2\n",
+                blank_backtrace=False
+    ):
+        self.failure_reason = failure_reason
+        self.assertion = assertion
+        self.blank_backtrace = blank_backtrace
+        self.path = tempfile.mkdtemp()
+        
+        with open(os.path.join(self.path, "config.yaml"), "w") as f:
+            yaml.dump({"description": "Dummy test"}, f)
+        
+        with open(os.path.join(self.path, "summary.yaml"), "w") as f:
+            yaml.dump({
+                "success": "false",
+                "failure_reason": self.failure_reason
+            }, f)
+
+        with open(os.path.join(self.path, "teuthology.log"), "w") as f:
+            if not self.blank_backtrace:
+                f.write(" ceph version 1000\n")
+                f.write(".stderr: Dummy error\n")
+                f.write(self.assertion)
+            f.write(" NOTE: a copy of the executable dummy text\n")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_typ, exc_val, exc_tb):
+        shutil.rmtree(self.path)
+
+class TestScrape(object):
+    """Tests for teuthology.scrape"""
+
+    def test_grep(self):
+        with FakeResultDir() as d:
+            filepath = os.path.join(d.path, "scrapetest.txt")
+            with open(filepath, 'w') as f:
+                f.write("Ceph is an open-source software storage platform\n\
+                        Teuthology is used for testing.")
+
+            #System level grep is called
+            value1 = scrape.grep(filepath, "software")
+            value2 = scrape.grep(filepath, "device")
+
+            assert value1 ==\
+                ['Ceph is an open-source software storage platform', '']
+            assert value2 == []
+
+    def test_job(self):
+        with FakeResultDir() as d:
+            job = scrape.Job(d.path, 1)
+            assert job.get_success() == "false"
+            assert job.get_assertion() == "FAILED assert 1 == 2"
+            assert job.get_last_tlog_line() ==\
+                b"NOTE: a copy of the executable dummy text"
+            assert job.get_failure_reason() == "Dummy reason"
+
+    def test_timeoutreason(self):
+        with FakeResultDir(failure_reason=\
+            "status 124:  timeout '123 /home/ubuntu/cephtest/workunit.client.0/cephtool/test.sh'") as d:
+            job = scrape.Job(d.path, 1)
+            assert scrape.TimeoutReason.could_be(job)
+            assert scrape.TimeoutReason(job).match(job)
+
+    def test_deadreason(self):
+        with FakeResultDir() as d:
+            job = scrape.Job(d.path, 1)
+            #Summary is present
+            #So this cannot be a DeadReason
+            assert not scrape.DeadReason.could_be(job)
+
+    def test_lockdepreason(self):
+        lkReason = None
+        with FakeResultDir(assertion=\
+            "FAILED assert common/lockdep reason\n") as d:
+            job = scrape.Job(d.path, 1)
+            assert scrape.LockdepReason.could_be(job)
+
+            lkReason = scrape.LockdepReason(job)
+            #Backtraces of same jobs must match 100%
+            assert lkReason.match(job)
+        with FakeResultDir(blank_backtrace=True) as d:
+            #Corresponding to 0% match
+            assert not lkReason.match(scrape.Job(d.path, 2))
+
+    def test_assertionreason(self):
+        with FakeResultDir() as d:
+            job = scrape.Job(d.path, 1)
+            assert scrape.AssertionReason.could_be(job)
+
+    def test_genericreason(self):
+        d1 = FakeResultDir(blank_backtrace=True)
+        d2 = FakeResultDir(failure_reason="Dummy dummy")
+        d3 = FakeResultDir()
+
+        job1 = scrape.Job(d1.path, 1)
+        job2 = scrape.Job(d2.path, 2)
+        job3 = scrape.Job(d3.path, 3)
+
+        reason = scrape.GenericReason(job3)
+
+        assert reason.match(job2)
+        assert not reason.match(job1)
+
+        shutil.rmtree(d1.path)
+        shutil.rmtree(d2.path)
+        shutil.rmtree(d3.path)
+
+    def test_valgrindreason(self):
+        vreason = None
+        with FakeResultDir(
+            failure_reason="saw valgrind issues",
+            assertion="2014-08-22T20:07:18.668 ERROR:tasks.ceph:saw valgrind issue   <kind>Leak_DefinitelyLost</kind> in /var/log/ceph/valgrind/osd.3.log.gz\n"
+        ) as d:
+            job = scrape.Job(d.path, 1)
+            assert scrape.ValgrindReason.could_be(job)
+            
+            vreason = scrape.ValgrindReason(job)
+            assert vreason.match(job)
+
+    def test_give_me_a_reason(self):
+        with FakeResultDir() as d:
+            job = scrape.Job(d.path, 1)
+            
+            assert type(scrape.give_me_a_reason(job)) == scrape.AssertionReason
+
+        #Test the lockdep ordering
+        with FakeResultDir(assertion=\
+        "FAILED assert common/lockdep reason\n") as d:
+            job = scrape.Job(d.path, 1)
+            assert type(scrape.give_me_a_reason(job)) == scrape.LockdepReason
+
+    def test_scraper(self):
+        d = FakeResultDir()
+        os.mkdir(os.path.join(d.path, "test"))
+        shutil.move(
+            os.path.join(d.path, "config.yaml"),
+            os.path.join(d.path, "test", "config.yaml")
+        )
+        shutil.move(
+            os.path.join(d.path, "summary.yaml"),
+            os.path.join(d.path, "test", "summary.yaml")
+        )
+        shutil.move(
+            os.path.join(d.path, "teuthology.log"),
+            os.path.join(d.path, "test", "teuthology.log")
+        )
+
+        scrape.Scraper(d.path).analyze()
+
+        #scrape.log should be created
+        assert os.path.exists(os.path.join(d.path, "scrape.log"))
+
+        shutil.rmtree(d.path)