From c366d686b9466c0bf265833a16af5035fea32143 Mon Sep 17 00:00:00 2001 From: grapheo12 Date: Sun, 29 Mar 2020 22:26:33 +0530 Subject: [PATCH] Added Scraper Script from github.com/jcsp/scrape with tests Signed-off-by: Shubham Mishra --- teuthology/results.py | 2 + teuthology/scrape.py | 515 +++++++++++++++++++++++++++++++++ teuthology/test/test_scrape.py | 167 +++++++++++ 3 files changed, 684 insertions(+) create mode 100644 teuthology/scrape.py create mode 100644 teuthology/test/test_scrape.py diff --git a/teuthology/results.py b/teuthology/results.py index 2e16d764cb..73953e02af 100644 --- a/teuthology/results.py +++ b/teuthology/results.py @@ -9,6 +9,7 @@ import teuthology from teuthology.config import config from teuthology import misc from teuthology.report import ResultsReporter +from teuthology.scrape import Scraper log = logging.getLogger(__name__) @@ -65,6 +66,7 @@ def results(archive_dir, name, email, timeout, dry_run): (subject, body) = build_email_body(name) + Scraper(archive_dir).analyze() if email and dry_run: print("From: %s" % (config.results_sending_email or 'teuthology')) print("To: %s" % email) diff --git a/teuthology/scrape.py b/teuthology/scrape.py new file mode 100644 index 0000000000..f3b907fba6 --- /dev/null +++ b/teuthology/scrape.py @@ -0,0 +1,515 @@ +# Origin: https://github.com/jcsp/scrape/blob/master/scrape.py +# Author: John Spray (github.com/jcsp) + +import difflib +from errno import ENOENT +from gzip import GzipFile +import sys +import os +import yaml +from collections import defaultdict +import re +import logging +import subprocess + +import six + +log = logging.getLogger('scrape') +log.addHandler(logging.StreamHandler()) +log.setLevel(logging.INFO) + + +class Reason(object): + def get_description(self): + return self.description + + def get_detail(self): + return None + + +def grep(path, expr): + """ + Call out to native grep rather than feeding massive log files through python line by line + """ + p = subprocess.Popen(["grep", expr, path], stdout=subprocess.PIPE) + p.wait() + out, err = p.communicate() + if p.returncode == 0: + return six.ensure_str(out).split("\n") + else: + return [] + + +class GenericReason(Reason): + """ + A reason inferred from a Job: matches Jobs with an apparently-similar failure + """ + + def __init__(self, job, description=None): + self.failure_reason = job.get_failure_reason() + self.description = description + + self.backtrace = job.get_backtrace() + if self.backtrace: + log.debug("Found a backtrace!\n{0}".format(self.backtrace)) + + def get_detail(self): + return self.backtrace + + def get_description(self): + if self.description: + return self.description + else: + if self.backtrace: + return "Crash: {0}".format(self.failure_reason) + else: + return "Failure: {0}".format(self.failure_reason) + + def match(self, job): + # I never match dead jobs + if job.get_failure_reason() is None: + return False + + # If one has a backtrace but the other doesn't, we're a different thing even if the official + # failure_reason is the same + if (self.backtrace is None) != (job.get_backtrace() is None): + return False + + # If we have the same backtrace, we're a match even if the teuthology failure_reason + # doesn't match (a crash is a crash, it can have different symptoms) + if self.backtrace: + ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio() + return ratio > 0.5 + else: + if "Test failure:" in self.failure_reason: + return self.failure_reason == job.get_failure_reason() + elif re.search("workunit test (.*)\) on ", self.failure_reason): + workunit_name = re.search("workunit test (.*)\) on ", self.failure_reason).group(1) + other_match = re.search("workunit test (.*)\) on ", job.get_failure_reason()) + return other_match is not None and workunit_name == other_match.group(1) + else: + reason_ratio = difflib.SequenceMatcher(None, self.failure_reason, job.get_failure_reason()).ratio() + return reason_ratio > 0.5 + + +class RegexReason(Reason): + """ + A known reason matching a particular regex to failure reason + """ + + def __init__(self, regexes, description): + self.description = description + if isinstance(regexes, list): + self.regexes = regexes + else: + self.regexes = [regexes] + + def match(self, job): + # I never match dead jobs + if job.get_failure_reason() is None: + return False + + for regex in self.regexes: + if re.match(regex, job.get_failure_reason()): + return True + + return False + + +class AssertionReason(Reason): + def __init__(self, job): + self.assertion = job.get_assertion() + self.backtrace = job.get_backtrace() + + def get_description(self): + return "Assertion: {0}".format(self.assertion) + + def get_detail(self): + return self.backtrace + + @classmethod + def could_be(cls, job): + return job.get_assertion() is not None + + def match(self, job): + return self.assertion == job.get_assertion() + + +class LockdepReason(AssertionReason): + """ + Different to a normal assertion, because matches should not only + have the same assertion but the same backtrace (don't want to glob + all lockdep failures together if they are really being tripped in + different places) + """ + @classmethod + def could_be(cls, job): + if not super(LockdepReason, cls).could_be(job): + return False + + return "common/lockdep" in job.get_assertion() + + def get_description(self): + return "Lockdep: {0}".format(self.assertion) + + def match(self, job): + if not super(LockdepReason, self).match(job): + return False + + if self.backtrace: + if job.get_backtrace(): + ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio() + return ratio > 0.5 + else: + return False + else: + # No backtrace to compare about, allow matches based purely on assertion + return True + + +class DeadReason(Reason): + """ + A reason for picking up jobs with no summary.yaml + """ + def __init__(self, job): + self.description = "Dead" + self.last_tlog_line = job.get_last_tlog_line() + self.backtrace = job.get_backtrace() + + def get_description(self): + return "Dead: {0}".format(self.last_tlog_line) + + def get_detail(self): + return self.backtrace + + @classmethod + def could_be(cls, job): + return job.summary_data is None + + def match(self, job): + if job.summary_data: + return False + + if self.backtrace: + if job.get_backtrace(): + # We both have backtrace: use that to decide if we're the same + ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio() + return ratio > 0.5 + else: + # I have BT but he doesn't, so we're different + return False + + if self.last_tlog_line or job.get_last_tlog_line(): + ratio = difflib.SequenceMatcher(None, self.last_tlog_line, + job.get_last_tlog_line()).ratio() + return ratio > 0.5 + return True + + +class TimeoutReason(Reason): + def __init__(self, job): + self.timeout, self.command = self.get_timeout(job) + + def get_description(self): + return "Timeout {0} running {1}".format( + self.timeout, self.command + ) + + @classmethod + def could_be(cls, job): + return cls.get_timeout(job) is not None + + @classmethod + def get_timeout(cls, job): + if job.get_failure_reason() is None: + return None + + match = re.search("status 124:.* timeout ([^ ]+) ([^']+)'", job.get_failure_reason()) + if not match: + return + + timeout, bin_path = match.groups() + + # Given a path like /home/ubuntu/cephtest/workunit.client.0/cephtool/test.sh + # ... strip it down to cephtool/test.sh + parts = bin_path.split(os.path.sep) + parts.reverse() + rparts = [] + for p in parts: + if 'workunit.' in p or 'cephtest' in p: + break + else: + rparts.append(p) + rparts.reverse() + command = os.path.sep.join(rparts) + + return timeout, command + + def match(self, job): + return self.get_timeout(job) == (self.timeout, self.command) + +MAX_TEUTHOLOGY_LOG = 1024 * 1024 * 100 +MAX_SVC_LOG = 100 * 1024 * 1024 +MAX_BT_LINES = 100 + + +class Job(object): + def __init__(self, path, job_id): + self.path = path + self.job_id = job_id + + try: + self.config = yaml.safe_load(open(os.path.join(self.path, "config.yaml"), 'r')) + self.description = self.config['description'] + assert self.description + except IOError: + self.config = None + self.description = None + + summary_path = os.path.join(self.path, "summary.yaml") + try: + self.summary_data = yaml.safe_load(open(summary_path, 'r')) + except IOError: + self.summary_data = None + + self.backtrace = None + self.assertion = None + self.populated = False + + def get_success(self): + if self.summary_data: + return self.summary_data['success'] + else: + return False + + def get_failure_reason(self): + if self.summary_data: + return self.summary_data['failure_reason'] + else: + return None + + def get_last_tlog_line(self): + t_path = os.path.join(self.path, "teuthology.log") + if not os.path.exists(t_path): + return None + else: + out, err = subprocess.Popen(["tail", "-n", "1", t_path], stdout=subprocess.PIPE).communicate() + return out.strip() + + def _search_backtrace(self, file_obj): + bt_lines = [] + assertion = None + for line in file_obj: + # Log prefix from teuthology.log + if ".stderr:" in line: + line = line.split(".stderr:")[1] + + if "FAILED assert" in line: + assertion = line.strip() + + if line.startswith(" ceph version"): + # The start of a backtrace! + bt_lines = [line] + elif line.startswith(" NOTE: a copy of the executable"): + # The backtrace terminated, if we have a buffer return it + if len(bt_lines): + return ("".join(bt_lines)).strip(), assertion + else: + log.warning("Saw end of BT but not start") + elif bt_lines: + # We're in a backtrace, push the line onto the list + if len(bt_lines) > MAX_BT_LINES: + # Something wrong with our parsing, drop it + log.warning("Ignoring malparsed backtrace: {0}".format( + ", ".join(bt_lines[0:3]) + )) + bt_lines = [] + bt_lines.append(line) + + return None, assertion + + def get_assertion(self): + if not self.populated: + self._populate_backtrace() + return self.assertion + + def get_backtrace(self): + if not self.populated: + self._populate_backtrace() + return self.backtrace + + def _populate_backtrace(self): + tlog_path = os.path.join(self.path, "teuthology.log") + try: + s = os.stat(tlog_path) + except OSError: + log.warning("Missing teuthology log {0}".format(tlog_path)) + return None + size = s.st_size + if size > MAX_TEUTHOLOGY_LOG: + log.debug("Ignoring teuthology log for job {0}, it is {1} bytes".format(self.job_id, size)) + return None + + self.backtrace, self.assertion = self._search_backtrace(open(tlog_path)) + if self.backtrace: + return + + for line in grep(tlog_path, "command crashed with signal"): + log.debug("Found a crash indication: {0}".format(line)) + # tasks.ceph.osd.1.plana82.stderr + match = re.search("tasks.ceph.([^\.]+).([^\.]+).([^\.]+).stderr", line) + if not match: + log.warning("Not-understood crash indication {0}".format(line)) + continue + svc, svc_id, hostname = match.groups() + gzipped_log_path = os.path.join( + self.path, "remote", hostname, "log", "ceph-{0}.{1}.log.gz".format(svc, svc_id)) + + try: + s = os.stat(gzipped_log_path) + except OSError as e: + if e.errno == ENOENT: + log.warning("Missing log {0}".format(gzipped_log_path)) + continue + else: + raise + + size = s.st_size + if size > MAX_SVC_LOG: + log.warning("Not checking for backtrace from {0}:{1}.{2} log, too large ({3})".format( + hostname, svc, svc_id, size + )) + continue + + bt, ass = self._search_backtrace(GzipFile(gzipped_log_path)) + if ass and not self.assertion: + self.assertion = ass + if bt: + self.backtrace = bt + return + + return None + + +class ValgrindReason(Reason): + def __init__(self, job): + assert self.could_be(job) + self.service_types = self._get_service_types(job) + + def _get_service_types(self, job): + """ + Get dict mapping service type 'osd' etc to sorted list of violation types 'Leak_PossiblyLost' etc + """ + + result = defaultdict(list) + # Lines like: + # 2014-08-22T20:07:18.668 ERROR:tasks.ceph:saw valgrind issue Leak_DefinitelyLost in /var/log/ceph/valgrind/osd.3.log.gz + for line in grep(os.path.join(job.path, "teuthology.log"), " in "): + match = re.search("(.+) in .+/(.+)", line) + if not match: + log.warning("Misunderstood line: {0}".format(line)) + continue + err_typ, log_basename = match.groups() + svc_typ = six.ensure_str(log_basename).split(".")[0] + if err_typ not in result[svc_typ]: + result[svc_typ].append(err_typ) + result[svc_typ] = sorted(result[svc_typ]) + + return dict(result) + + def get_description(self): + desc_bits = [] + for service, types in list(self.service_types.items()): + desc_bits.append("{0} ({1})".format(service, ", ".join(types))) + return "Valgrind: " + ", ".join(desc_bits) + + @classmethod + def could_be(cls, job): + return job.get_failure_reason() is not None and "saw valgrind issues" in job.get_failure_reason() + + def match(self, job): + return self._get_service_types(job) == self.service_types + + +known_reasons = [ + # If the failure reason indicates no packages found... + RegexReason(["Failed to fetch package version from http://", + "Command failed on .* with status 100: 'sudo apt-get update"] + , "Missing packages"), +] + + +def give_me_a_reason(job): + """ + If no existing reasons match the job, generate the most specific reason we can + """ + + # Note: because we match known reasons, including GenericReasons, before any of + # the Timeout/Valgrind whatever, even if a run is a timeout or a valgrind failure, + # it will get matched up with a backtrace or assertion if one is there, hiding + # the valgrind/timeout aspect. + + for r in known_reasons: + if r.match(job): + return r + + # NB ordering matters, LockdepReason must come before AssertionReason + for klass in [DeadReason, LockdepReason, AssertionReason, TimeoutReason, ValgrindReason]: + if klass.could_be(job): + return klass(job) + + return GenericReason(job) + + +class Scraper(object): + def __init__(self, target_dir): + self.target_dir = target_dir + log.addHandler(logging.FileHandler(os.path.join(target_dir, + "scrape.log"))) + + def analyze(self): + entries = os.listdir(self.target_dir) + jobs = [] + for entry in entries: + job_dir = os.path.join(self.target_dir, entry) + if os.path.isdir(job_dir): + jobs.append(Job(job_dir, entry)) + + log.info("Found {0} jobs".format(len(jobs))) + + passes = [] + reasons = defaultdict(list) + + for job in jobs: + if job.get_success(): + passes.append(job) + continue + + matched = False + for reason, reason_jobs in reasons.items(): + if reason.match(job): + reason_jobs.append(job) + matched = True + break + + if not matched: + reasons[give_me_a_reason(job)].append(job) + + log.info("Found {0} distinct failure reasons".format(len(reasons))) + for reason, jobs in list(reasons.items()): + job_spec = "{0} jobs: {1}".format(len(jobs), [j.job_id for j in jobs]) if len(jobs) < 30 else "{0} jobs".format(len(jobs)) + log.info(reason.get_description()) + detail = reason.get_detail() + if detail: + log.info(detail) + log.info(job_spec) + suites = [set(j.description.split()) for j in jobs if j.description != None] + if len(suites) > 1: + log.info("suites intersection: {0}".format(sorted(set.intersection(*suites)))) + log.info("suites union: {0}".format(sorted(set.union(*suites)))) + elif len(suites) == 1: + log.info("suites: {0}".format(sorted(suites[0]))) + log.info("") + +if __name__ == '__main__': + Scraper(sys.argv[1]).analyze() diff --git a/teuthology/test/test_scrape.py b/teuthology/test/test_scrape.py new file mode 100644 index 0000000000..f8a03520ec --- /dev/null +++ b/teuthology/test/test_scrape.py @@ -0,0 +1,167 @@ +from __future__ import with_statement + +import os +import shutil +import tempfile +import yaml +from teuthology import scrape + +class FakeResultDir(object): + """Mocks a Result Directory""" + + def __init__(self, + failure_reason="Dummy reason", + assertion="FAILED assert 1 == 2\n", + blank_backtrace=False + ): + self.failure_reason = failure_reason + self.assertion = assertion + self.blank_backtrace = blank_backtrace + self.path = tempfile.mkdtemp() + + with open(os.path.join(self.path, "config.yaml"), "w") as f: + yaml.dump({"description": "Dummy test"}, f) + + with open(os.path.join(self.path, "summary.yaml"), "w") as f: + yaml.dump({ + "success": "false", + "failure_reason": self.failure_reason + }, f) + + with open(os.path.join(self.path, "teuthology.log"), "w") as f: + if not self.blank_backtrace: + f.write(" ceph version 1000\n") + f.write(".stderr: Dummy error\n") + f.write(self.assertion) + f.write(" NOTE: a copy of the executable dummy text\n") + + def __enter__(self): + return self + + def __exit__(self, exc_typ, exc_val, exc_tb): + shutil.rmtree(self.path) + +class TestScrape(object): + """Tests for teuthology.scrape""" + + def test_grep(self): + with FakeResultDir() as d: + filepath = os.path.join(d.path, "scrapetest.txt") + with open(filepath, 'w') as f: + f.write("Ceph is an open-source software storage platform\n\ + Teuthology is used for testing.") + + #System level grep is called + value1 = scrape.grep(filepath, "software") + value2 = scrape.grep(filepath, "device") + + assert value1 ==\ + ['Ceph is an open-source software storage platform', ''] + assert value2 == [] + + def test_job(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + assert job.get_success() == "false" + assert job.get_assertion() == "FAILED assert 1 == 2" + assert job.get_last_tlog_line() ==\ + b"NOTE: a copy of the executable dummy text" + assert job.get_failure_reason() == "Dummy reason" + + def test_timeoutreason(self): + with FakeResultDir(failure_reason=\ + "status 124: timeout '123 /home/ubuntu/cephtest/workunit.client.0/cephtool/test.sh'") as d: + job = scrape.Job(d.path, 1) + assert scrape.TimeoutReason.could_be(job) + assert scrape.TimeoutReason(job).match(job) + + def test_deadreason(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + #Summary is present + #So this cannot be a DeadReason + assert not scrape.DeadReason.could_be(job) + + def test_lockdepreason(self): + lkReason = None + with FakeResultDir(assertion=\ + "FAILED assert common/lockdep reason\n") as d: + job = scrape.Job(d.path, 1) + assert scrape.LockdepReason.could_be(job) + + lkReason = scrape.LockdepReason(job) + #Backtraces of same jobs must match 100% + assert lkReason.match(job) + with FakeResultDir(blank_backtrace=True) as d: + #Corresponding to 0% match + assert not lkReason.match(scrape.Job(d.path, 2)) + + def test_assertionreason(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + assert scrape.AssertionReason.could_be(job) + + def test_genericreason(self): + d1 = FakeResultDir(blank_backtrace=True) + d2 = FakeResultDir(failure_reason="Dummy dummy") + d3 = FakeResultDir() + + job1 = scrape.Job(d1.path, 1) + job2 = scrape.Job(d2.path, 2) + job3 = scrape.Job(d3.path, 3) + + reason = scrape.GenericReason(job3) + + assert reason.match(job2) + assert not reason.match(job1) + + shutil.rmtree(d1.path) + shutil.rmtree(d2.path) + shutil.rmtree(d3.path) + + def test_valgrindreason(self): + vreason = None + with FakeResultDir( + failure_reason="saw valgrind issues", + assertion="2014-08-22T20:07:18.668 ERROR:tasks.ceph:saw valgrind issue Leak_DefinitelyLost in /var/log/ceph/valgrind/osd.3.log.gz\n" + ) as d: + job = scrape.Job(d.path, 1) + assert scrape.ValgrindReason.could_be(job) + + vreason = scrape.ValgrindReason(job) + assert vreason.match(job) + + def test_give_me_a_reason(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + + assert type(scrape.give_me_a_reason(job)) == scrape.AssertionReason + + #Test the lockdep ordering + with FakeResultDir(assertion=\ + "FAILED assert common/lockdep reason\n") as d: + job = scrape.Job(d.path, 1) + assert type(scrape.give_me_a_reason(job)) == scrape.LockdepReason + + def test_scraper(self): + d = FakeResultDir() + os.mkdir(os.path.join(d.path, "test")) + shutil.move( + os.path.join(d.path, "config.yaml"), + os.path.join(d.path, "test", "config.yaml") + ) + shutil.move( + os.path.join(d.path, "summary.yaml"), + os.path.join(d.path, "test", "summary.yaml") + ) + shutil.move( + os.path.join(d.path, "teuthology.log"), + os.path.join(d.path, "test", "teuthology.log") + ) + + scrape.Scraper(d.path).analyze() + + #scrape.log should be created + assert os.path.exists(os.path.join(d.path, "scrape.log")) + + shutil.rmtree(d.path) -- 2.39.5