From 84776988659a07f5942544a21b9c5942e1b541d7 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Fri, 15 Oct 2021 14:36:14 +0000 Subject: [PATCH] dispatcher/supervisor: Mark machine down if reimage error >= 10x Introduce a new feature where we mark the machine down if we found more than 10 consecutive reimaging failure when performing teuthology tests. Signed-off-by: Kamoltat --- teuthology/dispatcher/supervisor.py | 45 ++++++++ .../test_reimage_error_mark_machine_down.py | 104 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index ad31098bc7..3003d11abb 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -3,7 +3,9 @@ import os import subprocess import time import yaml +import requests +from urllib.parse import urljoin from datetime import datetime import teuthology @@ -151,6 +153,47 @@ def run_job(job_config, teuth_bin_path, archive_dir, verbose): if 'targets' in job_config: unlock_targets(job_config) +def failure_is_reimage(failure_reason): + if not failure_reason: + return False + reimage_failure = "Error reimaging machines:" + if reimage_failure in failure_reason: + return True + else: + return False + +def check_for_reimage_failures_and_mark_down(targets, count=10): + # Grab paddles history of jobs in the machine + # and count the number of reimaging errors + # if it fails N times then mark the machine down + base_url = teuth_config.results_server + for k, _ in targets.items(): + machine = k.split('@')[-1] + url = urljoin( + base_url, + '/nodes/{0}/jobs/?count={1}'.format( + machine, count) + ) + resp = requests.get(url) + jobs = resp.json() + if len(jobs) < count: + continue + reimage_failures = list(filter( + lambda j: failure_is_reimage(j['failure_reason']), + jobs + )) + if len(reimage_failures) < count: + continue + # Mark machine down + machine_name = shortname(k) + teuthology.lock.ops.update_lock( + machine_name, + description='reimage failed {0} times'.format(count), + status='down', + ) + log.error( + 'Reimage failed {0} times ... marking machine down'.format(count) + ) def reimage(job_config): # Reimage the targets specified in job config @@ -166,6 +209,8 @@ def reimage(job_config): # Reimage failures should map to the 'dead' status instead of 'fail' report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))) nuke(ctx, True) + # Machine that fails to reimage after 10 times will be marked down + check_for_reimage_failures_and_mark_down(targets) raise ctx.config['targets'] = reimaged # change the status to running after the reimaging process diff --git a/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py b/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py new file mode 100644 index 0000000000..f2365174e7 --- /dev/null +++ b/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py @@ -0,0 +1,104 @@ +from teuthology.dispatcher import supervisor +from unittest.mock import patch + +class TestCheckReImageFailureMarkDown(object): + def setup(self): + self.the_function = supervisor.check_for_reimage_failures_and_mark_down + + def create_n_out_of_10_reimage_failed_jobs(self, n): + ret_list = [] + for i in range(n): + obj1 = { + "failure_reason":"Error reimaging machines: Manually raised error" + } + ret_list.append(obj1) + for j in range(10-n): + obj2 = {"failure_reason":"Error something else: dummy"} + ret_list.append(obj2) + return ret_list + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_one_machine_ten_reimage_failed_jobs( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'} + m_requests.get.return_value.json.return_value = \ + self.create_n_out_of_10_reimage_failed_jobs(10) + shortname.return_value = 'rmachine061' + self.the_function(targets) + assert mark_down.called + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_one_machine_seven_reimage_failed_jobs( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname, + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'} + m_requests.get.return_value.json.return_value = \ + self.create_n_out_of_10_reimage_failed_jobs(7) + shortname.return_value = 'rmachine061' + self.the_function(targets) + assert mark_down.called is False + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_two_machine_all_reimage_failed_jobs( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname, + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519', + 'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'} + m_requests.get.return_value.json.side_effect = \ + [self.create_n_out_of_10_reimage_failed_jobs(10), + self.create_n_out_of_10_reimage_failed_jobs(10)] + shortname.return_value.side_effect = ['rmachine061', 'rmachine179'] + self.the_function(targets) + assert mark_down.call_count == 2 + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_two_machine_one_healthy_one_reimage_failure( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname, + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519', + 'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'} + m_requests.get.return_value.json.side_effect = \ + [self.create_n_out_of_10_reimage_failed_jobs(0), + self.create_n_out_of_10_reimage_failed_jobs(10)] + shortname.return_value.side_effect = ['rmachine061', 'rmachine179'] + self.the_function(targets) + assert mark_down.call_count == 1 + assert mark_down.call_args_list[0][0][0].startswith('rmachine179') + -- 2.39.5