import subprocess
import time
import yaml
+import requests
+from urllib.parse import urljoin
from datetime import datetime
import teuthology
if 'targets' in job_config:
unlock_targets(job_config)
+def failure_is_reimage(failure_reason):
+ if not failure_reason:
+ return False
+ reimage_failure = "Error reimaging machines:"
+ if reimage_failure in failure_reason:
+ return True
+ else:
+ return False
+
+def check_for_reimage_failures_and_mark_down(targets, count=10):
+ # Grab paddles history of jobs in the machine
+ # and count the number of reimaging errors
+ # if it fails N times then mark the machine down
+ base_url = teuth_config.results_server
+ for k, _ in targets.items():
+ machine = k.split('@')[-1]
+ url = urljoin(
+ base_url,
+ '/nodes/{0}/jobs/?count={1}'.format(
+ machine, count)
+ )
+ resp = requests.get(url)
+ jobs = resp.json()
+ if len(jobs) < count:
+ continue
+ reimage_failures = list(filter(
+ lambda j: failure_is_reimage(j['failure_reason']),
+ jobs
+ ))
+ if len(reimage_failures) < count:
+ continue
+ # Mark machine down
+ machine_name = shortname(k)
+ teuthology.lock.ops.update_lock(
+ machine_name,
+ description='reimage failed {0} times'.format(count),
+ status='down',
+ )
+ log.error(
+ 'Reimage failed {0} times ... marking machine down'.format(count)
+ )
def reimage(job_config):
# Reimage the targets specified in job config
# Reimage failures should map to the 'dead' status instead of 'fail'
report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
nuke(ctx, True)
+ # Machine that fails to reimage after 10 times will be marked down
+ check_for_reimage_failures_and_mark_down(targets)
raise
ctx.config['targets'] = reimaged
# change the status to running after the reimaging process
--- /dev/null
+from teuthology.dispatcher import supervisor
+from unittest.mock import patch
+
+class TestCheckReImageFailureMarkDown(object):
+ def setup(self):
+ self.the_function = supervisor.check_for_reimage_failures_and_mark_down
+
+ def create_n_out_of_10_reimage_failed_jobs(self, n):
+ ret_list = []
+ for i in range(n):
+ obj1 = {
+ "failure_reason":"Error reimaging machines: Manually raised error"
+ }
+ ret_list.append(obj1)
+ for j in range(10-n):
+ obj2 = {"failure_reason":"Error something else: dummy"}
+ ret_list.append(obj2)
+ return ret_list
+
+ @patch('teuthology.dispatcher.supervisor.shortname')
+ @patch('teuthology.lock.ops.update_lock')
+ @patch('teuthology.dispatcher.supervisor.requests')
+ @patch('teuthology.dispatcher.supervisor.urljoin')
+ @patch('teuthology.dispatcher.supervisor.teuth_config')
+ def test_one_machine_ten_reimage_failed_jobs(
+ self,
+ m_t_config,
+ m_urljoin,
+ m_requests,
+ mark_down,
+ shortname
+ ):
+ targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'}
+ m_requests.get.return_value.json.return_value = \
+ self.create_n_out_of_10_reimage_failed_jobs(10)
+ shortname.return_value = 'rmachine061'
+ self.the_function(targets)
+ assert mark_down.called
+
+ @patch('teuthology.dispatcher.supervisor.shortname')
+ @patch('teuthology.lock.ops.update_lock')
+ @patch('teuthology.dispatcher.supervisor.requests')
+ @patch('teuthology.dispatcher.supervisor.urljoin')
+ @patch('teuthology.dispatcher.supervisor.teuth_config')
+ def test_one_machine_seven_reimage_failed_jobs(
+ self,
+ m_t_config,
+ m_urljoin,
+ m_requests,
+ mark_down,
+ shortname,
+ ):
+ targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'}
+ m_requests.get.return_value.json.return_value = \
+ self.create_n_out_of_10_reimage_failed_jobs(7)
+ shortname.return_value = 'rmachine061'
+ self.the_function(targets)
+ assert mark_down.called is False
+
+ @patch('teuthology.dispatcher.supervisor.shortname')
+ @patch('teuthology.lock.ops.update_lock')
+ @patch('teuthology.dispatcher.supervisor.requests')
+ @patch('teuthology.dispatcher.supervisor.urljoin')
+ @patch('teuthology.dispatcher.supervisor.teuth_config')
+ def test_two_machine_all_reimage_failed_jobs(
+ self,
+ m_t_config,
+ m_urljoin,
+ m_requests,
+ mark_down,
+ shortname,
+ ):
+ targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519',
+ 'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'}
+ m_requests.get.return_value.json.side_effect = \
+ [self.create_n_out_of_10_reimage_failed_jobs(10),
+ self.create_n_out_of_10_reimage_failed_jobs(10)]
+ shortname.return_value.side_effect = ['rmachine061', 'rmachine179']
+ self.the_function(targets)
+ assert mark_down.call_count == 2
+
+ @patch('teuthology.dispatcher.supervisor.shortname')
+ @patch('teuthology.lock.ops.update_lock')
+ @patch('teuthology.dispatcher.supervisor.requests')
+ @patch('teuthology.dispatcher.supervisor.urljoin')
+ @patch('teuthology.dispatcher.supervisor.teuth_config')
+ def test_two_machine_one_healthy_one_reimage_failure(
+ self,
+ m_t_config,
+ m_urljoin,
+ m_requests,
+ mark_down,
+ shortname,
+ ):
+ targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519',
+ 'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'}
+ m_requests.get.return_value.json.side_effect = \
+ [self.create_n_out_of_10_reimage_failed_jobs(0),
+ self.create_n_out_of_10_reimage_failed_jobs(10)]
+ shortname.return_value.side_effect = ['rmachine061', 'rmachine179']
+ self.the_function(targets)
+ assert mark_down.call_count == 1
+ assert mark_down.call_args_list[0][0][0].startswith('rmachine179')
+