dispatcher/supervisor: Mark machine down if reimage error >= 10x

author Kamoltat <ksirivad@redhat.com>

Fri, 15 Oct 2021 14:36:14 +0000 (14:36 +0000)

committer Kamoltat <ksirivad@redhat.com>

Mon, 29 Nov 2021 21:43:07 +0000 (21:43 +0000)
author Kamoltat <ksirivad@redhat.com>
Fri, 15 Oct 2021 14:36:14 +0000 (14:36 +0000)
committer Kamoltat <ksirivad@redhat.com>
Mon, 29 Nov 2021 21:43:07 +0000 (21:43 +0000)
diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py

index ad31098bc708a9841778b5e929d2daee51037e43..3003d11abb70abc54208010aa2d055b5fee58648 100644 (file)
--- a/teuthology/dispatcher/supervisor.py
+++ b/teuthology/dispatcher/supervisor.py
@@ -3,7 +3,9 @@ import os
  import subprocess
  import time
  import yaml
+import requests
  
+from urllib.parse import urljoin
  from datetime import datetime
  
  import teuthology
@@ -151,6 +153,47 @@ def run_job(job_config, teuth_bin_path, archive_dir, verbose):
      if 'targets' in job_config:
          unlock_targets(job_config)
  
+def failure_is_reimage(failure_reason):
+    if not failure_reason:
+        return False
+    reimage_failure = "Error reimaging machines:"
+    if reimage_failure in failure_reason:
+        return True
+    else:
+        return False
+
+def check_for_reimage_failures_and_mark_down(targets, count=10):
+    # Grab paddles history of jobs in the machine
+    # and count the number of reimaging errors
+    # if it fails N times then mark the machine down
+    base_url = teuth_config.results_server
+    for k, _ in targets.items():
+        machine = k.split('@')[-1]
+        url = urljoin(
+                base_url,
+                '/nodes/{0}/jobs/?count={1}'.format(
+                machine, count)
+        )
+        resp = requests.get(url)
+        jobs = resp.json()
+        if len(jobs) < count:
+            continue
+        reimage_failures = list(filter(
+            lambda j: failure_is_reimage(j['failure_reason']),
+            jobs
+        ))
+        if len(reimage_failures) < count:
+            continue
+        # Mark machine down
+        machine_name = shortname(k)
+        teuthology.lock.ops.update_lock(
+           machine_name,
+           description='reimage failed {0} times'.format(count),
+           status='down',
+       )
+        log.error(
+            'Reimage failed {0} times ... marking machine down'.format(count)
+        )
  
  def reimage(job_config):
      # Reimage the targets specified in job config
@@ -166,6 +209,8 @@ def reimage(job_config):
          # Reimage failures should map to the 'dead' status instead of 'fail'
          report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
          nuke(ctx, True)
+        # Machine that fails to reimage after 10 times will be marked down
+        check_for_reimage_failures_and_mark_down(targets)
          raise
      ctx.config['targets'] = reimaged
      # change the status to running after the reimaging process
diff --git a/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py b/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py

new file mode 100644 (file)

index 0000000..f236517
--- /dev/null
+++ b/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py
@@ -0,0 +1,104 @@
+from teuthology.dispatcher import supervisor
+from unittest.mock import patch
+
+class TestCheckReImageFailureMarkDown(object):
+    def setup(self):
+        self.the_function = supervisor.check_for_reimage_failures_and_mark_down
+
+    def create_n_out_of_10_reimage_failed_jobs(self, n):
+        ret_list = []
+        for i in range(n):
+            obj1 = {
+              "failure_reason":"Error reimaging machines: Manually raised error"
+              }
+            ret_list.append(obj1)
+        for j in range(10-n):
+            obj2 = {"failure_reason":"Error something else: dummy"}
+            ret_list.append(obj2)
+        return ret_list
+
+    @patch('teuthology.dispatcher.supervisor.shortname')
+    @patch('teuthology.lock.ops.update_lock')
+    @patch('teuthology.dispatcher.supervisor.requests')
+    @patch('teuthology.dispatcher.supervisor.urljoin')
+    @patch('teuthology.dispatcher.supervisor.teuth_config')
+    def test_one_machine_ten_reimage_failed_jobs(
+        self,
+        m_t_config,
+        m_urljoin,
+        m_requests,
+        mark_down,
+        shortname
+        ):
+        targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'}
+        m_requests.get.return_value.json.return_value = \
+            self.create_n_out_of_10_reimage_failed_jobs(10)
+        shortname.return_value = 'rmachine061'
+        self.the_function(targets)
+        assert mark_down.called
+
+    @patch('teuthology.dispatcher.supervisor.shortname')
+    @patch('teuthology.lock.ops.update_lock')
+    @patch('teuthology.dispatcher.supervisor.requests')
+    @patch('teuthology.dispatcher.supervisor.urljoin')
+    @patch('teuthology.dispatcher.supervisor.teuth_config')
+    def test_one_machine_seven_reimage_failed_jobs(
+        self,
+        m_t_config,
+        m_urljoin,
+        m_requests,
+        mark_down,
+        shortname,
+        ):
+        targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'}
+        m_requests.get.return_value.json.return_value = \
+            self.create_n_out_of_10_reimage_failed_jobs(7)
+        shortname.return_value = 'rmachine061'
+        self.the_function(targets)
+        assert mark_down.called is False
+
+    @patch('teuthology.dispatcher.supervisor.shortname')
+    @patch('teuthology.lock.ops.update_lock')
+    @patch('teuthology.dispatcher.supervisor.requests')
+    @patch('teuthology.dispatcher.supervisor.urljoin')
+    @patch('teuthology.dispatcher.supervisor.teuth_config')
+    def test_two_machine_all_reimage_failed_jobs(
+        self,
+        m_t_config,
+        m_urljoin,
+        m_requests,
+        mark_down,
+        shortname,
+        ):
+        targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519',
+                   'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'}
+        m_requests.get.return_value.json.side_effect = \
+            [self.create_n_out_of_10_reimage_failed_jobs(10),
+            self.create_n_out_of_10_reimage_failed_jobs(10)]
+        shortname.return_value.side_effect = ['rmachine061', 'rmachine179']
+        self.the_function(targets)
+        assert mark_down.call_count == 2
+
+    @patch('teuthology.dispatcher.supervisor.shortname')
+    @patch('teuthology.lock.ops.update_lock')
+    @patch('teuthology.dispatcher.supervisor.requests')
+    @patch('teuthology.dispatcher.supervisor.urljoin')
+    @patch('teuthology.dispatcher.supervisor.teuth_config')
+    def test_two_machine_one_healthy_one_reimage_failure(
+        self,
+        m_t_config,
+        m_urljoin,
+        m_requests,
+        mark_down,
+        shortname,
+        ):
+        targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519',
+                   'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'}
+        m_requests.get.return_value.json.side_effect = \
+            [self.create_n_out_of_10_reimage_failed_jobs(0),
+            self.create_n_out_of_10_reimage_failed_jobs(10)]
+        shortname.return_value.side_effect = ['rmachine061', 'rmachine179']
+        self.the_function(targets)
+        assert mark_down.call_count == 1
+        assert mark_down.call_args_list[0][0][0].startswith('rmachine179')
+
author	Kamoltat <ksirivad@redhat.com>
	Fri, 15 Oct 2021 14:36:14 +0000 (14:36 +0000)
committer	Kamoltat <ksirivad@redhat.com>
	Mon, 29 Nov 2021 21:43:07 +0000 (21:43 +0000)
teuthology/dispatcher/supervisor.py		patch \| blob \| history
teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py	[new file with mode: 0644]	patch \| blob