]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-cm-ansible.git/commitdiff
testnode: Check for and mark down systems with missing NVMe devices 370/head
authorDavid Galloway <dgallowa@redhat.com>
Wed, 17 Jan 2018 18:06:07 +0000 (13:06 -0500)
committerDavid Galloway <dgallowa@redhat.com>
Wed, 17 Jan 2018 19:20:07 +0000 (14:20 -0500)
Signed-off-by: David Galloway <dgallowa@redhat.com>
roles/testnode/README.rst
roles/testnode/defaults/main.yml
roles/testnode/tasks/check-for-nvme.yml [new file with mode: 0644]
roles/testnode/tasks/main.yml

index d62a57051faa951753bc9209df5ea3e876e05843..0d1cc3272d33c84ebcd042cf12a79f4baaf15ecc 100644 (file)
@@ -225,6 +225,10 @@ Setting ``quick_lvs_to_create`` will:
         # Example would create 4 logical volumes each using 25% of a volume group created using all non-root physical volumes
         quick_lvs_to_create: 4
 
+Define ``check_for_nvme: true`` in Ansible inventory group_vars (by machine type) if the testnode should have an NVMe device.  This will include a few tasks to verify an NVMe device is present.  If the drive is missing, the tasks will mark the testnode down in the paddles_ lock database so the node doesn't repeatedly fail jobs.  Defaults to false::
+
+    check_for_nvme: false
+
 Tags
 ++++
 
@@ -308,3 +312,4 @@ To Do
 .. _teuthology: https://github.com/ceph/teuthology
 .. _ceph-qa-suite: https://github.com/ceph/ceph-qa-suite
 .. _docs: https://docs.ansible.com/ansible/latest/lvol_module.html
+.. _paddles: https://github.com/ceph/paddles
index 518442f6a44cb18ea33c52e7cbcf757f09bffd4b..8714ec21647041d1d21fd85837b5e4d6660aff8d 100644 (file)
@@ -61,3 +61,6 @@ ntp_servers:
   - 1.us.pool.ntp.org
   - 2.us.pool.ntp.org
   - 3.us.pool.ntp.org
+
+# Set to true in group_vars if the testnode/machine type should have an NVMe device
+check_for_nvme: false
diff --git a/roles/testnode/tasks/check-for-nvme.yml b/roles/testnode/tasks/check-for-nvme.yml
new file mode 100644 (file)
index 0000000..4023802
--- /dev/null
@@ -0,0 +1,41 @@
+---
+# NVMe cards have started failing frequently.  These tasks will mark a
+# system down in the paddles DB so it doesn't repeatedly fail jobs if the device is missing.
+# https://wiki.sepia.ceph.com/doku.php?id=hardware:smithi&#nvme_failure_tracking
+# These tasks can also be used by a few machine types in Octo
+
+# Default to false
+- set_fact:
+    nvme_card_present: false
+
+- name: Check for NVMe drive
+  set_fact:
+    nvme_card_present: true
+  with_items: "{{ ansible_devices }}"
+  when: "'nvme' in item"
+
+- name: Check for teuthology-lock command
+  local_action: shell which teuthology-lock
+  register: teuthology_lock
+  ignore_errors: true
+  become: false
+
+- name: Mark system down if NVMe card missing
+  local_action: "shell {{ teuthology_lock.stdout }} --update --status down {{ inventory_hostname }}"
+  become: false
+  when:
+    - teuthology_lock.rc == 0
+    - nvme_card_present == false
+
+- name: Update description in paddles lock DB if NVMe card missing
+  local_action: "shell {{ teuthology_lock.stdout }} --update --desc 'Marked down by ceph-cm-ansible due to missing NVMe card {{ ansible_date_time.iso8601 }}' {{ inventory_hostname }}"
+  become: false
+  when:
+    - teuthology_lock.rc == 0
+    - nvme_card_present == false
+
+- name: Fail rest of playbook due to missing NVMe card
+  fail:
+    msg: "Failing rest of playbook due to missing NVMe card"
+  when:
+    - nvme_card_present == false
index 8cfc5c1566622295ddf493ada998bb59f7fb100c..1caf54a6eb8c3d9b7992c05c7bdaeb38f36ea716 100644 (file)
@@ -67,6 +67,9 @@
   import_tasks: setup-debian.yml
   when: ansible_distribution == "Debian"
 
+- import_tasks: check-for-nvme.yml
+  when: check_for_nvme == true
+
 - import_tasks: zap_disks.yml
   tags:
     - zap