From eb7c301eebb7bf6afd19c291ad93bec1fecf2e09 Mon Sep 17 00:00:00 2001 From: David Galloway Date: Wed, 17 Jan 2018 13:06:07 -0500 Subject: [PATCH] testnode: Check for and mark down systems with missing NVMe devices Signed-off-by: David Galloway --- roles/testnode/README.rst | 5 +++ roles/testnode/defaults/main.yml | 3 ++ roles/testnode/tasks/check-for-nvme.yml | 41 +++++++++++++++++++++++++ roles/testnode/tasks/main.yml | 3 ++ 4 files changed, 52 insertions(+) create mode 100644 roles/testnode/tasks/check-for-nvme.yml diff --git a/roles/testnode/README.rst b/roles/testnode/README.rst index d62a5705..0d1cc327 100644 --- a/roles/testnode/README.rst +++ b/roles/testnode/README.rst @@ -225,6 +225,10 @@ Setting ``quick_lvs_to_create`` will: # Example would create 4 logical volumes each using 25% of a volume group created using all non-root physical volumes quick_lvs_to_create: 4 +Define ``check_for_nvme: true`` in Ansible inventory group_vars (by machine type) if the testnode should have an NVMe device. This will include a few tasks to verify an NVMe device is present. If the drive is missing, the tasks will mark the testnode down in the paddles_ lock database so the node doesn't repeatedly fail jobs. Defaults to false:: + + check_for_nvme: false + Tags ++++ @@ -308,3 +312,4 @@ To Do .. _teuthology: https://github.com/ceph/teuthology .. _ceph-qa-suite: https://github.com/ceph/ceph-qa-suite .. _docs: https://docs.ansible.com/ansible/latest/lvol_module.html +.. _paddles: https://github.com/ceph/paddles diff --git a/roles/testnode/defaults/main.yml b/roles/testnode/defaults/main.yml index 518442f6..8714ec21 100644 --- a/roles/testnode/defaults/main.yml +++ b/roles/testnode/defaults/main.yml @@ -61,3 +61,6 @@ ntp_servers: - 1.us.pool.ntp.org - 2.us.pool.ntp.org - 3.us.pool.ntp.org + +# Set to true in group_vars if the testnode/machine type should have an NVMe device +check_for_nvme: false diff --git a/roles/testnode/tasks/check-for-nvme.yml b/roles/testnode/tasks/check-for-nvme.yml new file mode 100644 index 00000000..40238027 --- /dev/null +++ b/roles/testnode/tasks/check-for-nvme.yml @@ -0,0 +1,41 @@ +--- +# NVMe cards have started failing frequently. These tasks will mark a +# system down in the paddles DB so it doesn't repeatedly fail jobs if the device is missing. +# https://wiki.sepia.ceph.com/doku.php?id=hardware:smithi&#nvme_failure_tracking +# These tasks can also be used by a few machine types in Octo + +# Default to false +- set_fact: + nvme_card_present: false + +- name: Check for NVMe drive + set_fact: + nvme_card_present: true + with_items: "{{ ansible_devices }}" + when: "'nvme' in item" + +- name: Check for teuthology-lock command + local_action: shell which teuthology-lock + register: teuthology_lock + ignore_errors: true + become: false + +- name: Mark system down if NVMe card missing + local_action: "shell {{ teuthology_lock.stdout }} --update --status down {{ inventory_hostname }}" + become: false + when: + - teuthology_lock.rc == 0 + - nvme_card_present == false + +- name: Update description in paddles lock DB if NVMe card missing + local_action: "shell {{ teuthology_lock.stdout }} --update --desc 'Marked down by ceph-cm-ansible due to missing NVMe card {{ ansible_date_time.iso8601 }}' {{ inventory_hostname }}" + become: false + when: + - teuthology_lock.rc == 0 + - nvme_card_present == false + +- name: Fail rest of playbook due to missing NVMe card + fail: + msg: "Failing rest of playbook due to missing NVMe card" + when: + - nvme_card_present == false diff --git a/roles/testnode/tasks/main.yml b/roles/testnode/tasks/main.yml index 8cfc5c15..1caf54a6 100644 --- a/roles/testnode/tasks/main.yml +++ b/roles/testnode/tasks/main.yml @@ -67,6 +67,9 @@ import_tasks: setup-debian.yml when: ansible_distribution == "Debian" +- import_tasks: check-for-nvme.yml + when: check_for_nvme == true + - import_tasks: zap_disks.yml tags: - zap -- 2.47.3