--- /dev/null
+---
+# This playbook use to recover Ceph OSDs after ssd journal failure.
+# You will also realise that it’s really simple to bring your
+# OSDs back to life after replacing your faulty SSD with a new one.
+#
+# You should define `dev_ssds` variable for host which change ssds after
+# failture.
+#
+# For example in host_vars/hostname1.yml
+#
+# dev_ssds:
+# - device_name: sdd
+# partitions:
+# - index: 1
+# size: 10G
+# osd_id: 0
+# - index: 2
+# size: 10G
+# osd_id: 1
+# - device_name: sdf
+# partitions:
+# - index: 1
+# size: 10G
+# osd_id: 2
+#
+# @param device_name: The device name of new ssd
+# @param partitions: The custom partition layout of new ssd
+# @param index: The index of this partition
+# @param size: The size of this partition
+# @param osd_id: Which osds's journal this pattition for.
+#
+# ansible-playbook recover-osds-after-ssd-journal-failure.yml
+# Prompts for select which host to recover, defaults to null,
+# doesn't select host the recover ssd. Input the hostname
+# which to recover osds after ssd journal failure
+#
+# ansible-playbook -e target_host=hostname \
+# recover-osds-after-ssd-journal-failure.yml
+# Overrides the prompt using -e option. Can be used in
+# automation scripts to avoid interactive prompt.
+
+- hosts: localhost
+ gather_facts: no
+ vars_prompt:
+ - name: target_host
+ prompt: please enter the target hostname which to recover osds after ssd journal failure
+ private: no
+ tasks:
+ - add_host:
+ name: "{{ target_host }}"
+ groups: dynamically_created_hosts
+
+- hosts: dynamically_created_hosts
+ vars:
+ journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
+ dev_ssds: []
+
+ tasks:
+ - name: load a variable file for dev_ssds
+ include_vars: "{{ item }}"
+ with_first_found:
+ - files:
+ - "host_vars/{{ ansible_hostname }}.yml"
+ skip: true
+
+ - name: get osd(s) if directory stat
+ stat:
+ path: "/var/lib/ceph/osd/ceph-{{ item.1.osd_id }}/journal_uuid"
+ register: osds_dir_stat
+ with_subelements:
+ - "{{ dev_ssds }}"
+ - partitions
+ when: dev_ssds is defined
+
+ - name: exit playbook osd(s) is not on this host
+ fail:
+ msg: exit playbook osds is not no this host
+ with_items:
+ osds_dir_stat.results
+ when:
+ - osds_dir_stat is defined and item.stat.exists == false
+
+ - name: install sgdisk(gdisk)
+ package:
+ name: gdisk
+ state: present
+ when: dev_ssds is defined
+
+ - name: get osd(s) journal uuid
+ shell: cat "/var/lib/ceph/osd/ceph-{{ item.1.osd_id }}/journal_uuid"
+ register: osds_uuid
+ with_subelements:
+ - "{{ dev_ssds }}"
+ - partitions
+ when: dev_ssds is defined
+
+ - name: make partitions on new ssd
+ shell: >
+ sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" --typecode={{ item.item[1].index
+ }}:{{ journal_typecode }} --partition-guid={{ item.item[1].index }}:{{ item.stdout }} --mbrtogpt -- /dev/{{ item.item[0].device_name }}
+ with_items:
+ - "{{ osds_uuid.results }}"
+ when: dev_ssds is defined
+
+ - name: stop osd(s) service
+ service:
+ name: "ceph-osd@{{ item.item[1].osd_id }}"
+ state: stopped
+ enabled: no
+ with_items:
+ - "{{ osds_uuid.results }}"
+ when: dev_ssds is defined
+
+ - name: reinitialize osd(s) journal in new ssd
+ shell: >
+ ceph-osd -i {{ item.item[1].osd_id }} --mkjournal
+ with_items:
+ - "{{ osds_uuid.results }}"
+ when: dev_ssds is defined
+
+ - name: start osd(s) service
+ service:
+ name: "ceph-osd@{{ item.item[1].osd_id }}"
+ state: started
+ enabled: yes
+ with_items:
+ - "{{ osds_uuid.results }}"
+ when: dev_ssds is defined