Until now, only the first task were executed.
The idea here is to use `listen` statement to be able to notify multiple
handler and regroup all of them in `./handlers/main.yml` as notifying an
included handler task is not possible.
Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
update-cache: yes
when: ansible_os_family == 'Debian'
-- name: restart ceph mons
- include: "./restart-mon.yml"
+- block:
+ - name: copy mon restart script
+ template:
+ src: restart_mon_daemon.sh.j2
+ dest: /tmp/restart_mon_daemon.sh
+ owner: root
+ group: root
+ mode: 0750
+ listen: "restart ceph mons"
-- name: restart ceph osds
- include: "./restart-osd.yml"
+ - name: restart ceph mon daemon(s)
+ command: /tmp/restart_mon_daemon.sh
+ listen: "restart ceph mons"
+
+ when:
+ - mon_group_name in group_names
+
+# This does not just restart OSDs but everything else too. Unfortunately
+# at this time the ansible role does not have an OSD id list to use
+# for restarting them specifically.
+- block:
+ - name: copy osd restart script
+ template:
+ src: restart_osd_daemon.sh.j2
+ dest: /tmp/restart_osd_daemon.sh
+ owner: root
+ group: root
+ mode: 0750
+ listen: "restart ceph osds"
+
+ - name: restart ceph osds daemon(s)
+ command: /tmp/restart_osd_daemon.sh
+ listen: "restart ceph osds"
+ when:
+ - handler_health_osd_check
+ when:
+ - osd_group_name in group_names
- name: restart ceph mdss
- include: "./restart-mds.yml"
+ service:
+ name: ceph-mds@{{ mds_name }}
+ state: restarted
+ # serial: 1 would be the proper solution here, but that can only be set on play level
+ # upstream issue: https://github.com/ansible/ansible/issues/12170
+ run_once: true
+ with_items: "{{ groups.get(mds_group_name, []) }}"
+ delegate_to: "{{ item }}"
+ when:
+ - mds_group_name in group_names
- name: restart ceph rgws
- include: "./restart-rgw.yml"
+ service:
+ name: ceph-radosgw@rgw.{{ ansible_hostname }}
+ state: restarted
+ # serial: 1 would be the proper solution here, but that can only be set on play level
+ # upstream issue: https://github.com/ansible/ansible/issues/12170
+ run_once: true
+ with_items: "{{ groups.get(rgw_group_name, []) }}"
+ delegate_to: "{{ item }}"
+ when:
+ - rgw_group_name in group_names
- name: restart ceph nfss
service:
+++ /dev/null
----
-- name: restart ceph mdss
- service:
- name: ceph-mds@{{ mds_name }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(mds_group_name, []) }}"
- delegate_to: "{{ item }}"
- when:
- - socket.rc == 0
- - mds_group_name in group_names
+++ /dev/null
----
-- name: restart ceph mons
- service:
- name: ceph-mon@{{ monitor_name }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(mon_group_name, []) }}"
- delegate_to: "{{ item }}"
- when:
- - socket.rc == 0
- - mon_group_name in group_names
-
-- name: validate monitors
- include: validate-mon.yml
- when: mon_group_name in group_names
+++ /dev/null
----
-# This does not just restart OSDs but everything else too. Unfortunately
-# at this time the ansible role does not have an OSD id list to use
-# for restarting them specifically.
-- name: restart ceph osds
- shell: |
- for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
- systemctl restart ceph-osd@$id
- sleep 5
- done
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(osd_group_name, []) }}"
- delegate_to: "{{ item }}"
- when:
- - socket.rc == 0
- - osd_group_name in group_names
-
-- name: validate osds
- include: validate-osd.yml
- when: osd_group_name in group_names
+++ /dev/null
----
-- name: restart ceph rgws
- service:
- name: ceph-rgw@{{ ansible_hostname }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(rgw_group_name, []) }}"
- delegate_to: "{{ item }}"
- when:
- - socketrgw.rc == 0
- - rgw_group_name in group_names
+++ /dev/null
----
-- name: wait for ceph monitor socket
- wait_for:
- path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"
-
-- name: set mon_host_count
- set_fact: mon_host_count={{ groups[mon_group_name] | length }}
-
-- name: select a running monitor
- set_fact: mon_host={{ item }}
- with_items: "{{ groups[mon_group_name] }}"
- when:
- - item != inventory_hostname
- - mon_host_count | int > 1
-
-- name: select first monitor if only one monitor
- set_fact: mon_host={{ item }}
- with_items: "{{ groups[mon_group_name][0] }}"
- when: mon_host_count | int == 1
-
-- name: waiting for the monitor to join the quorum...
- shell: |
- ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
- register: result
- until: result.rc == 0
- retries: "{{ handler_health_mon_check_retries }}"
- delay: "{{ handler_health_mon_check_delay }}"
- delegate_to: "{{ mon_host }}"
+++ /dev/null
----
-- name: collect osds
- shell: |
- ls /var/lib/ceph/osd/ | sed 's/.*-//'
- register: osd_ids
-
-- name: wait for ceph osd socket(s)
- wait_for:
- path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
- with_items: "{{ osd_ids.stdout_lines }}"
-
-- name: waiting for clean pgs...
- shell: |
- test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
- register: result
- until: result.rc == 0
- retries: "{{ handler_health_osd_check_retries }}"
- delay: "{{ handler_health_osd_check_delay }}"
- delegate_to: "{{ groups[mon_group_name][0] }}"
- when: handler_health_osd_check
+++ /dev/null
----
-- name: check for a ceph socket
- shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
- changed_when: false
- failed_when: false
- always_run: true
- register: socket
-
-- name: check for a rados gateway socket
- shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
- changed_when: false
- failed_when: false
- always_run: true
- register: socketrgw
static: False
- include: facts.yml
-- include: ./checks/check_socket.yml
- include: create_ceph_initial_dirs.yml
- include: generate_cluster_fsid.yml
- include: generate_ceph_conf.yml
--- /dev/null
+#!/bin/bash
+
+RETRIES="{{ handler_health_mon_check_retries }}"
+DELAY="{{ handler_health_mon_check_delay }}"
+MONITOR_NAME="{{ monitor_name }}"
+CLUSTER="{{ cluster }}"
+SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok
+
+
+check_quorum() {
+while [ $RETRIES -ne 0 ]; do
+ MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+ test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
+ sleep $DELAY
+ let RETRIES=RETRIES-1
+done
+# If we reach this point, it means there is a problem with the quorum
+exit 1
+}
+
+# First, restart the daemon
+systemctl restart ceph-mon@${MONITOR_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemon
+while [ $COUNT -ne 0 ]; do
+ test -S $SOCKET && check_quorum
+ sleep 1
+ let COUNT=COUNT-1
+done
+# If we reach this point, it means the socket is not present.
+echo "Error while restarting mon daemon"
+exit 1
--- /dev/null
+#!/bin/bash
+
+RETRIES="{{ handler_health_osd_check_retries }}"
+DELAY="{{ handler_health_osd_check_delay }}"
+CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
+
+check_pgs() {
+ while [ $RETRIES -ne 0 ]; do
+ ceph $CEPH_CLI -s | grep -sq 'active+clean'
+ RET=$?
+ test $RET -eq 0 && exit 0
+ sleep $DELAY
+ let RETRIES=RETRIES-1
+ done
+ # PGs not clean, exiting with return code 1
+ echo "Error with PGs, check config"
+ exit 1
+}
+
+
+for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
+ # First, restart daemon(s)
+ systemctl restart ceph-osd@${id}
+ # We need to wait because it may take some time for the socket to actually exists
+ COUNT=10
+ # Wait and ensure the socket exists after restarting the daemon
+ SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
+ while [ $COUNT -ne 0 ]; do
+ test -S $SOCKET && check_pgs
+ sleep 1
+ let COUNT=COUNT-1
+ done
+ # If we reach this point, it means the socket is not present.
+ echo "Error while restarting mon daemon"
+ exit 1
+done