Common: Fix handlers that are not properly triggered.

author Guillaume Abrioux <gabrioux@redhat.com>

Mon, 3 Apr 2017 17:55:11 +0000 (19:55 +0200)

committer Guillaume Abrioux <gabrioux@redhat.com>

Thu, 6 Apr 2017 14:19:58 +0000 (16:19 +0200)
author Guillaume Abrioux <gabrioux@redhat.com>
Mon, 3 Apr 2017 17:55:11 +0000 (19:55 +0200)
committer Guillaume Abrioux <gabrioux@redhat.com>
Thu, 6 Apr 2017 14:19:58 +0000 (16:19 +0200)
diff --git a/roles/ceph-common/handlers/main.yml b/roles/ceph-common/handlers/main.yml

index 8cb6bd301680bd060b61e3493e70cb3f7f754612..9602a0ce54036c13d634192a7e02b90df3e49c54 100644 (file)
--- a/roles/ceph-common/handlers/main.yml
+++ b/roles/ceph-common/handlers/main.yml
@@ -4,17 +4,67 @@
      update-cache: yes
    when: ansible_os_family == 'Debian'
  
-- name: restart ceph mons
-  include: "./restart-mon.yml"
+- block:
+  - name: copy mon restart script
+    template:
+      src: restart_mon_daemon.sh.j2
+      dest: /tmp/restart_mon_daemon.sh
+      owner: root
+      group: root
+      mode: 0750
+    listen: "restart ceph mons"
  
-- name: restart ceph osds
-  include: "./restart-osd.yml"
+  - name: restart ceph mon daemon(s)
+    command: /tmp/restart_mon_daemon.sh
+    listen: "restart ceph mons"
+
+  when:
+    - mon_group_name in group_names
+
+# This does not just restart OSDs but everything else too. Unfortunately
+# at this time the ansible role does not have an OSD id list to use
+# for restarting them specifically.
+- block:
+  - name: copy osd restart script
+    template:
+      src: restart_osd_daemon.sh.j2
+      dest: /tmp/restart_osd_daemon.sh
+      owner: root
+      group: root
+      mode: 0750
+    listen: "restart ceph osds"
+
+  - name: restart ceph osds daemon(s)
+    command: /tmp/restart_osd_daemon.sh
+    listen: "restart ceph osds"
+    when:
+      - handler_health_osd_check
+  when:
+    - osd_group_name in group_names
  
  - name: restart ceph mdss
-  include: "./restart-mds.yml"
+  service:
+    name: ceph-mds@{{ mds_name }}
+    state: restarted
+  # serial: 1 would be the proper solution here, but that can only be set on play level
+  # upstream issue: https://github.com/ansible/ansible/issues/12170
+  run_once: true
+  with_items: "{{ groups.get(mds_group_name, []) }}"
+  delegate_to: "{{ item }}"
+  when:
+    - mds_group_name in group_names
  
  - name: restart ceph rgws
-  include: "./restart-rgw.yml"
+  service:
+    name: ceph-radosgw@rgw.{{ ansible_hostname }}
+    state: restarted
+  # serial: 1 would be the proper solution here, but that can only be set on play level
+  # upstream issue: https://github.com/ansible/ansible/issues/12170
+  run_once: true
+  with_items: "{{ groups.get(rgw_group_name, []) }}"
+  delegate_to: "{{ item }}"
+  when:
+    - rgw_group_name in group_names
  
  - name: restart ceph nfss
    service:
diff --git a/roles/ceph-common/handlers/restart-mds.yml b/roles/ceph-common/handlers/restart-mds.yml

deleted file mode 100644 (file)

index 142043f..0000000
--- a/roles/ceph-common/handlers/restart-mds.yml
+++ /dev/null
@@ -1,13 +0,0 @@
----
-- name: restart ceph mdss
-  service:
-    name: ceph-mds@{{ mds_name }}
-    state: restarted
-  # serial: 1 would be the proper solution here, but that can only be set on play level
-  # upstream issue: https://github.com/ansible/ansible/issues/12170
-  run_once: true
-  with_items: "{{ groups.get(mds_group_name, []) }}"
-  delegate_to: "{{ item }}"
-  when:
-    - socket.rc == 0
-    - mds_group_name in group_names
diff --git a/roles/ceph-common/handlers/restart-mon.yml b/roles/ceph-common/handlers/restart-mon.yml

deleted file mode 100644 (file)

index 6776bd4..0000000
--- a/roles/ceph-common/handlers/restart-mon.yml
+++ /dev/null
@@ -1,17 +0,0 @@
----
-- name: restart ceph mons
-  service:
-    name: ceph-mon@{{ monitor_name }}
-    state: restarted
-  # serial: 1 would be the proper solution here, but that can only be set on play level
-  # upstream issue: https://github.com/ansible/ansible/issues/12170
-  run_once: true
-  with_items: "{{ groups.get(mon_group_name, []) }}"
-  delegate_to: "{{ item }}"
-  when:
-    - socket.rc == 0
-    - mon_group_name in group_names
-
-- name: validate monitors
-  include: validate-mon.yml
-  when: mon_group_name in group_names
diff --git a/roles/ceph-common/handlers/restart-osd.yml b/roles/ceph-common/handlers/restart-osd.yml

deleted file mode 100644 (file)

index 93641f9..0000000
--- a/roles/ceph-common/handlers/restart-osd.yml
+++ /dev/null
@@ -1,22 +0,0 @@
----
-# This does not just restart OSDs but everything else too. Unfortunately
-# at this time the ansible role does not have an OSD id list to use
-# for restarting them specifically.
-- name: restart ceph osds
-  shell: |
-    for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
-      systemctl restart ceph-osd@$id
-      sleep 5
-    done
-  # serial: 1 would be the proper solution here, but that can only be set on play level
-  # upstream issue: https://github.com/ansible/ansible/issues/12170
-  run_once: true
-  with_items: "{{ groups.get(osd_group_name, []) }}"
-  delegate_to: "{{ item }}"
-  when:
-    - socket.rc == 0
-    - osd_group_name in group_names
-
-- name: validate osds
-  include: validate-osd.yml
-  when: osd_group_name in group_names
diff --git a/roles/ceph-common/handlers/restart-rgw.yml b/roles/ceph-common/handlers/restart-rgw.yml

deleted file mode 100644 (file)

index 479ac31..0000000
--- a/roles/ceph-common/handlers/restart-rgw.yml
+++ /dev/null
@@ -1,13 +0,0 @@
----
-- name: restart ceph rgws
-  service:
-    name: ceph-rgw@{{ ansible_hostname }}
-    state: restarted
-  # serial: 1 would be the proper solution here, but that can only be set on play level
-  # upstream issue: https://github.com/ansible/ansible/issues/12170
-  run_once: true
-  with_items: "{{ groups.get(rgw_group_name, []) }}"
-  delegate_to: "{{ item }}"
-  when:
-    - socketrgw.rc == 0
-    - rgw_group_name in group_names
diff --git a/roles/ceph-common/handlers/validate-mon.yml b/roles/ceph-common/handlers/validate-mon.yml

deleted file mode 100644 (file)

index 4c5e15a..0000000
--- a/roles/ceph-common/handlers/validate-mon.yml
+++ /dev/null
@@ -1,28 +0,0 @@
----
-- name: wait for ceph monitor socket
-  wait_for:
-    path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"
-
-- name: set mon_host_count
-  set_fact: mon_host_count={{ groups[mon_group_name] | length }}
-
-- name: select a running monitor
-  set_fact: mon_host={{ item }}
-  with_items: "{{ groups[mon_group_name] }}"
-  when:
-    - item != inventory_hostname
-    - mon_host_count | int > 1
-
-- name: select first monitor if only one monitor
-  set_fact: mon_host={{ item }}
-  with_items: "{{ groups[mon_group_name][0] }}"
-  when: mon_host_count | int == 1
-
-- name: waiting for the monitor to join the quorum...
-  shell: |
-    ceph -s  --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
-  register: result
-  until: result.rc == 0
-  retries: "{{ handler_health_mon_check_retries }}"
-  delay: "{{ handler_health_mon_check_delay }}"
-  delegate_to: "{{ mon_host }}"
diff --git a/roles/ceph-common/handlers/validate-osd.yml b/roles/ceph-common/handlers/validate-osd.yml

deleted file mode 100644 (file)

index aefe1b9..0000000
--- a/roles/ceph-common/handlers/validate-osd.yml
+++ /dev/null
@@ -1,20 +0,0 @@
----
-- name: collect osds
-  shell: |
-    ls /var/lib/ceph/osd/ | sed 's/.*-//'
-  register: osd_ids
-
-- name: wait for ceph osd socket(s)
-  wait_for:
-    path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
-  with_items: "{{ osd_ids.stdout_lines }}"
-
-- name: waiting for clean pgs...
-  shell: |
-    test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
-  register: result
-  until: result.rc == 0
-  retries: "{{ handler_health_osd_check_retries }}"
-  delay: "{{ handler_health_osd_check_delay }}"
-  delegate_to: "{{ groups[mon_group_name][0] }}"
-  when: handler_health_osd_check
diff --git a/roles/ceph-common/tasks/checks/check_socket.yml b/roles/ceph-common/tasks/checks/check_socket.yml

deleted file mode 100644 (file)

index 674d34c..0000000
--- a/roles/ceph-common/tasks/checks/check_socket.yml
+++ /dev/null
@@ -1,14 +0,0 @@
----
-- name: check for a ceph socket
-  shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
-  changed_when: false
-  failed_when: false
-  always_run: true
-  register: socket
-
-- name: check for a rados gateway socket
-  shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
-  changed_when: false
-  failed_when: false
-  always_run: true
-  register: socketrgw
diff --git a/roles/ceph-common/tasks/main.yml b/roles/ceph-common/tasks/main.yml

index 217d48757ef09fa41d49bcad7d8d9c40066e9d5f..360c21232c86ba535dffb100999dbc724091fe8f 100644 (file)
--- a/roles/ceph-common/tasks/main.yml
+++ b/roles/ceph-common/tasks/main.yml
@@ -87,7 +87,6 @@
    static: False
  
  - include: facts.yml
-- include: ./checks/check_socket.yml
  - include: create_ceph_initial_dirs.yml
  - include: generate_cluster_fsid.yml
  - include: generate_ceph_conf.yml
diff --git a/roles/ceph-common/templates/restart_mon_daemon.sh.j2 b/roles/ceph-common/templates/restart_mon_daemon.sh.j2

new file mode 100644 (file)

index 0000000..d918b01
--- /dev/null
+++ b/roles/ceph-common/templates/restart_mon_daemon.sh.j2
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+RETRIES="{{ handler_health_mon_check_retries }}"
+DELAY="{{ handler_health_mon_check_delay }}"
+MONITOR_NAME="{{ monitor_name }}"
+CLUSTER="{{ cluster }}"
+SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok
+
+
+check_quorum() {
+while [ $RETRIES -ne 0 ]; do
+  MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+  test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
+  sleep $DELAY
+  let RETRIES=RETRIES-1
+done
+# If we reach this point, it means there is a problem with the quorum
+exit 1
+}
+
+# First, restart the daemon
+systemctl restart ceph-mon@${MONITOR_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemon
+while [ $COUNT -ne 0 ]; do
+  test -S $SOCKET && check_quorum
+  sleep 1
+  let COUNT=COUNT-1
+done
+# If we reach this point, it means the socket is not present.
+echo "Error while restarting mon daemon"
+exit 1
diff --git a/roles/ceph-common/templates/restart_osd_daemon.sh.j2 b/roles/ceph-common/templates/restart_osd_daemon.sh.j2

new file mode 100644 (file)

index 0000000..8b0b7d1
--- /dev/null
+++ b/roles/ceph-common/templates/restart_osd_daemon.sh.j2
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+RETRIES="{{ handler_health_osd_check_retries }}"
+DELAY="{{ handler_health_osd_check_delay }}"
+CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
+
+check_pgs() {
+  while [ $RETRIES -ne 0 ]; do
+    ceph $CEPH_CLI -s | grep -sq 'active+clean'
+    RET=$?
+    test $RET -eq 0 && exit 0
+    sleep $DELAY
+    let RETRIES=RETRIES-1
+  done
+  # PGs not clean, exiting with return code 1
+  echo "Error with PGs, check config"
+  exit 1
+}
+
+
+for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
+  # First, restart daemon(s)
+  systemctl restart ceph-osd@${id}
+  # We need to wait because it may take some time for the socket to actually exists
+  COUNT=10
+  # Wait and ensure the socket exists after restarting the daemon
+  SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
+  while [ $COUNT -ne 0 ]; do
+    test -S $SOCKET && check_pgs
+    sleep 1
+    let COUNT=COUNT-1
+  done
+  # If we reach this point, it means the socket is not present.
+  echo "Error while restarting mon daemon"
+  exit 1
+done
author	Guillaume Abrioux <gabrioux@redhat.com>
	Mon, 3 Apr 2017 17:55:11 +0000 (19:55 +0200)
committer	Guillaume Abrioux <gabrioux@redhat.com>
	Thu, 6 Apr 2017 14:19:58 +0000 (16:19 +0200)
roles/ceph-common/handlers/main.yml		patch \| blob \| history
roles/ceph-common/handlers/restart-mds.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/handlers/restart-mon.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/handlers/restart-osd.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/handlers/restart-rgw.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/handlers/validate-mon.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/handlers/validate-osd.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/tasks/checks/check_socket.yml	[deleted file]	patch \| blob \| history
roles/ceph-common/tasks/main.yml		patch \| blob \| history
roles/ceph-common/templates/restart_mon_daemon.sh.j2	[new file with mode: 0644]	patch \| blob
roles/ceph-common/templates/restart_osd_daemon.sh.j2	[new file with mode: 0644]	patch \| blob