common: serialise host restart

author Sébastien Han <seb@redhat.com>

Thu, 22 Sep 2016 15:03:14 +0000 (17:03 +0200)

committer Andrew Schoen <aschoen@redhat.com>

Tue, 31 Jan 2017 18:05:09 +0000 (12:05 -0600)
author Sébastien Han <seb@redhat.com>
Thu, 22 Sep 2016 15:03:14 +0000 (17:03 +0200)
committer Andrew Schoen <aschoen@redhat.com>
Tue, 31 Jan 2017 18:05:09 +0000 (12:05 -0600)
diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample

index 82eeeb6555311319a5c5735b827a2ad35b6d721c..9eb23f1cf7b19153f421f203e3e0993c1c0bd923 100644 (file)
--- a/group_vars/all.yml.sample
+++ b/group_vars/all.yml.sample
@@ -316,6 +316,20 @@ dummy:
  # if you don't want it keep the option commented
  #common_single_host_mode: true
  
+## Handlers - restarting daemons after a config change
+# if for whatever reasons the content of your ceph configuration changes
+# ceph daemons will be restarted as well. At the moment, we can not detect
+# which config option changed so all the daemons will be restarted. Although
+# this restart will be serialized for each node, in between a health check
+# will be performed so we make sure we don't move to the next node until
+# ceph is not healthy
+# Obviously between the checks (for monitors to be in quorum and for osd's pgs
+# to be clean) we have to wait. These retries and delays can be configurable
+# for both monitors and osds.
+#handler_health_mon_check_retries: 5
+#handler_health_mon_check_delay: 10
+#handler_health_osd_check_retries: 40
+#handler_health_osd_check_delay: 30
  
  ###################
  # CONFIG OVERRIDE #
diff --git a/roles/ceph-common/defaults/main.yml b/roles/ceph-common/defaults/main.yml

index 3000dfdcdbd51101f82c7f371427d41ae43ae430..50bd4b3ddd04057ae0395183612bbddcafc52efd 100644 (file)
--- a/roles/ceph-common/defaults/main.yml
+++ b/roles/ceph-common/defaults/main.yml
@@ -308,6 +308,20 @@ restapi_port: 5000
  # if you don't want it keep the option commented
  #common_single_host_mode: true
  
+## Handlers - restarting daemons after a config change
+# if for whatever reasons the content of your ceph configuration changes
+# ceph daemons will be restarted as well. At the moment, we can not detect
+# which config option changed so all the daemons will be restarted. Although
+# this restart will be serialized for each node, in between a health check
+# will be performed so we make sure we don't move to the next node until
+# ceph is not healthy
+# Obviously between the checks (for monitors to be in quorum and for osd's pgs
+# to be clean) we have to wait. These retries and delays can be configurable
+# for both monitors and osds.
+handler_health_mon_check_retries: 5
+handler_health_mon_check_delay: 10
+handler_health_osd_check_retries: 40
+handler_health_osd_check_delay: 30
  
  ###################
  # CONFIG OVERRIDE #
diff --git a/roles/ceph-common/handlers/main.yml b/roles/ceph-common/handlers/main.yml

index c7dfb7647b64bfb6ff1233f2324b907b1a4be754..dd2a0d726086d7ad6109cbb65569a2aa0d22615d 100644 (file)
--- a/roles/ceph-common/handlers/main.yml
+++ b/roles/ceph-common/handlers/main.yml
@@ -2,125 +2,19 @@
  - name: update apt cache
    apt:
      update-cache: yes
+  when: ansible_os_family == 'Debian'
  
  - name: restart ceph mons
-  command: service ceph restart mon
-  when:
-    - socket.rc == 0
-    - ansible_distribution != 'Ubuntu'
-    - mon_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
-
-- name: restart ceph mons with systemd
-  service:
-      name: ceph-mon@{{ monitor_name }}
-      state: restarted
-  when:
-    - socket.rc == 0
-    - use_systemd
-    - mon_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
-
-- name: restart ceph mons on ubuntu
-  command: initctl restart ceph-mon cluster={{ cluster }} id={{ monitor_name }}
-  when:
-    - socket.rc == 0
-    - ansible_distribution == 'Ubuntu'
-    - not use_systemd
-    - mon_group_name in group_names
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml"
  
  - name: restart ceph osds
-  command: service ceph restart osd
-  when:
-    - socket.rc == 0
-    - ansible_distribution != 'Ubuntu'
-    - osd_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
-
-# This does not just restart OSDs but everything else too. Unfortunately
-# at this time the ansible role does not have an OSD id list to use
-# for restarting them specifically.
-- name: restart ceph osds with systemd
-  service:
-    name: ceph.target
-    state: restarted
-  when:
-    - socket.rc == 0
-    - use_systemd
-    - osd_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
-
-- name: restart ceph osds on ubuntu
-  shell: |
-    for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
-      initctl restart ceph-osd cluster={{ cluster }} id=$id
-    done
-  when:
-    - socket.rc == 0
-    - ansible_distribution == 'Ubuntu'
-    - not use_systemd
-    - osd_group_name in group_names
-
-- name: restart ceph mdss on ubuntu
-  command: initctl restart ceph-mds cluster={{ cluster }} id={{ ansible_hostname }}
-  when:
-    - socket.rc == 0
-    - ansible_distribution == 'Ubuntu'
-    - not use_systemd
-    - mds_group_name in group_names
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml"
  
  - name: restart ceph mdss
-  command: service ceph restart mds
-  when:
-    - socket.rc == 0
-    - ansible_distribution != 'Ubuntu'
-    - use_systemd
-    - mds_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
-
-- name: restart ceph mdss with systemd
-  service:
-      name: ceph-mds@{{ mds_name }}
-      state: restarted
-  when:
-    - socket.rc == 0
-    - use_systemd
-    - mds_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
-
-- name: restart ceph rgws on ubuntu
-  command: initctl restart radosgw cluster={{ cluster }} id=rgw.{{ ansible_hostname }}
-  when:
-    - socketrgw.rc == 0
-    - ansible_distribution == 'Ubuntu'
-    - not use_systemd
-    - rgw_group_name in group_names
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml"
  
  - name: restart ceph rgws
-  command: /etc/init.d/radosgw restart
-  when:
-    - socketrgw.rc == 0
-    - ansible_distribution != 'Ubuntu'
-    - rgw_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
-
-- name: restart ceph rgws on red hat
-  command: /etc/init.d/ceph-radosgw restart
-  when:
-    - socketrgw.rc == 0
-    - ansible_os_family == 'RedHat'
-    - rgw_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
-
-- name: restart ceph rgws with systemd
-  service:
-    name: ceph-rgw@{{ ansible_hostname }}
-    state: restarted
-  when:
-    - socketrgw.rc == 0
-    - use_systemd
-    - rgw_group_name in group_names
-    - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml"
  
  - name: restart ceph nfss
    service:
diff --git a/roles/ceph-common/handlers/restart-mds.yml b/roles/ceph-common/handlers/restart-mds.yml

new file mode 100644 (file)

index 0000000..e6ff5ef
--- /dev/null
+++ b/roles/ceph-common/handlers/restart-mds.yml
@@ -0,0 +1,13 @@
+---
+- name: restart ceph mdss
+  service:
+    name: ceph-mds@{{ mds_name }}
+    state: restarted
+  # serial: 1 would be the proper solution here, but that can only be set on play level
+  # upstream issue: https://github.com/ansible/ansible/issues/12170
+  run_once: true
+  with_items: "{{ groups[mds_group_name] }}"
+  delegate_to: "{{ item }}"
+  when:
+    - socket.rc == 0
+    - mds_group_name in group_names
diff --git a/roles/ceph-common/handlers/restart-mon.yml b/roles/ceph-common/handlers/restart-mon.yml

new file mode 100644 (file)

index 0000000..440b7f2
--- /dev/null
+++ b/roles/ceph-common/handlers/restart-mon.yml
@@ -0,0 +1,17 @@
+---
+- name: restart ceph mons
+  service:
+    name: ceph-mon@{{ monitor_name }}
+    state: restarted
+  # serial: 1 would be the proper solution here, but that can only be set on play level
+  # upstream issue: https://github.com/ansible/ansible/issues/12170
+  run_once: true
+  with_items: "{{ groups[mon_group_name] }}"
+  delegate_to: "{{ item }}"
+  when:
+    - socket.rc == 0
+    - mon_group_name in group_names
+
+- name: validate monitors
+  include: validate-mon.yml
+  when: mon_group_name in group_names
diff --git a/roles/ceph-common/handlers/restart-osd.yml b/roles/ceph-common/handlers/restart-osd.yml

new file mode 100644 (file)

index 0000000..dc6fbee
--- /dev/null
+++ b/roles/ceph-common/handlers/restart-osd.yml
@@ -0,0 +1,22 @@
+---
+# This does not just restart OSDs but everything else too. Unfortunately
+# at this time the ansible role does not have an OSD id list to use
+# for restarting them specifically.
+- name: restart ceph osds
+  shell: |
+    for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
+      systemctl restart ceph-osd@$id
+      sleep 5
+    done
+  # serial: 1 would be the proper solution here, but that can only be set on play level
+  # upstream issue: https://github.com/ansible/ansible/issues/12170
+  run_once: true
+  with_items: "{{ groups[osd_group_name] }}"
+  delegate_to: "{{ item }}"
+  when:
+    - socket.rc == 0
+    - osd_group_name in group_names
+
+- name: validate osds
+  include: validate-osd.yml
+  when: osd_group_name in group_names
diff --git a/roles/ceph-common/handlers/restart-rgw.yml b/roles/ceph-common/handlers/restart-rgw.yml

new file mode 100644 (file)

index 0000000..5e52e9c
--- /dev/null
+++ b/roles/ceph-common/handlers/restart-rgw.yml
@@ -0,0 +1,13 @@
+---
+- name: restart ceph rgws
+  service:
+    name: ceph-rgw@{{ ansible_hostname }}
+    state: restarted
+  # serial: 1 would be the proper solution here, but that can only be set on play level
+  # upstream issue: https://github.com/ansible/ansible/issues/12170
+  run_once: true
+  with_items: "{{ groups[rgw_group_name] }}"
+  delegate_to: "{{ item }}"
+  when:
+    - socketrgw.rc == 0
+    - rgw_group_name in group_names
diff --git a/roles/ceph-common/handlers/validate-mon.yml b/roles/ceph-common/handlers/validate-mon.yml

new file mode 100644 (file)

index 0000000..4c5e15a
--- /dev/null
+++ b/roles/ceph-common/handlers/validate-mon.yml
@@ -0,0 +1,28 @@
+---
+- name: wait for ceph monitor socket
+  wait_for:
+    path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"
+
+- name: set mon_host_count
+  set_fact: mon_host_count={{ groups[mon_group_name] | length }}
+
+- name: select a running monitor
+  set_fact: mon_host={{ item }}
+  with_items: "{{ groups[mon_group_name] }}"
+  when:
+    - item != inventory_hostname
+    - mon_host_count | int > 1
+
+- name: select first monitor if only one monitor
+  set_fact: mon_host={{ item }}
+  with_items: "{{ groups[mon_group_name][0] }}"
+  when: mon_host_count | int == 1
+
+- name: waiting for the monitor to join the quorum...
+  shell: |
+    ceph -s  --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
+  register: result
+  until: result.rc == 0
+  retries: "{{ handler_health_mon_check_retries }}"
+  delay: "{{ handler_health_mon_check_delay }}"
+  delegate_to: "{{ mon_host }}"
diff --git a/roles/ceph-common/handlers/validate-osd.yml b/roles/ceph-common/handlers/validate-osd.yml

new file mode 100644 (file)

index 0000000..b83d095
--- /dev/null
+++ b/roles/ceph-common/handlers/validate-osd.yml
@@ -0,0 +1,19 @@
+---
+- name: collect osds
+  shell: |
+    ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'
+  register: osd_ids
+
+- name: wait for ceph osd socket(s)
+  wait_for:
+    path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
+  with_items: "{{ osd_ids.stdout_lines }}"
+
+- name: waiting for clean pgs...
+  shell: |
+    test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
+  register: result
+  until: result.rc == 0
+  retries: "{{ handler_health_osd_check_retries }}"
+  delay: "{{ handler_health_osd_check_delay }}"
+  delegate_to: "{{ groups[mon_group_name][0] }}"
diff --git a/roles/ceph-common/tasks/generate_ceph_conf.yml b/roles/ceph-common/tasks/generate_ceph_conf.yml

index 59fa03c17992cf1c39432093eb53a16e72ab93fc..dc929c7367e9048f81b55135c4521177b7dfb63a 100644 (file)
--- a/roles/ceph-common/tasks/generate_ceph_conf.yml
+++ b/roles/ceph-common/tasks/generate_ceph_conf.yml
@@ -19,16 +19,7 @@
      config_type: ini
    notify:
      - restart ceph mons
-    - restart ceph mons on ubuntu
-    - restart ceph mons with systemd
      - restart ceph osds
-    - restart ceph osds on ubuntu
-    - restart ceph osds with systemd
      - restart ceph mdss
-    - restart ceph mdss on ubuntu
-    - restart ceph mdss with systemd
      - restart ceph rgws
-    - restart ceph rgws on ubuntu
-    - restart ceph rgws on red hat
-    - restart ceph rgws with systemd
      - restart ceph nfss
author	Sébastien Han <seb@redhat.com>
	Thu, 22 Sep 2016 15:03:14 +0000 (17:03 +0200)
committer	Andrew Schoen <aschoen@redhat.com>
	Tue, 31 Jan 2017 18:05:09 +0000 (12:05 -0600)
group_vars/all.yml.sample		patch \| blob \| history
roles/ceph-common/defaults/main.yml		patch \| blob \| history
roles/ceph-common/handlers/main.yml		patch \| blob \| history
roles/ceph-common/handlers/restart-mds.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-common/handlers/restart-mon.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-common/handlers/restart-osd.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-common/handlers/restart-rgw.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-common/handlers/validate-mon.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-common/handlers/validate-osd.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-common/tasks/generate_ceph_conf.yml		patch \| blob \| history