From: Sébastien Han Date: Thu, 22 Sep 2016 15:03:14 +0000 (+0200) Subject: common: serialise host restart X-Git-Tag: v2.2.10~44 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fca383cd88efe162e39d95353eb5fc99b8895f2c;p=ceph-ansible.git common: serialise host restart This commits allows us to restart Ceph daemon machine by machine instead of restarting all the daemons in a single shot. Rework the structure of the handler for clarity as well. Signed-off-by: Sébastien Han --- diff --git a/defaults/main.yml b/defaults/main.yml index 6190c49cc..43515bd13 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -331,6 +331,20 @@ restapi_port: 5000 # if you don't want it keep the option commented #common_single_host_mode: true +## Handlers - restarting daemons after a config change +# if for whatever reasons the content of your ceph configuration changes +# ceph daemons will be restarted as well. At the moment, we can not detect +# which config option changed so all the daemons will be restarted. Although +# this restart will be serialized for each node, in between a health check +# will be performed so we make sure we don't move to the next node until +# ceph is not healthy +# Obviously between the checks (for monitors to be in quorum and for osd's pgs +# to be clean) we have to wait. These retries and delays can be configurable +# for both monitors and osds. +handler_health_mon_check_retries: 5 +handler_health_mon_check_delay: 10 +handler_health_osd_check_retries: 40 +handler_health_osd_check_delay: 30 ################### # CONFIG OVERRIDE # diff --git a/handlers/main.yml b/handlers/main.yml index 6a0a54c33..dd2a0d726 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -2,41 +2,19 @@ - name: update apt cache apt: update-cache: yes + when: ansible_os_family == 'Debian' - name: restart ceph mons - service: - name: ceph-mon@{{ monitor_name }} - state: restarted - when: - - socket.rc == 0 - - mon_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml" -# This does not just restart OSDs but everything else too. Unfortunately -# at this time the ansible role does not have an OSD id list to use -# for restarting them specifically. - name: restart ceph osds - service: - name: ceph.target - state: restarted - when: - - socket.rc == 0 - - osd_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml" - name: restart ceph mdss - service: - name: ceph-mds@{{ mds_name }} - state: restarted - when: - - socket.rc == 0 - - mds_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml" - name: restart ceph rgws - service: - name: ceph-rgw@{{ ansible_hostname }} - state: restarted - when: - - socketrgw.rc == 0 - - rgw_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml" - name: restart ceph nfss service: diff --git a/handlers/restart-mds.yml b/handlers/restart-mds.yml new file mode 100644 index 000000000..e6ff5ef4e --- /dev/null +++ b/handlers/restart-mds.yml @@ -0,0 +1,13 @@ +--- +- name: restart ceph mdss + service: + name: ceph-mds@{{ mds_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[mds_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - mds_group_name in group_names diff --git a/handlers/restart-mon.yml b/handlers/restart-mon.yml new file mode 100644 index 000000000..440b7f219 --- /dev/null +++ b/handlers/restart-mon.yml @@ -0,0 +1,17 @@ +--- +- name: restart ceph mons + service: + name: ceph-mon@{{ monitor_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[mon_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - mon_group_name in group_names + +- name: validate monitors + include: validate-mon.yml + when: mon_group_name in group_names diff --git a/handlers/restart-osd.yml b/handlers/restart-osd.yml new file mode 100644 index 000000000..dc6fbeebb --- /dev/null +++ b/handlers/restart-osd.yml @@ -0,0 +1,22 @@ +--- +# This does not just restart OSDs but everything else too. Unfortunately +# at this time the ansible role does not have an OSD id list to use +# for restarting them specifically. +- name: restart ceph osds + shell: | + for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do + systemctl restart ceph-osd@$id + sleep 5 + done + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[osd_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - osd_group_name in group_names + +- name: validate osds + include: validate-osd.yml + when: osd_group_name in group_names diff --git a/handlers/restart-rgw.yml b/handlers/restart-rgw.yml new file mode 100644 index 000000000..5e52e9cc0 --- /dev/null +++ b/handlers/restart-rgw.yml @@ -0,0 +1,13 @@ +--- +- name: restart ceph rgws + service: + name: ceph-rgw@{{ ansible_hostname }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[rgw_group_name] }}" + delegate_to: "{{ item }}" + when: + - socketrgw.rc == 0 + - rgw_group_name in group_names diff --git a/handlers/validate-mon.yml b/handlers/validate-mon.yml new file mode 100644 index 000000000..4c5e15acb --- /dev/null +++ b/handlers/validate-mon.yml @@ -0,0 +1,28 @@ +--- +- name: wait for ceph monitor socket + wait_for: + path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok" + +- name: set mon_host_count + set_fact: mon_host_count={{ groups[mon_group_name] | length }} + +- name: select a running monitor + set_fact: mon_host={{ item }} + with_items: "{{ groups[mon_group_name] }}" + when: + - item != inventory_hostname + - mon_host_count | int > 1 + +- name: select first monitor if only one monitor + set_fact: mon_host={{ item }} + with_items: "{{ groups[mon_group_name][0] }}" + when: mon_host_count | int == 1 + +- name: waiting for the monitor to join the quorum... + shell: | + ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }} + register: result + until: result.rc == 0 + retries: "{{ handler_health_mon_check_retries }}" + delay: "{{ handler_health_mon_check_delay }}" + delegate_to: "{{ mon_host }}" diff --git a/handlers/validate-osd.yml b/handlers/validate-osd.yml new file mode 100644 index 000000000..b83d0952d --- /dev/null +++ b/handlers/validate-osd.yml @@ -0,0 +1,19 @@ +--- +- name: collect osds + shell: | + ls /var/lib/ceph/osd/ |grep -oh '[0-9]*' + register: osd_ids + +- name: wait for ceph osd socket(s) + wait_for: + path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok" + with_items: "{{ osd_ids.stdout_lines }}" + +- name: waiting for clean pgs... + shell: | + test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN" + register: result + until: result.rc == 0 + retries: "{{ handler_health_osd_check_retries }}" + delay: "{{ handler_health_osd_check_delay }}" + delegate_to: "{{ groups[mon_group_name][0] }}" diff --git a/tasks/generate_ceph_conf.yml b/tasks/generate_ceph_conf.yml index 9f54fb40c..1766a5383 100644 --- a/tasks/generate_ceph_conf.yml +++ b/tasks/generate_ceph_conf.yml @@ -22,3 +22,4 @@ - restart ceph osds - restart ceph mdss - restart ceph rgws + - restart ceph nfss