From 40a2df5bbf13685c83ddc0b784b97e8a76414006 Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=A9bastien=20Han?= Date: Thu, 22 Sep 2016 17:03:14 +0200 Subject: [PATCH] common: serialise host restart MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This commits allows us to restart Ceph daemon machine by machine instead of restarting all the daemons in a single shot. Rework the structure of the handler for clarity as well. Signed-off-by: Sébastien Han --- group_vars/all.yml.sample | 14 ++++++++ roles/ceph-common/defaults/main.yml | 14 ++++++++ roles/ceph-common/handlers/main.yml | 32 +++---------------- roles/ceph-common/handlers/restart-mds.yml | 13 ++++++++ roles/ceph-common/handlers/restart-mon.yml | 17 ++++++++++ roles/ceph-common/handlers/restart-osd.yml | 22 +++++++++++++ roles/ceph-common/handlers/restart-rgw.yml | 13 ++++++++ roles/ceph-common/handlers/validate-mon.yml | 28 ++++++++++++++++ roles/ceph-common/handlers/validate-osd.yml | 19 +++++++++++ .../ceph-common/tasks/generate_ceph_conf.yml | 1 + 10 files changed, 146 insertions(+), 27 deletions(-) create mode 100644 roles/ceph-common/handlers/restart-mds.yml create mode 100644 roles/ceph-common/handlers/restart-mon.yml create mode 100644 roles/ceph-common/handlers/restart-osd.yml create mode 100644 roles/ceph-common/handlers/restart-rgw.yml create mode 100644 roles/ceph-common/handlers/validate-mon.yml create mode 100644 roles/ceph-common/handlers/validate-osd.yml diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index 90ee940b7..c625d0639 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -339,6 +339,20 @@ dummy: # if you don't want it keep the option commented #common_single_host_mode: true +## Handlers - restarting daemons after a config change +# if for whatever reasons the content of your ceph configuration changes +# ceph daemons will be restarted as well. At the moment, we can not detect +# which config option changed so all the daemons will be restarted. Although +# this restart will be serialized for each node, in between a health check +# will be performed so we make sure we don't move to the next node until +# ceph is not healthy +# Obviously between the checks (for monitors to be in quorum and for osd's pgs +# to be clean) we have to wait. These retries and delays can be configurable +# for both monitors and osds. +#handler_health_mon_check_retries: 5 +#handler_health_mon_check_delay: 10 +#handler_health_osd_check_retries: 40 +#handler_health_osd_check_delay: 30 ################### # CONFIG OVERRIDE # diff --git a/roles/ceph-common/defaults/main.yml b/roles/ceph-common/defaults/main.yml index 6190c49cc..43515bd13 100644 --- a/roles/ceph-common/defaults/main.yml +++ b/roles/ceph-common/defaults/main.yml @@ -331,6 +331,20 @@ restapi_port: 5000 # if you don't want it keep the option commented #common_single_host_mode: true +## Handlers - restarting daemons after a config change +# if for whatever reasons the content of your ceph configuration changes +# ceph daemons will be restarted as well. At the moment, we can not detect +# which config option changed so all the daemons will be restarted. Although +# this restart will be serialized for each node, in between a health check +# will be performed so we make sure we don't move to the next node until +# ceph is not healthy +# Obviously between the checks (for monitors to be in quorum and for osd's pgs +# to be clean) we have to wait. These retries and delays can be configurable +# for both monitors and osds. +handler_health_mon_check_retries: 5 +handler_health_mon_check_delay: 10 +handler_health_osd_check_retries: 40 +handler_health_osd_check_delay: 30 ################### # CONFIG OVERRIDE # diff --git a/roles/ceph-common/handlers/main.yml b/roles/ceph-common/handlers/main.yml index 6a0a54c33..dd2a0d726 100644 --- a/roles/ceph-common/handlers/main.yml +++ b/roles/ceph-common/handlers/main.yml @@ -2,41 +2,19 @@ - name: update apt cache apt: update-cache: yes + when: ansible_os_family == 'Debian' - name: restart ceph mons - service: - name: ceph-mon@{{ monitor_name }} - state: restarted - when: - - socket.rc == 0 - - mon_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml" -# This does not just restart OSDs but everything else too. Unfortunately -# at this time the ansible role does not have an OSD id list to use -# for restarting them specifically. - name: restart ceph osds - service: - name: ceph.target - state: restarted - when: - - socket.rc == 0 - - osd_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml" - name: restart ceph mdss - service: - name: ceph-mds@{{ mds_name }} - state: restarted - when: - - socket.rc == 0 - - mds_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml" - name: restart ceph rgws - service: - name: ceph-rgw@{{ ansible_hostname }} - state: restarted - when: - - socketrgw.rc == 0 - - rgw_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml" - name: restart ceph nfss service: diff --git a/roles/ceph-common/handlers/restart-mds.yml b/roles/ceph-common/handlers/restart-mds.yml new file mode 100644 index 000000000..e6ff5ef4e --- /dev/null +++ b/roles/ceph-common/handlers/restart-mds.yml @@ -0,0 +1,13 @@ +--- +- name: restart ceph mdss + service: + name: ceph-mds@{{ mds_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[mds_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - mds_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-mon.yml b/roles/ceph-common/handlers/restart-mon.yml new file mode 100644 index 000000000..440b7f219 --- /dev/null +++ b/roles/ceph-common/handlers/restart-mon.yml @@ -0,0 +1,17 @@ +--- +- name: restart ceph mons + service: + name: ceph-mon@{{ monitor_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[mon_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - mon_group_name in group_names + +- name: validate monitors + include: validate-mon.yml + when: mon_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-osd.yml b/roles/ceph-common/handlers/restart-osd.yml new file mode 100644 index 000000000..dc6fbeebb --- /dev/null +++ b/roles/ceph-common/handlers/restart-osd.yml @@ -0,0 +1,22 @@ +--- +# This does not just restart OSDs but everything else too. Unfortunately +# at this time the ansible role does not have an OSD id list to use +# for restarting them specifically. +- name: restart ceph osds + shell: | + for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do + systemctl restart ceph-osd@$id + sleep 5 + done + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[osd_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - osd_group_name in group_names + +- name: validate osds + include: validate-osd.yml + when: osd_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-rgw.yml b/roles/ceph-common/handlers/restart-rgw.yml new file mode 100644 index 000000000..5e52e9cc0 --- /dev/null +++ b/roles/ceph-common/handlers/restart-rgw.yml @@ -0,0 +1,13 @@ +--- +- name: restart ceph rgws + service: + name: ceph-rgw@{{ ansible_hostname }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[rgw_group_name] }}" + delegate_to: "{{ item }}" + when: + - socketrgw.rc == 0 + - rgw_group_name in group_names diff --git a/roles/ceph-common/handlers/validate-mon.yml b/roles/ceph-common/handlers/validate-mon.yml new file mode 100644 index 000000000..4c5e15acb --- /dev/null +++ b/roles/ceph-common/handlers/validate-mon.yml @@ -0,0 +1,28 @@ +--- +- name: wait for ceph monitor socket + wait_for: + path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok" + +- name: set mon_host_count + set_fact: mon_host_count={{ groups[mon_group_name] | length }} + +- name: select a running monitor + set_fact: mon_host={{ item }} + with_items: "{{ groups[mon_group_name] }}" + when: + - item != inventory_hostname + - mon_host_count | int > 1 + +- name: select first monitor if only one monitor + set_fact: mon_host={{ item }} + with_items: "{{ groups[mon_group_name][0] }}" + when: mon_host_count | int == 1 + +- name: waiting for the monitor to join the quorum... + shell: | + ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }} + register: result + until: result.rc == 0 + retries: "{{ handler_health_mon_check_retries }}" + delay: "{{ handler_health_mon_check_delay }}" + delegate_to: "{{ mon_host }}" diff --git a/roles/ceph-common/handlers/validate-osd.yml b/roles/ceph-common/handlers/validate-osd.yml new file mode 100644 index 000000000..b83d0952d --- /dev/null +++ b/roles/ceph-common/handlers/validate-osd.yml @@ -0,0 +1,19 @@ +--- +- name: collect osds + shell: | + ls /var/lib/ceph/osd/ |grep -oh '[0-9]*' + register: osd_ids + +- name: wait for ceph osd socket(s) + wait_for: + path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok" + with_items: "{{ osd_ids.stdout_lines }}" + +- name: waiting for clean pgs... + shell: | + test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN" + register: result + until: result.rc == 0 + retries: "{{ handler_health_osd_check_retries }}" + delay: "{{ handler_health_osd_check_delay }}" + delegate_to: "{{ groups[mon_group_name][0] }}" diff --git a/roles/ceph-common/tasks/generate_ceph_conf.yml b/roles/ceph-common/tasks/generate_ceph_conf.yml index 9f54fb40c..1766a5383 100644 --- a/roles/ceph-common/tasks/generate_ceph_conf.yml +++ b/roles/ceph-common/tasks/generate_ceph_conf.yml @@ -22,3 +22,4 @@ - restart ceph osds - restart ceph mdss - restart ceph rgws + - restart ceph nfss -- 2.39.5