#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
#radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
#radosgw_civetweb_port: 8080
-#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
#radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
+#
+# Monitor handler checks
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
+#
+# OSD handler checks
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30
#handler_health_osd_check: true
+#
+# MDS handler checks
+#handler_health_mds_check_retries: 5
+#handler_health_mds_check_delay: 10
+#
+# RGW handler checks
+#handler_health_rgw_check_retries: 5
+#handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
#radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
#radosgw_civetweb_port: 8080
-#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
#radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
+#
+# Monitor handler checks
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
+#
+# OSD handler checks
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30
#handler_health_osd_check: true
+#
+# MDS handler checks
+#handler_health_mds_check_retries: 5
+#handler_health_mds_check_delay: 10
+#
+# RGW handler checks
+#handler_health_rgw_check_retries: 5
+#handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
radosgw_civetweb_port: 8080
-radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
+#
+# Monitor handler checks
handler_health_mon_check_retries: 5
handler_health_mon_check_delay: 10
+#
+# OSD handler checks
handler_health_osd_check_retries: 40
handler_health_osd_check_delay: 30
handler_health_osd_check: true
+#
+# MDS handler checks
+handler_health_mds_check_retries: 5
+handler_health_mds_check_delay: 10
+#
+# RGW handler checks
+handler_health_rgw_check_retries: 5
+handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful
command: /tmp/restart_mon_daemon.sh
listen: "restart ceph mons"
when:
-# We do not want to run these checks on initial deployment (`socket.rc == 0`)
+ # We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- mon_group_name in group_names
listen: "restart ceph osds"
with_items: "{{ socket_osd_container.results | default([]) }}"
when:
- # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
- # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+ # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+ # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- containerized_deployment
- ((crush_location is defined and crush_location) or item.get('rc') == 0)
- handler_health_osd_check
command: /tmp/restart_osd_daemon.sh
listen: "restart ceph osds"
when:
- # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
- # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+ # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+ # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- ((crush_location is defined and crush_location) or socket.rc == 0)
- ceph_current_fsid.rc == 0
- handler_health_osd_check
- inventory_hostname in play_hosts
- osd_group_name in group_names
-- name: restart ceph mdss
- service:
- name: ceph-mds@{{ mds_name }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(mds_group_name, []) }}"
- delegate_to: "{{ item }}"
+- name: copy mds restart script
+ template:
+ src: restart_mds_daemon.sh.j2
+ dest: /tmp/restart_mds_daemon.sh
+ owner: root
+ group: root
+ mode: 0750
+ listen: "restart ceph mdss"
when:
+ - inventory_hostname in play_hosts
- mds_group_name in group_names
-- name: restart ceph rgws
- service:
- name: ceph-radosgw@rgw.{{ ansible_hostname }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(rgw_group_name, []) }}"
- delegate_to: "{{ item }}"
+- name: restart ceph mds daemon(s)
+ command: /tmp/restart_mds_daemon.sh
+ listen: "restart ceph mdss"
+ when:
+ # We do not want to run these checks on initial deployment (`socket.rc == 0`)
+ - socket.rc == 0
+ - mds_group_name in group_names
+
+- name: copy rgw restart script
+ template:
+ src: restart_rgw_daemon.sh.j2
+ dest: /tmp/restart_rgw_daemon.sh
+ owner: root
+ group: root
+ mode: 0750
+ listen: "restart ceph rgws"
when:
+ - inventory_hostname in play_hosts
+ - rgw_group_name in group_names
+
+- name: restart ceph rgw daemon(s)
+ command: /tmp/restart_rgw_daemon.sh
+ listen: "restart ceph rgws"
+ when:
+ # We do not want to run these checks on initial deployment (`socket.rc == 0`)
+ - socket.rc == 0
- rgw_group_name in group_names
- name: restart ceph nfss
--- /dev/null
+#!/bin/bash
+
+RETRIES="{{ handler_health_mds_check_retries }}"
+DELAY="{{ handler_health_mds_check_delay }}"
+MDS_NAME="{{ ansible_hostname }}"
+SOCKET=/var/run/ceph/{{ cluster }}-mds.${MDS_NAME}.asok
+
+# First, restart the daemon
+systemctl restart ceph-mds@${MDS_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemds
+while [ $RETRIES -ne 0 ]; do
+ {{ docker_exec_cmd }} test -S $SOCKET && exit 0
+ sleep $DELAY
+ let RETRIES=RETRIES-1
+done
+# If we reach this point, it means the socket is not present.
+echo "Socket file ${SOCKET} could not be found, which means the Metadata Server is not running."
+exit 1
check_quorum() {
while [ $RETRIES -ne 0 ]; do
- MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+ MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
--- /dev/null
+#!/bin/bash
+
+RETRIES="{{ handler_health_rgw_check_retries }}"
+DELAY="{{ handler_health_rgw_check_delay }}"
+RGW_NAME="{{ ansible_hostname }}"
+RGW_PORT="{{ radosgw_civetweb_port }}"
+SOCKET=/var/run/ceph/{{ cluster }}-client.rgw.${RGW_NAME}.asok
+
+{% if radosgw_address_block | length > 0 %}
+ {% if ip_version == 'ipv4' -%}
+RGW_IP={{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}
+ {%- elif ip_version == 'ipv6' -%}
+RGW_IP=[{{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}]
+ {%- endif %}
+{% elif hostvars[inventory_hostname]['radosgw_address'] is defined and hostvars[inventory_hostname]['radosgw_address'] != '0.0.0.0' -%}
+ {% if ip_version == 'ipv4' -%}
+RGW_IP={{ hostvars[inventory_hostname]['radosgw_address'] }}
+ {%- elif ip_version == 'ipv6' -%}
+RGW_IP=[{{ hostvars[inventory_hostname]['radosgw_address'] }}]
+ {% endif %}
+{%- else -%}
+ {% set interface = ["ansible_",radosgw_interface]|join %}
+ {% if ip_version == 'ipv6' -%}
+RGW_IP=[{{ hostvars[inventory_hostname][interface][ip_version][0]['address'] }}]
+ {%- elif ip_version == 'ipv4' -%}
+RGW_IP={{ hostvars[inventory_hostname][interface][ip_version]['address'] }}
+ {% endif %}
+{%- endif %}
+
+check_for_curl_or_wget() {
+ if {{ docker_exec_cmd }} command -v wget &>/dev/null; then
+ rgw_test_command="wget --quiet"
+ elif {{ docker_exec_cmd }} command -v curl &>/dev/null; then
+ rgw_test_command="curl --fail --silent --output /dev/null"
+ else
+ echo "It seems that neither curl or wget are available on your system."
+ echo "Cannot test rgw connection."
+ exit 0
+ fi
+}
+
+check_rest() {
+ check_for_curl_or_wget
+ while [ $RETRIES -ne 0 ]; do
+ test "$rgw_test_command http://$RGW_IP:$RGW_PORT" && exit 0
+ sleep $DELAY
+ let RETRIES=RETRIES-1
+ done
+ # If we reach this point, it means there is a problem with the connection to rgw
+ echo "Error connecting locally to Rados Gateway service: http://$rgw_listen"
+ exit 1
+}
+
+# First, restart the daemon
+systemctl restart ceph-radosgw@rgw.${RGW_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemon
+while [ $COUNT -ne 0 ]; do
+ {{ docker_exec_cmd }} test -S $SOCKET && check_rest
+ sleep 1
+ let COUNT=COUNT-1
+done
+echo "Socket file ${SOCKET} could not be found, which means Rados Gateway is not running."
+exit 1