ceph-defaults: fix handlers for mds and rgw

author Sébastien Han <seb@redhat.com>

Thu, 31 Aug 2017 09:22:33 +0000 (11:22 +0200)

committer Sébastien Han <seb@redhat.com>

Thu, 31 Aug 2017 17:02:21 +0000 (19:02 +0200)
author Sébastien Han <seb@redhat.com>
Thu, 31 Aug 2017 09:22:33 +0000 (11:22 +0200)
committer Sébastien Han <seb@redhat.com>
Thu, 31 Aug 2017 17:02:21 +0000 (19:02 +0200)
diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample

index b8a538dca290c2ca2ce047c90d137945eb5e2790..17bcda687519df2e9929d8c29aea9f36c1599d26 100644 (file)
--- a/group_vars/all.yml.sample
+++ b/group_vars/all.yml.sample
@@ -327,7 +327,6 @@ dummy:
  #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
  #radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
  #radosgw_civetweb_port: 8080
-#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
  #radosgw_civetweb_num_threads: 100
  # For additional civetweb configuration options available such as SSL, logging,
  # keepalive, and timeout settings, please see the civetweb docs at
@@ -366,11 +365,23 @@ dummy:
  # Obviously between the checks (for monitors to be in quorum and for osd's pgs
  # to be clean) we have to wait. These retries and delays can be configurable
  # for both monitors and osds.
+#
+# Monitor handler checks
  #handler_health_mon_check_retries: 5
  #handler_health_mon_check_delay: 10
+#
+# OSD handler checks
  #handler_health_osd_check_retries: 40
  #handler_health_osd_check_delay: 30
  #handler_health_osd_check: true
+#
+# MDS handler checks
+#handler_health_mds_check_retries: 5
+#handler_health_mds_check_delay: 10
+#
+# RGW handler checks
+#handler_health_rgw_check_retries: 5
+#handler_health_rgw_check_delay: 10
  
  # Confiure the type of NFS gatway access.  At least one must be enabled for an
  # NFS role to be useful
diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample

index 9b8aa0a1c19e1fd17e3e9ff8272fc8f3c38051d4..24093b9c07cbd43ae0ed4c3a0e8210f091c51928 100644 (file)
--- a/group_vars/rhcs.yml.sample
+++ b/group_vars/rhcs.yml.sample
@@ -327,7 +327,6 @@ ceph_repository: rhcs
  #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
  #radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
  #radosgw_civetweb_port: 8080
-#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
  #radosgw_civetweb_num_threads: 100
  # For additional civetweb configuration options available such as SSL, logging,
  # keepalive, and timeout settings, please see the civetweb docs at
@@ -366,11 +365,23 @@ ceph_repository: rhcs
  # Obviously between the checks (for monitors to be in quorum and for osd's pgs
  # to be clean) we have to wait. These retries and delays can be configurable
  # for both monitors and osds.
+#
+# Monitor handler checks
  #handler_health_mon_check_retries: 5
  #handler_health_mon_check_delay: 10
+#
+# OSD handler checks
  #handler_health_osd_check_retries: 40
  #handler_health_osd_check_delay: 30
  #handler_health_osd_check: true
+#
+# MDS handler checks
+#handler_health_mds_check_retries: 5
+#handler_health_mds_check_delay: 10
+#
+# RGW handler checks
+#handler_health_rgw_check_retries: 5
+#handler_health_rgw_check_delay: 10
  
  # Confiure the type of NFS gatway access.  At least one must be enabled for an
  # NFS role to be useful
diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml

index 473d9cdfb1df899738813308684cd63b7f93751b..e342633809dec63490255cdc81abcfc864a3d5f4 100644 (file)
--- a/roles/ceph-defaults/defaults/main.yml
+++ b/roles/ceph-defaults/defaults/main.yml
@@ -319,7 +319,6 @@ mds_max_mds: 3
  #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
  radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
  radosgw_civetweb_port: 8080
-radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
  radosgw_civetweb_num_threads: 100
  # For additional civetweb configuration options available such as SSL, logging,
  # keepalive, and timeout settings, please see the civetweb docs at
@@ -358,11 +357,23 @@ restapi_port: 5000
  # Obviously between the checks (for monitors to be in quorum and for osd's pgs
  # to be clean) we have to wait. These retries and delays can be configurable
  # for both monitors and osds.
+#
+# Monitor handler checks
  handler_health_mon_check_retries: 5
  handler_health_mon_check_delay: 10
+#
+# OSD handler checks
  handler_health_osd_check_retries: 40
  handler_health_osd_check_delay: 30
  handler_health_osd_check: true
+#
+# MDS handler checks
+handler_health_mds_check_retries: 5
+handler_health_mds_check_delay: 10
+#
+# RGW handler checks
+handler_health_rgw_check_retries: 5
+handler_health_rgw_check_delay: 10
  
  # Confiure the type of NFS gatway access.  At least one must be enabled for an
  # NFS role to be useful
diff --git a/roles/ceph-defaults/handlers/main.yml b/roles/ceph-defaults/handlers/main.yml

index 3a053d435782e2fcd8f1305a39eed8e3af3ccc1a..8f21db762aae6fd39c743a118e0d3328f57161dd 100644 (file)
--- a/roles/ceph-defaults/handlers/main.yml
+++ b/roles/ceph-defaults/handlers/main.yml
@@ -18,7 +18,7 @@
      command: /tmp/restart_mon_daemon.sh
      listen: "restart ceph mons"
    when:
-# We do not want to run these checks on initial deployment (`socket.rc == 0`)
+    # We do not want to run these checks on initial deployment (`socket.rc == 0`)
      - socket.rc == 0
      - mon_group_name in group_names
  
@@ -42,8 +42,8 @@
    listen: "restart ceph osds"
    with_items: "{{ socket_osd_container.results | default([]) }}"
    when:
-  # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
-  # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+    # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+    # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
      - containerized_deployment
      - ((crush_location is defined and crush_location) or item.get('rc') == 0)
      - handler_health_osd_check
@@ -55,8 +55,8 @@
    command: /tmp/restart_osd_daemon.sh
    listen: "restart ceph osds"
    when:
-  # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
-  # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+    # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+    # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
      - ((crush_location is defined and crush_location) or socket.rc == 0)
      - ceph_current_fsid.rc == 0
      - handler_health_osd_check
@@ -64,28 +64,44 @@
      - inventory_hostname in play_hosts
      - osd_group_name in group_names
  
-- name: restart ceph mdss
-  service:
-    name: ceph-mds@{{ mds_name }}
-    state: restarted
-  # serial: 1 would be the proper solution here, but that can only be set on play level
-  # upstream issue: https://github.com/ansible/ansible/issues/12170
-  run_once: true
-  with_items: "{{ groups.get(mds_group_name, []) }}"
-  delegate_to: "{{ item }}"
+- name: copy mds restart script
+  template:
+    src: restart_mds_daemon.sh.j2
+    dest: /tmp/restart_mds_daemon.sh
+    owner: root
+    group: root
+    mode: 0750
+  listen: "restart ceph mdss"
    when:
+    - inventory_hostname in play_hosts
      - mds_group_name in group_names
  
-- name: restart ceph rgws
-  service:
-    name: ceph-radosgw@rgw.{{ ansible_hostname }}
-    state: restarted
-  # serial: 1 would be the proper solution here, but that can only be set on play level
-  # upstream issue: https://github.com/ansible/ansible/issues/12170
-  run_once: true
-  with_items: "{{ groups.get(rgw_group_name, []) }}"
-  delegate_to: "{{ item }}"
+- name: restart ceph mds daemon(s)
+  command: /tmp/restart_mds_daemon.sh
+  listen: "restart ceph mdss"
+  when:
+    # We do not want to run these checks on initial deployment (`socket.rc == 0`)
+    - socket.rc == 0
+    - mds_group_name in group_names
+
+- name: copy rgw restart script
+  template:
+    src: restart_rgw_daemon.sh.j2
+    dest: /tmp/restart_rgw_daemon.sh
+    owner: root
+    group: root
+    mode: 0750
+  listen: "restart ceph rgws"
    when:
+    - inventory_hostname in play_hosts
+    - rgw_group_name in group_names
+
+- name: restart ceph rgw daemon(s)
+  command: /tmp/restart_rgw_daemon.sh
+  listen: "restart ceph rgws"
+  when:
+    # We do not want to run these checks on initial deployment (`socket.rc == 0`)
+    - socket.rc == 0
      - rgw_group_name in group_names
  
  - name: restart ceph nfss
diff --git a/roles/ceph-defaults/templates/restart_mds_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_mds_daemon.sh.j2

new file mode 100644 (file)

index 0000000..557ac7f
--- /dev/null
+++ b/roles/ceph-defaults/templates/restart_mds_daemon.sh.j2
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+RETRIES="{{ handler_health_mds_check_retries }}"
+DELAY="{{ handler_health_mds_check_delay }}"
+MDS_NAME="{{ ansible_hostname }}"
+SOCKET=/var/run/ceph/{{ cluster }}-mds.${MDS_NAME}.asok
+
+# First, restart the daemon
+systemctl restart ceph-mds@${MDS_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemds
+while [ $RETRIES -ne 0 ]; do
+  {{ docker_exec_cmd }} test -S $SOCKET && exit 0
+  sleep $DELAY
+  let RETRIES=RETRIES-1
+done
+# If we reach this point, it means the socket is not present.
+echo "Socket file ${SOCKET} could not be found, which means the Metadata Server is not running."
+exit 1
diff --git a/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2

index 745f6915fdaf51037aec1bc39c2a176fefd665fb..9c86ffccb2db349ea538da9c176dffe8acd5481a 100644 (file)
--- a/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2
+++ b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2
@@ -8,7 +8,7 @@ SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok
  
  check_quorum() {
  while [ $RETRIES -ne 0 ]; do
-  MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }}   -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+  MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
    test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
    sleep $DELAY
    let RETRIES=RETRIES-1
diff --git a/roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2

new file mode 100644 (file)

index 0000000..612559f
--- /dev/null
+++ b/roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+RETRIES="{{ handler_health_rgw_check_retries }}"
+DELAY="{{ handler_health_rgw_check_delay }}"
+RGW_NAME="{{ ansible_hostname }}"
+RGW_PORT="{{ radosgw_civetweb_port }}"
+SOCKET=/var/run/ceph/{{ cluster }}-client.rgw.${RGW_NAME}.asok
+
+{% if radosgw_address_block | length > 0 %}
+  {% if ip_version == 'ipv4' -%}
+RGW_IP={{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}
+  {%- elif ip_version == 'ipv6' -%}
+RGW_IP=[{{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}]
+  {%- endif %}
+{% elif hostvars[inventory_hostname]['radosgw_address'] is defined and hostvars[inventory_hostname]['radosgw_address'] != '0.0.0.0' -%}
+  {% if ip_version == 'ipv4' -%}
+RGW_IP={{ hostvars[inventory_hostname]['radosgw_address'] }}
+  {%- elif ip_version == 'ipv6' -%}
+RGW_IP=[{{ hostvars[inventory_hostname]['radosgw_address'] }}]
+  {% endif %}
+{%- else -%}
+  {% set interface = ["ansible_",radosgw_interface]|join %}
+  {% if ip_version == 'ipv6' -%}
+RGW_IP=[{{ hostvars[inventory_hostname][interface][ip_version][0]['address'] }}]
+  {%- elif ip_version == 'ipv4' -%}
+RGW_IP={{ hostvars[inventory_hostname][interface][ip_version]['address'] }}
+  {% endif %}
+{%- endif %}
+
+check_for_curl_or_wget() {
+  if {{ docker_exec_cmd }} command -v wget &>/dev/null; then
+    rgw_test_command="wget --quiet"
+  elif {{ docker_exec_cmd }} command -v curl &>/dev/null; then
+    rgw_test_command="curl --fail --silent --output /dev/null"
+  else
+    echo "It seems that neither curl or wget are available on your system."
+    echo "Cannot test rgw connection."
+    exit 0
+  fi
+}
+
+check_rest() {
+  check_for_curl_or_wget
+  while [ $RETRIES -ne 0 ]; do
+    test "$rgw_test_command http://$RGW_IP:$RGW_PORT" && exit 0
+    sleep $DELAY
+    let RETRIES=RETRIES-1
+  done
+  # If we reach this point, it means there is a problem with the connection to rgw
+  echo "Error connecting locally to Rados Gateway service: http://$rgw_listen"
+  exit 1
+}
+
+# First, restart the daemon
+systemctl restart ceph-radosgw@rgw.${RGW_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemon
+while [ $COUNT -ne 0 ]; do
+  {{ docker_exec_cmd }} test -S $SOCKET && check_rest
+  sleep 1
+  let COUNT=COUNT-1
+done
+echo "Socket file ${SOCKET} could not be found, which means Rados Gateway is not running."
+exit 1
author	Sébastien Han <seb@redhat.com>
	Thu, 31 Aug 2017 09:22:33 +0000 (11:22 +0200)
committer	Sébastien Han <seb@redhat.com>
	Thu, 31 Aug 2017 17:02:21 +0000 (19:02 +0200)
group_vars/all.yml.sample		patch \| blob \| history
group_vars/rhcs.yml.sample		patch \| blob \| history
roles/ceph-defaults/defaults/main.yml		patch \| blob \| history
roles/ceph-defaults/handlers/main.yml		patch \| blob \| history
roles/ceph-defaults/templates/restart_mds_daemon.sh.j2	[new file with mode: 0644]	patch \| blob
roles/ceph-defaults/templates/restart_mon_daemon.sh.j2		patch \| blob \| history
roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2	[new file with mode: 0644]	patch \| blob