Until now, there is no handlers for containerized deployments.
Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
tasks:
- - name: disable ceph rgw service
+# For backward compatibility
+ - name: disable ceph rgw service (old unit name, for backward compatibility)
service:
name: "ceph-rgw@{{ ansible_hostname }}"
state: stopped
enabled: no
ignore_errors: true
+ - name: disable ceph rgw service (new unit name)
+ service:
+ name: "ceph-radosgw@{{ ansible_hostname }}"
+ state: stopped
+ enabled: no
+ ignore_errors: true
+
- name: remove ceph rgw container
docker:
image: "{{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}"
- name: remove ceph rgw service
file:
- path: /etc/systemd/system/ceph-rgw@.service
+ path: "{{ item }}"
state: absent
+ with_items:
+# For backward compatibility
+ - /etc/systemd/system/ceph-rgw@.service
+ - /etc/systemd/system/ceph-radosgw@.service
- name: remove ceph rgw image
docker_image:
- name: restart containerized ceph rgws with systemd
service:
- name: ceph-rgw@{{ ansible_hostname }}
+ name: ceph-radosgw@{{ ansible_hostname }}
state: restarted
enabled: yes
when:
+++ /dev/null
----
-- name: update apt cache
- apt:
- update-cache: yes
- when: ansible_os_family == 'Debian'
-
-- block:
- - name: copy mon restart script
- template:
- src: restart_mon_daemon.sh.j2
- dest: /tmp/restart_mon_daemon.sh
- owner: root
- group: root
- mode: 0750
- listen: "restart ceph mons"
-
- - name: restart ceph mon daemon(s)
- command: /tmp/restart_mon_daemon.sh
- listen: "restart ceph mons"
-
- when:
-# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- - socket.rc == 0
- - ceph_current_fsid.rc == 0
- - mon_group_name in group_names
-
-# This does not just restart OSDs but everything else too. Unfortunately
-# at this time the ansible role does not have an OSD id list to use
-# for restarting them specifically.
-- block:
- - name: copy osd restart script
- template:
- src: restart_osd_daemon.sh.j2
- dest: /tmp/restart_osd_daemon.sh
- owner: root
- group: root
- mode: 0750
- listen: "restart ceph osds"
-
- - name: restart ceph osds daemon(s)
- command: /tmp/restart_osd_daemon.sh
- listen: "restart ceph osds"
- when: handler_health_osd_check
-
- when:
-# We do not want to run these checks on initial deployment (`socket.rc == 0`)
-# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- - ((crush_location is defined and crush_location) or socket.rc == 0)
- - ceph_current_fsid.rc == 0
- - osd_group_name in group_names
-# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
- - inventory_hostname in play_hosts
-
-- name: restart ceph mdss
- service:
- name: ceph-mds@{{ mds_name }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(mds_group_name, []) }}"
- delegate_to: "{{ item }}"
- when:
- - mds_group_name in group_names
-
-- name: restart ceph rgws
- service:
- name: ceph-radosgw@rgw.{{ ansible_hostname }}
- state: restarted
- # serial: 1 would be the proper solution here, but that can only be set on play level
- # upstream issue: https://github.com/ansible/ansible/issues/12170
- run_once: true
- with_items: "{{ groups.get(rgw_group_name, []) }}"
- delegate_to: "{{ item }}"
- when:
- - rgw_group_name in group_names
-
-- name: restart ceph nfss
- service:
- name: nfs-ganesha
- state: restarted
- when:
- - nfs_group_name in group_names
+++ /dev/null
----
-# These checks are used to avoid running handlers at initial deployment.
-- name: check for a ceph socket
- shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
- changed_when: false
- failed_when: false
- always_run: true
- register: socket
-
-- name: check for a rados gateway socket
- shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
- changed_when: false
- failed_when: false
- always_run: true
- register: socketrgw
- ceph_current_fsid.rc == 0
- mon_group_name in group_names
-- include: ./checks/check_socket.yml
- include: create_ceph_initial_dirs.yml
- include: generate_ceph_conf.yml
- include: create_rbd_client_dir.yml
+++ /dev/null
-#!/bin/bash
-
-RETRIES="{{ handler_health_mon_check_retries }}"
-DELAY="{{ handler_health_mon_check_delay }}"
-MONITOR_NAME="{{ monitor_name }}"
-CLUSTER="{{ cluster }}"
-SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok
-
-
-check_quorum() {
-while [ $RETRIES -ne 0 ]; do
- MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
- test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
- sleep $DELAY
- let RETRIES=RETRIES-1
-done
-# If we reach this point, it means there is a problem with the quorum
-echo "Error with quorum."
-echo "cluster status:"
-ceph --cluster ${CLUSTER} -s
-exit 1
-}
-
-# First, restart the daemon
-systemctl restart ceph-mon@${MONITOR_NAME}
-
-COUNT=10
-# Wait and ensure the socket exists after restarting the daemon
-while [ $COUNT -ne 0 ]; do
- test -S $SOCKET && check_quorum
- sleep 1
- let COUNT=COUNT-1
-done
-# If we reach this point, it means the socket is not present.
-echo "Socket file ${SOCKET} could not be found, which means the monitor is not running."
-exit 1
+++ /dev/null
-#!/bin/bash
-
-RETRIES="{{ handler_health_osd_check_retries }}"
-DELAY="{{ handler_health_osd_check_delay }}"
-CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
-
-check_pgs() {
- while [ $RETRIES -ne 0 ]; do
- test "[""$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
- RET=$?
- test $RET -eq 0 && return 0
- sleep $DELAY
- let RETRIES=RETRIES-1
- done
- # PGs not clean, exiting with return code 1
- echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
- echo "It is possible that the cluster has less OSDs than the replica configuration"
- echo "Will refuse to continue"
- ceph $CEPH_CLI -s
- exit 1
-}
-
-for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
- # First, restart daemon(s)
- systemctl restart ceph-osd@${id}
- # We need to wait because it may take some time for the socket to actually exists
- COUNT=10
- # Wait and ensure the socket exists after restarting the daemon
- SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
- while [ $COUNT -ne 0 ]; do
- test -S $SOCKET && check_pgs && continue 2
- sleep 1
- let COUNT=COUNT-1
- done
- # If we reach this point, it means the socket is not present.
- echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
- exit 1
-done
##########
# DOCKER #
##########
-
+docker_exec_cmd:
docker: false
ceph_docker_image: "ceph/daemon"
ceph_docker_image_tag: latest
--- /dev/null
+---
+- name: update apt cache
+ apt:
+ update-cache: yes
+ when: ansible_os_family == 'Debian'
+
+- block:
+ - name: copy mon restart script
+ template:
+ src: restart_mon_daemon.sh.j2
+ dest: /tmp/restart_mon_daemon.sh
+ owner: root
+ group: root
+ mode: 0750
+ listen: "restart ceph mons"
+
+ - name: restart ceph mon daemon(s)
+ command: /tmp/restart_mon_daemon.sh
+ listen: "restart ceph mons"
+ when:
+# We do not want to run these checks on initial deployment (`socket.rc == 0`)
+ - socket.rc == 0
+ - mon_group_name in group_names
+
+# This does not just restart OSDs but everything else too. Unfortunately
+# at this time the ansible role does not have an OSD id list to use
+# for restarting them specifically.
+- name: copy osd restart script
+ template:
+ src: restart_osd_daemon.sh.j2
+ dest: /tmp/restart_osd_daemon.sh
+ owner: root
+ group: root
+ mode: 0750
+ listen: "restart ceph osds"
+ when:
+ - inventory_hostname in play_hosts
+ - osd_group_name in group_names
+
+- name: restart containerized ceph osds daemon(s)
+ command: /tmp/restart_osd_daemon.sh
+ listen: "restart ceph osds"
+ with_items: "{{ socket_osd_container.results }}"
+ when:
+ # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+ # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+ - ((crush_location is defined and crush_location) or item.get('rc') == 0)
+ - handler_health_osd_check
+ # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
+ - inventory_hostname in play_hosts
+ - osd_group_name in group_names
+
+- name: restart non-containerized ceph osds daemon(s)
+ command: /tmp/restart_osd_daemon.sh
+ listen: "restart ceph osds"
+ when:
+ # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+ # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+ - ((crush_location is defined and crush_location) or socket.rc == 0)
+ - ceph_current_fsid.rc == 0
+ - handler_health_osd_check
+ # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
+ - inventory_hostname in play_hosts
+ - osd_group_name in group_names
+
+- name: restart ceph mdss
+ service:
+ name: ceph-mds@{{ mds_name }}
+ state: restarted
+ # serial: 1 would be the proper solution here, but that can only be set on play level
+ # upstream issue: https://github.com/ansible/ansible/issues/12170
+ run_once: true
+ with_items: "{{ groups.get(mds_group_name, []) }}"
+ delegate_to: "{{ item }}"
+ when:
+ - mds_group_name in group_names
+
+- name: restart ceph rgws
+ service:
+ name: ceph-radosgw@rgw.{{ ansible_hostname }}
+ state: restarted
+ # serial: 1 would be the proper solution here, but that can only be set on play level
+ # upstream issue: https://github.com/ansible/ansible/issues/12170
+ run_once: true
+ with_items: "{{ groups.get(rgw_group_name, []) }}"
+ delegate_to: "{{ item }}"
+ when:
+ - rgw_group_name in group_names
+
+- name: restart ceph nfss
+ service:
+ name: nfs-ganesha
+ state: restarted
+ when:
+ - nfs_group_name in group_names
--- /dev/null
+---
+# These checks are used to avoid running handlers at initial deployment.
+- name: check for a ceph socket
+ shell: |
+ {{ docker_exec_cmd }} bash -c 'stat {{ rbd_client_admin_socket_path }}/*.asok > /dev/null 2>&1'
+ changed_when: false
+ failed_when: false
+ always_run: true
+ register: socket
+
+- name: check for a ceph socket in containerized deployment (osds)
+ shell: |
+ docker exec ceph-osd-"{{ ansible_hostname }}"-"{{ item | replace('/', '') }}" bash -c 'stat /var/run/ceph/*.asok > /dev/null 2>&1'
+ changed_when: false
+ failed_when: false
+ always_run: true
+ register: socket_osd_container
+ with_items: "{{ devices }}"
+ when:
+ - containerized_deployment
+ - inventory_hostname in groups.get(osd_group_name)
---
- include: facts.yml
+- include: check_socket.yml
--- /dev/null
+#!/bin/bash
+
+RETRIES="{{ handler_health_mon_check_retries }}"
+DELAY="{{ handler_health_mon_check_delay }}"
+MONITOR_NAME="{{ monitor_name }}"
+SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok
+
+
+check_quorum() {
+while [ $RETRIES -ne 0 ]; do
+ MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+ test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
+ sleep $DELAY
+ let RETRIES=RETRIES-1
+done
+# If we reach this point, it means there is a problem with the quorum
+echo "Error with quorum."
+echo "cluster status:"
+{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s
+exit 1
+}
+
+# First, restart the daemon
+systemctl restart ceph-mon@${MONITOR_NAME}
+
+COUNT=10
+# Wait and ensure the socket exists after restarting the daemon
+while [ $COUNT -ne 0 ]; do
+ {{ docker_exec_cmd }} test -S $SOCKET && check_quorum
+ sleep 1
+ let COUNT=COUNT-1
+done
+# If we reach this point, it means the socket is not present.
+echo "Socket file ${SOCKET} could not be found, which means the monitor is not running."
+exit 1
--- /dev/null
+#!/bin/bash
+
+RETRIES="{{ handler_health_osd_check_retries }}"
+DELAY="{{ handler_health_osd_check_delay }}"
+CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
+
+check_pgs() {
+ while [ $RETRIES -ne 0 ]; do
+ test "[""$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
+ RET=$?
+ test $RET -eq 0 && return 0
+ sleep $DELAY
+ let RETRIES=RETRIES-1
+ done
+ # PGs not clean, exiting with return code 1
+ echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
+ echo "It is possible that the cluster has less OSDs than the replica configuration"
+ echo "Will refuse to continue"
+ $docker_exec ceph "$CEPH_CLI" -s
+ exit 1
+}
+
+wait_for_socket_in_docker() {
+ if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then
+ log "Timed out while trying to look for a Ceph OSD socket."
+ log "Abort mission!"
+ exit 1
+ fi
+}
+
+get_dev_name() {
+ echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'
+}
+
+get_docker_id_from_dev_name() {
+ local id
+ local count
+ count=10
+ while [ $count -ne 0 ]; do
+ id=$(docker ps -q -f "name=$1")
+ test "$id" != "" && break
+ sleep 1
+ let count=count-1
+ done
+ echo "$id"
+}
+
+get_docker_osd_id() {
+ wait_for_socket_in_docker $1
+ docker exec "$1" ls /var/run/ceph | cut -d'.' -f2
+}
+
+# For containerized deployments, the unit file looks like: ceph-osd@sda.service
+# For non-containerized deployments, the unit file looks like: ceph-osd@0.service
+for unit in $(systemctl list-units | grep -oE "ceph-osd@([0-9]{1,2}|[a-z]+).service"); do
+ # First, restart daemon(s)
+ systemctl restart "${unit}"
+ # We need to wait because it may take some time for the socket to actually exists
+ COUNT=10
+ # Wait and ensure the socket exists after restarting the daemon
+ {% if containerized_deployment -%}
+ id=$(get_dev_name "$unit")
+ container_id=$(get_docker_id_from_dev_name "$id")
+ osd_id=$(get_docker_osd_id "$container_id")
+ docker_exec="docker exec $container_id"
+ {% else %}
+ osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]{1,2}')
+ {% endif %}
+ SOCKET=/var/run/ceph/test-osd.${osd_id}.asok
+ while [ $COUNT -ne 0 ]; do
+ $docker_exec test -S "$SOCKET" && check_pgs && continue 2
+ sleep 1
+ let COUNT=COUNT-1
+ done
+ # If we reach this point, it means the socket is not present.
+ echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
+ exit 1
+done
config_type: ini
when:
- (not mon_containerized_default_ceph_conf_with_kv and
- (inventory_hostname in groups.get(mon_group_name, []))) or
+ (inventory_hostname in groups.get(mon_group_name, []) or inventory_hostname in groups.get(osd_group_name, []))) or
(not mon_containerized_default_ceph_conf_with_kv and
((groups.get(nfs_group_name, []) | length > 0)
and (inventory_hostname == groups.get(nfs_group_name, [])[0])))
+ notify:
+ - restart ceph mons
+ - restart ceph osds
+ - restart ceph mdss
+ - restart ceph rgws
- name: set fsid fact when generate_fsid = true
set_fact:
##########
# DOCKER #
##########
-docker_exec_cmd:
ceph_mon_docker_subnet: "{{ public_network }}"# subnet of the monitor_interface
# ceph_mon_docker_extra_env:
- name: generate systemd unit file
become: true
template:
- src: "{{ role_path }}/templates/ceph-rgw.service.j2"
- dest: /etc/systemd/system/ceph-rgw@.service
+ src: "{{ role_path }}/templates/ceph-radosgw.service.j2"
+ dest: /etc/systemd/system/ceph-radosgw@.service
owner: "root"
group: "root"
mode: "0644"
+# For backward compatibility
+- name: disable old systemd unit ('ceph-rgw@') if present
+ service:
+ name: ceph-rgw@{{ ansible_hostname }}
+ state: disable
+ ignore_errors: true
+
- name: enable systemd unit file for rgw instance
- shell: systemctl enable ceph-rgw@{{ ansible_hostname }}.service
+ shell: systemctl enable ceph-radosgw@{{ ansible_hostname }}.service
failed_when: false
changed_when: false
- name: systemd start rgw container
service:
- name: ceph-rgw@{{ ansible_hostname }}
+ name: ceph-radosgw@{{ ansible_hostname }}
state: started
enabled: yes
changed_when: false
--- /dev/null
+[Unit]
+Description=Ceph RGW
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStartPre=-/usr/bin/docker stop ceph-rgw-{{ ansible_hostname }}
+ExecStartPre=-/usr/bin/docker rm ceph-rgw-{{ ansible_hostname }}
+ExecStart=/usr/bin/docker run --rm --net=host \
+ {% if not containerized_deployment_with_kv -%}
+ -v /var/lib/ceph:/var/lib/ceph \
+ -v /etc/ceph:/etc/ceph \
+ {% else -%}
+ -e KV_TYPE={{kv_type}} \
+ -e KV_IP={{kv_endpoint}} \
+ -e KV_PORT={{kv_port}} \
+ {% endif -%}
+ -v /etc/localtime:/etc/localtime:ro \
+ --privileged \
+ -e CEPH_DAEMON=RGW \
+ {{ ceph_rgw_docker_extra_env }} \
+ --name=ceph-rgw-{{ ansible_hostname }} \
+ {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}
+ExecStopPost=-/usr/bin/docker stop ceph-rgw-{{ ansible_hostname }}
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
+++ /dev/null
-[Unit]
-Description=Ceph RGW
-After=docker.service
-
-[Service]
-EnvironmentFile=-/etc/environment
-ExecStartPre=-/usr/bin/docker stop ceph-rgw-{{ ansible_hostname }}
-ExecStartPre=-/usr/bin/docker rm ceph-rgw-{{ ansible_hostname }}
-ExecStart=/usr/bin/docker run --rm --net=host \
- {% if not containerized_deployment_with_kv -%}
- -v /var/lib/ceph:/var/lib/ceph \
- -v /etc/ceph:/etc/ceph \
- {% else -%}
- -e KV_TYPE={{kv_type}} \
- -e KV_IP={{kv_endpoint}} \
- -e KV_PORT={{kv_port}} \
- {% endif -%}
- -v /etc/localtime:/etc/localtime:ro \
- --privileged \
- -e CEPH_DAEMON=RGW \
- {{ ceph_rgw_docker_extra_env }} \
- --name=ceph-rgw-{{ ansible_hostname }} \
- {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}
-ExecStopPost=-/usr/bin/docker stop ceph-rgw-{{ ansible_hostname }}
-Restart=always
-RestartSec=10s
-TimeoutStartSec=120
-TimeoutStopSec=15
-
-[Install]
-WantedBy=multi-user.target