From 142eccc6fd3391670c22841e2c3c4751a2254dfe Mon Sep 17 00:00:00 2001 From: Matthew Vernon Date: Fri, 21 Sep 2018 17:55:01 +0100 Subject: [PATCH] restart_osd_daemon.sh.j2 - Reset RETRIES between calls of check_pgs Previously RETRIES was set (by default to 40) once at the start of the script; this meant that it would only ever wait for up to 40 lots of 30s across *all* the OSDs on a host before bombing out. In fact, we want to be prepared to wait for the same amount of time after each OSD restart for the clusters' pgs to be happy again before continuing. Closes: #3154 Signed-off-by: Matthew Vernon (cherry picked from commit aa97ecf0480c1075187b38038463f2f52144c754) --- roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 index 0ca0b1958..9d20870b1 100644 --- a/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 +++ b/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 @@ -1,6 +1,5 @@ #!/bin/bash -RETRIES="{{ handler_health_osd_check_retries }}" DELAY="{{ handler_health_osd_check_delay }}" CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" @@ -72,6 +71,7 @@ for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph- {% endif %} SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok while [ $COUNT -ne 0 ]; do + RETRIES="{{ handler_health_osd_check_retries }}" $docker_exec test -S "$SOCKET" && check_pgs && continue 2 sleep $DELAY let COUNT=COUNT-1 -- 2.47.3