From c7e269fcf5620a49909b880f57f5cbb988c27b07 Mon Sep 17 00:00:00 2001
From: Subhachandra Chandra <schandra@grailbio.com>
Date: Fri, 16 Mar 2018 10:10:14 -0700
Subject: [PATCH] Fix restarting OSDs twice during a rolling update.

During a rolling update, OSDs are restarted twice currently. Once, by the
handler in roles/ceph-defaults/handlers/main.yml and a second time by tasks
in the rolling_update playbook. This change turns off restarts by the handler.
Further, the restart initiated by the rolling_update playbook is more
efficient as it restarts all the OSDs on a host as one operation and waits
for them to rejoin the cluster. The restart task in the handler restarts one
OSD at a time and waits for it to join the cluster.
---
 roles/ceph-defaults/handlers/main.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/roles/ceph-defaults/handlers/main.yml b/roles/ceph-defaults/handlers/main.yml
index 65b227a37..11e1a16de 100644
--- a/roles/ceph-defaults/handlers/main.yml
+++ b/roles/ceph-defaults/handlers/main.yml
@@ -64,6 +64,9 @@
 # This does not just restart OSDs but everything else too. Unfortunately
 # at this time the ansible role does not have an OSD id list to use
 # for restarting them specifically.
+# This does not need to run during a rolling update as the playbook will
+# restart all OSDs using the tasks "start ceph osd" or
+# "restart containerized ceph osd"
 - name: copy osd restart script
   template:
     src: restart_osd_daemon.sh.j2
@@ -74,6 +77,7 @@
   listen: "restart ceph osds"
   when:
     - osd_group_name in group_names
+    - not rolling_update
 
 - name: restart ceph osds daemon(s) - non container
   command: /usr/bin/env bash /tmp/restart_osd_daemon.sh
@@ -81,6 +85,7 @@
   when:
     - osd_group_name in group_names
     - not containerized_deployment
+    - not rolling_update
     # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
     # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
     - osd_socket_stat.rc == 0
@@ -99,6 +104,7 @@
     # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
     - osd_group_name in group_names
     - containerized_deployment
+    - not rolling_update
     - ceph_osd_container_stat.get('rc') == 0
     - inventory_hostname == groups.get(osd_group_name) | last
     - ceph_osd_container_stat.get('stdout_lines', [])|length != 0
-- 
2.39.5