From: Sébastien Han <sebastien.han@enovance.com>
Date: Wed, 25 Mar 2015 10:28:37 +0000 (+0100)
Subject: Improve rolling upgrades
X-Git-Tag: v1.0.0~233^2~1
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6f806cc3be4b03bcbfefba6f54501d45d05c1dc6;p=ceph-ansible.git

Improve rolling upgrades

Re-arrange the files.
Add new checks.

Signed-off-by: Sébastien Han <sebastien.han@enovance.com>
---

diff --git a/maintenance.yml b/maintenance.yml
deleted file mode 100644
index 3b7d2a9c6..000000000
--- a/maintenance.yml
+++ /dev/null
@@ -1,37 +0,0 @@
----
-# This playbook was made to automate Ceph servers maintenance
-# Typical use case: hardware change
-# By running this playbook you will set the 'noout' flag on your
-# cluster, which means that OSD **can't** be marked as out
-# of the CRUSH map, but they will be marked as down.
-# Basically we tell the cluster to don't move any data since
-# the operation won't last for too long.
-
-- hosts: <your_host>
-  gather_facts: False
-
-  tasks:
-
-  - name: Set the noout flag
-    command: ceph osd set noout
-    delegate_to: <your_monitor>
-
-  - name: Turn off the server
-    command: poweroff
-
-  - name: Wait for the server to go down
-    local_action: >
-      wait_for host=<your_host>
-      port=22
-      state=stopped
-
-  - name: Wait for the server to come up
-    local_action: >
-      wait_for host=<your_host
-      port=22
-      delay=10
-      timeout=3600
-
-  - name: Unset the noout flag
-    command: ceph osd unset noout
-    delegate_to: <your_monitor>
diff --git a/operations/cluster-maintenance.yml b/operations/cluster-maintenance.yml
new file mode 100644
index 000000000..3b7d2a9c6
--- /dev/null
+++ b/operations/cluster-maintenance.yml
@@ -0,0 +1,37 @@
+---
+# This playbook was made to automate Ceph servers maintenance
+# Typical use case: hardware change
+# By running this playbook you will set the 'noout' flag on your
+# cluster, which means that OSD **can't** be marked as out
+# of the CRUSH map, but they will be marked as down.
+# Basically we tell the cluster to don't move any data since
+# the operation won't last for too long.
+
+- hosts: <your_host>
+  gather_facts: False
+
+  tasks:
+
+  - name: Set the noout flag
+    command: ceph osd set noout
+    delegate_to: <your_monitor>
+
+  - name: Turn off the server
+    command: poweroff
+
+  - name: Wait for the server to go down
+    local_action: >
+      wait_for host=<your_host>
+      port=22
+      state=stopped
+
+  - name: Wait for the server to come up
+    local_action: >
+      wait_for host=<your_host
+      port=22
+      delay=10
+      timeout=3600
+
+  - name: Unset the noout flag
+    command: ceph osd unset noout
+    delegate_to: <your_monitor>
diff --git a/operations/cluster-operating-system-migration.yml b/operations/cluster-operating-system-migration.yml
new file mode 100644
index 000000000..b09798b96
--- /dev/null
+++ b/operations/cluster-operating-system-migration.yml
@@ -0,0 +1,249 @@
+---
+# This playbook was meant to upgrade a node from Ubuntu to RHEL.
+# We are performing a set of actions prior to reboot the node.
+# The node reboots via PXE and gets its new operating system.
+# This playbook only works for monitors and OSDs.
+
+- hosts: mons
+  serial: 1
+  sudo: True
+
+  vars:
+    backup_dir: /tmp/
+
+  pre_tasks:
+    - name: Compress the store as much as possible
+      command: ceph tell mon.{{ ansible_hostname }} compact
+
+    - name: Check if sysvinit
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit
+      register: sysvinit
+
+    - name: Check if upstart
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart
+      register: upstart
+
+    - name: Restart the Monitor after compaction (Upstart)
+      service: name=ceph-mon-all state=restarted
+      when: upstart.stat.exists == True
+
+    - name: Restart the Monitor after compaction (Sysvinit)
+      service: name=ceph state=restarted args=mon
+      when: sysvinit.stat.exists == True
+
+    - name: Wait for the monitor to be up again
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port=6789
+        timeout=10
+
+    - name: Stop the monitor (Upstart)
+      service: name=ceph-mon-all state=started
+      when: upstart.stat.exists == True
+
+    - name: Stop the monitor (Sysvinit)
+      service: name=ceph state=started args=mon
+      when: sysvinit.stat.exists == True
+
+    - name: Wait for the monitor to be down
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port=6789
+        timeout=10
+        state=stopped
+
+    - name: Create a backup directory
+      file: >
+        path={{ backup_dir }}/monitors-backups
+        state=directory
+        owner=root
+        group=root
+        mode=0644
+      delegate_to: "{{ item }}"
+      with_items: groups.backup[0]
+
+    - name: Archive monitor stores
+      shell: >
+        tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar
+        chdir=/var/lib/ceph/
+        creates={{ ansible_hostname }}.tar
+
+    - name: Scp the Monitor store
+      fetch: >
+        src=/var/lib/ceph/{{ ansible_hostname }}.tar
+        dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar
+        flat=yes
+
+  tasks:
+    - name: Reboot the server
+      command: reboot
+
+    - name: Wait for the server to come up
+      local_action: >
+        wait_for
+        port=22
+        delay=10
+        timeout=3600
+
+    - name: Wait a bit more to be sure that the server is ready
+      pause: seconds=20
+
+    - name: Check if sysvinit
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit
+      register: sysvinit
+
+    - name: Check if upstart
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart
+      register: upstart
+
+    - name: Make sure the monitor is stopped (Upstart)
+      service: name=ceph-mon-all state=started
+      when: upstart.stat.exists == True
+
+    - name: Make sure the monitor is stopped (Sysvinit)
+      service: name=ceph state=started args=mon
+      when: sysvinit.stat.exists == True
+
+    - name: Scp back monitor store
+      copy: >
+        src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar
+        dest=/var/lib/ceph/{{ ansible_hostname }}.tar
+
+    - name: Untar the monitor store
+      shell: >
+        tar -xzvf {{ ansible_hostname }}.tar --overwrite --overwrite-dir
+        chdir=/var/lib/ceph/
+        creates=etc/ceph/ceph.conf
+
+    - name: Configure RHEL7 for sysvinit
+      shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \;
+
+    - name: Start the monitor
+      service: >
+        name=ceph
+        state=started
+        pattern=/usr/bin/ceph-mon
+        args=mon
+
+    - name: Wait for the Monitor to be up again
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port=6789
+        timeout=10
+
+    - name: Waiting for a quorum...
+      shell: >
+        ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }}
+      register: result
+      until: result.rc == 0
+      retries: 5
+      delay: 10
+      delegate_to: "{{ item }}"
+      with_items: groups.backup[0]
+
+- hosts: osds
+  serial: 1
+  sudo: True
+
+  vars:
+    backup_dir: /tmp/
+
+  pre_tasks:
+    - name: Set the noout flag
+      command: ceph osd set noout
+      delegate_to: "{{ item }}"
+      with_items: groups.mons[0]
+
+  tasks:
+    - name: Archive ceph configs
+      shell: >
+        tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar
+        chdir=/var/lib/ceph/
+        creates={{ ansible_hostname }}.tar
+
+    - name: Create backup directory
+      file: >
+        path={{ backup_dir }}/osds-backups
+        state=directory
+        owner=root
+        group=root
+        mode=0644
+      delegate_to: "{{ item }}"
+      with_items: groups.backup[0]
+
+    - name: Scp OSDs dirs and configs
+      fetch: >
+        src=/var/lib/ceph/{{ ansible_hostname }}.tar
+        dest={{ backup_dir }}/osds-backups/
+        flat=yes
+
+    - name: Reboot the server
+      command: reboot
+
+    - name: Wait for the server to come up
+      local_action: >
+        wait_for
+        port=22
+        delay=10
+        timeout=3600
+
+    - name: Wait a bit to be sure that the server is ready for scp
+      pause: seconds=20
+
+    - name: Scp back OSDs dirs and configs
+      copy: >
+        src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar
+        dest=/var/lib/ceph/{{ ansible_hostname }}.tar
+
+    - name: Untar the OSD config
+      shell: >
+        tar -xzvf {{ ansible_hostname }}.tar --overwrite --overwrite-dir
+        chdir=/var/lib/ceph/
+        creates=etc/ceph/ceph.conf
+
+    - name: Configure RHEL with sysvinit
+      shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \;
+
+    - name: Copy ceph.conf
+      command: >
+        cp etc/ceph/ceph.conf /etc/ceph/ceph.conf
+        chdir=/var/lib/ceph/
+
+    - name: Start all the OSDs
+      service: >
+        name=ceph
+        state=started
+        pattern=/usr/bin/ceph-osd
+        args=osd
+
+    - name: Wait for the OSDs to be up again
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port={{ item }}
+        timeout=10
+      with_items:
+        - 6800
+
+    - name: Waiting for clean PGs...
+      shell: >
+        test "$(ceph pg stat | sed 's/^.*pgs://' | sed 's/active+clean.*//' |sed 's/ //')" -eq "$(ceph pg stat | sed 's/pgs.*//' | sed 's/^.*://' | sed 's/ //')" && ceph -s | egrep -q "HEALTH_OK|HEALTH_WARN"
+      register: result
+      until: result.rc == 0
+      retries: 10
+      delay: 10
+      delegate_to: "{{ item }}"
+      with_items: groups.backup[0]
+
+#  post_tasks:
+    - name: Unset the noout flag
+      command: ceph osd unset noout
+      delegate_to: "{{ item }}"
+      with_items: groups.mons[0]
diff --git a/operations/purge-cluster.yml b/operations/purge-cluster.yml
new file mode 100644
index 000000000..5848d4a9f
--- /dev/null
+++ b/operations/purge-cluster.yml
@@ -0,0 +1,32 @@
+---
+# This playbook purges Ceph
+# It removes: packages, configuration files and ALL THE DATA
+
+- hosts:
+  - mons
+  - osds
+
+  vars:
+    devices: [ '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde', '/dev/sdf' ]
+    partitions: [ '1', '2', '3' ]
+
+  tasks:
+
+  - name: Purge Ceph
+    command: ceph-deploy purge {{ ansible_fqdn }}
+    delegate_to: 127.0.0.1
+
+  - name: Remove OSD data
+    shell: rm -rf /var/lib/ceph/osd/*/*
+    ignore_errors: true
+
+  - name: Purge remaining data
+    command: ceph-deploy purgedata {{ ansible_fqdn }}
+    delegate_to: 127.0.0.1
+
+  - name: Purge partitions
+    shell: parted -s {{ item[0] }} rm {{ item[1] }}
+    with_nested:
+      - devices
+      - partitions
+    ignore_errors: true
diff --git a/operations/rolling_update.yml b/operations/rolling_update.yml
new file mode 100644
index 000000000..3c606271c
--- /dev/null
+++ b/operations/rolling_update.yml
@@ -0,0 +1,56 @@
+---
+# This playbook does a rolling update for all the Ceph services
+# Change the value of serial: to adjust the number of server to be updated.
+#
+# The four roles that apply to the ceph hosts will be applied: ceph-common,
+# ceph-mon, ceph-osd and ceph-mds. So any changes to configuration, package updates, etc,
+# will be applied as part of the rolling update process.
+#
+
+# /!\ DO NOT FORGET TO CHANGE THE RELEASE VERSION FIRST! /!\
+
+- hosts:
+  - mons
+  - osds
+  - mdss
+  - rgws
+  sudo: True
+  roles:
+  - ceph-common
+
+- hosts: mons
+  serial: 1
+  sudo: True
+  roles:
+  - ceph-mon
+  post_tasks:
+  - name: restart monitor(s)
+    service: >
+      name=ceph
+      state=restarted
+      args=mon
+
+- hosts: osds
+  serial: 1
+  sudo: True
+  roles:
+  - ceph-osd
+  post_tasks:
+  - name: restart object storage daemon(s)
+    command: service ceph-osd-all restart
+    when: ansible_distribution == "Ubuntu"
+  - name: restart object storage daemon(s)
+    service: name=ceph state=restarted args=osd
+    when: ansible_distribution == "Debian"
+
+- hosts: mdss
+  serial: 1
+  sudo: True
+  roles:
+  - ceph-mds
+  post_tasks:
+  - name: restart metadata server(s)
+    service: >
+      name=ceph
+      state=restarted
+      args=mds
diff --git a/purge.yml b/purge.yml
deleted file mode 100644
index 5848d4a9f..000000000
--- a/purge.yml
+++ /dev/null
@@ -1,32 +0,0 @@
----
-# This playbook purges Ceph
-# It removes: packages, configuration files and ALL THE DATA
-
-- hosts:
-  - mons
-  - osds
-
-  vars:
-    devices: [ '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde', '/dev/sdf' ]
-    partitions: [ '1', '2', '3' ]
-
-  tasks:
-
-  - name: Purge Ceph
-    command: ceph-deploy purge {{ ansible_fqdn }}
-    delegate_to: 127.0.0.1
-
-  - name: Remove OSD data
-    shell: rm -rf /var/lib/ceph/osd/*/*
-    ignore_errors: true
-
-  - name: Purge remaining data
-    command: ceph-deploy purgedata {{ ansible_fqdn }}
-    delegate_to: 127.0.0.1
-
-  - name: Purge partitions
-    shell: parted -s {{ item[0] }} rm {{ item[1] }}
-    with_nested:
-      - devices
-      - partitions
-    ignore_errors: true
diff --git a/rolling_update.yml b/rolling_update.yml
deleted file mode 100644
index 3c606271c..000000000
--- a/rolling_update.yml
+++ /dev/null
@@ -1,56 +0,0 @@
----
-# This playbook does a rolling update for all the Ceph services
-# Change the value of serial: to adjust the number of server to be updated.
-#
-# The four roles that apply to the ceph hosts will be applied: ceph-common,
-# ceph-mon, ceph-osd and ceph-mds. So any changes to configuration, package updates, etc,
-# will be applied as part of the rolling update process.
-#
-
-# /!\ DO NOT FORGET TO CHANGE THE RELEASE VERSION FIRST! /!\
-
-- hosts:
-  - mons
-  - osds
-  - mdss
-  - rgws
-  sudo: True
-  roles:
-  - ceph-common
-
-- hosts: mons
-  serial: 1
-  sudo: True
-  roles:
-  - ceph-mon
-  post_tasks:
-  - name: restart monitor(s)
-    service: >
-      name=ceph
-      state=restarted
-      args=mon
-
-- hosts: osds
-  serial: 1
-  sudo: True
-  roles:
-  - ceph-osd
-  post_tasks:
-  - name: restart object storage daemon(s)
-    command: service ceph-osd-all restart
-    when: ansible_distribution == "Ubuntu"
-  - name: restart object storage daemon(s)
-    service: name=ceph state=restarted args=osd
-    when: ansible_distribution == "Debian"
-
-- hosts: mdss
-  serial: 1
-  sudo: True
-  roles:
-  - ceph-mds
-  post_tasks:
-  - name: restart metadata server(s)
-    service: >
-      name=ceph
-      state=restarted
-      args=mds