From: Sébastien Han Date: Wed, 25 Mar 2015 10:28:37 +0000 (+0100) Subject: Improve rolling upgrades X-Git-Tag: v1.0.0~233^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6f806cc3be4b03bcbfefba6f54501d45d05c1dc6;p=ceph-ansible.git Improve rolling upgrades Re-arrange the files. Add new checks. Signed-off-by: Sébastien Han --- diff --git a/maintenance.yml b/maintenance.yml deleted file mode 100644 index 3b7d2a9c6..000000000 --- a/maintenance.yml +++ /dev/null @@ -1,37 +0,0 @@ ---- -# This playbook was made to automate Ceph servers maintenance -# Typical use case: hardware change -# By running this playbook you will set the 'noout' flag on your -# cluster, which means that OSD **can't** be marked as out -# of the CRUSH map, but they will be marked as down. -# Basically we tell the cluster to don't move any data since -# the operation won't last for too long. - -- hosts: - gather_facts: False - - tasks: - - - name: Set the noout flag - command: ceph osd set noout - delegate_to: - - - name: Turn off the server - command: poweroff - - - name: Wait for the server to go down - local_action: > - wait_for host= - port=22 - state=stopped - - - name: Wait for the server to come up - local_action: > - wait_for host= diff --git a/operations/cluster-maintenance.yml b/operations/cluster-maintenance.yml new file mode 100644 index 000000000..3b7d2a9c6 --- /dev/null +++ b/operations/cluster-maintenance.yml @@ -0,0 +1,37 @@ +--- +# This playbook was made to automate Ceph servers maintenance +# Typical use case: hardware change +# By running this playbook you will set the 'noout' flag on your +# cluster, which means that OSD **can't** be marked as out +# of the CRUSH map, but they will be marked as down. +# Basically we tell the cluster to don't move any data since +# the operation won't last for too long. + +- hosts: + gather_facts: False + + tasks: + + - name: Set the noout flag + command: ceph osd set noout + delegate_to: + + - name: Turn off the server + command: poweroff + + - name: Wait for the server to go down + local_action: > + wait_for host= + port=22 + state=stopped + + - name: Wait for the server to come up + local_action: > + wait_for host= diff --git a/operations/cluster-operating-system-migration.yml b/operations/cluster-operating-system-migration.yml new file mode 100644 index 000000000..b09798b96 --- /dev/null +++ b/operations/cluster-operating-system-migration.yml @@ -0,0 +1,249 @@ +--- +# This playbook was meant to upgrade a node from Ubuntu to RHEL. +# We are performing a set of actions prior to reboot the node. +# The node reboots via PXE and gets its new operating system. +# This playbook only works for monitors and OSDs. + +- hosts: mons + serial: 1 + sudo: True + + vars: + backup_dir: /tmp/ + + pre_tasks: + - name: Compress the store as much as possible + command: ceph tell mon.{{ ansible_hostname }} compact + + - name: Check if sysvinit + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit + register: sysvinit + + - name: Check if upstart + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart + register: upstart + + - name: Restart the Monitor after compaction (Upstart) + service: name=ceph-mon-all state=restarted + when: upstart.stat.exists == True + + - name: Restart the Monitor after compaction (Sysvinit) + service: name=ceph state=restarted args=mon + when: sysvinit.stat.exists == True + + - name: Wait for the monitor to be up again + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port=6789 + timeout=10 + + - name: Stop the monitor (Upstart) + service: name=ceph-mon-all state=started + when: upstart.stat.exists == True + + - name: Stop the monitor (Sysvinit) + service: name=ceph state=started args=mon + when: sysvinit.stat.exists == True + + - name: Wait for the monitor to be down + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port=6789 + timeout=10 + state=stopped + + - name: Create a backup directory + file: > + path={{ backup_dir }}/monitors-backups + state=directory + owner=root + group=root + mode=0644 + delegate_to: "{{ item }}" + with_items: groups.backup[0] + + - name: Archive monitor stores + shell: > + tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar + chdir=/var/lib/ceph/ + creates={{ ansible_hostname }}.tar + + - name: Scp the Monitor store + fetch: > + src=/var/lib/ceph/{{ ansible_hostname }}.tar + dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar + flat=yes + + tasks: + - name: Reboot the server + command: reboot + + - name: Wait for the server to come up + local_action: > + wait_for + port=22 + delay=10 + timeout=3600 + + - name: Wait a bit more to be sure that the server is ready + pause: seconds=20 + + - name: Check if sysvinit + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit + register: sysvinit + + - name: Check if upstart + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart + register: upstart + + - name: Make sure the monitor is stopped (Upstart) + service: name=ceph-mon-all state=started + when: upstart.stat.exists == True + + - name: Make sure the monitor is stopped (Sysvinit) + service: name=ceph state=started args=mon + when: sysvinit.stat.exists == True + + - name: Scp back monitor store + copy: > + src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar + dest=/var/lib/ceph/{{ ansible_hostname }}.tar + + - name: Untar the monitor store + shell: > + tar -xzvf {{ ansible_hostname }}.tar --overwrite --overwrite-dir + chdir=/var/lib/ceph/ + creates=etc/ceph/ceph.conf + + - name: Configure RHEL7 for sysvinit + shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; + + - name: Start the monitor + service: > + name=ceph + state=started + pattern=/usr/bin/ceph-mon + args=mon + + - name: Wait for the Monitor to be up again + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port=6789 + timeout=10 + + - name: Waiting for a quorum... + shell: > + ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }} + register: result + until: result.rc == 0 + retries: 5 + delay: 10 + delegate_to: "{{ item }}" + with_items: groups.backup[0] + +- hosts: osds + serial: 1 + sudo: True + + vars: + backup_dir: /tmp/ + + pre_tasks: + - name: Set the noout flag + command: ceph osd set noout + delegate_to: "{{ item }}" + with_items: groups.mons[0] + + tasks: + - name: Archive ceph configs + shell: > + tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar + chdir=/var/lib/ceph/ + creates={{ ansible_hostname }}.tar + + - name: Create backup directory + file: > + path={{ backup_dir }}/osds-backups + state=directory + owner=root + group=root + mode=0644 + delegate_to: "{{ item }}" + with_items: groups.backup[0] + + - name: Scp OSDs dirs and configs + fetch: > + src=/var/lib/ceph/{{ ansible_hostname }}.tar + dest={{ backup_dir }}/osds-backups/ + flat=yes + + - name: Reboot the server + command: reboot + + - name: Wait for the server to come up + local_action: > + wait_for + port=22 + delay=10 + timeout=3600 + + - name: Wait a bit to be sure that the server is ready for scp + pause: seconds=20 + + - name: Scp back OSDs dirs and configs + copy: > + src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar + dest=/var/lib/ceph/{{ ansible_hostname }}.tar + + - name: Untar the OSD config + shell: > + tar -xzvf {{ ansible_hostname }}.tar --overwrite --overwrite-dir + chdir=/var/lib/ceph/ + creates=etc/ceph/ceph.conf + + - name: Configure RHEL with sysvinit + shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; + + - name: Copy ceph.conf + command: > + cp etc/ceph/ceph.conf /etc/ceph/ceph.conf + chdir=/var/lib/ceph/ + + - name: Start all the OSDs + service: > + name=ceph + state=started + pattern=/usr/bin/ceph-osd + args=osd + + - name: Wait for the OSDs to be up again + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port={{ item }} + timeout=10 + with_items: + - 6800 + + - name: Waiting for clean PGs... + shell: > + test "$(ceph pg stat | sed 's/^.*pgs://' | sed 's/active+clean.*//' |sed 's/ //')" -eq "$(ceph pg stat | sed 's/pgs.*//' | sed 's/^.*://' | sed 's/ //')" && ceph -s | egrep -q "HEALTH_OK|HEALTH_WARN" + register: result + until: result.rc == 0 + retries: 10 + delay: 10 + delegate_to: "{{ item }}" + with_items: groups.backup[0] + +# post_tasks: + - name: Unset the noout flag + command: ceph osd unset noout + delegate_to: "{{ item }}" + with_items: groups.mons[0] diff --git a/operations/purge-cluster.yml b/operations/purge-cluster.yml new file mode 100644 index 000000000..5848d4a9f --- /dev/null +++ b/operations/purge-cluster.yml @@ -0,0 +1,32 @@ +--- +# This playbook purges Ceph +# It removes: packages, configuration files and ALL THE DATA + +- hosts: + - mons + - osds + + vars: + devices: [ '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde', '/dev/sdf' ] + partitions: [ '1', '2', '3' ] + + tasks: + + - name: Purge Ceph + command: ceph-deploy purge {{ ansible_fqdn }} + delegate_to: 127.0.0.1 + + - name: Remove OSD data + shell: rm -rf /var/lib/ceph/osd/*/* + ignore_errors: true + + - name: Purge remaining data + command: ceph-deploy purgedata {{ ansible_fqdn }} + delegate_to: 127.0.0.1 + + - name: Purge partitions + shell: parted -s {{ item[0] }} rm {{ item[1] }} + with_nested: + - devices + - partitions + ignore_errors: true diff --git a/operations/rolling_update.yml b/operations/rolling_update.yml new file mode 100644 index 000000000..3c606271c --- /dev/null +++ b/operations/rolling_update.yml @@ -0,0 +1,56 @@ +--- +# This playbook does a rolling update for all the Ceph services +# Change the value of serial: to adjust the number of server to be updated. +# +# The four roles that apply to the ceph hosts will be applied: ceph-common, +# ceph-mon, ceph-osd and ceph-mds. So any changes to configuration, package updates, etc, +# will be applied as part of the rolling update process. +# + +# /!\ DO NOT FORGET TO CHANGE THE RELEASE VERSION FIRST! /!\ + +- hosts: + - mons + - osds + - mdss + - rgws + sudo: True + roles: + - ceph-common + +- hosts: mons + serial: 1 + sudo: True + roles: + - ceph-mon + post_tasks: + - name: restart monitor(s) + service: > + name=ceph + state=restarted + args=mon + +- hosts: osds + serial: 1 + sudo: True + roles: + - ceph-osd + post_tasks: + - name: restart object storage daemon(s) + command: service ceph-osd-all restart + when: ansible_distribution == "Ubuntu" + - name: restart object storage daemon(s) + service: name=ceph state=restarted args=osd + when: ansible_distribution == "Debian" + +- hosts: mdss + serial: 1 + sudo: True + roles: + - ceph-mds + post_tasks: + - name: restart metadata server(s) + service: > + name=ceph + state=restarted + args=mds diff --git a/purge.yml b/purge.yml deleted file mode 100644 index 5848d4a9f..000000000 --- a/purge.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -# This playbook purges Ceph -# It removes: packages, configuration files and ALL THE DATA - -- hosts: - - mons - - osds - - vars: - devices: [ '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde', '/dev/sdf' ] - partitions: [ '1', '2', '3' ] - - tasks: - - - name: Purge Ceph - command: ceph-deploy purge {{ ansible_fqdn }} - delegate_to: 127.0.0.1 - - - name: Remove OSD data - shell: rm -rf /var/lib/ceph/osd/*/* - ignore_errors: true - - - name: Purge remaining data - command: ceph-deploy purgedata {{ ansible_fqdn }} - delegate_to: 127.0.0.1 - - - name: Purge partitions - shell: parted -s {{ item[0] }} rm {{ item[1] }} - with_nested: - - devices - - partitions - ignore_errors: true diff --git a/rolling_update.yml b/rolling_update.yml deleted file mode 100644 index 3c606271c..000000000 --- a/rolling_update.yml +++ /dev/null @@ -1,56 +0,0 @@ ---- -# This playbook does a rolling update for all the Ceph services -# Change the value of serial: to adjust the number of server to be updated. -# -# The four roles that apply to the ceph hosts will be applied: ceph-common, -# ceph-mon, ceph-osd and ceph-mds. So any changes to configuration, package updates, etc, -# will be applied as part of the rolling update process. -# - -# /!\ DO NOT FORGET TO CHANGE THE RELEASE VERSION FIRST! /!\ - -- hosts: - - mons - - osds - - mdss - - rgws - sudo: True - roles: - - ceph-common - -- hosts: mons - serial: 1 - sudo: True - roles: - - ceph-mon - post_tasks: - - name: restart monitor(s) - service: > - name=ceph - state=restarted - args=mon - -- hosts: osds - serial: 1 - sudo: True - roles: - - ceph-osd - post_tasks: - - name: restart object storage daemon(s) - command: service ceph-osd-all restart - when: ansible_distribution == "Ubuntu" - - name: restart object storage daemon(s) - service: name=ceph state=restarted args=osd - when: ansible_distribution == "Debian" - -- hosts: mdss - serial: 1 - sudo: True - roles: - - ceph-mds - post_tasks: - - name: restart metadata server(s) - service: > - name=ceph - state=restarted - args=mds