From: Sébastien Han Date: Thu, 26 Mar 2015 17:51:11 +0000 (+0100) Subject: Merge branch 'master' of https://github.com/ceph/ceph-ansible into improve-rolling... X-Git-Tag: v1.0.0~233^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F237%2Fhead;p=ceph-ansible.git Merge branch 'master' of https://github.com/ceph/ceph-ansible into improve-rolling-upgrade --- 6aaea43d1cba1516f2c69489f6e7f080b3c6507a diff --cc cluster-maintenance.yml index 000000000,000000000..3b7d2a9c6 new file mode 100644 --- /dev/null +++ b/cluster-maintenance.yml @@@ -1,0 -1,0 +1,37 @@@ ++--- ++# This playbook was made to automate Ceph servers maintenance ++# Typical use case: hardware change ++# By running this playbook you will set the 'noout' flag on your ++# cluster, which means that OSD **can't** be marked as out ++# of the CRUSH map, but they will be marked as down. ++# Basically we tell the cluster to don't move any data since ++# the operation won't last for too long. ++ ++- hosts: ++ gather_facts: False ++ ++ tasks: ++ ++ - name: Set the noout flag ++ command: ceph osd set noout ++ delegate_to: ++ ++ - name: Turn off the server ++ command: poweroff ++ ++ - name: Wait for the server to go down ++ local_action: > ++ wait_for host= ++ port=22 ++ state=stopped ++ ++ - name: Wait for the server to come up ++ local_action: > ++ wait_for host= diff --cc cluster-os-migration.yml index 000000000,000000000..278c0aa40 new file mode 100644 --- /dev/null +++ b/cluster-os-migration.yml @@@ -1,0 -1,0 +1,432 @@@ ++--- ++# This playbook was meant to upgrade a node from Ubuntu to RHEL. ++# We are performing a set of actions prior to reboot the node. ++# The node reboots via PXE and gets its new operating system. ++# This playbook only works for monitors and OSDs. ++# Note that some of the checks are ugly: ++# ie: the when migration_completed.stat.exists ++# can be improved with includes, however I wanted to keep a single file... ++# ++ ++- hosts: mons ++ serial: 1 ++ sudo: True ++ ++ vars: ++ backup_dir: /tmp/ ++ ++ tasks: ++ ++ - name: Check if the node has be migrated already ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed ++ register: migration_completed ++ ignore_errors: True ++ ++ - name: Check for failed run ++ stat: > ++ path=/var/lib/ceph/{{ ansible_hostname }}.tar ++ register: mon_archive_leftover ++ ++ - fail: msg="Looks like an archive is already there, please remove it!" ++ when: migration_completed.stat.exists == False and mon_archive_leftover.stat.exists == True ++ ++ - name: Compress the store as much as possible ++ command: ceph tell mon.{{ ansible_hostname }} compact ++ when: migration_completed.stat.exists == False ++ ++ - name: Check if sysvinit ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit ++ register: monsysvinit ++ changed_when: False ++ ++ - name: Check if upstart ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart ++ register: monupstart ++ changed_when: False ++ ++ - name: Check if init does what it is supposed to do (Sysvinit) ++ shell: > ++ ps faux|grep -sq [c]eph-mon && service ceph status mon >> /dev/null ++ register: ceph_status_sysvinit ++ changed_when: False ++ ++ # can't complete the condition since the previous taks never ran... ++ - fail: msg="Something is terribly wrong here, sysvinit is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!" ++ when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True ++ ++ - name: Check if init does what it is supposed to do (upstart) ++ shell: > ++ ps faux|grep -sq [c]eph-mon && status ceph-mon-all >> /dev/null ++ register: ceph_status_upstart ++ changed_when: False ++ ++ - fail: msg="Something is terribly wrong here, upstart is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!" ++ when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True ++ ++ - name: Restart the Monitor after compaction (Upstart) ++ service: > ++ name=ceph-mon ++ state=restarted ++ args=id={{ ansible_hostname }} ++ when: monupstart.stat.exists == True and migration_completed.stat.exists == False ++ ++ - name: Restart the Monitor after compaction (Sysvinit) ++ service: > ++ name=ceph ++ state=restarted ++ args=mon ++ when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False ++ ++ - name: Wait for the monitor to be up again ++ local_action: > ++ wait_for ++ host={{ ansible_ssh_host | default(inventory_hostname) }} ++ port=6789 ++ timeout=10 ++ when: migration_completed.stat.exists == False ++ ++ - name: Stop the monitor (Upstart) ++ service: > ++ name=ceph-mon ++ state=stopped ++ args=id={{ ansible_hostname }} ++ when: monupstart.stat.exists == True and migration_completed.stat.exists == False ++ ++ - name: Stop the monitor (Sysvinit) ++ service: > ++ name=ceph ++ state=stopped ++ args=mon ++ when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False ++ ++ - name: Wait for the monitor to be down ++ local_action: > ++ wait_for ++ host={{ ansible_ssh_host | default(inventory_hostname) }} ++ port=6789 ++ timeout=10 ++ state=stopped ++ when: migration_completed.stat.exists == False ++ ++ - name: Create a backup directory ++ file: > ++ path={{ backup_dir }}/monitors-backups ++ state=directory ++ owner=root ++ group=root ++ mode=0644 ++ delegate_to: "{{ item }}" ++ with_items: groups.backup[0] ++ when: migration_completed.stat.exists == False ++ ++ # NOTE (leseb): should we convert upstart to sysvinit here already? ++ - name: Archive monitor stores ++ shell: > ++ tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar ++ chdir=/var/lib/ceph/ ++ creates={{ ansible_hostname }}.tar ++ when: migration_completed.stat.exists == False ++ ++ - name: Scp the Monitor store ++ fetch: > ++ src=/var/lib/ceph/{{ ansible_hostname }}.tar ++ dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar ++ flat=yes ++ when: migration_completed.stat.exists == False ++ ++ - name: Reboot the server ++ command: reboot ++ when: migration_completed.stat.exists == False ++ ++ - name: Wait for the server to come up ++ local_action: > ++ wait_for ++ port=22 ++ delay=10 ++ timeout=3600 ++ when: migration_completed.stat.exists == False ++ ++ - name: Wait a bit more to be sure that the server is ready ++ pause: seconds=20 ++ when: migration_completed.stat.exists == False ++ ++ - name: Check if sysvinit ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit ++ register: monsysvinit ++ changed_when: False ++ ++ - name: Check if upstart ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart ++ register: monupstart ++ changed_when: False ++ ++ - name: Make sure the monitor is stopped (Upstart) ++ service: > ++ name=ceph-mon ++ state=stopped ++ args=id={{ ansible_hostname }} ++ when: monupstart.stat.exists == True and migration_completed.stat.exists == False ++ ++ - name: Make sure the monitor is stopped (Sysvinit) ++ service: > ++ name=ceph ++ state=stopped ++ args=mon ++ when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False ++ ++ # NOTE (leseb): 'creates' was added in Ansible 1.6 ++ - name: Copy and unarchive the monitor store ++ unarchive: > ++ src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar ++ dest=/var/lib/ceph/ ++ copy=yes ++ mode=0600 ++ creates=etc/ceph/ceph.conf ++ when: migration_completed.stat.exists == False ++ ++ - name: Copy keys and configs ++ shell: > ++ cp etc/ceph/* /etc/ceph/ ++ chdir=/var/lib/ceph/ ++ when: migration_completed.stat.exists == False ++ ++ - name: Configure RHEL7 for sysvinit ++ shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; ++ when: migration_completed.stat.exists == False ++ ++ # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary ++ # so we directly call sysvinit ++ - name: Start the monitor ++ service: > ++ name=ceph ++ state=started ++ args=mon ++ when: migration_completed.stat.exists == False ++ ++ - name: Wait for the Monitor to be up again ++ local_action: > ++ wait_for ++ host={{ ansible_ssh_host | default(inventory_hostname) }} ++ port=6789 ++ timeout=10 ++ when: migration_completed.stat.exists == False ++ ++ - name: Waiting for the monitor to join the quorum... ++ shell: > ++ ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }} ++ register: result ++ until: result.rc == 0 ++ retries: 5 ++ delay: 10 ++ delegate_to: "{{ item }}" ++ with_items: groups.backup[0] ++ when: migration_completed.stat.exists == False ++ ++ - name: Done moving to the next monitor ++ file: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed ++ state=touch ++ owner=root ++ group=root ++ mode=0600 ++ when: migration_completed.stat.exists == False ++ ++- hosts: osds ++ serial: 1 ++ sudo: True ++ ++ vars: ++ backup_dir: /tmp/ ++ ++ tasks: ++ - name: Check if the node has be migrated already ++ stat: > ++ path=/var/lib/ceph/migration_completed ++ register: migration_completed ++ ignore_errors: True ++ ++ - name: Check for failed run ++ stat: > ++ path=/var/lib/ceph/{{ ansible_hostname }}.tar ++ register: osd_archive_leftover ++ ++ - fail: msg="Looks like an archive is already there, please remove it!" ++ when: migration_completed.stat.exists == False and osd_archive_leftover.stat.exists == True ++ ++ - name: Check if init does what it is supposed to do (Sysvinit) ++ shell: > ++ ps faux|grep -sq [c]eph-osd && service ceph status osd >> /dev/null ++ register: ceph_status_sysvinit ++ changed_when: False ++ ++ # can't complete the condition since the previous taks never ran... ++ - fail: msg="Something is terribly wrong here, sysvinit is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!" ++ when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True ++ ++ - name: Check if init does what it is supposed to do (upstart) ++ shell: > ++ ps faux|grep -sq [c]eph-osd && initctl list|egrep -sq "ceph-osd \(ceph/.\) start/running, process [0-9][0-9][0-9][0-9]" ++ register: ceph_status_upstart ++ changed_when: False ++ ++ - fail: msg="Something is terribly wrong here, upstart is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!" ++ when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True ++ ++ - name: Set the noout flag ++ command: ceph osd set noout ++ delegate_to: "{{ item }}" ++ with_items: groups.mons[0] ++ when: migration_completed.stat.exists == False ++ ++ - name: Check if sysvinit ++ shell: stat /var/lib/ceph/osd/ceph-*/sysvinit ++ register: osdsysvinit ++ ignore_errors: True ++ changed_when: False ++ ++ - name: Check if upstart ++ shell: stat /var/lib/ceph/osd/ceph-*/upstart ++ register: osdupstart ++ ignore_errors: True ++ changed_when: False ++ ++ - name: Archive ceph configs ++ shell: > ++ tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar ++ chdir=/var/lib/ceph/ ++ creates={{ ansible_hostname }}.tar ++ when: migration_completed.stat.exists == False ++ ++ - name: Create backup directory ++ file: > ++ path={{ backup_dir }}/osds-backups ++ state=directory ++ owner=root ++ group=root ++ mode=0644 ++ delegate_to: "{{ item }}" ++ with_items: groups.backup[0] ++ when: migration_completed.stat.exists == False ++ ++ - name: Scp OSDs dirs and configs ++ fetch: > ++ src=/var/lib/ceph/{{ ansible_hostname }}.tar ++ dest={{ backup_dir }}/osds-backups/ ++ flat=yes ++ when: migration_completed.stat.exists == False ++ ++ - name: Collect OSD ports ++ shell: netstat -tlpn | awk -F ":" '/ceph-osd/ { sub (" .*", "", $2); print $2 }' | uniq ++ register: osd_ports ++ when: migration_completed.stat.exists == False ++ ++ - name: Gracefully stop the OSDs (Upstart) ++ service: > ++ name=ceph-osd-all ++ state=stopped ++ when: osdupstart.rc == 0 and migration_completed.stat.exists == False ++ ++ - name: Gracefully stop the OSDs (Sysvinit) ++ service: > ++ name=ceph ++ state=stopped ++ args=mon ++ when: osdsysvinit.rc == 0 and migration_completed.stat.exists == False ++ ++ - name: Wait for the OSDs to be down ++ local_action: > ++ wait_for ++ host={{ ansible_ssh_host | default(inventory_hostname) }} ++ port={{ item }} ++ timeout=10 ++ state=stopped ++ with_items: ++ - "{{ osd_ports.stdout_lines }}" ++ when: migration_completed.stat.exists == False ++ ++ - name: Configure RHEL with sysvinit ++ shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; ++ when: migration_completed.stat.exists == False ++ ++ - name: Reboot the server ++ command: reboot ++ when: migration_completed.stat.exists == False ++ ++ - name: Wait for the server to come up ++ local_action: > ++ wait_for ++ port=22 ++ delay=10 ++ timeout=3600 ++ when: migration_completed.stat.exists == False ++ ++ - name: Wait a bit to be sure that the server is ready for scp ++ pause: seconds=20 ++ when: migration_completed.stat.exists == False ++ ++ # NOTE (leseb): 'creates' was added in Ansible 1.6 ++ - name: Copy and unarchive the OSD configs ++ unarchive: > ++ src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar ++ dest=/var/lib/ceph/ ++ copy=yes ++ mode=0600 ++ creates=etc/ceph/ceph.conf ++ when: migration_completed.stat.exists == False ++ ++ - name: Copy keys and configs ++ shell: > ++ cp etc/ceph/* /etc/ceph/ ++ chdir=/var/lib/ceph/ ++ when: migration_completed.stat.exists == False ++ ++ # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary ++ # so we directly call sysvinit ++ - name: Start all the OSDs ++ service: > ++ name=ceph-osd-all ++ state=started ++ args=osd ++ when: migration_completed.stat.exists == False ++ ++ # NOTE (leseb): this is tricky unless this is set into the ceph.conf ++ # listened ports can be predicted, thus they will change after each restart ++# - name: Wait for the OSDs to be up again ++# local_action: > ++# wait_for ++# host={{ ansible_ssh_host | default(inventory_hostname) }} ++# port={{ item }} ++# timeout=30 ++# with_items: ++# - "{{ osd_ports.stdout_lines }}" ++ ++ - name: Waiting for clean PGs... ++ shell: > ++ test "$(ceph pg stat | sed 's/^.*pgs://' | sed 's/active+clean.*//' |sed 's/ //')" -eq "$(ceph pg stat | sed 's/pgs.*//' | sed 's/^.*://' | sed 's/ //')" && ceph health | egrep -q "HEALTH_OK|HEALTH_WARN" ++ register: result ++ until: result.rc == 0 ++ retries: 10 ++ delay: 10 ++ delegate_to: "{{ item }}" ++ with_items: groups.backup[0] ++ when: migration_completed.stat.exists == False ++ ++ - name: Done moving to the next OSD ++ file: > ++ path=/var/lib/ceph/migration_completed ++ state=touch ++ owner=root ++ group=root ++ mode=0600 ++ when: migration_completed.stat.exists == False ++ ++ - name: Unset the noout flag ++ command: ceph osd unset noout ++ delegate_to: "{{ item }}" ++ with_items: groups.mons[0] ++ when: migration_completed.stat.exists == False diff --cc purge-cluster.yml index 000000000,000000000..5848d4a9f new file mode 100644 --- /dev/null +++ b/purge-cluster.yml @@@ -1,0 -1,0 +1,32 @@@ ++--- ++# This playbook purges Ceph ++# It removes: packages, configuration files and ALL THE DATA ++ ++- hosts: ++ - mons ++ - osds ++ ++ vars: ++ devices: [ '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde', '/dev/sdf' ] ++ partitions: [ '1', '2', '3' ] ++ ++ tasks: ++ ++ - name: Purge Ceph ++ command: ceph-deploy purge {{ ansible_fqdn }} ++ delegate_to: 127.0.0.1 ++ ++ - name: Remove OSD data ++ shell: rm -rf /var/lib/ceph/osd/*/* ++ ignore_errors: true ++ ++ - name: Purge remaining data ++ command: ceph-deploy purgedata {{ ansible_fqdn }} ++ delegate_to: 127.0.0.1 ++ ++ - name: Purge partitions ++ shell: parted -s {{ item[0] }} rm {{ item[1] }} ++ with_nested: ++ - devices ++ - partitions ++ ignore_errors: true diff --cc rolling_update.yml index 000000000,3c606271c..607096a22 mode 000000,100644..100644 --- a/rolling_update.yml +++ b/rolling_update.yml @@@ -1,0 -1,56 +1,176 @@@ + --- + # This playbook does a rolling update for all the Ceph services + # Change the value of serial: to adjust the number of server to be updated. + # + # The four roles that apply to the ceph hosts will be applied: ceph-common, + # ceph-mon, ceph-osd and ceph-mds. So any changes to configuration, package updates, etc, + # will be applied as part of the rolling update process. + # + + # /!\ DO NOT FORGET TO CHANGE THE RELEASE VERSION FIRST! /!\ + -- hosts: - - mons - - osds - - mdss - - rgws - sudo: True - roles: - - ceph-common - + - hosts: mons + serial: 1 + sudo: True ++ ++ pre_tasks: ++ - name: Compress the store as much as possible ++ command: ceph tell mon.{{ ansible_hostname }} compact ++ + roles: - - ceph-mon ++ - ceph-common ++ - ceph-mon ++ + post_tasks: - - name: restart monitor(s) - service: > - name=ceph - state=restarted - args=mon ++ - name: Check if sysvinit ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit ++ register: monsysvinit ++ ++ - name: Check if upstart ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart ++ register: monupstart ++ ++ - name: Restart the monitor after compaction (Upstart) ++ service: > ++ name=ceph-mon ++ state=restarted ++ args=id={{ ansible_hostname }} ++ when: monupstart.stat.exists == True ++ ++ - name: Restart the monitor after compaction (Sysvinit) ++ service: > ++ name=ceph ++ state=restarted ++ args=mon ++ when: monsysvinit.stat.exists == True ++ ++ - name: restart monitor(s) ++ service: > ++ name=ceph ++ state=restarted ++ args=mon ++ ++ - name: Waiting for the monitor to join the quorum... ++ shell: > ++ ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }} ++ register: result ++ until: result.rc == 0 ++ retries: 5 ++ delay: 10 ++ delegate_to: 127.0.0.1 ++ + + - hosts: osds + serial: 1 + sudo: True ++ ++ pre_tasks: ++ - name: Set the noout flag ++ command: ceph osd set noout ++ delegate_to: "{{ item }}" ++ with_items: groups.mons[0] ++ + roles: - - ceph-osd ++ - ceph-common ++ - ceph-osd ++ + post_tasks: - - name: restart object storage daemon(s) - command: service ceph-osd-all restart - when: ansible_distribution == "Ubuntu" - - name: restart object storage daemon(s) - service: name=ceph state=restarted args=osd - when: ansible_distribution == "Debian" ++ - name: Check if sysvinit ++ shell: stat /var/lib/ceph/osd/ceph-*/sysvinit ++ register: osdsysvinit ++ ignore_errors: True ++ ++ - name: Check if upstart ++ shell: stat /var/lib/ceph/osd/ceph-*/upstart ++ register: osdupstart ++ ignore_errors: True ++ ++ - name: Gracefully stop the OSDs (Upstart) ++ service: > ++ name=ceph-osd-all ++ state=restarted ++ when: osdupstart.rc == 0 ++ ++ - name: Gracefully stop the OSDs (Sysvinit) ++ service: > ++ name=ceph ++ state=restarted ++ args=mon ++ when: osdsysvinit.rc == 0 and ++ ++ - name: Waiting for clean PGs... ++ shell: > ++ test "$(ceph pg stat | sed 's/^.*pgs://' | sed 's/active+clean.*//' |sed 's/ //')" -eq "$(ceph pg stat | sed 's/pgs.*//' | sed 's/^.*://' | sed 's/ //')" && ceph health | egrep -q "HEALTH_OK|HEALTH_WARN" ++ register: result ++ until: result.rc == 0 ++ retries: 10 ++ delay: 10 ++ delegate_to: 127.0.0.1 ++ ++ - name: Unset the noout flag ++ command: ceph osd unset noout ++ delegate_to: "{{ item }}" ++ with_items: groups.mons[0] ++ + + - hosts: mdss + serial: 1 + sudo: True ++ + roles: - - ceph-mds ++ - ceph-common ++ - ceph-mds ++ + post_tasks: - - name: restart metadata server(s) - service: > - name=ceph - state=restarted - args=mds ++ - name: Check if sysvinit ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit ++ register: mdssysvinit ++ ++ - name: Check if upstart ++ stat: > ++ path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart ++ register: mdsupstart ++ ++ - name: Restart the metadata server (Upstart) ++ service: > ++ name=ceph-mds ++ state=restarted ++ args=id={{ ansible_hostname }} ++ when: mdsupstart.stat.exists == True ++ ++ - name: Restart the metadata server (Sysvinit) ++ service: > ++ name=ceph ++ state=restarted ++ args=mds ++ when: mdssysvinit.stat.exists == True ++ ++ ++- hosts: rgws ++ serial: 1 ++ sudo: True ++ ++ roles: ++ - ceph-common ++ - ceph-radosgw ++ ++ post_tasks: ++ - name: restart rados gateway server(s) ++ service: > ++ name={{ item }} ++ state=restarted ++ with_items: ++ - radosgw ++ when: radosgw_frontend == 'civetweb' ++ ++ - name: restart rados gateway server(s) ++ service: > ++ name={{ item }} ++ state=restarted ++ with_items: ++ - apache2 ++ - radosgw ++ when: radosgw_frontend == 'apache'