From: Sébastien Han Date: Wed, 30 Aug 2017 21:30:49 +0000 (+0200) Subject: infra playbook: move untested scenario to a new dir X-Git-Tag: v3.0.0rc6~3^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=54d7a81241eac26d87e2bee513df5efd8866a586;p=ceph-ansible.git infra playbook: move untested scenario to a new dir Move untested/with few confidence playbooks in a untested-by-ci directory. Also removing this directory from the package build. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1461551 Signed-off-by: Sébastien Han --- diff --git a/ceph-ansible.spec.in b/ceph-ansible.spec.in index d2be8ef0d..53c694012 100644 --- a/ceph-ansible.spec.in +++ b/ceph-ansible.spec.in @@ -42,6 +42,7 @@ done pushd %{buildroot}%{_datarootdir}/ceph-ansible rm -r roles/ceph-common-coreos rm group_vars/common-coreoss.yml.sample + rm -r infrastructure-playbooks/untested-by-ci popd # Strip iscsi files. diff --git a/infrastructure-playbooks/cluster-maintenance.yml b/infrastructure-playbooks/cluster-maintenance.yml deleted file mode 100644 index c559ed62f..000000000 --- a/infrastructure-playbooks/cluster-maintenance.yml +++ /dev/null @@ -1,37 +0,0 @@ ---- -# This playbook was made to automate Ceph servers maintenance -# Typical use case: hardware change -# By running this playbook you will set the 'noout' flag on your -# cluster, which means that OSD **can't** be marked as out -# of the CRUSH map, but they will be marked as down. -# Basically we tell the cluster to don't move any data since -# the operation won't last for too long. - -- hosts: - gather_facts: False - - tasks: - - - name: Set the noout flag - command: ceph osd set noout - delegate_to: - - - name: Turn off the server - command: poweroff - - - name: Wait for the server to go down - local_action: > - wait_for host= - port=22 - state=stopped - - - name: Wait for the server to come up - local_action: > - wait_for host= - port=22 - delay=10 - timeout=3600 - - - name: Unset the noout flag - command: ceph osd unset noout - delegate_to: diff --git a/infrastructure-playbooks/cluster-os-migration.yml b/infrastructure-playbooks/cluster-os-migration.yml deleted file mode 100644 index 843056f64..000000000 --- a/infrastructure-playbooks/cluster-os-migration.yml +++ /dev/null @@ -1,555 +0,0 @@ ---- -# This playbook was meant to upgrade a node from Ubuntu to RHEL. -# We are performing a set of actions prior to reboot the node. -# The node reboots via PXE and gets its new operating system. -# This playbook only works for monitors and OSDs. -# Note that some of the checks are ugly: -# ie: the when migration_completed.stat.exists -# can be improved with includes, however I wanted to keep a single file... -# - -- hosts: mons - serial: 1 - sudo: True - - vars: - backup_dir: /tmp/ - - tasks: - - - name: Check if the node has be migrated already - stat: > - path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed - register: migration_completed - failed_when: false - - - name: Check for failed run - stat: > - path=/var/lib/ceph/{{ ansible_hostname }}.tar - register: mon_archive_leftover - - - fail: msg="Looks like an archive is already there, please remove it!" - when: migration_completed.stat.exists == False and mon_archive_leftover.stat.exists == True - - - name: Compress the store as much as possible - command: ceph tell mon.{{ ansible_hostname }} compact - when: migration_completed.stat.exists == False - - - name: Check if sysvinit - stat: > - path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit - register: monsysvinit - changed_when: False - - - name: Check if upstart - stat: > - path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart - register: monupstart - changed_when: False - - - name: Check if init does what it is supposed to do (Sysvinit) - shell: > - ps faux|grep -sq [c]eph-mon && service ceph status mon >> /dev/null - register: ceph_status_sysvinit - changed_when: False - - # can't complete the condition since the previous taks never ran... - - fail: msg="Something is terribly wrong here, sysvinit is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!" - when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True - - - name: Check if init does what it is supposed to do (upstart) - shell: > - ps faux|grep -sq [c]eph-mon && status ceph-mon-all >> /dev/null - register: ceph_status_upstart - changed_when: False - - - fail: msg="Something is terribly wrong here, upstart is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!" - when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True - - - name: Restart the Monitor after compaction (Upstart) - service: > - name=ceph-mon - state=restarted - args=id={{ ansible_hostname }} - when: monupstart.stat.exists == True and migration_completed.stat.exists == False - - - name: Restart the Monitor after compaction (Sysvinit) - service: > - name=ceph - state=restarted - args=mon - when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False - - - name: Wait for the monitor to be up again - local_action: > - wait_for - host={{ ansible_ssh_host | default(inventory_hostname) }} - port=6789 - timeout=10 - when: migration_completed.stat.exists == False - - - name: Stop the monitor (Upstart) - service: > - name=ceph-mon - state=stopped - args=id={{ ansible_hostname }} - when: monupstart.stat.exists == True and migration_completed.stat.exists == False - - - name: Stop the monitor (Sysvinit) - service: > - name=ceph - state=stopped - args=mon - when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False - - - name: Wait for the monitor to be down - local_action: > - wait_for - host={{ ansible_ssh_host | default(inventory_hostname) }} - port=6789 - timeout=10 - state=stopped - when: migration_completed.stat.exists == False - - - name: Create a backup directory - file: > - path={{ backup_dir }}/monitors-backups - state=directory - owner=root - group=root - mode=0644 - delegate_to: "{{ item }}" - with_items: "{{ groups.backup[0] }}" - when: migration_completed.stat.exists == False - - # NOTE (leseb): should we convert upstart to sysvinit here already? - - name: Archive monitor stores - shell: > - tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar - chdir=/var/lib/ceph/ - creates={{ ansible_hostname }}.tar - when: migration_completed.stat.exists == False - - - name: Scp the Monitor store - fetch: > - src=/var/lib/ceph/{{ ansible_hostname }}.tar - dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar - flat=yes - when: migration_completed.stat.exists == False - - - name: Reboot the server - command: reboot - when: migration_completed.stat.exists == False - - - name: Wait for the server to come up - local_action: > - wait_for - port=22 - delay=10 - timeout=3600 - when: migration_completed.stat.exists == False - - - name: Wait a bit more to be sure that the server is ready - pause: seconds=20 - when: migration_completed.stat.exists == False - - - name: Check if sysvinit - stat: > - path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit - register: monsysvinit - changed_when: False - - - name: Check if upstart - stat: > - path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart - register: monupstart - changed_when: False - - - name: Make sure the monitor is stopped (Upstart) - service: > - name=ceph-mon - state=stopped - args=id={{ ansible_hostname }} - when: monupstart.stat.exists == True and migration_completed.stat.exists == False - - - name: Make sure the monitor is stopped (Sysvinit) - service: > - name=ceph - state=stopped - args=mon - when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False - - # NOTE (leseb): 'creates' was added in Ansible 1.6 - - name: Copy and unarchive the monitor store - unarchive: > - src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar - dest=/var/lib/ceph/ - copy=yes - mode=0600 - creates=etc/ceph/ceph.conf - when: migration_completed.stat.exists == False - - - name: Copy keys and configs - shell: > - cp etc/ceph/* /etc/ceph/ - chdir=/var/lib/ceph/ - when: migration_completed.stat.exists == False - - - name: Configure RHEL7 for sysvinit - shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; - when: migration_completed.stat.exists == False - - # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary - # so we directly call sysvinit - - name: Start the monitor - service: > - name=ceph - state=started - args=mon - when: migration_completed.stat.exists == False - - - name: Wait for the Monitor to be up again - local_action: > - wait_for - host={{ ansible_ssh_host | default(inventory_hostname) }} - port=6789 - timeout=10 - when: migration_completed.stat.exists == False - - - name: Waiting for the monitor to join the quorum... - shell: > - ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }} - register: result - until: result.rc == 0 - retries: 5 - delay: 10 - delegate_to: "{{ item }}" - with_items: "{{ groups.backup[0] }}" - when: migration_completed.stat.exists == False - - - name: Done moving to the next monitor - file: > - path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed - state=touch - owner=root - group=root - mode=0600 - when: migration_completed.stat.exists == False - -- hosts: osds - serial: 1 - sudo: True - - vars: - backup_dir: /tmp/ - - tasks: - - name: Check if the node has be migrated already - stat: > - path=/var/lib/ceph/migration_completed - register: migration_completed - failed_when: false - - - name: Check for failed run - stat: > - path=/var/lib/ceph/{{ ansible_hostname }}.tar - register: osd_archive_leftover - - - fail: msg="Looks like an archive is already there, please remove it!" - when: migration_completed.stat.exists == False and osd_archive_leftover.stat.exists == True - - - name: Check if init does what it is supposed to do (Sysvinit) - shell: > - ps faux|grep -sq [c]eph-osd && service ceph status osd >> /dev/null - register: ceph_status_sysvinit - changed_when: False - - # can't complete the condition since the previous taks never ran... - - fail: msg="Something is terribly wrong here, sysvinit is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!" - when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True - - - name: Check if init does what it is supposed to do (upstart) - shell: > - ps faux|grep -sq [c]eph-osd && initctl list|egrep -sq "ceph-osd \(ceph/.\) start/running, process [0-9][0-9][0-9][0-9]" - register: ceph_status_upstart - changed_when: False - - - fail: msg="Something is terribly wrong here, upstart is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!" - when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True - - - name: Set the noout flag - command: ceph osd set noout - delegate_to: "{{ item }}" - with_items: "{{ groups[mon_group_name][0] }}" - when: migration_completed.stat.exists == False - - - name: Check if sysvinit - shell: stat /var/lib/ceph/osd/ceph-*/sysvinit - register: osdsysvinit - failed_when: false - changed_when: False - - - name: Check if upstart - shell: stat /var/lib/ceph/osd/ceph-*/upstart - register: osdupstart - failed_when: false - changed_when: False - - - name: Archive ceph configs - shell: > - tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar - chdir=/var/lib/ceph/ - creates={{ ansible_hostname }}.tar - when: migration_completed.stat.exists == False - - - name: Create backup directory - file: > - path={{ backup_dir }}/osds-backups - state=directory - owner=root - group=root - mode=0644 - delegate_to: "{{ item }}" - with_items: "{{ groups.backup[0] }}" - when: migration_completed.stat.exists == False - - - name: Scp OSDs dirs and configs - fetch: > - src=/var/lib/ceph/{{ ansible_hostname }}.tar - dest={{ backup_dir }}/osds-backups/ - flat=yes - when: migration_completed.stat.exists == False - - - name: Collect OSD ports - shell: netstat -tlpn | awk -F ":" '/ceph-osd/ { sub (" .*", "", $2); print $2 }' | uniq - register: osd_ports - when: migration_completed.stat.exists == False - - - name: Gracefully stop the OSDs (Upstart) - service: > - name=ceph-osd-all - state=stopped - when: osdupstart.rc == 0 and migration_completed.stat.exists == False - - - name: Gracefully stop the OSDs (Sysvinit) - service: > - name=ceph - state=stopped - args=mon - when: osdsysvinit.rc == 0 and migration_completed.stat.exists == False - - - name: Wait for the OSDs to be down - local_action: > - wait_for - host={{ ansible_ssh_host | default(inventory_hostname) }} - port={{ item }} - timeout=10 - state=stopped - with_items: "{{ osd_ports.stdout_lines }}" - when: migration_completed.stat.exists == False - - - name: Configure RHEL with sysvinit - shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; - when: migration_completed.stat.exists == False - - - name: Reboot the server - command: reboot - when: migration_completed.stat.exists == False - - - name: Wait for the server to come up - local_action: > - wait_for - port=22 - delay=10 - timeout=3600 - when: migration_completed.stat.exists == False - - - name: Wait a bit to be sure that the server is ready for scp - pause: seconds=20 - when: migration_completed.stat.exists == False - - # NOTE (leseb): 'creates' was added in Ansible 1.6 - - name: Copy and unarchive the OSD configs - unarchive: > - src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar - dest=/var/lib/ceph/ - copy=yes - mode=0600 - creates=etc/ceph/ceph.conf - when: migration_completed.stat.exists == False - - - name: Copy keys and configs - shell: > - cp etc/ceph/* /etc/ceph/ - chdir=/var/lib/ceph/ - when: migration_completed.stat.exists == False - - # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary - # so we directly call sysvinit - - name: Start all the OSDs - service: > - name=ceph-osd-all - state=started - args=osd - when: migration_completed.stat.exists == False - - # NOTE (leseb): this is tricky unless this is set into the ceph.conf - # listened ports can be predicted, thus they will change after each restart -# - name: Wait for the OSDs to be up again -# local_action: > -# wait_for -# host={{ ansible_ssh_host | default(inventory_hostname) }} -# port={{ item }} -# timeout=30 -# with_items: -# - "{{ osd_ports.stdout_lines }}" - - - name: Waiting for clean PGs... - shell: > - test "[""$(ceph -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')" - register: result - until: result.rc == 0 - retries: 10 - delay: 10 - delegate_to: "{{ item }}" - with_items: "{{ groups.backup[0] }}" - when: migration_completed.stat.exists == False - - - name: Done moving to the next OSD - file: > - path=/var/lib/ceph/migration_completed - state=touch - owner=root - group=root - mode=0600 - when: migration_completed.stat.exists == False - - - name: Unset the noout flag - command: ceph osd unset noout - delegate_to: "{{ item }}" - with_items: "{{ groups[mon_group_name][0] }}" - when: migration_completed.stat.exists == False - -- hosts: rgws - serial: 1 - sudo: True - - vars: - backup_dir: /tmp/ - - tasks: - - name: Check if the node has be migrated already - stat: > - path=/var/lib/ceph/radosgw/migration_completed - register: migration_completed - failed_when: false - - - name: Check for failed run - stat: > - path=/var/lib/ceph/{{ ansible_hostname }}.tar - register: rgw_archive_leftover - - - fail: msg="Looks like an archive is already there, please remove it!" - when: migration_completed.stat.exists == False and rgw_archive_leftover.stat.exists == True - - - name: Archive rados gateway configs - shell: > - tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar - chdir=/var/lib/ceph/ - creates={{ ansible_hostname }}.tar - when: migration_completed.stat.exists == False - - - name: Create backup directory - file: > - path={{ backup_dir }}/rgws-backups - state=directory - owner=root - group=root - mode=0644 - delegate_to: "{{ item }}" - with_items: "{{ groups.backup[0] }}" - when: migration_completed.stat.exists == False - - - name: Scp RGWs dirs and configs - fetch: > - src=/var/lib/ceph/{{ ansible_hostname }}.tar - dest={{ backup_dir }}/rgws-backups/ - flat=yes - when: migration_completed.stat.exists == False - - - name: Gracefully stop the rados gateway - service: > - name={{ item }} - state=stopped - with_items: - - radosgw - when: migration_completed.stat.exists == False - - - name: Wait for radosgw to be down - local_action: > - wait_for - host={{ ansible_ssh_host | default(inventory_hostname) }} - path=/tmp/radosgw.sock - state=absent - timeout=30 - when: migration_completed.stat.exists == False - - - name: Reboot the server - command: reboot - when: migration_completed.stat.exists == False - - - name: Wait for the server to come up - local_action: > - wait_for - port=22 - delay=10 - timeout=3600 - when: migration_completed.stat.exists == False - - - name: Wait a bit to be sure that the server is ready for scp - pause: seconds=20 - when: migration_completed.stat.exists == False - - # NOTE (leseb): 'creates' was added in Ansible 1.6 - - name: Copy and unarchive the OSD configs - unarchive: > - src={{ backup_dir }}/rgws-backups/{{ ansible_hostname }}.tar - dest=/var/lib/ceph/ - copy=yes - mode=0600 - creates=etc/ceph/ceph.conf - when: migration_completed.stat.exists == False - - - name: Copy keys and configs - shell: > - {{ item }} - chdir=/var/lib/ceph/ - with_items: - - cp etc/ceph/* /etc/ceph/ - when: migration_completed.stat.exists == False - - - name: Start rados gateway - service: > - name={{ item }} - state=started - with_items: - - radosgw - when: migration_completed.stat.exists == False - - - name: Wait for radosgw to be up again - local_action: > - wait_for - host={{ ansible_ssh_host | default(inventory_hostname) }} - path=/tmp/radosgw.sock - state=present - timeout=30 - when: migration_completed.stat.exists == False - - - name: Done moving to the next rados gateway - file: > - path=/var/lib/ceph/radosgw/migration_completed - state=touch - owner=root - group=root - mode=0600 - when: migration_completed.stat.exists == False diff --git a/infrastructure-playbooks/make-osd-partitions.yml b/infrastructure-playbooks/make-osd-partitions.yml deleted file mode 100644 index 0fc6892d2..000000000 --- a/infrastructure-playbooks/make-osd-partitions.yml +++ /dev/null @@ -1,99 +0,0 @@ ---- -# This playbook will make custom partition layout for your osd hosts. -# You should define `devices` variable for every host. -# -# For example, in host_vars/hostname1 -# -# devices: -# - device_name: sdb -# partitions: -# - index: 1 -# size: 10G -# type: data -# - index: 2 -# size: 5G -# type: journal -# - device_name: sdc -# partitions: -# - index: 1 -# size: 10G -# type: data -# - index: 2 -# size: 5G -# type: journal -# -- vars: - osd_group_name: osds - journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 - data_typecode: 4fbd7e29-9d25-41b8-afd0-062c0ceff05d - devices: [] - hosts: - - "{{ osd_group_name }}" - - tasks: - - - name: load a variable file for devices partition - include_vars: "{{ item }}" - with_first_found: - - files: - - "host_vars/{{ ansible_hostname }}.yml" - - "host_vars/default.yml" - skip: true - - - name: exit playbook, if devices not defined - fail: - msg: "devices must be define in host_vars/default.yml or host_vars/{{ ansible_hostname }}.yml" - when: devices is not defined - - - name: install sgdisk(gdisk) - package: - name: gdisk - state: present - - - name: erase all previous partitions(dangerous!!!) - shell: sgdisk --zap-all -- /dev/{{item.device_name}} - with_items: "{{ devices }}" - - - name: make osd partitions - shell: > - sgdisk --new={{item.1.index}}:0:+{{item.1.size}} "--change-name={{item.1.index}}:ceph {{item.1.type}}" - "--typecode={{item.1.index}}:{% if item.1.type=='data' %}{{data_typecode}}{% else %}{{journal_typecode}}{% endif %}" - --mbrtogpt -- /dev/{{item.0.device_name}} - with_subelements: - - "{{ devices }}" - - partitions - - - set_fact: - owner: 167 - group: 167 - when: - - ansible_os_family == "RedHat" - - - set_fact: - owner: 64045 - group: 64045 - when: - - ansible_os_family == "Debian" - - - name: change partitions ownership - file: - path: "/dev/{{item.0.device_name}}{{item.1.index}}" - owner: "{{ owner | default('root')}}" - group: "{{ group | default('disk')}}" - with_subelements: - - "{{ devices }}" - - partitions - when: - item.0.device_name | match('/dev/([hsv]d[a-z]{1,2}){1,2}$') - - - name: change partitions ownership - file: - path: "/dev/{{item.0.device_name}}p{{item.1.index}}" - owner: "{{ owner | default('root')}}" - group: "{{ group | default('disk')}}" - with_subelements: - - "{{ devices }}" - - partitions - when: - item.0.device_name | match('/dev/(cciss/c[0-9]d[0-9]|nvme[0-9]n[0-9]){1,2}$') -... \ No newline at end of file diff --git a/infrastructure-playbooks/migrate-journal-to-ssd.yml b/infrastructure-playbooks/migrate-journal-to-ssd.yml deleted file mode 100644 index 44a75e01b..000000000 --- a/infrastructure-playbooks/migrate-journal-to-ssd.yml +++ /dev/null @@ -1,112 +0,0 @@ ---- -# This playbook use to migrate activity osd(s) journal to SSD. -# -# You should define `osds_journal_devices` variable for host which osd(s) journal migrate to. -# -# For example in host_vars/hostname1.yml -# -# osds_journal_devices: -# - device_name: /dev/sdd -# partitions: -# - index: 1 -# size: 10G -# osd_id: 0 -# - index: 2 -# size: 10G -# osd_id: 1 -# - device_name: /dev/sdf -# partitions: -# - index: 1 -# size: 10G -# osd_id: 2 -# -# @param device_name: The full device path of new ssd. -# @param partitions: The custom partition layout of ssd. -# @param index: The index of this partition. -# @param size: The size of this partition. -# @param osd_id: Which osds's journal this partition for. -# -# ansible-playbook migrate-journal-to-ssd.yml -# The playbook will migrate osd(s) journal to ssd device which you define in host_vars. - -- vars: - osd_group_name: osds - journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 - osds_journal_devices: [] - hosts: - - "{{ osd_group_name }}" - serial: 1 - tasks: - - - name: get osd(s) if directory stat - stat: - path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" - register: osds_dir_stat - with_subelements: - - "{{ osds_journal_devices }}" - - partitions - - - name: exit playbook osd(s) is not on this host - fail: - msg: exit playbook osd(s) is not on this host - with_items: - osds_dir_stat.results - when: - - osds_dir_stat is defined and item.stat.exists == false - - - name: install sgdisk(gdisk) - package: - name: gdisk - state: present - when: osds_journal_devices is defined - - - name: generate uuid for osds journal - command: uuidgen - register: osds - with_subelements: - - "{{ osds_journal_devices }}" - - partitions - - - name: make osd partitions on ssd - shell: > - sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" - --typecode={{ item.item[1].index }}:{{ journal_typecode }} - --partition-guid={{ item.item[1].index }}:{{ item.stdout }} - --mbrtogpt -- {{ item.item[0].device_name }} - with_items: - - "{{ osds.results }}" - - - name: stop osd(s) service - service: - name: "ceph-osd@{{ item.item[1].osd_id }}" - state: stopped - with_items: - - "{{ osds.results }}" - - - name: flush osd(s) journal - command: ceph-osd -i {{ item.item[1].osd_id }} --flush-journal --cluster {{ cluster }} - with_items: - - "{{ osds.results }}" - when: osds_journal_devices is defined - - - name: update osd(s) journal soft link - command: ln -sf /dev/disk/by-partuuid/{{ item.stdout }} /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal - with_items: - - "{{ osds.results }}" - - - name: update osd(s) journal uuid - command: echo {{ item.stdout }} > /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal_uuid - with_items: - - "{{ osds.results }}" - - - name: initialize osd(s) new journal - command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }} - with_items: - - "{{ osds.results }}" - - - name: start osd(s) service - service: - name: "ceph-osd@{{ item.item[1].osd_id }}" - state: started - with_items: - - "{{ osds.results }}" diff --git a/infrastructure-playbooks/purge-multisite.yml b/infrastructure-playbooks/purge-multisite.yml deleted file mode 100644 index 8b78553ac..000000000 --- a/infrastructure-playbooks/purge-multisite.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -# Nukes a multisite config -- hosts: rgws - become: True - tasks: - - include: roles/ceph-rgw/tasks/multisite/destroy.yml - - handlers: - - include: roles/ceph-rgw/handlers/main.yml - # Ansible 2.1.0 bug will ignore included handlers without this - static: True diff --git a/infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml b/infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml deleted file mode 100644 index de3b6e86e..000000000 --- a/infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml +++ /dev/null @@ -1,117 +0,0 @@ ---- -# This playbook use to recover Ceph OSDs after ssd journal failure. -# You will also realise that it’s really simple to bring your -# OSDs back to life after replacing your faulty SSD with a new one. -# -# You should define `dev_ssds` variable for host which changes ssds after -# failure. -# -# For example in host_vars/hostname1.yml -# -# dev_ssds: -# - device_name: /dev/sdd -# partitions: -# - index: 1 -# size: 10G -# osd_id: 0 -# - index: 2 -# size: 10G -# osd_id: 1 -# - device_name: /dev/sdf -# partitions: -# - index: 1 -# size: 10G -# osd_id: 2 -# -# @param device_name: The full device path of new ssd -# @param partitions: The custom partition layout of new ssd -# @param index: The index of this partition -# @param size: The size of this partition -# @param osd_id: Which osds's journal this partition for. -# -# ansible-playbook recover-osds-after-ssd-journal-failure.yml -# Prompts for select which host to recover, defaults to null, -# doesn't select host the recover ssd. Input the hostname -# which to recover osds after ssd journal failure -# -# ansible-playbook -e target_host=hostname \ -# recover-osds-after-ssd-journal-failure.yml -# Overrides the prompt using -e option. Can be used in -# automation scripts to avoid interactive prompt. - -- hosts: localhost - gather_facts: no - vars_prompt: - - name: target_host - prompt: please enter the target hostname which to recover osds after ssd journal failure - private: no - tasks: - - add_host: - name: "{{ target_host }}" - groups: dynamically_created_hosts - -- hosts: dynamically_created_hosts - vars: - journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 - dev_ssds: [] - - tasks: - - fail: msg="please define dev_ssds variable" - when: dev_ssds|length <= 0 - - - name: get osd(s) if directory stat - stat: - path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" - register: osds_dir_stat - with_subelements: - - "{{ dev_ssds }}" - - partitions - - - name: exit playbook osd(s) is not on this host - fail: - msg: exit playbook osds is not no this host - with_items: - osds_dir_stat.results - when: - - osds_dir_stat is defined - - item.stat.exists == false - - - name: install sgdisk(gdisk) - package: - name: gdisk - state: present - - - name: get osd(s) journal uuid - command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" - register: osds_uuid - with_subelements: - - "{{ dev_ssds }}" - - partitions - - - name: make partitions on new ssd - shell: > - sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" - --typecode={{ item.item[1].index }}:{{ journal_typecode }} - --partition-guid={{ item.item[1].index }}:{{ item.stdout }} - --mbrtogpt -- {{ item.item[0].device_name }} - with_items: - - "{{ osds_uuid.results }}" - - - name: stop osd(s) service - service: - name: "ceph-osd@{{ item.item[1].osd_id }}" - state: stopped - with_items: - - "{{ osds_uuid.results }}" - - - name: reinitialize osd(s) journal in new ssd - command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }} - with_items: - - "{{ osds_uuid.results }}" - - - name: start osd(s) service - service: - name: "ceph-osd@{{ item.item[1].osd_id }}" - state: started - with_items: - - "{{ osds_uuid.results }}" diff --git a/infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml b/infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml new file mode 100644 index 000000000..c559ed62f --- /dev/null +++ b/infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml @@ -0,0 +1,37 @@ +--- +# This playbook was made to automate Ceph servers maintenance +# Typical use case: hardware change +# By running this playbook you will set the 'noout' flag on your +# cluster, which means that OSD **can't** be marked as out +# of the CRUSH map, but they will be marked as down. +# Basically we tell the cluster to don't move any data since +# the operation won't last for too long. + +- hosts: + gather_facts: False + + tasks: + + - name: Set the noout flag + command: ceph osd set noout + delegate_to: + + - name: Turn off the server + command: poweroff + + - name: Wait for the server to go down + local_action: > + wait_for host= + port=22 + state=stopped + + - name: Wait for the server to come up + local_action: > + wait_for host= + port=22 + delay=10 + timeout=3600 + + - name: Unset the noout flag + command: ceph osd unset noout + delegate_to: diff --git a/infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml b/infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml new file mode 100644 index 000000000..843056f64 --- /dev/null +++ b/infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml @@ -0,0 +1,555 @@ +--- +# This playbook was meant to upgrade a node from Ubuntu to RHEL. +# We are performing a set of actions prior to reboot the node. +# The node reboots via PXE and gets its new operating system. +# This playbook only works for monitors and OSDs. +# Note that some of the checks are ugly: +# ie: the when migration_completed.stat.exists +# can be improved with includes, however I wanted to keep a single file... +# + +- hosts: mons + serial: 1 + sudo: True + + vars: + backup_dir: /tmp/ + + tasks: + + - name: Check if the node has be migrated already + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed + register: migration_completed + failed_when: false + + - name: Check for failed run + stat: > + path=/var/lib/ceph/{{ ansible_hostname }}.tar + register: mon_archive_leftover + + - fail: msg="Looks like an archive is already there, please remove it!" + when: migration_completed.stat.exists == False and mon_archive_leftover.stat.exists == True + + - name: Compress the store as much as possible + command: ceph tell mon.{{ ansible_hostname }} compact + when: migration_completed.stat.exists == False + + - name: Check if sysvinit + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit + register: monsysvinit + changed_when: False + + - name: Check if upstart + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart + register: monupstart + changed_when: False + + - name: Check if init does what it is supposed to do (Sysvinit) + shell: > + ps faux|grep -sq [c]eph-mon && service ceph status mon >> /dev/null + register: ceph_status_sysvinit + changed_when: False + + # can't complete the condition since the previous taks never ran... + - fail: msg="Something is terribly wrong here, sysvinit is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!" + when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True + + - name: Check if init does what it is supposed to do (upstart) + shell: > + ps faux|grep -sq [c]eph-mon && status ceph-mon-all >> /dev/null + register: ceph_status_upstart + changed_when: False + + - fail: msg="Something is terribly wrong here, upstart is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!" + when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True + + - name: Restart the Monitor after compaction (Upstart) + service: > + name=ceph-mon + state=restarted + args=id={{ ansible_hostname }} + when: monupstart.stat.exists == True and migration_completed.stat.exists == False + + - name: Restart the Monitor after compaction (Sysvinit) + service: > + name=ceph + state=restarted + args=mon + when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False + + - name: Wait for the monitor to be up again + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port=6789 + timeout=10 + when: migration_completed.stat.exists == False + + - name: Stop the monitor (Upstart) + service: > + name=ceph-mon + state=stopped + args=id={{ ansible_hostname }} + when: monupstart.stat.exists == True and migration_completed.stat.exists == False + + - name: Stop the monitor (Sysvinit) + service: > + name=ceph + state=stopped + args=mon + when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False + + - name: Wait for the monitor to be down + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port=6789 + timeout=10 + state=stopped + when: migration_completed.stat.exists == False + + - name: Create a backup directory + file: > + path={{ backup_dir }}/monitors-backups + state=directory + owner=root + group=root + mode=0644 + delegate_to: "{{ item }}" + with_items: "{{ groups.backup[0] }}" + when: migration_completed.stat.exists == False + + # NOTE (leseb): should we convert upstart to sysvinit here already? + - name: Archive monitor stores + shell: > + tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar + chdir=/var/lib/ceph/ + creates={{ ansible_hostname }}.tar + when: migration_completed.stat.exists == False + + - name: Scp the Monitor store + fetch: > + src=/var/lib/ceph/{{ ansible_hostname }}.tar + dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar + flat=yes + when: migration_completed.stat.exists == False + + - name: Reboot the server + command: reboot + when: migration_completed.stat.exists == False + + - name: Wait for the server to come up + local_action: > + wait_for + port=22 + delay=10 + timeout=3600 + when: migration_completed.stat.exists == False + + - name: Wait a bit more to be sure that the server is ready + pause: seconds=20 + when: migration_completed.stat.exists == False + + - name: Check if sysvinit + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit + register: monsysvinit + changed_when: False + + - name: Check if upstart + stat: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart + register: monupstart + changed_when: False + + - name: Make sure the monitor is stopped (Upstart) + service: > + name=ceph-mon + state=stopped + args=id={{ ansible_hostname }} + when: monupstart.stat.exists == True and migration_completed.stat.exists == False + + - name: Make sure the monitor is stopped (Sysvinit) + service: > + name=ceph + state=stopped + args=mon + when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False + + # NOTE (leseb): 'creates' was added in Ansible 1.6 + - name: Copy and unarchive the monitor store + unarchive: > + src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar + dest=/var/lib/ceph/ + copy=yes + mode=0600 + creates=etc/ceph/ceph.conf + when: migration_completed.stat.exists == False + + - name: Copy keys and configs + shell: > + cp etc/ceph/* /etc/ceph/ + chdir=/var/lib/ceph/ + when: migration_completed.stat.exists == False + + - name: Configure RHEL7 for sysvinit + shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; + when: migration_completed.stat.exists == False + + # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary + # so we directly call sysvinit + - name: Start the monitor + service: > + name=ceph + state=started + args=mon + when: migration_completed.stat.exists == False + + - name: Wait for the Monitor to be up again + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port=6789 + timeout=10 + when: migration_completed.stat.exists == False + + - name: Waiting for the monitor to join the quorum... + shell: > + ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }} + register: result + until: result.rc == 0 + retries: 5 + delay: 10 + delegate_to: "{{ item }}" + with_items: "{{ groups.backup[0] }}" + when: migration_completed.stat.exists == False + + - name: Done moving to the next monitor + file: > + path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed + state=touch + owner=root + group=root + mode=0600 + when: migration_completed.stat.exists == False + +- hosts: osds + serial: 1 + sudo: True + + vars: + backup_dir: /tmp/ + + tasks: + - name: Check if the node has be migrated already + stat: > + path=/var/lib/ceph/migration_completed + register: migration_completed + failed_when: false + + - name: Check for failed run + stat: > + path=/var/lib/ceph/{{ ansible_hostname }}.tar + register: osd_archive_leftover + + - fail: msg="Looks like an archive is already there, please remove it!" + when: migration_completed.stat.exists == False and osd_archive_leftover.stat.exists == True + + - name: Check if init does what it is supposed to do (Sysvinit) + shell: > + ps faux|grep -sq [c]eph-osd && service ceph status osd >> /dev/null + register: ceph_status_sysvinit + changed_when: False + + # can't complete the condition since the previous taks never ran... + - fail: msg="Something is terribly wrong here, sysvinit is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!" + when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True + + - name: Check if init does what it is supposed to do (upstart) + shell: > + ps faux|grep -sq [c]eph-osd && initctl list|egrep -sq "ceph-osd \(ceph/.\) start/running, process [0-9][0-9][0-9][0-9]" + register: ceph_status_upstart + changed_when: False + + - fail: msg="Something is terribly wrong here, upstart is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!" + when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True + + - name: Set the noout flag + command: ceph osd set noout + delegate_to: "{{ item }}" + with_items: "{{ groups[mon_group_name][0] }}" + when: migration_completed.stat.exists == False + + - name: Check if sysvinit + shell: stat /var/lib/ceph/osd/ceph-*/sysvinit + register: osdsysvinit + failed_when: false + changed_when: False + + - name: Check if upstart + shell: stat /var/lib/ceph/osd/ceph-*/upstart + register: osdupstart + failed_when: false + changed_when: False + + - name: Archive ceph configs + shell: > + tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar + chdir=/var/lib/ceph/ + creates={{ ansible_hostname }}.tar + when: migration_completed.stat.exists == False + + - name: Create backup directory + file: > + path={{ backup_dir }}/osds-backups + state=directory + owner=root + group=root + mode=0644 + delegate_to: "{{ item }}" + with_items: "{{ groups.backup[0] }}" + when: migration_completed.stat.exists == False + + - name: Scp OSDs dirs and configs + fetch: > + src=/var/lib/ceph/{{ ansible_hostname }}.tar + dest={{ backup_dir }}/osds-backups/ + flat=yes + when: migration_completed.stat.exists == False + + - name: Collect OSD ports + shell: netstat -tlpn | awk -F ":" '/ceph-osd/ { sub (" .*", "", $2); print $2 }' | uniq + register: osd_ports + when: migration_completed.stat.exists == False + + - name: Gracefully stop the OSDs (Upstart) + service: > + name=ceph-osd-all + state=stopped + when: osdupstart.rc == 0 and migration_completed.stat.exists == False + + - name: Gracefully stop the OSDs (Sysvinit) + service: > + name=ceph + state=stopped + args=mon + when: osdsysvinit.rc == 0 and migration_completed.stat.exists == False + + - name: Wait for the OSDs to be down + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + port={{ item }} + timeout=10 + state=stopped + with_items: "{{ osd_ports.stdout_lines }}" + when: migration_completed.stat.exists == False + + - name: Configure RHEL with sysvinit + shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \; + when: migration_completed.stat.exists == False + + - name: Reboot the server + command: reboot + when: migration_completed.stat.exists == False + + - name: Wait for the server to come up + local_action: > + wait_for + port=22 + delay=10 + timeout=3600 + when: migration_completed.stat.exists == False + + - name: Wait a bit to be sure that the server is ready for scp + pause: seconds=20 + when: migration_completed.stat.exists == False + + # NOTE (leseb): 'creates' was added in Ansible 1.6 + - name: Copy and unarchive the OSD configs + unarchive: > + src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar + dest=/var/lib/ceph/ + copy=yes + mode=0600 + creates=etc/ceph/ceph.conf + when: migration_completed.stat.exists == False + + - name: Copy keys and configs + shell: > + cp etc/ceph/* /etc/ceph/ + chdir=/var/lib/ceph/ + when: migration_completed.stat.exists == False + + # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary + # so we directly call sysvinit + - name: Start all the OSDs + service: > + name=ceph-osd-all + state=started + args=osd + when: migration_completed.stat.exists == False + + # NOTE (leseb): this is tricky unless this is set into the ceph.conf + # listened ports can be predicted, thus they will change after each restart +# - name: Wait for the OSDs to be up again +# local_action: > +# wait_for +# host={{ ansible_ssh_host | default(inventory_hostname) }} +# port={{ item }} +# timeout=30 +# with_items: +# - "{{ osd_ports.stdout_lines }}" + + - name: Waiting for clean PGs... + shell: > + test "[""$(ceph -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')" + register: result + until: result.rc == 0 + retries: 10 + delay: 10 + delegate_to: "{{ item }}" + with_items: "{{ groups.backup[0] }}" + when: migration_completed.stat.exists == False + + - name: Done moving to the next OSD + file: > + path=/var/lib/ceph/migration_completed + state=touch + owner=root + group=root + mode=0600 + when: migration_completed.stat.exists == False + + - name: Unset the noout flag + command: ceph osd unset noout + delegate_to: "{{ item }}" + with_items: "{{ groups[mon_group_name][0] }}" + when: migration_completed.stat.exists == False + +- hosts: rgws + serial: 1 + sudo: True + + vars: + backup_dir: /tmp/ + + tasks: + - name: Check if the node has be migrated already + stat: > + path=/var/lib/ceph/radosgw/migration_completed + register: migration_completed + failed_when: false + + - name: Check for failed run + stat: > + path=/var/lib/ceph/{{ ansible_hostname }}.tar + register: rgw_archive_leftover + + - fail: msg="Looks like an archive is already there, please remove it!" + when: migration_completed.stat.exists == False and rgw_archive_leftover.stat.exists == True + + - name: Archive rados gateway configs + shell: > + tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar + chdir=/var/lib/ceph/ + creates={{ ansible_hostname }}.tar + when: migration_completed.stat.exists == False + + - name: Create backup directory + file: > + path={{ backup_dir }}/rgws-backups + state=directory + owner=root + group=root + mode=0644 + delegate_to: "{{ item }}" + with_items: "{{ groups.backup[0] }}" + when: migration_completed.stat.exists == False + + - name: Scp RGWs dirs and configs + fetch: > + src=/var/lib/ceph/{{ ansible_hostname }}.tar + dest={{ backup_dir }}/rgws-backups/ + flat=yes + when: migration_completed.stat.exists == False + + - name: Gracefully stop the rados gateway + service: > + name={{ item }} + state=stopped + with_items: + - radosgw + when: migration_completed.stat.exists == False + + - name: Wait for radosgw to be down + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + path=/tmp/radosgw.sock + state=absent + timeout=30 + when: migration_completed.stat.exists == False + + - name: Reboot the server + command: reboot + when: migration_completed.stat.exists == False + + - name: Wait for the server to come up + local_action: > + wait_for + port=22 + delay=10 + timeout=3600 + when: migration_completed.stat.exists == False + + - name: Wait a bit to be sure that the server is ready for scp + pause: seconds=20 + when: migration_completed.stat.exists == False + + # NOTE (leseb): 'creates' was added in Ansible 1.6 + - name: Copy and unarchive the OSD configs + unarchive: > + src={{ backup_dir }}/rgws-backups/{{ ansible_hostname }}.tar + dest=/var/lib/ceph/ + copy=yes + mode=0600 + creates=etc/ceph/ceph.conf + when: migration_completed.stat.exists == False + + - name: Copy keys and configs + shell: > + {{ item }} + chdir=/var/lib/ceph/ + with_items: + - cp etc/ceph/* /etc/ceph/ + when: migration_completed.stat.exists == False + + - name: Start rados gateway + service: > + name={{ item }} + state=started + with_items: + - radosgw + when: migration_completed.stat.exists == False + + - name: Wait for radosgw to be up again + local_action: > + wait_for + host={{ ansible_ssh_host | default(inventory_hostname) }} + path=/tmp/radosgw.sock + state=present + timeout=30 + when: migration_completed.stat.exists == False + + - name: Done moving to the next rados gateway + file: > + path=/var/lib/ceph/radosgw/migration_completed + state=touch + owner=root + group=root + mode=0600 + when: migration_completed.stat.exists == False diff --git a/infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml b/infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml new file mode 100644 index 000000000..0fc6892d2 --- /dev/null +++ b/infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml @@ -0,0 +1,99 @@ +--- +# This playbook will make custom partition layout for your osd hosts. +# You should define `devices` variable for every host. +# +# For example, in host_vars/hostname1 +# +# devices: +# - device_name: sdb +# partitions: +# - index: 1 +# size: 10G +# type: data +# - index: 2 +# size: 5G +# type: journal +# - device_name: sdc +# partitions: +# - index: 1 +# size: 10G +# type: data +# - index: 2 +# size: 5G +# type: journal +# +- vars: + osd_group_name: osds + journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 + data_typecode: 4fbd7e29-9d25-41b8-afd0-062c0ceff05d + devices: [] + hosts: + - "{{ osd_group_name }}" + + tasks: + + - name: load a variable file for devices partition + include_vars: "{{ item }}" + with_first_found: + - files: + - "host_vars/{{ ansible_hostname }}.yml" + - "host_vars/default.yml" + skip: true + + - name: exit playbook, if devices not defined + fail: + msg: "devices must be define in host_vars/default.yml or host_vars/{{ ansible_hostname }}.yml" + when: devices is not defined + + - name: install sgdisk(gdisk) + package: + name: gdisk + state: present + + - name: erase all previous partitions(dangerous!!!) + shell: sgdisk --zap-all -- /dev/{{item.device_name}} + with_items: "{{ devices }}" + + - name: make osd partitions + shell: > + sgdisk --new={{item.1.index}}:0:+{{item.1.size}} "--change-name={{item.1.index}}:ceph {{item.1.type}}" + "--typecode={{item.1.index}}:{% if item.1.type=='data' %}{{data_typecode}}{% else %}{{journal_typecode}}{% endif %}" + --mbrtogpt -- /dev/{{item.0.device_name}} + with_subelements: + - "{{ devices }}" + - partitions + + - set_fact: + owner: 167 + group: 167 + when: + - ansible_os_family == "RedHat" + + - set_fact: + owner: 64045 + group: 64045 + when: + - ansible_os_family == "Debian" + + - name: change partitions ownership + file: + path: "/dev/{{item.0.device_name}}{{item.1.index}}" + owner: "{{ owner | default('root')}}" + group: "{{ group | default('disk')}}" + with_subelements: + - "{{ devices }}" + - partitions + when: + item.0.device_name | match('/dev/([hsv]d[a-z]{1,2}){1,2}$') + + - name: change partitions ownership + file: + path: "/dev/{{item.0.device_name}}p{{item.1.index}}" + owner: "{{ owner | default('root')}}" + group: "{{ group | default('disk')}}" + with_subelements: + - "{{ devices }}" + - partitions + when: + item.0.device_name | match('/dev/(cciss/c[0-9]d[0-9]|nvme[0-9]n[0-9]){1,2}$') +... \ No newline at end of file diff --git a/infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml b/infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml new file mode 100644 index 000000000..44a75e01b --- /dev/null +++ b/infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml @@ -0,0 +1,112 @@ +--- +# This playbook use to migrate activity osd(s) journal to SSD. +# +# You should define `osds_journal_devices` variable for host which osd(s) journal migrate to. +# +# For example in host_vars/hostname1.yml +# +# osds_journal_devices: +# - device_name: /dev/sdd +# partitions: +# - index: 1 +# size: 10G +# osd_id: 0 +# - index: 2 +# size: 10G +# osd_id: 1 +# - device_name: /dev/sdf +# partitions: +# - index: 1 +# size: 10G +# osd_id: 2 +# +# @param device_name: The full device path of new ssd. +# @param partitions: The custom partition layout of ssd. +# @param index: The index of this partition. +# @param size: The size of this partition. +# @param osd_id: Which osds's journal this partition for. +# +# ansible-playbook migrate-journal-to-ssd.yml +# The playbook will migrate osd(s) journal to ssd device which you define in host_vars. + +- vars: + osd_group_name: osds + journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 + osds_journal_devices: [] + hosts: + - "{{ osd_group_name }}" + serial: 1 + tasks: + + - name: get osd(s) if directory stat + stat: + path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" + register: osds_dir_stat + with_subelements: + - "{{ osds_journal_devices }}" + - partitions + + - name: exit playbook osd(s) is not on this host + fail: + msg: exit playbook osd(s) is not on this host + with_items: + osds_dir_stat.results + when: + - osds_dir_stat is defined and item.stat.exists == false + + - name: install sgdisk(gdisk) + package: + name: gdisk + state: present + when: osds_journal_devices is defined + + - name: generate uuid for osds journal + command: uuidgen + register: osds + with_subelements: + - "{{ osds_journal_devices }}" + - partitions + + - name: make osd partitions on ssd + shell: > + sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" + --typecode={{ item.item[1].index }}:{{ journal_typecode }} + --partition-guid={{ item.item[1].index }}:{{ item.stdout }} + --mbrtogpt -- {{ item.item[0].device_name }} + with_items: + - "{{ osds.results }}" + + - name: stop osd(s) service + service: + name: "ceph-osd@{{ item.item[1].osd_id }}" + state: stopped + with_items: + - "{{ osds.results }}" + + - name: flush osd(s) journal + command: ceph-osd -i {{ item.item[1].osd_id }} --flush-journal --cluster {{ cluster }} + with_items: + - "{{ osds.results }}" + when: osds_journal_devices is defined + + - name: update osd(s) journal soft link + command: ln -sf /dev/disk/by-partuuid/{{ item.stdout }} /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal + with_items: + - "{{ osds.results }}" + + - name: update osd(s) journal uuid + command: echo {{ item.stdout }} > /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal_uuid + with_items: + - "{{ osds.results }}" + + - name: initialize osd(s) new journal + command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }} + with_items: + - "{{ osds.results }}" + + - name: start osd(s) service + service: + name: "ceph-osd@{{ item.item[1].osd_id }}" + state: started + with_items: + - "{{ osds.results }}" diff --git a/infrastructure-playbooks/untested-by-ci/purge-multisite.yml b/infrastructure-playbooks/untested-by-ci/purge-multisite.yml new file mode 100644 index 000000000..8b78553ac --- /dev/null +++ b/infrastructure-playbooks/untested-by-ci/purge-multisite.yml @@ -0,0 +1,11 @@ +--- +# Nukes a multisite config +- hosts: rgws + become: True + tasks: + - include: roles/ceph-rgw/tasks/multisite/destroy.yml + + handlers: + - include: roles/ceph-rgw/handlers/main.yml + # Ansible 2.1.0 bug will ignore included handlers without this + static: True diff --git a/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml b/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml new file mode 100644 index 000000000..de3b6e86e --- /dev/null +++ b/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml @@ -0,0 +1,117 @@ +--- +# This playbook use to recover Ceph OSDs after ssd journal failure. +# You will also realise that it’s really simple to bring your +# OSDs back to life after replacing your faulty SSD with a new one. +# +# You should define `dev_ssds` variable for host which changes ssds after +# failure. +# +# For example in host_vars/hostname1.yml +# +# dev_ssds: +# - device_name: /dev/sdd +# partitions: +# - index: 1 +# size: 10G +# osd_id: 0 +# - index: 2 +# size: 10G +# osd_id: 1 +# - device_name: /dev/sdf +# partitions: +# - index: 1 +# size: 10G +# osd_id: 2 +# +# @param device_name: The full device path of new ssd +# @param partitions: The custom partition layout of new ssd +# @param index: The index of this partition +# @param size: The size of this partition +# @param osd_id: Which osds's journal this partition for. +# +# ansible-playbook recover-osds-after-ssd-journal-failure.yml +# Prompts for select which host to recover, defaults to null, +# doesn't select host the recover ssd. Input the hostname +# which to recover osds after ssd journal failure +# +# ansible-playbook -e target_host=hostname \ +# recover-osds-after-ssd-journal-failure.yml +# Overrides the prompt using -e option. Can be used in +# automation scripts to avoid interactive prompt. + +- hosts: localhost + gather_facts: no + vars_prompt: + - name: target_host + prompt: please enter the target hostname which to recover osds after ssd journal failure + private: no + tasks: + - add_host: + name: "{{ target_host }}" + groups: dynamically_created_hosts + +- hosts: dynamically_created_hosts + vars: + journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 + dev_ssds: [] + + tasks: + - fail: msg="please define dev_ssds variable" + when: dev_ssds|length <= 0 + + - name: get osd(s) if directory stat + stat: + path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" + register: osds_dir_stat + with_subelements: + - "{{ dev_ssds }}" + - partitions + + - name: exit playbook osd(s) is not on this host + fail: + msg: exit playbook osds is not no this host + with_items: + osds_dir_stat.results + when: + - osds_dir_stat is defined + - item.stat.exists == false + + - name: install sgdisk(gdisk) + package: + name: gdisk + state: present + + - name: get osd(s) journal uuid + command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" + register: osds_uuid + with_subelements: + - "{{ dev_ssds }}" + - partitions + + - name: make partitions on new ssd + shell: > + sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" + --typecode={{ item.item[1].index }}:{{ journal_typecode }} + --partition-guid={{ item.item[1].index }}:{{ item.stdout }} + --mbrtogpt -- {{ item.item[0].device_name }} + with_items: + - "{{ osds_uuid.results }}" + + - name: stop osd(s) service + service: + name: "ceph-osd@{{ item.item[1].osd_id }}" + state: stopped + with_items: + - "{{ osds_uuid.results }}" + + - name: reinitialize osd(s) journal in new ssd + command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }} + with_items: + - "{{ osds_uuid.results }}" + + - name: start osd(s) service + service: + name: "ceph-osd@{{ item.item[1].osd_id }}" + state: started + with_items: + - "{{ osds_uuid.results }}"