infra playbook: move untested scenario to a new dir

author Sébastien Han <seb@redhat.com>

Wed, 30 Aug 2017 21:30:49 +0000 (23:30 +0200)

committer Sébastien Han <seb@redhat.com>

Fri, 1 Sep 2017 17:58:24 +0000 (19:58 +0200)
author Sébastien Han <seb@redhat.com>
Wed, 30 Aug 2017 21:30:49 +0000 (23:30 +0200)
committer Sébastien Han <seb@redhat.com>
Fri, 1 Sep 2017 17:58:24 +0000 (19:58 +0200)
diff --git a/ceph-ansible.spec.in b/ceph-ansible.spec.in

index d2be8ef0d692335f9964a9552cc908e294d4f311..53c694012366f21964a62a9956d23db047401b51 100644 (file)
--- a/ceph-ansible.spec.in
+++ b/ceph-ansible.spec.in
@@ -42,6 +42,7 @@ done
  pushd %{buildroot}%{_datarootdir}/ceph-ansible
    rm -r roles/ceph-common-coreos
    rm group_vars/common-coreoss.yml.sample
+  rm -r infrastructure-playbooks/untested-by-ci
  popd
  
  # Strip iscsi files.
diff --git a/infrastructure-playbooks/cluster-maintenance.yml b/infrastructure-playbooks/cluster-maintenance.yml

deleted file mode 100644 (file)

index c559ed6..0000000
--- a/infrastructure-playbooks/cluster-maintenance.yml
+++ /dev/null
@@ -1,37 +0,0 @@
----
-# This playbook was made to automate Ceph servers maintenance
-# Typical use case: hardware change
-# By running this playbook you will set the 'noout' flag on your
-# cluster, which means that OSD **can't** be marked as out
-# of the CRUSH map, but they will be marked as down.
-# Basically we tell the cluster to don't move any data since
-# the operation won't last for too long.
-
-- hosts: <your_host>
-  gather_facts: False
-
-  tasks:
-
-  - name: Set the noout flag
-    command: ceph osd set noout
-    delegate_to: <your_monitor>
-
-  - name: Turn off the server
-    command: poweroff
-
-  - name: Wait for the server to go down
-    local_action: >
-      wait_for host=<your_host>
-      port=22
-      state=stopped
-
-  - name: Wait for the server to come up
-    local_action: >
-      wait_for host=<your_host>
-      port=22
-      delay=10
-      timeout=3600
-
-  - name: Unset the noout flag
-    command: ceph osd unset noout
-    delegate_to: <your_monitor>
diff --git a/infrastructure-playbooks/cluster-os-migration.yml b/infrastructure-playbooks/cluster-os-migration.yml

deleted file mode 100644 (file)

index 843056f..0000000
--- a/infrastructure-playbooks/cluster-os-migration.yml
+++ /dev/null
@@ -1,555 +0,0 @@
----
-# This playbook was meant to upgrade a node from Ubuntu to RHEL.
-# We are performing a set of actions prior to reboot the node.
-# The node reboots via PXE and gets its new operating system.
-# This playbook only works for monitors and OSDs.
-# Note that some of the checks are ugly:
-#   ie: the when migration_completed.stat.exists
-# can be improved with includes, however I wanted to keep a single file...
-#
-
-- hosts: mons
-  serial: 1
-  sudo: True
-
-  vars:
-    backup_dir: /tmp/
-
-  tasks:
-
-    - name: Check if the node has be migrated already
-      stat: >
-        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed
-      register: migration_completed
-      failed_when: false
-
-    - name: Check for failed run
-      stat: >
-        path=/var/lib/ceph/{{ ansible_hostname }}.tar
-      register: mon_archive_leftover
-
-    - fail: msg="Looks like an archive is already there, please remove it!"
-      when: migration_completed.stat.exists == False and mon_archive_leftover.stat.exists == True
-
-    - name: Compress the store as much as possible
-      command: ceph tell mon.{{ ansible_hostname }} compact
-      when: migration_completed.stat.exists == False
-
-    - name: Check if sysvinit
-      stat: >
-        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit
-      register: monsysvinit
-      changed_when: False
-
-    - name: Check if upstart
-      stat: >
-        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart
-      register: monupstart
-      changed_when: False
-
-    - name: Check if init does what it is supposed to do (Sysvinit)
-      shell: >
-        ps faux|grep -sq [c]eph-mon && service ceph status mon >> /dev/null
-      register: ceph_status_sysvinit
-      changed_when: False
-
-    # can't complete the condition since the previous taks never ran...
-    - fail: msg="Something is terribly wrong here, sysvinit is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!"
-      when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True
-
-    - name: Check if init does what it is supposed to do (upstart)
-      shell: >
-        ps faux|grep -sq [c]eph-mon && status ceph-mon-all >> /dev/null
-      register: ceph_status_upstart
-      changed_when: False
-
-    - fail: msg="Something is terribly wrong here, upstart is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!"
-      when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True
-
-    - name: Restart the Monitor after compaction (Upstart)
-      service: >
-        name=ceph-mon
-        state=restarted
-        args=id={{ ansible_hostname }}
-      when: monupstart.stat.exists == True and migration_completed.stat.exists == False
-
-    - name: Restart the Monitor after compaction (Sysvinit)
-      service: >
-        name=ceph
-        state=restarted
-        args=mon
-      when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False
-
-    - name: Wait for the monitor to be up again
-      local_action: >
-        wait_for
-        host={{ ansible_ssh_host | default(inventory_hostname) }}
-        port=6789
-        timeout=10
-      when: migration_completed.stat.exists == False
-
-    - name: Stop the monitor (Upstart)
-      service: >
-        name=ceph-mon
-        state=stopped
-        args=id={{ ansible_hostname }}
-      when: monupstart.stat.exists == True and migration_completed.stat.exists == False
-
-    - name: Stop the monitor (Sysvinit)
-      service: >
-        name=ceph
-        state=stopped
-        args=mon
-      when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False
-
-    - name: Wait for the monitor to be down
-      local_action: >
-        wait_for
-        host={{ ansible_ssh_host | default(inventory_hostname) }}
-        port=6789
-        timeout=10
-        state=stopped
-      when: migration_completed.stat.exists == False
-
-    - name: Create a backup directory
-      file: >
-        path={{ backup_dir }}/monitors-backups
-        state=directory
-        owner=root
-        group=root
-        mode=0644
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups.backup[0] }}"
-      when: migration_completed.stat.exists == False
-
-    # NOTE (leseb): should we convert upstart to sysvinit here already?
-    - name: Archive monitor stores
-      shell: >
-        tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar
-        chdir=/var/lib/ceph/
-        creates={{ ansible_hostname }}.tar
-      when: migration_completed.stat.exists == False
-
-    - name: Scp the Monitor store
-      fetch: >
-        src=/var/lib/ceph/{{ ansible_hostname }}.tar
-        dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar
-        flat=yes
-      when: migration_completed.stat.exists == False
-
-    - name: Reboot the server
-      command: reboot
-      when: migration_completed.stat.exists == False
-
-    - name: Wait for the server to come up
-      local_action: >
-        wait_for
-        port=22
-        delay=10
-        timeout=3600
-      when: migration_completed.stat.exists == False
-
-    - name: Wait a bit more to be sure that the server is ready
-      pause: seconds=20
-      when: migration_completed.stat.exists == False
-
-    - name: Check if sysvinit
-      stat: >
-        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit
-      register: monsysvinit
-      changed_when: False
-
-    - name: Check if upstart
-      stat: >
-        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart
-      register: monupstart
-      changed_when: False
-
-    - name: Make sure the monitor is stopped (Upstart)
-      service: >
-        name=ceph-mon
-        state=stopped
-        args=id={{ ansible_hostname }}
-      when: monupstart.stat.exists == True and migration_completed.stat.exists == False
-
-    - name: Make sure the monitor is stopped (Sysvinit)
-      service: >
-        name=ceph
-        state=stopped
-        args=mon
-      when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False
-
-    # NOTE (leseb): 'creates' was added in Ansible 1.6
-    - name: Copy and unarchive the monitor store
-      unarchive: >
-        src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar
-        dest=/var/lib/ceph/
-        copy=yes
-        mode=0600
-        creates=etc/ceph/ceph.conf
-      when: migration_completed.stat.exists == False
-
-    - name: Copy keys and configs
-      shell: >
-        cp etc/ceph/* /etc/ceph/
-        chdir=/var/lib/ceph/
-      when: migration_completed.stat.exists == False
-
-    - name: Configure RHEL7 for sysvinit
-      shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \;
-      when: migration_completed.stat.exists == False
-
-    # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary
-    # so we directly call sysvinit
-    - name: Start the monitor
-      service: >
-        name=ceph
-        state=started
-        args=mon
-      when: migration_completed.stat.exists == False
-
-    - name: Wait for the Monitor to be up again
-      local_action: >
-        wait_for
-        host={{ ansible_ssh_host | default(inventory_hostname) }}
-        port=6789
-        timeout=10
-      when: migration_completed.stat.exists == False
-
-    - name: Waiting for the monitor to join the quorum...
-      shell: >
-        ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }}
-      register: result
-      until: result.rc == 0
-      retries: 5
-      delay: 10
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups.backup[0] }}"
-      when: migration_completed.stat.exists == False
-
-    - name: Done moving to the next monitor
-      file: >
-        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed
-        state=touch
-        owner=root
-        group=root
-        mode=0600
-      when: migration_completed.stat.exists == False
-
-- hosts: osds
-  serial: 1
-  sudo: True
-
-  vars:
-    backup_dir: /tmp/
-
-  tasks:
-    - name: Check if the node has be migrated already
-      stat: >
-        path=/var/lib/ceph/migration_completed
-      register: migration_completed
-      failed_when: false
-
-    - name: Check for failed run
-      stat: >
-        path=/var/lib/ceph/{{ ansible_hostname }}.tar
-      register: osd_archive_leftover
-
-    - fail: msg="Looks like an archive is already there, please remove it!"
-      when: migration_completed.stat.exists == False and osd_archive_leftover.stat.exists == True
-
-    - name: Check if init does what it is supposed to do (Sysvinit)
-      shell: >
-        ps faux|grep -sq [c]eph-osd && service ceph status osd >> /dev/null
-      register: ceph_status_sysvinit
-      changed_when: False
-
-    # can't complete the condition since the previous taks never ran...
-    - fail: msg="Something is terribly wrong here, sysvinit is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!"
-      when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True
-
-    - name: Check if init does what it is supposed to do (upstart)
-      shell: >
-        ps faux|grep -sq [c]eph-osd && initctl list|egrep -sq "ceph-osd \(ceph/.\) start/running, process [0-9][0-9][0-9][0-9]"
-      register: ceph_status_upstart
-      changed_when: False
-
-    - fail: msg="Something is terribly wrong here, upstart is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!"
-      when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True
-
-    - name: Set the noout flag
-      command: ceph osd set noout
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups[mon_group_name][0] }}"
-      when: migration_completed.stat.exists == False
-
-    - name: Check if sysvinit
-      shell: stat /var/lib/ceph/osd/ceph-*/sysvinit
-      register: osdsysvinit
-      failed_when: false
-      changed_when: False
-
-    - name: Check if upstart
-      shell: stat /var/lib/ceph/osd/ceph-*/upstart
-      register: osdupstart
-      failed_when: false
-      changed_when: False
-
-    - name: Archive ceph configs
-      shell: >
-        tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar
-        chdir=/var/lib/ceph/
-        creates={{ ansible_hostname }}.tar
-      when: migration_completed.stat.exists == False
-
-    - name: Create backup directory
-      file: >
-        path={{ backup_dir }}/osds-backups
-        state=directory
-        owner=root
-        group=root
-        mode=0644
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups.backup[0] }}"
-      when: migration_completed.stat.exists == False
-
-    - name: Scp OSDs dirs and configs
-      fetch: >
-        src=/var/lib/ceph/{{ ansible_hostname }}.tar
-        dest={{ backup_dir }}/osds-backups/
-        flat=yes
-      when: migration_completed.stat.exists == False
-
-    - name: Collect OSD ports
-      shell: netstat -tlpn | awk -F ":" '/ceph-osd/ { sub (" .*", "", $2); print $2 }' | uniq
-      register: osd_ports
-      when: migration_completed.stat.exists == False
-
-    - name: Gracefully stop the OSDs (Upstart)
-      service: >
-        name=ceph-osd-all
-        state=stopped
-      when: osdupstart.rc == 0 and migration_completed.stat.exists == False
-
-    - name: Gracefully stop the OSDs (Sysvinit)
-      service: >
-        name=ceph
-        state=stopped
-        args=mon
-      when: osdsysvinit.rc == 0 and migration_completed.stat.exists == False
-
-    - name: Wait for the OSDs to be down
-      local_action: >
-        wait_for
-        host={{ ansible_ssh_host | default(inventory_hostname) }}
-        port={{ item }}
-        timeout=10
-        state=stopped
-      with_items: "{{ osd_ports.stdout_lines }}"
-      when: migration_completed.stat.exists == False
-
-    - name: Configure RHEL with sysvinit
-      shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \;
-      when: migration_completed.stat.exists == False
-
-    - name: Reboot the server
-      command: reboot
-      when: migration_completed.stat.exists == False
-
-    - name: Wait for the server to come up
-      local_action: >
-        wait_for
-        port=22
-        delay=10
-        timeout=3600
-      when: migration_completed.stat.exists == False
-
-    - name: Wait a bit to be sure that the server is ready for scp
-      pause: seconds=20
-      when: migration_completed.stat.exists == False
-
-    # NOTE (leseb): 'creates' was added in Ansible 1.6
-    - name: Copy and unarchive the OSD configs
-      unarchive: >
-        src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar
-        dest=/var/lib/ceph/
-        copy=yes
-        mode=0600
-        creates=etc/ceph/ceph.conf
-      when: migration_completed.stat.exists == False
-
-    - name: Copy keys and configs
-      shell: >
-        cp etc/ceph/* /etc/ceph/
-        chdir=/var/lib/ceph/
-      when: migration_completed.stat.exists == False
-
-    # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary
-    # so we directly call sysvinit
-    - name: Start all the OSDs
-      service: >
-        name=ceph-osd-all
-        state=started
-        args=osd
-      when: migration_completed.stat.exists == False
-
-    # NOTE (leseb): this is tricky unless this is set into the ceph.conf
-    # listened ports can be predicted, thus they will change after each restart
-#    - name: Wait for the OSDs to be up again
-#      local_action: >
-#        wait_for
-#        host={{ ansible_ssh_host | default(inventory_hostname) }}
-#        port={{ item }}
-#        timeout=30
-#      with_items:
-#        - "{{ osd_ports.stdout_lines }}"
-
-    - name: Waiting for clean PGs...
-      shell: >
-        test "[""$(ceph -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
-      register: result
-      until: result.rc == 0
-      retries: 10
-      delay: 10
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups.backup[0] }}"
-      when: migration_completed.stat.exists == False
-
-    - name: Done moving to the next OSD
-      file: >
-        path=/var/lib/ceph/migration_completed
-        state=touch
-        owner=root
-        group=root
-        mode=0600
-      when: migration_completed.stat.exists == False
-
-    - name: Unset the noout flag
-      command: ceph osd unset noout
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups[mon_group_name][0] }}"
-      when: migration_completed.stat.exists == False
-
-- hosts: rgws
-  serial: 1
-  sudo: True
-
-  vars:
-    backup_dir: /tmp/
-
-  tasks:
-    - name: Check if the node has be migrated already
-      stat: >
-        path=/var/lib/ceph/radosgw/migration_completed
-      register: migration_completed
-      failed_when: false
-
-    - name: Check for failed run
-      stat: >
-        path=/var/lib/ceph/{{ ansible_hostname }}.tar
-      register: rgw_archive_leftover
-
-    - fail: msg="Looks like an archive is already there, please remove it!"
-      when: migration_completed.stat.exists == False and rgw_archive_leftover.stat.exists == True
-
-    - name: Archive rados gateway configs
-      shell: >
-        tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar
-        chdir=/var/lib/ceph/
-        creates={{ ansible_hostname }}.tar
-      when: migration_completed.stat.exists == False
-
-    - name: Create backup directory
-      file: >
-        path={{ backup_dir }}/rgws-backups
-        state=directory
-        owner=root
-        group=root
-        mode=0644
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups.backup[0] }}"
-      when: migration_completed.stat.exists == False
-
-    - name: Scp RGWs dirs and configs
-      fetch: >
-        src=/var/lib/ceph/{{ ansible_hostname }}.tar
-        dest={{ backup_dir }}/rgws-backups/
-        flat=yes
-      when: migration_completed.stat.exists == False
-
-    - name: Gracefully stop the rados gateway
-      service: >
-        name={{ item }}
-        state=stopped
-      with_items:
-        - radosgw
-      when: migration_completed.stat.exists == False
-
-    - name: Wait for radosgw to be down
-      local_action: >
-        wait_for
-        host={{ ansible_ssh_host | default(inventory_hostname) }}
-        path=/tmp/radosgw.sock
-        state=absent
-        timeout=30
-      when: migration_completed.stat.exists == False
-
-    - name: Reboot the server
-      command: reboot
-      when: migration_completed.stat.exists == False
-
-    - name: Wait for the server to come up
-      local_action: >
-        wait_for
-        port=22
-        delay=10
-        timeout=3600
-      when: migration_completed.stat.exists == False
-
-    - name: Wait a bit to be sure that the server is ready for scp
-      pause: seconds=20
-      when: migration_completed.stat.exists == False
-
-    # NOTE (leseb): 'creates' was added in Ansible 1.6
-    - name: Copy and unarchive the OSD configs
-      unarchive: >
-        src={{ backup_dir }}/rgws-backups/{{ ansible_hostname }}.tar
-        dest=/var/lib/ceph/
-        copy=yes
-        mode=0600
-        creates=etc/ceph/ceph.conf
-      when: migration_completed.stat.exists == False
-
-    - name: Copy keys and configs
-      shell: >
-        {{ item }}
-        chdir=/var/lib/ceph/
-      with_items:
-        - cp etc/ceph/* /etc/ceph/
-      when: migration_completed.stat.exists == False
-
-    - name: Start rados gateway
-      service: >
-        name={{ item }}
-        state=started
-      with_items:
-        - radosgw
-      when: migration_completed.stat.exists == False
-
-    - name: Wait for radosgw to be up again
-      local_action: >
-        wait_for
-        host={{ ansible_ssh_host | default(inventory_hostname) }}
-        path=/tmp/radosgw.sock
-        state=present
-        timeout=30
-      when: migration_completed.stat.exists == False
-
-    - name: Done moving to the next rados gateway
-      file: >
-        path=/var/lib/ceph/radosgw/migration_completed
-        state=touch
-        owner=root
-        group=root
-        mode=0600
-      when: migration_completed.stat.exists == False
diff --git a/infrastructure-playbooks/make-osd-partitions.yml b/infrastructure-playbooks/make-osd-partitions.yml

deleted file mode 100644 (file)

index 0fc6892..0000000
--- a/infrastructure-playbooks/make-osd-partitions.yml
+++ /dev/null
@@ -1,99 +0,0 @@
----
-# This playbook will make custom partition layout for your osd hosts.
-# You should define `devices` variable for every host.
-#
-# For example, in host_vars/hostname1
-#
-# devices:
-# - device_name: sdb
-#   partitions:
-#   - index: 1
-#     size: 10G
-#     type: data
-#   - index: 2
-#     size: 5G
-#     type: journal
-# - device_name: sdc
-#   partitions:
-#   - index: 1
-#     size: 10G
-#     type: data
-#   - index: 2
-#     size: 5G
-#     type: journal
-#
-- vars:
-    osd_group_name: osds
-    journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
-    data_typecode: 4fbd7e29-9d25-41b8-afd0-062c0ceff05d
-    devices: []
-  hosts:
-  - "{{ osd_group_name }}"
-
-  tasks:
-
-  - name: load a variable file for devices partition
-    include_vars: "{{ item }}"
-    with_first_found:
-      - files:
-          - "host_vars/{{ ansible_hostname }}.yml"
-          - "host_vars/default.yml"
-        skip: true
-
-  - name: exit playbook, if devices not defined
-    fail:
-      msg: "devices must be define in host_vars/default.yml or host_vars/{{ ansible_hostname }}.yml"
-    when: devices is not defined
-
-  - name: install sgdisk(gdisk)
-    package:
-      name: gdisk
-      state: present
-
-  - name: erase all previous partitions(dangerous!!!)
-    shell: sgdisk --zap-all -- /dev/{{item.device_name}}
-    with_items: "{{ devices }}"
-
-  - name: make osd partitions
-    shell: >
-           sgdisk --new={{item.1.index}}:0:+{{item.1.size}} "--change-name={{item.1.index}}:ceph {{item.1.type}}"
-           "--typecode={{item.1.index}}:{% if item.1.type=='data' %}{{data_typecode}}{% else %}{{journal_typecode}}{% endif %}"
-           --mbrtogpt -- /dev/{{item.0.device_name}}
-    with_subelements:
-    - "{{ devices }}"
-    - partitions
-
-  - set_fact:
-      owner: 167
-      group: 167
-    when:
-      - ansible_os_family == "RedHat"
-  
-  - set_fact:
-      owner: 64045
-      group: 64045
-    when:
-      - ansible_os_family == "Debian"
-
-  - name: change partitions ownership
-    file:
-      path: "/dev/{{item.0.device_name}}{{item.1.index}}"
-      owner: "{{ owner | default('root')}}"
-      group: "{{ group | default('disk')}}"
-    with_subelements:
-      - "{{ devices }}"
-      - partitions
-    when:
-      item.0.device_name | match('/dev/([hsv]d[a-z]{1,2}){1,2}$')
-
-  - name: change partitions ownership
-    file:
-      path: "/dev/{{item.0.device_name}}p{{item.1.index}}"
-      owner: "{{ owner | default('root')}}"
-      group: "{{ group | default('disk')}}"
-    with_subelements:
-      - "{{ devices }}"
-      - partitions
-    when:
-      item.0.device_name | match('/dev/(cciss/c[0-9]d[0-9]|nvme[0-9]n[0-9]){1,2}$')
-...
-\ No newline at end of file
diff --git a/infrastructure-playbooks/migrate-journal-to-ssd.yml b/infrastructure-playbooks/migrate-journal-to-ssd.yml

deleted file mode 100644 (file)

index 44a75e0..0000000
--- a/infrastructure-playbooks/migrate-journal-to-ssd.yml
+++ /dev/null
@@ -1,112 +0,0 @@
----
-# This playbook use to migrate activity osd(s) journal to SSD.
-#
-# You should define `osds_journal_devices` variable for host which osd(s) journal migrate to.
-# 
-# For example in host_vars/hostname1.yml
-#
-# osds_journal_devices:
-# - device_name: /dev/sdd
-#   partitions:
-#   - index: 1
-#     size: 10G
-#     osd_id: 0
-#   - index: 2
-#     size: 10G
-#     osd_id: 1
-# - device_name: /dev/sdf
-#   partitions:       
-#   - index: 1        
-#     size: 10G       
-#     osd_id: 2       
-#
-# @param device_name: The full device path of new ssd.
-# @param partitions:  The custom partition layout of ssd.
-# @param index:  The index of this partition.
-# @param size:  The size of this partition.
-# @param osd_id: Which osds's journal this partition for.
-#
-# ansible-playbook migrate-journal-to-ssd.yml
-#    The playbook will migrate osd(s) journal to ssd device which you define in host_vars. 
-
-- vars:
-    osd_group_name: osds
-    journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
-    osds_journal_devices: []
-  hosts:
-      - "{{ osd_group_name }}"
-  serial: 1
-  tasks:
-
-  - name: get osd(s) if directory stat
-    stat:
-      path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
-    register: osds_dir_stat
-    with_subelements:
-      - "{{ osds_journal_devices }}"
-      - partitions
-
-  - name: exit playbook osd(s) is not on this host
-    fail:
-        msg: exit playbook osd(s) is not on this host
-    with_items: 
-        osds_dir_stat.results
-    when:
-      -  osds_dir_stat is defined and item.stat.exists == false
-
-  - name: install sgdisk(gdisk)
-    package:
-      name: gdisk
-      state: present
-    when: osds_journal_devices is defined
-
-  - name: generate uuid for osds journal
-    command: uuidgen
-    register: osds
-    with_subelements:
-      - "{{ osds_journal_devices }}"
-      - partitions
-
-  - name: make osd partitions on ssd
-    shell: >
-      sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" 
-      --typecode={{ item.item[1].index }}:{{ journal_typecode }} 
-      --partition-guid={{ item.item[1].index }}:{{ item.stdout }} 
-      --mbrtogpt -- {{ item.item[0].device_name }}
-    with_items: 
-      - "{{ osds.results }}"
-
-  - name: stop osd(s) service
-    service:
-      name: "ceph-osd@{{ item.item[1].osd_id }}"
-      state: stopped
-    with_items:
-      - "{{ osds.results }}"
-
-  - name: flush osd(s) journal
-    command: ceph-osd -i {{ item.item[1].osd_id }} --flush-journal --cluster {{ cluster }} 
-    with_items:
-      - "{{ osds.results }}"
-    when: osds_journal_devices is defined
-
-  - name: update osd(s) journal soft link
-    command: ln -sf /dev/disk/by-partuuid/{{ item.stdout }} /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal
-    with_items:
-      - "{{ osds.results }}"
-
-  - name: update osd(s) journal uuid
-    command: echo {{ item.stdout }} > /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal_uuid
-    with_items:
-      - "{{ osds.results }}"
-
-  - name: initialize osd(s) new journal 
-    command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
-    with_items:
-      - "{{ osds.results }}"
-
-  - name: start osd(s) service
-    service:
-      name: "ceph-osd@{{ item.item[1].osd_id }}"
-      state: started
-    with_items:
-      - "{{ osds.results }}"
diff --git a/infrastructure-playbooks/purge-multisite.yml b/infrastructure-playbooks/purge-multisite.yml

deleted file mode 100644 (file)

index 8b78553..0000000
--- a/infrastructure-playbooks/purge-multisite.yml
+++ /dev/null
@@ -1,11 +0,0 @@
----
-# Nukes a multisite config
-- hosts: rgws
-  become: True
-  tasks:
-  - include: roles/ceph-rgw/tasks/multisite/destroy.yml
-
-  handlers:
-  - include: roles/ceph-rgw/handlers/main.yml
-    # Ansible 2.1.0 bug will ignore included handlers without this
-    static: True
diff --git a/infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml b/infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml

deleted file mode 100644 (file)

index de3b6e8..0000000
--- a/infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml
+++ /dev/null
@@ -1,117 +0,0 @@
----
-# This playbook use to recover Ceph OSDs after ssd journal failure.
-# You will also realise that it’s really simple to bring your 
-# OSDs back to life after replacing your faulty SSD with a new one.
-#
-# You should define `dev_ssds` variable for host which changes ssds after
-# failure. 
-# 
-# For example in host_vars/hostname1.yml
-#
-# dev_ssds:
-# - device_name: /dev/sdd
-#   partitions:
-#   - index: 1
-#     size: 10G
-#     osd_id: 0
-#   - index: 2
-#     size: 10G
-#     osd_id: 1
-# - device_name: /dev/sdf
-#   partitions:       
-#   - index: 1        
-#     size: 10G       
-#     osd_id: 2       
-#
-# @param device_name: The full device path of new ssd
-# @param partitions:  The custom partition layout of new ssd
-# @param index:  The index of this partition
-# @param size:  The size of this partition
-# @param osd_id: Which osds's journal this partition for.
-#
-# ansible-playbook recover-osds-after-ssd-journal-failure.yml
-#     Prompts for select which host to recover, defaults to null,  
-#     doesn't select host the recover ssd. Input the hostname
-#     which to recover osds after ssd journal failure
-#
-# ansible-playbook -e target_host=hostname \
-#     recover-osds-after-ssd-journal-failure.yml
-#     Overrides the prompt using -e option. Can be used in
-#     automation scripts to avoid interactive prompt.
-
-- hosts: localhost
-  gather_facts: no
-  vars_prompt:
-  - name: target_host
-    prompt: please enter the target hostname which to recover osds after ssd journal failure
-    private: no
-  tasks:
-    - add_host:
-        name: "{{ target_host }}"
-        groups: dynamically_created_hosts
-
-- hosts: dynamically_created_hosts
-  vars:
-   journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
-   dev_ssds: []
-
-  tasks:
-  - fail: msg="please define dev_ssds variable"
-    when: dev_ssds|length <= 0
- 
-  - name: get osd(s) if directory stat
-    stat:
-      path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
-    register: osds_dir_stat
-    with_subelements:
-      - "{{ dev_ssds }}"
-      - partitions
- 
-  - name: exit playbook osd(s) is not on this host
-    fail:
-        msg: exit playbook osds is not no this host
-    with_items:
-        osds_dir_stat.results
-    when:
-      - osds_dir_stat is defined 
-      - item.stat.exists == false
-
-  - name: install sgdisk(gdisk)
-    package:
-      name: gdisk
-      state: present
-    
-  - name: get osd(s) journal uuid
-    command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
-    register: osds_uuid
-    with_subelements:
-      - "{{ dev_ssds }}"
-      - partitions
-
-  - name: make partitions on new ssd
-    shell: >
-      sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" 
-      --typecode={{ item.item[1].index }}:{{ journal_typecode }} 
-      --partition-guid={{ item.item[1].index }}:{{ item.stdout }} 
-      --mbrtogpt -- {{ item.item[0].device_name }}
-    with_items:
-      - "{{ osds_uuid.results }}"
-
-  - name: stop osd(s) service
-    service:
-      name: "ceph-osd@{{ item.item[1].osd_id }}"
-      state: stopped
-    with_items:
-      - "{{ osds_uuid.results }}"
-
-  - name: reinitialize osd(s) journal in new ssd
-    command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
-    with_items:
-      - "{{ osds_uuid.results }}"
- 
-  - name: start osd(s) service
-    service:
-      name: "ceph-osd@{{ item.item[1].osd_id }}"
-      state: started
-    with_items:
-       - "{{ osds_uuid.results }}"
diff --git a/infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml b/infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml

new file mode 100644 (file)

index 0000000..c559ed6
--- /dev/null
+++ b/infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml
@@ -0,0 +1,37 @@
+---
+# This playbook was made to automate Ceph servers maintenance
+# Typical use case: hardware change
+# By running this playbook you will set the 'noout' flag on your
+# cluster, which means that OSD **can't** be marked as out
+# of the CRUSH map, but they will be marked as down.
+# Basically we tell the cluster to don't move any data since
+# the operation won't last for too long.
+
+- hosts: <your_host>
+  gather_facts: False
+
+  tasks:
+
+  - name: Set the noout flag
+    command: ceph osd set noout
+    delegate_to: <your_monitor>
+
+  - name: Turn off the server
+    command: poweroff
+
+  - name: Wait for the server to go down
+    local_action: >
+      wait_for host=<your_host>
+      port=22
+      state=stopped
+
+  - name: Wait for the server to come up
+    local_action: >
+      wait_for host=<your_host>
+      port=22
+      delay=10
+      timeout=3600
+
+  - name: Unset the noout flag
+    command: ceph osd unset noout
+    delegate_to: <your_monitor>
diff --git a/infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml b/infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml

new file mode 100644 (file)

index 0000000..843056f
--- /dev/null
+++ b/infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml
@@ -0,0 +1,555 @@
+---
+# This playbook was meant to upgrade a node from Ubuntu to RHEL.
+# We are performing a set of actions prior to reboot the node.
+# The node reboots via PXE and gets its new operating system.
+# This playbook only works for monitors and OSDs.
+# Note that some of the checks are ugly:
+#   ie: the when migration_completed.stat.exists
+# can be improved with includes, however I wanted to keep a single file...
+#
+
+- hosts: mons
+  serial: 1
+  sudo: True
+
+  vars:
+    backup_dir: /tmp/
+
+  tasks:
+
+    - name: Check if the node has be migrated already
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed
+      register: migration_completed
+      failed_when: false
+
+    - name: Check for failed run
+      stat: >
+        path=/var/lib/ceph/{{ ansible_hostname }}.tar
+      register: mon_archive_leftover
+
+    - fail: msg="Looks like an archive is already there, please remove it!"
+      when: migration_completed.stat.exists == False and mon_archive_leftover.stat.exists == True
+
+    - name: Compress the store as much as possible
+      command: ceph tell mon.{{ ansible_hostname }} compact
+      when: migration_completed.stat.exists == False
+
+    - name: Check if sysvinit
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit
+      register: monsysvinit
+      changed_when: False
+
+    - name: Check if upstart
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart
+      register: monupstart
+      changed_when: False
+
+    - name: Check if init does what it is supposed to do (Sysvinit)
+      shell: >
+        ps faux|grep -sq [c]eph-mon && service ceph status mon >> /dev/null
+      register: ceph_status_sysvinit
+      changed_when: False
+
+    # can't complete the condition since the previous taks never ran...
+    - fail: msg="Something is terribly wrong here, sysvinit is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!"
+      when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True
+
+    - name: Check if init does what it is supposed to do (upstart)
+      shell: >
+        ps faux|grep -sq [c]eph-mon && status ceph-mon-all >> /dev/null
+      register: ceph_status_upstart
+      changed_when: False
+
+    - fail: msg="Something is terribly wrong here, upstart is configured, the service is started BUT the init script does not return 0, GO FIX YOUR SETUP!"
+      when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True
+
+    - name: Restart the Monitor after compaction (Upstart)
+      service: >
+        name=ceph-mon
+        state=restarted
+        args=id={{ ansible_hostname }}
+      when: monupstart.stat.exists == True and migration_completed.stat.exists == False
+
+    - name: Restart the Monitor after compaction (Sysvinit)
+      service: >
+        name=ceph
+        state=restarted
+        args=mon
+      when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False
+
+    - name: Wait for the monitor to be up again
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port=6789
+        timeout=10
+      when: migration_completed.stat.exists == False
+
+    - name: Stop the monitor (Upstart)
+      service: >
+        name=ceph-mon
+        state=stopped
+        args=id={{ ansible_hostname }}
+      when: monupstart.stat.exists == True and migration_completed.stat.exists == False
+
+    - name: Stop the monitor (Sysvinit)
+      service: >
+        name=ceph
+        state=stopped
+        args=mon
+      when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False
+
+    - name: Wait for the monitor to be down
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port=6789
+        timeout=10
+        state=stopped
+      when: migration_completed.stat.exists == False
+
+    - name: Create a backup directory
+      file: >
+        path={{ backup_dir }}/monitors-backups
+        state=directory
+        owner=root
+        group=root
+        mode=0644
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups.backup[0] }}"
+      when: migration_completed.stat.exists == False
+
+    # NOTE (leseb): should we convert upstart to sysvinit here already?
+    - name: Archive monitor stores
+      shell: >
+        tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar
+        chdir=/var/lib/ceph/
+        creates={{ ansible_hostname }}.tar
+      when: migration_completed.stat.exists == False
+
+    - name: Scp the Monitor store
+      fetch: >
+        src=/var/lib/ceph/{{ ansible_hostname }}.tar
+        dest={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar
+        flat=yes
+      when: migration_completed.stat.exists == False
+
+    - name: Reboot the server
+      command: reboot
+      when: migration_completed.stat.exists == False
+
+    - name: Wait for the server to come up
+      local_action: >
+        wait_for
+        port=22
+        delay=10
+        timeout=3600
+      when: migration_completed.stat.exists == False
+
+    - name: Wait a bit more to be sure that the server is ready
+      pause: seconds=20
+      when: migration_completed.stat.exists == False
+
+    - name: Check if sysvinit
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/sysvinit
+      register: monsysvinit
+      changed_when: False
+
+    - name: Check if upstart
+      stat: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/upstart
+      register: monupstart
+      changed_when: False
+
+    - name: Make sure the monitor is stopped (Upstart)
+      service: >
+        name=ceph-mon
+        state=stopped
+        args=id={{ ansible_hostname }}
+      when: monupstart.stat.exists == True and migration_completed.stat.exists == False
+
+    - name: Make sure the monitor is stopped (Sysvinit)
+      service: >
+        name=ceph
+        state=stopped
+        args=mon
+      when: monsysvinit.stat.exists == True and migration_completed.stat.exists == False
+
+    # NOTE (leseb): 'creates' was added in Ansible 1.6
+    - name: Copy and unarchive the monitor store
+      unarchive: >
+        src={{ backup_dir }}/monitors-backups/{{ ansible_hostname }}.tar
+        dest=/var/lib/ceph/
+        copy=yes
+        mode=0600
+        creates=etc/ceph/ceph.conf
+      when: migration_completed.stat.exists == False
+
+    - name: Copy keys and configs
+      shell: >
+        cp etc/ceph/* /etc/ceph/
+        chdir=/var/lib/ceph/
+      when: migration_completed.stat.exists == False
+
+    - name: Configure RHEL7 for sysvinit
+      shell: find -L /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \;
+      when: migration_completed.stat.exists == False
+
+    # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary
+    # so we directly call sysvinit
+    - name: Start the monitor
+      service: >
+        name=ceph
+        state=started
+        args=mon
+      when: migration_completed.stat.exists == False
+
+    - name: Wait for the Monitor to be up again
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port=6789
+        timeout=10
+      when: migration_completed.stat.exists == False
+
+    - name: Waiting for the monitor to join the quorum...
+      shell: >
+        ceph -s | grep monmap | sed 's/.*quorum//' | egrep -q {{ ansible_hostname }}
+      register: result
+      until: result.rc == 0
+      retries: 5
+      delay: 10
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups.backup[0] }}"
+      when: migration_completed.stat.exists == False
+
+    - name: Done moving to the next monitor
+      file: >
+        path=/var/lib/ceph/mon/ceph-{{ ansible_hostname }}/migration_completed
+        state=touch
+        owner=root
+        group=root
+        mode=0600
+      when: migration_completed.stat.exists == False
+
+- hosts: osds
+  serial: 1
+  sudo: True
+
+  vars:
+    backup_dir: /tmp/
+
+  tasks:
+    - name: Check if the node has be migrated already
+      stat: >
+        path=/var/lib/ceph/migration_completed
+      register: migration_completed
+      failed_when: false
+
+    - name: Check for failed run
+      stat: >
+        path=/var/lib/ceph/{{ ansible_hostname }}.tar
+      register: osd_archive_leftover
+
+    - fail: msg="Looks like an archive is already there, please remove it!"
+      when: migration_completed.stat.exists == False and osd_archive_leftover.stat.exists == True
+
+    - name: Check if init does what it is supposed to do (Sysvinit)
+      shell: >
+        ps faux|grep -sq [c]eph-osd && service ceph status osd >> /dev/null
+      register: ceph_status_sysvinit
+      changed_when: False
+
+    # can't complete the condition since the previous taks never ran...
+    - fail: msg="Something is terribly wrong here, sysvinit is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!"
+      when: ceph_status_sysvinit.rc != 0 and migration_completed.stat.exists == False and monsysvinit.stat.exists == True
+
+    - name: Check if init does what it is supposed to do (upstart)
+      shell: >
+        ps faux|grep -sq [c]eph-osd && initctl list|egrep -sq "ceph-osd \(ceph/.\) start/running, process [0-9][0-9][0-9][0-9]"
+      register: ceph_status_upstart
+      changed_when: False
+
+    - fail: msg="Something is terribly wrong here, upstart is configured, the services are started BUT the init script does not return 0, GO FIX YOUR SETUP!"
+      when: ceph_status_upstart.rc != 0 and migration_completed.stat.exists == False and monupstart.stat.exists == True
+
+    - name: Set the noout flag
+      command: ceph osd set noout
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups[mon_group_name][0] }}"
+      when: migration_completed.stat.exists == False
+
+    - name: Check if sysvinit
+      shell: stat /var/lib/ceph/osd/ceph-*/sysvinit
+      register: osdsysvinit
+      failed_when: false
+      changed_when: False
+
+    - name: Check if upstart
+      shell: stat /var/lib/ceph/osd/ceph-*/upstart
+      register: osdupstart
+      failed_when: false
+      changed_when: False
+
+    - name: Archive ceph configs
+      shell: >
+        tar -cpvzf - --one-file-system . /etc/ceph/ceph.conf | cat > {{ ansible_hostname }}.tar
+        chdir=/var/lib/ceph/
+        creates={{ ansible_hostname }}.tar
+      when: migration_completed.stat.exists == False
+
+    - name: Create backup directory
+      file: >
+        path={{ backup_dir }}/osds-backups
+        state=directory
+        owner=root
+        group=root
+        mode=0644
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups.backup[0] }}"
+      when: migration_completed.stat.exists == False
+
+    - name: Scp OSDs dirs and configs
+      fetch: >
+        src=/var/lib/ceph/{{ ansible_hostname }}.tar
+        dest={{ backup_dir }}/osds-backups/
+        flat=yes
+      when: migration_completed.stat.exists == False
+
+    - name: Collect OSD ports
+      shell: netstat -tlpn | awk -F ":" '/ceph-osd/ { sub (" .*", "", $2); print $2 }' | uniq
+      register: osd_ports
+      when: migration_completed.stat.exists == False
+
+    - name: Gracefully stop the OSDs (Upstart)
+      service: >
+        name=ceph-osd-all
+        state=stopped
+      when: osdupstart.rc == 0 and migration_completed.stat.exists == False
+
+    - name: Gracefully stop the OSDs (Sysvinit)
+      service: >
+        name=ceph
+        state=stopped
+        args=mon
+      when: osdsysvinit.rc == 0 and migration_completed.stat.exists == False
+
+    - name: Wait for the OSDs to be down
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        port={{ item }}
+        timeout=10
+        state=stopped
+      with_items: "{{ osd_ports.stdout_lines }}"
+      when: migration_completed.stat.exists == False
+
+    - name: Configure RHEL with sysvinit
+      shell: find -L /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -exec touch {}/sysvinit \; -exec rm {}/upstart \;
+      when: migration_completed.stat.exists == False
+
+    - name: Reboot the server
+      command: reboot
+      when: migration_completed.stat.exists == False
+
+    - name: Wait for the server to come up
+      local_action: >
+        wait_for
+        port=22
+        delay=10
+        timeout=3600
+      when: migration_completed.stat.exists == False
+
+    - name: Wait a bit to be sure that the server is ready for scp
+      pause: seconds=20
+      when: migration_completed.stat.exists == False
+
+    # NOTE (leseb): 'creates' was added in Ansible 1.6
+    - name: Copy and unarchive the OSD configs
+      unarchive: >
+        src={{ backup_dir }}/osds-backups/{{ ansible_hostname }}.tar
+        dest=/var/lib/ceph/
+        copy=yes
+        mode=0600
+        creates=etc/ceph/ceph.conf
+      when: migration_completed.stat.exists == False
+
+    - name: Copy keys and configs
+      shell: >
+        cp etc/ceph/* /etc/ceph/
+        chdir=/var/lib/ceph/
+      when: migration_completed.stat.exists == False
+
+    # NOTE (leseb): at this point the upstart and sysvinit checks are not necessary
+    # so we directly call sysvinit
+    - name: Start all the OSDs
+      service: >
+        name=ceph-osd-all
+        state=started
+        args=osd
+      when: migration_completed.stat.exists == False
+
+    # NOTE (leseb): this is tricky unless this is set into the ceph.conf
+    # listened ports can be predicted, thus they will change after each restart
+#    - name: Wait for the OSDs to be up again
+#      local_action: >
+#        wait_for
+#        host={{ ansible_ssh_host | default(inventory_hostname) }}
+#        port={{ item }}
+#        timeout=30
+#      with_items:
+#        - "{{ osd_ports.stdout_lines }}"
+
+    - name: Waiting for clean PGs...
+      shell: >
+        test "[""$(ceph -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
+      register: result
+      until: result.rc == 0
+      retries: 10
+      delay: 10
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups.backup[0] }}"
+      when: migration_completed.stat.exists == False
+
+    - name: Done moving to the next OSD
+      file: >
+        path=/var/lib/ceph/migration_completed
+        state=touch
+        owner=root
+        group=root
+        mode=0600
+      when: migration_completed.stat.exists == False
+
+    - name: Unset the noout flag
+      command: ceph osd unset noout
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups[mon_group_name][0] }}"
+      when: migration_completed.stat.exists == False
+
+- hosts: rgws
+  serial: 1
+  sudo: True
+
+  vars:
+    backup_dir: /tmp/
+
+  tasks:
+    - name: Check if the node has be migrated already
+      stat: >
+        path=/var/lib/ceph/radosgw/migration_completed
+      register: migration_completed
+      failed_when: false
+
+    - name: Check for failed run
+      stat: >
+        path=/var/lib/ceph/{{ ansible_hostname }}.tar
+      register: rgw_archive_leftover
+
+    - fail: msg="Looks like an archive is already there, please remove it!"
+      when: migration_completed.stat.exists == False and rgw_archive_leftover.stat.exists == True
+
+    - name: Archive rados gateway configs
+      shell: >
+        tar -cpvzf - --one-file-system . /etc/ceph/* | cat > {{ ansible_hostname }}.tar
+        chdir=/var/lib/ceph/
+        creates={{ ansible_hostname }}.tar
+      when: migration_completed.stat.exists == False
+
+    - name: Create backup directory
+      file: >
+        path={{ backup_dir }}/rgws-backups
+        state=directory
+        owner=root
+        group=root
+        mode=0644
+      delegate_to: "{{ item }}"
+      with_items: "{{ groups.backup[0] }}"
+      when: migration_completed.stat.exists == False
+
+    - name: Scp RGWs dirs and configs
+      fetch: >
+        src=/var/lib/ceph/{{ ansible_hostname }}.tar
+        dest={{ backup_dir }}/rgws-backups/
+        flat=yes
+      when: migration_completed.stat.exists == False
+
+    - name: Gracefully stop the rados gateway
+      service: >
+        name={{ item }}
+        state=stopped
+      with_items:
+        - radosgw
+      when: migration_completed.stat.exists == False
+
+    - name: Wait for radosgw to be down
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        path=/tmp/radosgw.sock
+        state=absent
+        timeout=30
+      when: migration_completed.stat.exists == False
+
+    - name: Reboot the server
+      command: reboot
+      when: migration_completed.stat.exists == False
+
+    - name: Wait for the server to come up
+      local_action: >
+        wait_for
+        port=22
+        delay=10
+        timeout=3600
+      when: migration_completed.stat.exists == False
+
+    - name: Wait a bit to be sure that the server is ready for scp
+      pause: seconds=20
+      when: migration_completed.stat.exists == False
+
+    # NOTE (leseb): 'creates' was added in Ansible 1.6
+    - name: Copy and unarchive the OSD configs
+      unarchive: >
+        src={{ backup_dir }}/rgws-backups/{{ ansible_hostname }}.tar
+        dest=/var/lib/ceph/
+        copy=yes
+        mode=0600
+        creates=etc/ceph/ceph.conf
+      when: migration_completed.stat.exists == False
+
+    - name: Copy keys and configs
+      shell: >
+        {{ item }}
+        chdir=/var/lib/ceph/
+      with_items:
+        - cp etc/ceph/* /etc/ceph/
+      when: migration_completed.stat.exists == False
+
+    - name: Start rados gateway
+      service: >
+        name={{ item }}
+        state=started
+      with_items:
+        - radosgw
+      when: migration_completed.stat.exists == False
+
+    - name: Wait for radosgw to be up again
+      local_action: >
+        wait_for
+        host={{ ansible_ssh_host | default(inventory_hostname) }}
+        path=/tmp/radosgw.sock
+        state=present
+        timeout=30
+      when: migration_completed.stat.exists == False
+
+    - name: Done moving to the next rados gateway
+      file: >
+        path=/var/lib/ceph/radosgw/migration_completed
+        state=touch
+        owner=root
+        group=root
+        mode=0600
+      when: migration_completed.stat.exists == False
diff --git a/infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml b/infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml

new file mode 100644 (file)

index 0000000..0fc6892
--- /dev/null
+++ b/infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml
@@ -0,0 +1,99 @@
+---
+# This playbook will make custom partition layout for your osd hosts.
+# You should define `devices` variable for every host.
+#
+# For example, in host_vars/hostname1
+#
+# devices:
+# - device_name: sdb
+#   partitions:
+#   - index: 1
+#     size: 10G
+#     type: data
+#   - index: 2
+#     size: 5G
+#     type: journal
+# - device_name: sdc
+#   partitions:
+#   - index: 1
+#     size: 10G
+#     type: data
+#   - index: 2
+#     size: 5G
+#     type: journal
+#
+- vars:
+    osd_group_name: osds
+    journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
+    data_typecode: 4fbd7e29-9d25-41b8-afd0-062c0ceff05d
+    devices: []
+  hosts:
+  - "{{ osd_group_name }}"
+
+  tasks:
+
+  - name: load a variable file for devices partition
+    include_vars: "{{ item }}"
+    with_first_found:
+      - files:
+          - "host_vars/{{ ansible_hostname }}.yml"
+          - "host_vars/default.yml"
+        skip: true
+
+  - name: exit playbook, if devices not defined
+    fail:
+      msg: "devices must be define in host_vars/default.yml or host_vars/{{ ansible_hostname }}.yml"
+    when: devices is not defined
+
+  - name: install sgdisk(gdisk)
+    package:
+      name: gdisk
+      state: present
+
+  - name: erase all previous partitions(dangerous!!!)
+    shell: sgdisk --zap-all -- /dev/{{item.device_name}}
+    with_items: "{{ devices }}"
+
+  - name: make osd partitions
+    shell: >
+           sgdisk --new={{item.1.index}}:0:+{{item.1.size}} "--change-name={{item.1.index}}:ceph {{item.1.type}}"
+           "--typecode={{item.1.index}}:{% if item.1.type=='data' %}{{data_typecode}}{% else %}{{journal_typecode}}{% endif %}"
+           --mbrtogpt -- /dev/{{item.0.device_name}}
+    with_subelements:
+    - "{{ devices }}"
+    - partitions
+
+  - set_fact:
+      owner: 167
+      group: 167
+    when:
+      - ansible_os_family == "RedHat"
+  
+  - set_fact:
+      owner: 64045
+      group: 64045
+    when:
+      - ansible_os_family == "Debian"
+
+  - name: change partitions ownership
+    file:
+      path: "/dev/{{item.0.device_name}}{{item.1.index}}"
+      owner: "{{ owner | default('root')}}"
+      group: "{{ group | default('disk')}}"
+    with_subelements:
+      - "{{ devices }}"
+      - partitions
+    when:
+      item.0.device_name | match('/dev/([hsv]d[a-z]{1,2}){1,2}$')
+
+  - name: change partitions ownership
+    file:
+      path: "/dev/{{item.0.device_name}}p{{item.1.index}}"
+      owner: "{{ owner | default('root')}}"
+      group: "{{ group | default('disk')}}"
+    with_subelements:
+      - "{{ devices }}"
+      - partitions
+    when:
+      item.0.device_name | match('/dev/(cciss/c[0-9]d[0-9]|nvme[0-9]n[0-9]){1,2}$')
+...
+\ No newline at end of file
diff --git a/infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml b/infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml

new file mode 100644 (file)

index 0000000..44a75e0
--- /dev/null
+++ b/infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml
@@ -0,0 +1,112 @@
+---
+# This playbook use to migrate activity osd(s) journal to SSD.
+#
+# You should define `osds_journal_devices` variable for host which osd(s) journal migrate to.
+# 
+# For example in host_vars/hostname1.yml
+#
+# osds_journal_devices:
+# - device_name: /dev/sdd
+#   partitions:
+#   - index: 1
+#     size: 10G
+#     osd_id: 0
+#   - index: 2
+#     size: 10G
+#     osd_id: 1
+# - device_name: /dev/sdf
+#   partitions:       
+#   - index: 1        
+#     size: 10G       
+#     osd_id: 2       
+#
+# @param device_name: The full device path of new ssd.
+# @param partitions:  The custom partition layout of ssd.
+# @param index:  The index of this partition.
+# @param size:  The size of this partition.
+# @param osd_id: Which osds's journal this partition for.
+#
+# ansible-playbook migrate-journal-to-ssd.yml
+#    The playbook will migrate osd(s) journal to ssd device which you define in host_vars. 
+
+- vars:
+    osd_group_name: osds
+    journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
+    osds_journal_devices: []
+  hosts:
+      - "{{ osd_group_name }}"
+  serial: 1
+  tasks:
+
+  - name: get osd(s) if directory stat
+    stat:
+      path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
+    register: osds_dir_stat
+    with_subelements:
+      - "{{ osds_journal_devices }}"
+      - partitions
+
+  - name: exit playbook osd(s) is not on this host
+    fail:
+        msg: exit playbook osd(s) is not on this host
+    with_items: 
+        osds_dir_stat.results
+    when:
+      -  osds_dir_stat is defined and item.stat.exists == false
+
+  - name: install sgdisk(gdisk)
+    package:
+      name: gdisk
+      state: present
+    when: osds_journal_devices is defined
+
+  - name: generate uuid for osds journal
+    command: uuidgen
+    register: osds
+    with_subelements:
+      - "{{ osds_journal_devices }}"
+      - partitions
+
+  - name: make osd partitions on ssd
+    shell: >
+      sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" 
+      --typecode={{ item.item[1].index }}:{{ journal_typecode }} 
+      --partition-guid={{ item.item[1].index }}:{{ item.stdout }} 
+      --mbrtogpt -- {{ item.item[0].device_name }}
+    with_items: 
+      - "{{ osds.results }}"
+
+  - name: stop osd(s) service
+    service:
+      name: "ceph-osd@{{ item.item[1].osd_id }}"
+      state: stopped
+    with_items:
+      - "{{ osds.results }}"
+
+  - name: flush osd(s) journal
+    command: ceph-osd -i {{ item.item[1].osd_id }} --flush-journal --cluster {{ cluster }} 
+    with_items:
+      - "{{ osds.results }}"
+    when: osds_journal_devices is defined
+
+  - name: update osd(s) journal soft link
+    command: ln -sf /dev/disk/by-partuuid/{{ item.stdout }} /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal
+    with_items:
+      - "{{ osds.results }}"
+
+  - name: update osd(s) journal uuid
+    command: echo {{ item.stdout }} > /var/lib/ceph/osd/{{ cluster }}-{{ item.item[1].osd_id }}/journal_uuid
+    with_items:
+      - "{{ osds.results }}"
+
+  - name: initialize osd(s) new journal 
+    command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
+    with_items:
+      - "{{ osds.results }}"
+
+  - name: start osd(s) service
+    service:
+      name: "ceph-osd@{{ item.item[1].osd_id }}"
+      state: started
+    with_items:
+      - "{{ osds.results }}"
diff --git a/infrastructure-playbooks/untested-by-ci/purge-multisite.yml b/infrastructure-playbooks/untested-by-ci/purge-multisite.yml

new file mode 100644 (file)

index 0000000..8b78553
--- /dev/null
+++ b/infrastructure-playbooks/untested-by-ci/purge-multisite.yml
@@ -0,0 +1,11 @@
+---
+# Nukes a multisite config
+- hosts: rgws
+  become: True
+  tasks:
+  - include: roles/ceph-rgw/tasks/multisite/destroy.yml
+
+  handlers:
+  - include: roles/ceph-rgw/handlers/main.yml
+    # Ansible 2.1.0 bug will ignore included handlers without this
+    static: True
diff --git a/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml b/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml

new file mode 100644 (file)

index 0000000..de3b6e8
--- /dev/null
+++ b/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml
@@ -0,0 +1,117 @@
+---
+# This playbook use to recover Ceph OSDs after ssd journal failure.
+# You will also realise that it’s really simple to bring your 
+# OSDs back to life after replacing your faulty SSD with a new one.
+#
+# You should define `dev_ssds` variable for host which changes ssds after
+# failure. 
+# 
+# For example in host_vars/hostname1.yml
+#
+# dev_ssds:
+# - device_name: /dev/sdd
+#   partitions:
+#   - index: 1
+#     size: 10G
+#     osd_id: 0
+#   - index: 2
+#     size: 10G
+#     osd_id: 1
+# - device_name: /dev/sdf
+#   partitions:       
+#   - index: 1        
+#     size: 10G       
+#     osd_id: 2       
+#
+# @param device_name: The full device path of new ssd
+# @param partitions:  The custom partition layout of new ssd
+# @param index:  The index of this partition
+# @param size:  The size of this partition
+# @param osd_id: Which osds's journal this partition for.
+#
+# ansible-playbook recover-osds-after-ssd-journal-failure.yml
+#     Prompts for select which host to recover, defaults to null,  
+#     doesn't select host the recover ssd. Input the hostname
+#     which to recover osds after ssd journal failure
+#
+# ansible-playbook -e target_host=hostname \
+#     recover-osds-after-ssd-journal-failure.yml
+#     Overrides the prompt using -e option. Can be used in
+#     automation scripts to avoid interactive prompt.
+
+- hosts: localhost
+  gather_facts: no
+  vars_prompt:
+  - name: target_host
+    prompt: please enter the target hostname which to recover osds after ssd journal failure
+    private: no
+  tasks:
+    - add_host:
+        name: "{{ target_host }}"
+        groups: dynamically_created_hosts
+
+- hosts: dynamically_created_hosts
+  vars:
+   journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
+   dev_ssds: []
+
+  tasks:
+  - fail: msg="please define dev_ssds variable"
+    when: dev_ssds|length <= 0
+ 
+  - name: get osd(s) if directory stat
+    stat:
+      path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
+    register: osds_dir_stat
+    with_subelements:
+      - "{{ dev_ssds }}"
+      - partitions
+ 
+  - name: exit playbook osd(s) is not on this host
+    fail:
+        msg: exit playbook osds is not no this host
+    with_items:
+        osds_dir_stat.results
+    when:
+      - osds_dir_stat is defined 
+      - item.stat.exists == false
+
+  - name: install sgdisk(gdisk)
+    package:
+      name: gdisk
+      state: present
+    
+  - name: get osd(s) journal uuid
+    command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
+    register: osds_uuid
+    with_subelements:
+      - "{{ dev_ssds }}"
+      - partitions
+
+  - name: make partitions on new ssd
+    shell: >
+      sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" 
+      --typecode={{ item.item[1].index }}:{{ journal_typecode }} 
+      --partition-guid={{ item.item[1].index }}:{{ item.stdout }} 
+      --mbrtogpt -- {{ item.item[0].device_name }}
+    with_items:
+      - "{{ osds_uuid.results }}"
+
+  - name: stop osd(s) service
+    service:
+      name: "ceph-osd@{{ item.item[1].osd_id }}"
+      state: stopped
+    with_items:
+      - "{{ osds_uuid.results }}"
+
+  - name: reinitialize osd(s) journal in new ssd
+    command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
+    with_items:
+      - "{{ osds_uuid.results }}"
+ 
+  - name: start osd(s) service
+    service:
+      name: "ceph-osd@{{ item.item[1].osd_id }}"
+      state: started
+    with_items:
+       - "{{ osds_uuid.results }}"
author	Sébastien Han <seb@redhat.com>
	Wed, 30 Aug 2017 21:30:49 +0000 (23:30 +0200)
committer	Sébastien Han <seb@redhat.com>
	Fri, 1 Sep 2017 17:58:24 +0000 (19:58 +0200)
ceph-ansible.spec.in		patch \| blob \| history
infrastructure-playbooks/cluster-maintenance.yml	[deleted file]	patch \| blob \| history
infrastructure-playbooks/cluster-os-migration.yml	[deleted file]	patch \| blob \| history
infrastructure-playbooks/make-osd-partitions.yml	[deleted file]	patch \| blob \| history
infrastructure-playbooks/migrate-journal-to-ssd.yml	[deleted file]	patch \| blob \| history
infrastructure-playbooks/purge-multisite.yml	[deleted file]	patch \| blob \| history
infrastructure-playbooks/recover-osds-after-ssd-journal-failure.yml	[deleted file]	patch \| blob \| history
infrastructure-playbooks/untested-by-ci/cluster-maintenance.yml	[new file with mode: 0644]	patch \| blob
infrastructure-playbooks/untested-by-ci/cluster-os-migration.yml	[new file with mode: 0644]	patch \| blob
infrastructure-playbooks/untested-by-ci/make-osd-partitions.yml	[new file with mode: 0644]	patch \| blob
infrastructure-playbooks/untested-by-ci/migrate-journal-to-ssd.yml	[new file with mode: 0644]	patch \| blob
infrastructure-playbooks/untested-by-ci/purge-multisite.yml	[new file with mode: 0644]	patch \| blob
infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-journal-failure.yml	[new file with mode: 0644]	patch \| blob