From 6bac61361194154971370803119983b3e7c23f39 Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=A9bastien=20Han?= Date: Mon, 18 Sep 2017 17:45:08 +0200 Subject: [PATCH] shrink: support for container MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We can now shrink mon and osds on containerized deployment. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1492115 Signed-off-by: Sébastien Han --- infrastructure-playbooks/shrink-mon.yml | 21 +++++--- infrastructure-playbooks/shrink-osd.yml | 65 ++++++++++++++++++++----- tox.ini | 11 ++++- 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/infrastructure-playbooks/shrink-mon.yml b/infrastructure-playbooks/shrink-mon.yml index 365f5b2b2..8e43748d5 100644 --- a/infrastructure-playbooks/shrink-mon.yml +++ b/infrastructure-playbooks/shrink-mon.yml @@ -75,20 +75,27 @@ post_tasks: - name: pick a monitor different than the one we want to remove - set_fact: mon_host={{ item }} + set_fact: + mon_host: "{{ item }}" with_items: "{{ groups[mon_group_name] }}" when: - item != mon_to_kill + - name: set_fact docker_exec_cmd build docker exec command (containerized) + set_fact: + docker_exec_cmd: "docker exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }}" + when: containerized_deployment + - name: exit playbook, if can not connect to the cluster - command: timeout 5 ceph --cluster {{ cluster }} health + command: "{{ docker_exec_cmd }} timeout 5 ceph --cluster {{ cluster }} health" register: ceph_health until: ceph_health.stdout.find("HEALTH") > -1 delegate_to: "{{ mon_host }}" retries: 5 delay: 2 - - set_fact: + - name: set_fact mon_to_kill_hostname + set_fact: mon_to_kill_hostname: "{{ hostvars[mon_to_kill]['ansible_hostname'] }}" - name: stop monitor service(s) @@ -106,7 +113,7 @@ delegate_to: "{{ mon_to_kill }}" - name: remove monitor from the quorum - command: ceph --cluster {{ cluster }} mon remove {{ mon_to_kill_hostname }} + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} mon remove {{ mon_to_kill_hostname }}" failed_when: false delegate_to: "{{ mon_host }}" @@ -116,7 +123,7 @@ # 'sleep 5' is not that bad and should be sufficient - name: verify the monitor is out of the cluster shell: | - ceph --cluster {{ cluster }} -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["quorum_names"])' + {{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["quorum_names"])' delegate_to: "{{ mon_host }}" failed_when: false register: result @@ -140,9 +147,9 @@ - mon_to_kill_hostname in result.stdout - name: show ceph health - command: ceph --cluster {{ cluster }} -s + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s" delegate_to: "{{ mon_host }}" - name: show ceph mon status - command: ceph --cluster {{ cluster }} mon stat + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} mon stat" delegate_to: "{{ mon_host }}" diff --git a/infrastructure-playbooks/shrink-osd.yml b/infrastructure-playbooks/shrink-osd.yml index cc014df4c..842c79cfd 100644 --- a/infrastructure-playbooks/shrink-osd.yml +++ b/infrastructure-playbooks/shrink-osd.yml @@ -60,8 +60,13 @@ post_tasks: + - name: set_fact docker_exec_cmd build docker exec command (containerized) + set_fact: + docker_exec_cmd: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}" + when: containerized_deployment + - name: exit playbook, if can not connect to the cluster - command: timeout 5 ceph --cluster {{ cluster }} health + command: "{{ docker_exec_cmd }} timeout 5 ceph --cluster {{ cluster }} health" register: ceph_health until: ceph_health.stdout.find("HEALTH") > -1 delegate_to: "{{ groups[mon_group_name][0] }}" @@ -69,12 +74,13 @@ delay: 2 - name: find the host(s) where the osd(s) is/are running on - command: ceph --cluster {{ cluster }} osd find {{ item }} + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} osd find {{ item }}" with_items: "{{ osd_to_kill.split(',') }}" delegate_to: "{{ groups[mon_group_name][0] }}" register: find_osd_hosts - - set_fact: + - name: set_fact osd_hosts + set_fact: osd_hosts: "{{ osd_hosts | default([]) + [ (item.stdout | from_json).crush_location.host ] }}" with_items: "{{ find_osd_hosts.results }}" @@ -86,12 +92,39 @@ delegate_to: "{{ item }}" failed_when: false - - fail: + - name: fail when admin key is not present + fail: msg: "The Ceph admin key is not present on the OSD node, please add it and remove it after the playbook is done." with_items: "{{ ceph_admin_key.results }}" when: - item.stat.exists == false + # NOTE(leseb): using '>' is the only way I could have the command working + - name: find osd device based on the id + shell: > + docker run --privileged=true -v /dev:/dev --entrypoint /usr/sbin/ceph-disk + {{ ceph_docker_registry}}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} + list | awk -v pattern=osd.{{ item.0 }} '$0 ~ pattern {print $1}' + with_together: + - "{{ osd_to_kill.split(',') }}" + - "{{ osd_hosts }}" + register: osd_to_kill_disks + delegate_to: "{{ item.1 }}" + when: + - containerized_deployment + + - name: stop osd services (container) + service: + name: "ceph-osd@{{ item.0.stdout[:-1] | regex_replace('/dev/', '') }}" + state: stopped + enabled: no + with_together: + - "{{ osd_to_kill_disks.results }}" + - "{{ osd_hosts }}" + delegate_to: "{{ item.1 }}" + when: + - containerized_deployment + - name: deactivating osd(s) command: ceph-disk deactivate --cluster {{ cluster }} --deactivate-by-id {{ item.0 }} --mark-out register: deactivate @@ -101,14 +134,18 @@ - "{{ osd_to_kill.split(',') }}" - "{{ osd_hosts }}" delegate_to: "{{ item.1 }}" + when: + - not containerized_deployment - name: set osd(s) out when ceph-disk deactivating fail - command: ceph --cluster {{ cluster }} osd out osd.{{ item.0 }} + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} osd out osd.{{ item.0 }}" delegate_to: "{{ groups[mon_group_name][0] }}" with_together: - "{{ osd_to_kill.split(',') }}" - "{{ deactivate.results }}" when: + - not containerized_deployment + - not item.1.get("skipped") - item.1.stderr|length > 0 - name: destroying osd(s) @@ -120,39 +157,41 @@ - "{{ osd_to_kill.split(',') }}" - "{{ osd_hosts }}" delegate_to: "{{ item.1 }}" + when: + - not containerized_deployment - name: remove osd(s) from crush_map when ceph-disk destroy fail - command: ceph --cluster {{ cluster }} osd crush remove osd.{{ item.0 }} + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} osd crush remove osd.{{ item.0 }}" run_once: true delegate_to: "{{ groups[mon_group_name][0] }}" with_together: - "{{ osd_to_kill.split(',') }}" - "{{ destroy.results }}" when: - - item.1.stderr|length > 0 + - (item.1.get("skipped") or item.1.stderr|length > 0) - name: delete osd(s) auth key when ceph-disk destroy fail - command: ceph --cluster {{ cluster }} auth del osd.{{ item.0 }} + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} auth del osd.{{ item.0 }}" delegate_to: "{{ groups[mon_group_name][0] }}" with_together: - "{{ osd_to_kill.split(',') }}" - "{{ destroy.results }}" when: - - item.1.stderr|length > 0 + - (item.1.get("skipped") or item.1.stderr|length > 0) - name: deallocate osd(s) id when ceph-disk destroy fail - command: ceph --cluster {{ cluster }} osd rm {{ item.0 }} + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} osd rm {{ item.0 }}" delegate_to: "{{ groups[mon_group_name][0] }}" with_together: - "{{ osd_to_kill.split(',') }}" - "{{ destroy.results }}" when: - - item.1.stderr|length > 0 + - (item.1.get("skipped") or item.1.stderr|length > 0) - name: show ceph health - command: ceph --cluster {{ cluster }} -s + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s" delegate_to: "{{ groups[mon_group_name][0] }}" - name: show ceph osd tree - command: ceph --cluster {{ cluster }} osd tree + command: "{{ docker_exec_cmd }} ceph --cluster {{ cluster }} osd tree" delegate_to: "{{ groups[mon_group_name][0] }}" diff --git a/tox.ini b/tox.ini index 58912e534..2143b2a8c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = {dev,jewel,luminous,rhcs}-{ansible2.2,ansible2.3}-{xenial_cluster,journal_collocation,centos7_cluster,dmcrypt_journal,dmcrypt_journal_collocation,docker_cluster,purge_cluster,purge_dmcrypt,docker_dedicated_journal,docker_dmcrypt_journal_collocation,update_dmcrypt,update_cluster,cluster,purge_docker_cluster,update_docker_cluster,switch_to_containers} - {dev,luminous}-{ansible2.2,ansible2.3}-{bluestore_journal_collocation,bluestore_cluster,bluestore_dmcrypt_journal,bluestore_dmcrypt_journal_collocation,bluestore_docker_dedicated_journal,bluestore_docker_dmcrypt_journal_collocation,lvm_osds,purge_lvm_osds,shrink_mon,shrink_osd,journal_collocation_auto,journal_collocation_auto_dmcrypt} + {dev,luminous}-{ansible2.2,ansible2.3}-{bluestore_journal_collocation,bluestore_cluster,bluestore_dmcrypt_journal,bluestore_dmcrypt_journal_collocation,bluestore_docker_dedicated_journal,bluestore_docker_dmcrypt_journal_collocation,lvm_osds,purge_lvm_osds,shrink_mon,shrink_osd,journal_collocation_auto,journal_collocation_auto_dmcrypt,shrink_mon_container,shrink_osd_container} skipsdist = True @@ -86,7 +86,7 @@ commands= cp {toxinidir}/infrastructure-playbooks/shrink-mon.yml {toxinidir}/shrink-mon.yml ansible-playbook -vv -i {changedir}/hosts {toxinidir}/shrink-mon.yml --extra-vars "\ ireallymeanit=yes \ - mon_to_kill=ceph-mon2 \ + mon_to_kill={env:MON_TO_KILL:ceph-mon2} \ " [shrink-osd] commands= @@ -132,6 +132,9 @@ setenv= docker_dmcrypt_journal_collocation: PLAYBOOK = site-docker.yml.sample bluestore_docker_dedicated_journal: PLAYBOOK = site-docker.yml.sample bluestore_docker_dmcrypt_journal_collocation: PLAYBOOK = site-docker.yml.sample + shrink_mon_container: PLAYBOOK = site-docker.yml.sample + shrink_mon_container: MON_TO_KILL = mon2 + shrink_osd_container: PLAYBOOK = site-docker.yml.sample rhcs: CEPH_STABLE_RELEASE = luminous jewel: CEPH_STABLE_RELEASE = jewel @@ -162,7 +165,9 @@ changedir= # tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using non-collocated OSD scenario centos7_cluster: {toxinidir}/tests/functional/centos/7/cluster shrink_mon: {toxinidir}/tests/functional/centos/7/cluster + shrink_mon_container: {toxinidir}/tests/functional/centos/7/docker shrink_osd: {toxinidir}/tests/functional/centos/7/cluster + shrink_osd_container: {toxinidir}/tests/functional/centos/7/docker # an alias for centos7_cluster, this makes the name better suited for rhcs testing cluster: {toxinidir}/tests/functional/centos/7/cluster # tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using docker @@ -220,6 +225,8 @@ commands= update_cluster: {[update]commands} update_docker_cluster: {[update]commands} shrink_mon: {[shrink-mon]commands} + shrink_mon_container: {[shrink-mon]commands} shrink_osd: {[shrink-osd]commands} + shrink_osd_container: {[shrink-osd]commands} vagrant destroy --force -- 2.47.3