From ec648981e6739eef90ee473c401226ccce4416ec Mon Sep 17 00:00:00 2001 From: Dimitri Savineau Date: Mon, 5 Jul 2021 14:07:05 -0400 Subject: [PATCH] infra: add playbook to purge dashboard/monitoring The dashboard/monitoring stack can be deployed via the dashboard_enabled variable. But there's nothing similar if we can to remove that part only and keep the ceph cluster up and running. The current purge playbooks remove everything. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1786691 Signed-off-by: Dimitri Savineau (cherry picked from commit 8e4ef7d6da5bc73d47be93281ecae2b3c6fa826f) --- infrastructure-playbooks/purge-dashboard.yml | 204 +++++++++++++++++++ tox.ini | 23 ++- 2 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 infrastructure-playbooks/purge-dashboard.yml diff --git a/infrastructure-playbooks/purge-dashboard.yml b/infrastructure-playbooks/purge-dashboard.yml new file mode 100644 index 000000000..c083aeacc --- /dev/null +++ b/infrastructure-playbooks/purge-dashboard.yml @@ -0,0 +1,204 @@ +--- +# This playbook purges the Ceph MGR Dashboard and Monitoring +# (alertmanager/prometheus/grafana/node-exporter) stack. +# It removes: packages, configuration files and ALL THE DATA +# +# Use it like this: +# ansible-playbook purge-dashboard.yml +# Prompts for confirmation to purge, defaults to no and +# doesn't purge anything. yes purges the dashboard and +# monitoring stack. +# +# ansible-playbook -e ireallymeanit=yes|no purge-dashboard.yml +# Overrides the prompt using -e option. Can be used in +# automation scripts to avoid interactive prompt. + +- name: confirm whether user really meant to purge the dashboard + hosts: localhost + gather_facts: false + vars_prompt: + - name: ireallymeanit + prompt: Are you sure you want to purge the dashboard? + default: 'no' + private: no + tasks: + - name: exit playbook, if user did not mean to purge dashboard + fail: + msg: > + "Exiting purge-dashboard playbook, dashboard was NOT purged. + To purge the dashboard, either say 'yes' on the prompt or + or use `-e ireallymeanit=yes` on the command line when + invoking the playbook" + when: ireallymeanit != 'yes' + +- name: gather facts on all hosts + hosts: + - "{{ mon_group_name|default('mons') }}" + - "{{ osd_group_name|default('osds') }}" + - "{{ mds_group_name|default('mdss') }}" + - "{{ rgw_group_name|default('rgws') }}" + - "{{ rbdmirror_group_name|default('rbdmirrors') }}" + - "{{ nfs_group_name|default('nfss') }}" + - "{{ client_group_name|default('clients') }}" + - "{{ mgr_group_name|default('mgrs') }}" + - "{{ monitoring_group_name | default('monitoring') }}" + become: true + tasks: + - debug: msg="gather facts on all Ceph hosts for following reference" + +- name: purge node exporter + hosts: + - "{{ mon_group_name|default('mons') }}" + - "{{ osd_group_name|default('osds') }}" + - "{{ mds_group_name|default('mdss') }}" + - "{{ rgw_group_name|default('rgws') }}" + - "{{ rbdmirror_group_name|default('rbdmirrors') }}" + - "{{ nfs_group_name|default('nfss') }}" + - "{{ client_group_name|default('clients') }}" + - "{{ mgr_group_name|default('mgrs') }}" + - "{{ monitoring_group_name | default('monitoring') }}" + gather_facts: false + become: true + tasks: + - import_role: + name: ceph-defaults + + - import_role: + name: ceph-facts + tasks_from: container_binary + + - name: disable node_exporter service + service: + name: node_exporter + state: stopped + enabled: no + failed_when: false + + - name: remove node_exporter service file + file: + name: /etc/systemd/system/node_exporter.service + state: absent + + - name: remove node-exporter image + command: "{{ container_binary }} rmi {{ node_exporter_container_image }}" + changed_when: false + failed_when: false + +- name: purge ceph monitoring + hosts: "{{ monitoring_group_name | default('monitoring') }}" + gather_facts: false + become: true + tasks: + - import_role: + name: ceph-defaults + + - import_role: + name: ceph-facts + tasks_from: container_binary + + - name: stop services + service: + name: "{{ item }}" + state: stopped + enabled: no + failed_when: false + loop: + - alertmanager + - prometheus + - grafana-server + + - name: remove systemd service files + file: + name: "/etc/systemd/system/{{ item }}.service" + state: absent + loop: + - alertmanager + - prometheus + - grafana-server + + - name: remove ceph dashboard container images + command: "{{ container_binary }} rmi {{ item }}" + loop: + - "{{ alertmanager_container_image }}" + - "{{ prometheus_container_image }}" + - "{{ grafana_container_image }}" + changed_when: false + failed_when: false + + - name: remove ceph-grafana-dashboards package on RedHat or SUSE + package: + name: ceph-grafana-dashboards + state: absent + when: + - not containerized_deployment | bool + - ansible_facts['os_family'] in ['RedHat', 'Suse'] + + - name: remove data + file: + name: "{{ item }}" + state: absent + loop: + - "{{ alertmanager_conf_dir }}" + - "{{ prometheus_conf_dir }}" + - /etc/grafana + - "{{ alertmanager_data_dir }}" + - "{{ prometheus_data_dir }}" + - /var/lib/grafana + +- name: purge ceph dashboard + hosts: "{{ groups[mgr_group_name] | default(groups[mon_group_name]) | default(omit) }}" + gather_facts: false + become: true + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + tasks: + - import_role: + name: ceph-defaults + + - import_role: + name: ceph-facts + tasks_from: container_binary + + - name: remove the dashboard admin user + ceph_dashboard_user: + name: "{{ dashboard_admin_user }}" + cluster: "{{ cluster }}" + state: absent + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + + - name: remove radosgw system user + radosgw_user: + name: "{{ dashboard_rgw_api_user_id }}" + cluster: "{{ cluster }}" + state: absent + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + when: groups.get(rgw_group_name, []) | length > 0 + + - name: disable mgr dashboard and prometheus modules + ceph_mgr_module: + name: "{{ item }}" + cluster: "{{ cluster }}" + state: disable + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + loop: + - dashboard + - prometheus + + - name: remove TLS certificate and key files + file: + name: "/etc/ceph/ceph-dashboard.{{ item }}" + state: absent + loop: + - crt + - key + when: dashboard_protocol == "https" + + - name: remove ceph-mgr-dashboard package + package: + name: ceph-mgr-dashboard + state: absent + when: not containerized_deployment | bool diff --git a/tox.ini b/tox.ini index eab18d582..3fe44e185 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = centos-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,cephadm_adopt} +envlist = centos-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,cephadm_adopt,purge_dashboard} centos-non_container-{switch_to_containers} infra_lv_create migrate_ceph_disk_to_ceph_volume @@ -69,6 +69,25 @@ commands= # test that the cluster can be redeployed in a healthy state py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests +[purge-dashboard] +commands= + ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/purge-dashboard.yml --extra-vars "\ + ireallymeanit=yes \ + ceph_docker_registry={env:CEPH_DOCKER_REGISTRY:quay.ceph.io} \ + ceph_docker_image={env:CEPH_DOCKER_IMAGE:ceph-ci/daemon} \ + ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG:latest-pacific} \ + " + + # set up the cluster again + ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars @ceph-override.json --extra-vars "\ + ceph_stable_release={env:CEPH_STABLE_RELEASE:pacific} \ + ceph_docker_registry_auth=True \ + ceph_docker_registry_username={env:DOCKER_HUB_USERNAME} \ + ceph_docker_registry_password={env:DOCKER_HUB_PASSWORD} \ + " + # test that the cluster can be redeployed in a healthy state + py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests + [purge-lvm] commands= ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/{env:PURGE_PLAYBOOK:purge-cluster.yml} --extra-vars "\ @@ -306,6 +325,7 @@ changedir= # tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using docker collocation: {toxinidir}/tests/functional/collocation{env:CONTAINER_DIR:} purge: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:} + purge_dashboard: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:} switch_to_containers: {toxinidir}/tests/functional/all_daemons lvm_osds: {toxinidir}/tests/functional/lvm-osds{env:CONTAINER_DIR:} lvm_batch: {toxinidir}/tests/functional/lvm-batch{env:CONTAINER_DIR:} @@ -355,6 +375,7 @@ commands= all_daemons,all_in_one,collocation: ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars "delegate_facts_host={env:DELEGATE_FACTS_HOST:True} ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG_BIS:latest-bis-pacific} ceph_stable_release={env:CEPH_STABLE_RELEASE:pacific}" --extra-vars @ceph-override.json purge: {[purge]commands} + purge_dashboard: {[purge-dashboard]commands} switch_to_containers: {[switch-to-containers]commands} shrink_mon: {[shrink-mon]commands} shrink_osd: {[shrink-osd]commands} -- 2.47.3