shrink-mon: support updating ceph config file after mon removal

author Guillaume Abrioux <gabrioux@redhat.com>

Thu, 28 Jul 2022 12:15:38 +0000 (14:15 +0200)

committer Guillaume Abrioux <gabrioux@redhat.com>

Tue, 9 Aug 2022 05:20:53 +0000 (07:20 +0200)
author Guillaume Abrioux <gabrioux@redhat.com>
Thu, 28 Jul 2022 12:15:38 +0000 (14:15 +0200)
committer Guillaume Abrioux <gabrioux@redhat.com>
Tue, 9 Aug 2022 05:20:53 +0000 (07:20 +0200)
diff --git a/infrastructure-playbooks/shrink-mon.yml b/infrastructure-playbooks/shrink-mon.yml

index 05d6c2be543e1483cf028d85ec5a0e2ee864f623..a44fdc4d7cb8a4647879d05a09180225cb8351e1 100644 (file)
--- a/infrastructure-playbooks/shrink-mon.yml
+++ b/infrastructure-playbooks/shrink-mon.yml
@@ -22,7 +22,7 @@
      - debug: msg="gather facts on all Ceph hosts for following reference"
  
  - name: confirm whether user really meant to remove monitor from the ceph cluster
-  hosts: "{{ groups[mon_group_name][0] }}"
+  hosts: localhost
    become: true
    vars_prompt:
      - name: ireallymeanit
@@ -33,115 +33,208 @@
      mon_group_name: mons
  
    pre_tasks:
-    - name: exit playbook, if only one monitor is present in cluster
-      fail:
-        msg: "You are about to shrink the only monitor present in the cluster.
-              If you really want to do that, please use the purge-cluster playbook."
-      when: groups[mon_group_name] | length | int == 1
-
-    - name: exit playbook, if no monitor was given
-      fail:
-        msg: "mon_to_kill must be declared
-          Exiting shrink-cluster playbook, no monitor was removed.
-           On the command line when invoking the playbook, you can use
-           -e mon_to_kill=ceph-mon01 argument. You can only remove a single monitor each time the playbook runs."
-      when: mon_to_kill is not defined
-
-    - name: exit playbook, if the monitor is not part of the inventory
-      fail:
-        msg: "It seems that the host given is not part of your inventory, please make sure it is."
-      when: mon_to_kill not in groups[mon_group_name]
-
-    - name: exit playbook, if user did not mean to shrink cluster
-      fail:
-        msg: "Exiting shrink-mon playbook, no monitor was removed.
-           To shrink the cluster, either say 'yes' on the prompt or
-           or use `-e ireallymeanit=yes` on the command line when
-           invoking the playbook"
-      when: ireallymeanit != 'yes'
-
-    - import_role:
-        name: ceph-defaults
-
-    - import_role:
-        name: ceph-facts
-        tasks_from: container_binary
+    - name: get current monitor status
+      delegate_to: "{{ groups.get(mon_group_name)[0] }}"
+      block:
+        - import_role:
+            name: ceph-defaults
+
+        - import_role:
+            name: ceph-facts
+            tasks_from: container_binary
+
+        - name: "set_fact container_exec_cmd build {{ container_binary }} exec command (containerized)"
+          set_fact:
+            container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups.get(mon_group_name)[0]]['ansible_facts']['hostname'] }}"
+          when: containerized_deployment | bool
+
+        - name: get current quorum status
+          command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} quorum_status -f json"
+          changed_when: false
+          failed_when: false
+          register: current_quorum_status
  
-  tasks:
      - name: pick a monitor different than the one we want to remove
        set_fact:
          mon_host: "{{ item }}"
-      with_items: "{{ groups[mon_group_name] }}"
+      loop: "{{ (current_quorum_status.stdout | from_json)['quorum_names'] }}"
        when: item != mon_to_kill
  
-    - name: "set_fact container_exec_cmd build {{ container_binary }} exec command (containerized)"
-      set_fact:
-        container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ hostvars[mon_host]['ansible_facts']['hostname'] }}"
-      when: containerized_deployment | bool
-
-    - name: exit playbook, if can not connect to the cluster
-      command: "{{ container_exec_cmd }} timeout 5 ceph --cluster {{ cluster }} health"
-      register: ceph_health
-      changed_when: false
-      until: ceph_health.stdout.find("HEALTH") > -1
+    - name: fail if basic requirements aren't satisfied
        delegate_to: "{{ mon_host }}"
-      retries: 5
-      delay: 2
-
-    - name: set_fact mon_to_kill_hostname
+      block:
+        - import_role:
+            name: ceph-defaults
+
+        - import_role:
+            name: ceph-facts
+            tasks_from: container_binary
+
+        - name: "set_fact container_exec_cmd build {{ container_binary }} exec command (containerized)"
+          set_fact:
+            container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ hostvars[mon_host]['ansible_facts']['hostname'] }}"
+          when: containerized_deployment | bool
+
+        - name: exit playbook, if only one monitor is present in cluster
+          fail:
+            msg: "You are about to shrink the only monitor present in the cluster.
+                  If you really want to do that, please use the purge-cluster playbook."
+          when: groups[mon_group_name] | length | int == 1
+
+        - name: exit playbook, if no monitor was given
+          fail:
+            msg: "mon_to_kill must be declared
+              Exiting shrink-cluster playbook, no monitor was removed.
+               On the command line when invoking the playbook, you can use
+               -e mon_to_kill=ceph-mon01 argument. You can only remove a single monitor each time the playbook runs."
+          when: mon_to_kill is not defined
+
+        - name: exit playbook, if the monitor is not part of the inventory
+          fail:
+            msg: "It seems that the host given is not part of your inventory, please make sure it is."
+          when: mon_to_kill not in groups[mon_group_name]
+
+        - name: exit playbook, if user did not mean to shrink cluster
+          fail:
+            msg: "Exiting shrink-mon playbook, no monitor was removed.
+               To shrink the cluster, either say 'yes' on the prompt or
+               or use `-e ireallymeanit=yes` on the command line when
+               invoking the playbook"
+          when: ireallymeanit != 'yes'
+
+        - name: set_fact mon_to_kill_hostname
+          set_fact:
+            mon_to_kill_hostname: "{{ hostvars[mon_to_kill]['ansible_facts']['hostname'] }}"
+
+    - name: set_fact valid_mon_to_kill
        set_fact:
-        mon_to_kill_hostname: "{{ hostvars[mon_to_kill]['ansible_facts']['hostname'] }}"
-
-    - name: stop monitor service(s)
-      service:
-        name: ceph-mon@{{ mon_to_kill_hostname }}
-        state: stopped
-        enabled: no
-      delegate_to: "{{ mon_to_kill }}"
-      failed_when: false
-
-    - name: purge monitor store
-      file:
-        path: /var/lib/ceph/mon/{{ cluster }}-{{ mon_to_kill_hostname }}
-        state: absent
-      delegate_to: "{{ mon_to_kill }}"
-
-    - name: remove monitor from the quorum
-      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} mon remove {{ mon_to_kill_hostname }}"
-      changed_when: false
-      failed_when: false
+        valid_mon_to_kill: "{{ mon_to_kill_hostname in (current_quorum_status.stdout | from_json)['quorum_names'] }}"
+
+
+  tasks:
+    - name: shrink selected monitor
        delegate_to: "{{ mon_host }}"
+      when: valid_mon_to_kill | bool
+      block:
+        - name: exit playbook, if can not connect to the cluster
+          command: "{{ container_exec_cmd }} timeout 5 ceph --cluster {{ cluster }} health"
+          register: ceph_health
+          changed_when: false
+          until: ceph_health.stdout.find("HEALTH") > -1
+          retries: 5
+          delay: 2
+
+        - name: stop monitor service(s)
+          service:
+            name: ceph-mon@{{ mon_to_kill_hostname }}
+            state: stopped
+            enabled: no
+          delegate_to: "{{ mon_to_kill }}"
+          failed_when: false
+
+        - name: purge monitor store
+          file:
+            path: /var/lib/ceph/mon/{{ cluster }}-{{ mon_to_kill_hostname }}
+            state: absent
+          delegate_to: "{{ mon_to_kill }}"
+
+        - name: remove monitor from the quorum
+          command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} mon remove {{ mon_to_kill_hostname }}"
+          changed_when: false
+          failed_when: false
  
    post_tasks:
-    - name: verify the monitor is out of the cluster
-      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} quorum_status -f json"
+    - name: post verifications
        delegate_to: "{{ mon_host }}"
-      changed_when: false
-      failed_when: false
-      register: result
-      until: mon_to_kill_hostname not in (result.stdout | from_json)['quorum_names']
-      retries: 2
-      delay: 10
-
-    - name: please remove the monitor from your ceph configuration file
-      debug:
-          msg: "The monitor has been successfully removed from the cluster.
-          Please remove the monitor entry from the rest of your ceph configuration files, cluster wide."
-      run_once: true
-      when: mon_to_kill_hostname not in (result.stdout | from_json)['quorum_names']
-
-    - name: fail if monitor is still part of the cluster
-      fail:
-          msg: "Monitor appears to still be part of the cluster, please check what happened."
-      run_once: true
-      when: mon_to_kill_hostname in (result.stdout | from_json)['quorum_names']
-
-    - name: show ceph health
-      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} -s"
-      delegate_to: "{{ mon_host }}"
-      changed_when: false
-
-    - name: show ceph mon status
-      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} mon stat"
+      when: valid_mon_to_kill | bool
+      block:
+        - name: verify the monitor is out of the cluster
+          command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} quorum_status -f json"
+          changed_when: false
+          failed_when: false
+          register: result
+          until: mon_to_kill_hostname not in (result.stdout | from_json)['quorum_names']
+          retries: 2
+          delay: 10
+
+        - name: fail if monitor is still part of the cluster
+          fail:
+              msg: "Monitor appears to still be part of the cluster, please check what happened."
+          run_once: true
+          when: mon_to_kill_hostname in (result.stdout | from_json)['quorum_names']
+
+- name: remove monitor entry from ceph config file
+  hosts:
+    - mons
+    - osds
+    - mdss
+    - rgws
+    - nfss
+    - rbdmirrors
+    - clients
+    - iscsigws
+    - mgrs
+    - monitoring
+  gather_facts: false
+  become: True
+  any_errors_fatal: true
+  tasks:
+    - name: update ceph config file
+      when:
+        - shrink_mon_update_cfg | default(false) | bool
+        - hostvars['localhost']['valid_mon_to_kill'] | bool
+      block:
+        - name: gather and delegate facts
+          setup:
+            gather_subset:
+              - 'all'
+              - '!facter'
+              - '!ohai'
+          delegate_to: "{{ item }}"
+          delegate_facts: True
+          with_items: "{{ groups['all'] | difference(groups.get('clients', [])) }}"
+          run_once: true
+          tags: always
+
+        - import_role:
+            name: ceph-defaults
+
+        - import_role:
+            name: ceph-facts
+
+        - import_role:
+            name: ceph-handler
+
+        - import_role:
+            name: ceph-config
+
+
+- name: show ceph status
+  hosts: localhost
+  become: true
+  tasks:
+    - name: show ceph status
        delegate_to: "{{ mon_host }}"
-      changed_when: false
-\ No newline at end of file
+      block:
+        - import_role:
+            name: ceph-defaults
+
+        - name: set_fact ceph_cmd
+          set_fact:
+            ceph_cmd: "{{ container_binary + ' run --rm --net=host -v /etc/ceph:/etc/ceph:z -v /var/lib/ceph:/var/lib/ceph:ro -v /var/run/ceph:/var/run/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }} --cluster {{ cluster }}"
+
+        - name: show ceph mon status
+          command: "{{ ceph_cmd }} mon stat"
+          changed_when: false
+
+        - name: show ceph health
+          command: "{{ ceph_cmd }} -s"
+          changed_when: false
+
+        - name: warn about ceph config file
+          fail:
+            msg: |
+              `shrink_mon_update_cfg` wasn't set to `true`.
+              Please, update manually your ceph config file on all nodes or rerun this playbook with `-e shrink_mon_update_cfg=true`
+          when: not shrink_mon_update_cfg | default(false) | bool
+          ignore_errors: true
diff --git a/roles/ceph-config/templates/ceph.conf.j2 b/roles/ceph-config/templates/ceph.conf.j2

index 3d5183810f89d95398b8f924b0af1f2583ac23a0..909d54a61f4b42b9a1027f08399d8423af6fa737 100644 (file)
--- a/roles/ceph-config/templates/ceph.conf.j2
+++ b/roles/ceph-config/templates/ceph.conf.j2
@@ -23,7 +23,7 @@ osd crush chooseleaf type = 0
  {% endif %}
  
  {% if nb_mon > 0 and inventory_hostname in groups.get(mon_group_name, []) %}
-mon initial members = {% for host in groups[mon_group_name] %}
+mon initial members = {% for host in groups[mon_group_name] | difference([mon_to_kill | default('')]) %}
        {% if hostvars[host]['ansible_facts']['hostname'] is defined -%}
          {{ hostvars[host]['ansible_facts']['hostname'] }}
        {%- endif %}
diff --git a/roles/ceph-facts/tasks/set_monitor_address.yml b/roles/ceph-facts/tasks/set_monitor_address.yml

index b1cb3466752568c720443ccf029d5137154b9f92..b6d4a9461166b5197b1bc34184458d704ffb419e 100644 (file)
--- a/roles/ceph-facts/tasks/set_monitor_address.yml
+++ b/roles/ceph-facts/tasks/set_monitor_address.yml
@@ -2,7 +2,7 @@
  - name: set_fact _monitor_addresses to monitor_address_block ipv4
    set_fact:
      _monitor_addresses: "{{ _monitor_addresses | default([]) + [{ 'name': item, 'addr': hostvars[item]['ansible_facts']['all_ipv4_addresses'] | ips_in_ranges(hostvars[item]['monitor_address_block'].split(',')) | first }] }}"
-  with_items: "{{ groups.get(mon_group_name, []) }}"
+  loop: "{{ groups.get(mon_group_name, []) | difference(mon_to_kill | default('')) }}"
    when:
      - "item not in _monitor_addresses | default([]) | selectattr('name', 'defined') |  map(attribute='name') | list"
      - hostvars[item]['monitor_address_block'] is defined
@@ -12,7 +12,7 @@
  - name: set_fact _monitor_addresses to monitor_address_block ipv6
    set_fact:
      _monitor_addresses: "{{ _monitor_addresses | default([]) + [{ 'name': item, 'addr': hostvars[item]['ansible_facts']['all_ipv6_addresses'] | ips_in_ranges(hostvars[item]['monitor_address_block'].split(',')) | last | ansible.utils.ipwrap }] }}"
-  with_items: "{{ groups.get(mon_group_name, []) }}"
+  loop: "{{ groups.get(mon_group_name, []) | difference(mon_to_kill | default('')) }}"
    when:
      - "item not in _monitor_addresses | default([]) | selectattr('name', 'defined') |  map(attribute='name') | list"
      - hostvars[item]['monitor_address_block'] is defined
@@ -22,7 +22,7 @@
  - name: set_fact _monitor_addresses to monitor_address
    set_fact:
      _monitor_addresses: "{{ _monitor_addresses | default([]) + [{ 'name': item, 'addr': hostvars[item]['monitor_address'] | ansible.utils.ipwrap}] }}"
-  with_items: "{{ groups.get(mon_group_name, []) }}"
+  loop: "{{ groups.get(mon_group_name, []) | difference(mon_to_kill | default('')) }}"
    when:
      - "item not in _monitor_addresses | default([]) | selectattr('name', 'defined') | map(attribute='name') | list"
      - hostvars[item]['monitor_address'] is defined
@@ -31,7 +31,7 @@
  - name: set_fact _monitor_addresses to monitor_interface - ipv4
    set_fact:
      _monitor_addresses: "{{ _monitor_addresses | default([]) + [{ 'name': item, 'addr': hostvars[item]['ansible_facts'][(hostvars[item]['monitor_interface']|replace('-', '_'))][ip_version]['address'] | ansible.utils.ipwrap }]  }}"
-  with_items: "{{ groups.get(mon_group_name, []) }}"
+  loop: "{{ groups.get(mon_group_name, []) | difference(mon_to_kill | default('')) }}"
    when:
      - "item not in _monitor_addresses | default([]) | selectattr('name', 'defined') | map(attribute='name') | list"
      - ip_version == 'ipv4'
@@ -42,7 +42,7 @@
  - name: set_fact _monitor_addresses to monitor_interface - ipv6
    set_fact:
      _monitor_addresses: "{{ _monitor_addresses | default([]) + [{ 'name': item, 'addr': hostvars[item]['ansible_facts'][(hostvars[item]['monitor_interface']|replace('-', '_'))][ip_version][0]['address'] | ansible.utils.ipwrap }] }}"
-  with_items: "{{ groups.get(mon_group_name, []) }}"
+  loop: "{{ groups.get(mon_group_name, []) | difference(mon_to_kill | default('')) }}"
    when:
      - "item not in _monitor_addresses | default([]) | selectattr('name', 'defined') | map(attribute='name') | list"
      - ip_version == 'ipv6'
@@ -56,4 +56,4 @@
    with_items: "{{ _monitor_addresses }}"
    when:
      - (inventory_hostname == item.name and not rolling_update | default(False) | bool)
-      or (rolling_update | default(False) | bool and item.name == groups.get(mon_group_name, [])[0])
-\ No newline at end of file
+      or (rolling_update | default(False) | bool and item.name == groups.get(mon_group_name, [])[0])
diff --git a/tox.ini b/tox.ini

index fabcca9977b440accbf9f092f81557a697923370..366d285fe6627dc04b67e365f66731f68802de13 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -118,6 +118,7 @@ commands=
    ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/shrink-mon.yml --extra-vars "\
        ireallymeanit=yes \
        mon_to_kill={env:MON_TO_KILL:mon2} \
+      shrink_mon_update_cfg=true \
    "
  [shrink-osd]
  commands=
author	Guillaume Abrioux <gabrioux@redhat.com>
	Thu, 28 Jul 2022 12:15:38 +0000 (14:15 +0200)
committer	Guillaume Abrioux <gabrioux@redhat.com>
	Tue, 9 Aug 2022 05:20:53 +0000 (07:20 +0200)
infrastructure-playbooks/shrink-mon.yml		patch \| blob \| history
roles/ceph-config/templates/ceph.conf.j2		patch \| blob \| history
roles/ceph-facts/tasks/set_monitor_address.yml		patch \| blob \| history
tox.ini		patch \| blob \| history