]> git.apps.os.sepia.ceph.com Git - ceph-ansible.git/commitdiff
ceph-crash: introduce new role ceph-crash
authorGuillaume Abrioux <gabrioux@redhat.com>
Fri, 3 Jul 2020 08:21:49 +0000 (10:21 +0200)
committerDimitri Savineau <savineau.dimitri@gmail.com>
Wed, 22 Jul 2020 22:47:01 +0000 (18:47 -0400)
This commit introduces a new role `ceph-crash` in order to deploy
everything needed for the ceph-crash daemon.

Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
(cherry picked from commit 9d2f2108e1c9b6ae42b3133bb9ac37d4765e5e07)

19 files changed:
infrastructure-playbooks/docker-to-podman.yml
infrastructure-playbooks/purge-cluster.yml
infrastructure-playbooks/purge-container-cluster.yml
infrastructure-playbooks/rolling_update.yml
infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml
roles/ceph-container-common/tasks/fetch_image.yml
roles/ceph-crash/meta/main.yml [new file with mode: 0644]
roles/ceph-crash/tasks/main.yml [new file with mode: 0644]
roles/ceph-crash/tasks/systemd.yml [new file with mode: 0644]
roles/ceph-crash/templates/ceph-crash.service.j2 [new file with mode: 0644]
roles/ceph-handler/handlers/main.yml
roles/ceph-handler/tasks/check_running_containers.yml
roles/ceph-handler/tasks/check_socket_non_container.yml
roles/ceph-handler/tasks/handler_crash.yml [new file with mode: 0644]
roles/ceph-handler/tasks/main.yml
site-container.yml.sample
site.yml.sample
tests/conftest.py
tests/functional/tests/test_install.py

index 5e9e048c63a3127dd9a7b65414805d2c07e01baa..a6d2cb703834989057bbfd978aab415f1f45751f 100644 (file)
         tasks_from: systemd.yml
       when: inventory_hostname in groups.get(rgw_group_name, [])
 
+    - import_role:
+        name: ceph-crash
+        tasks_from: systemd.yml
+      when: inventory_hostname in groups.get(mon_group_name, []) or
+            inventory_hostname in groups.get(osd_group_name, []) or
+            inventory_hostname in groups.get(mds_group_name, []) or
+            inventory_hostname in groups.get(rgw_group_name, []) or
+            inventory_hostname in groups.get(mgr_group_name, []) or
+            inventory_hostname in groups.get(rbdmirror_group_name, [])
+
     - name: dashboard configuration
       when: dashboard_enabled | bool
       block:
index 93e43cf42088882def8425ecc0d53fbb8ea73141..7cecdf39d97ce1652683475ee33cc1ab7a47963f 100644 (file)
       - /var/lib/ceph/bootstrap-mgr
       - /var/lib/ceph/tmp
 
+- name: purge ceph-crash daemons
+  hosts:
+    - "{{ mon_group_name | default('mons') }}"
+    - "{{ osd_group_name | default('osds') }}"
+    - "{{ mds_group_name | default('mdss') }}"
+    - "{{ rgw_group_name | default('rgws') }}"
+    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
+    - "{{ mgr_group_name | default('mgrs') }}"
+  gather_facts: false
+  become: true
+  tasks:
+    - name: stop ceph-crash service
+      service:
+        name: ceph-crash.service
+        state: stopped
+        enabled: no
+      failed_when: false
+
+    - name: remove /var/lib/ceph/crash
+      file:
+        path: /var/lib/ceph/crash
+        state: absent
+
 
 - name: final cleanup - check any running ceph, purge ceph packages, purge config and remove data
 
index c326735a5b265f71a39e812bd28db181de7e5dbb..cebad947dc26aa2cfc637fdcae67e044e050018f 100644 (file)
           failed_when: false
       when: dashboard_enabled | bool
 
+- name: purge ceph-crash containers
+  hosts:
+    - "{{ mon_group_name | default('mons') }}"
+    - "{{ osd_group_name | default('osds') }}"
+    - "{{ mds_group_name | default('mdss') }}"
+    - "{{ rgw_group_name | default('rgws') }}"
+    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
+    - "{{ mgr_group_name | default('mgrs') }}"
+  gather_facts: false
+  become: true
+  tasks:
+    - name: stop ceph-crash container
+      service:
+        name: "ceph-crash@{{ ansible_hostname }}"
+        state: stopped
+        enabled: no
+      failed_when: false
+
+    - name: remove service file
+      file:
+        name: "/etc/systemd/system/ceph-crash.service"
+        state: absent
+      failed_when: false
+
+    - name: remove /var/lib/ceph/crash
+      file:
+        path: /var/lib/ceph/crash
+        state: absent
+
 - name: check container hosts
 
   hosts:
index 6ff628d23c441019b9e7e56514b28188dd3dc022..f6c0918c4bac02ec5053cef6066d0cb7e83273cf 100644 (file)
     - import_role:
         name: ceph-client
 
+- name: upgrade ceph-crash daemons
+  hosts:
+    - "{{ mon_group_name | default('mons') }}"
+    - "{{ osd_group_name | default('osds') }}"
+    - "{{ mds_group_name | default('mdss') }}"
+    - "{{ rgw_group_name | default('rgws') }}"
+    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
+    - "{{ mgr_group_name | default('mgrs') }}"
+  gather_facts: false
+  become: true
+  tasks:
+    - import_role:
+        name: ceph-defaults
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary.yml
+    - import_role:
+        name: ceph-handler
+    - import_role:
+        name: ceph-crash
+
 - name: complete upgrade
   hosts:
   - "{{ mon_group_name | default('mons') }}"
index b5ffcf1029ea5c3bd0d6f49865745f5fd94112ee..5dee84924548eab5e03bdbdd1e4fd4db10cb92c3 100644 (file)
 
     - import_role:
         name: ceph-nfs
+
+- name: switching from non-containerized to containerized ceph-crash
+
+  hosts:
+    - "{{ mon_group_name | default('mons') }}"
+    - "{{ osd_group_name | default('osds') }}"
+    - "{{ mds_group_name | default('mdss') }}"
+    - "{{ rgw_group_name | default('rgws') }}"
+    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
+    - "{{ mgr_group_name | default('mgrs') }}"
+
+  vars:
+    containerized_deployment: true
+  serial: 1
+  become: true
+  tasks:
+    - name: stop non-containerized ceph-crash
+      service:
+        name: ceph-crash
+        state: stopped
+        enabled: no
+
+    - import_role:
+        name: ceph-defaults
+
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary.yml
+
+    - import_role:
+        name: ceph-handler
+
+    - import_role:
+        name: ceph-crash
\ No newline at end of file
index 9748298d66054c5aa6d9b84424bffbd92e899e75..8ae6150375341129e07e31aba6621171afae72aa 100644 (file)
     - ceph_nfs_container_stat.get('rc') == 0
     - ceph_nfs_container_stat.get('stdout_lines', [])|length != 0
 
+- name: inspect ceph crash container
+  command: "{{ container_binary }} inspect {{ ceph_crash_container_stat.stdout }}"
+  changed_when: false
+  register: ceph_crash_inspect
+  when:
+    - ceph_crash_container_stat.get('rc') == 0
+    - ceph_crash_container_stat.get('stdout_lines', [])|length != 0
+
 # NOTE(leseb): using failed_when to handle the case when the image is not present yet
 - name: "inspecting ceph mon container image before pulling"
   command: "{{ container_binary }} inspect {{ (ceph_mon_inspect.stdout | from_json)[0].Image }}"
     - nfs_group_name in group_names
     - ceph_nfs_inspect.get('rc') == 0
 
+- name: "inspecting ceph crash container image before pulling"
+  command: "{{ container_binary }} inspect {{ (ceph_crash_inspect.stdout | from_json)[0].Image }}"
+  changed_when: false
+  failed_when: false
+  register: ceph_crash_container_inspect_before_pull
+  when: ceph_crash_inspect.get('rc') == 0
+
 - name: set_fact ceph_mon_image_repodigest_before_pulling
   set_fact:
     ceph_mon_image_repodigest_before_pulling: "{{ (ceph_mon_container_inspect_before_pull.stdout | from_json)[0].Id }}"
     - mgr_group_name in group_names
     - ceph_mgr_container_inspect_before_pull.get('rc') == 0
 
+- name: set_fact ceph_crash_image_repodigest_before_pulling
+  set_fact:
+    ceph_crash_image_repodigest_before_pulling: "{{ (ceph_crash_container_inspect_before_pull.stdout | from_json)[0].Id }}"
+  when: ceph_crash_container_inspect_before_pull.get('rc') == 0
+
 - name: set_fact ceph_rbd_mirror_image_repodigest_before_pulling
   set_fact:
     ceph_rbd_mirror_image_repodigest_before_pulling: "{{ (ceph_rbd_mirror_container_inspect_before_pull.stdout | from_json)[0].Id }}"
     - ceph_nfs_container_inspect_before_pull.get('rc') == 0
     - ceph_nfs_image_repodigest_before_pulling != image_repodigest_after_pulling
 
+- name: set_fact ceph_crash_image_updated
+  set_fact:
+    ceph_crash_image_updated: "{{ ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling }}"
+  changed_when: true
+  notify: restart ceph crash
+  when:
+    - ceph_crash_container_inspect_before_pull.get('rc') == 0
+    - ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling
+
 - name: export local ceph dev image
   command: >
     {{ container_binary }} save -o "/tmp/{{ ceph_docker_username }}-{{ ceph_docker_imagename }}-{{ ceph_docker_image_tag }}.tar"
diff --git a/roles/ceph-crash/meta/main.yml b/roles/ceph-crash/meta/main.yml
new file mode 100644 (file)
index 0000000..43578ce
--- /dev/null
@@ -0,0 +1,15 @@
+---
+galaxy_info:
+  company: Red Hat
+  author: Guillaume Abrioux
+  description: Deploy ceph-crash
+  license: Apache
+  min_ansible_version: 2.7
+  platforms:
+    - name: EL
+      versions:
+        - 7
+        - 8
+  galaxy_tags:
+    - system
+dependencies: []
diff --git a/roles/ceph-crash/tasks/main.yml b/roles/ceph-crash/tasks/main.yml
new file mode 100644 (file)
index 0000000..2b360a6
--- /dev/null
@@ -0,0 +1,71 @@
+---
+- name: create and copy client.crash keyring
+  when: cephx | bool
+  block:
+    - name: create client.crash keyring
+      ceph_key:
+        state: present
+        name: "client.crash"
+        caps: "{{ {'mon': 'allow profile crash', 'mgr': 'allow profile crash'} }}"
+        cluster: "{{ cluster }}"
+        dest: "{{ ceph_conf_key_directory }}"
+        import_key: True
+        mode: "{{ ceph_keyring_permissions }}"
+        owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
+        group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
+      environment:
+        CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
+        CEPH_CONTAINER_BINARY: "{{ container_binary }}"
+      delegate_to: "{{ groups.get(mon_group_name, [])[0] }}"
+      run_once: True
+
+    - name: get keys from monitors
+      command: "{{ hostvars[groups[mon_group_name][0]]['container_exec_cmd'] | default('') }} ceph --cluster {{ cluster }} auth get client.crash"
+      register: _crash_keys
+      delegate_to: "{{ groups.get(mon_group_name)[0] }}"
+      run_once: true
+
+    - name: get a list of node where the keyring should be copied
+      set_fact:
+        list_target_node: "{{ list_target_node | default([]) | union(((groups.get('all') | difference(groups.get(grafana_server_group_name, []) + groups.get(client_group_name, []) + groups.get(nfs_group_name, []) + groups.get(iscsi_gw_group_name, []))) + groups.get(item, [])) | unique) }}"
+      run_once: True
+      with_items:
+        - "{{ mon_group_name if groups.get(mon_group_name, []) | length > 0 else [] }}"
+        - "{{ osd_group_name if groups.get(osd_group_name, []) | length > 0 else [] }}"
+        - "{{ mds_group_name if groups.get(mds_group_name, []) | length > 0 else [] }}"
+        - "{{ rgw_group_name if groups.get(rgw_group_name, []) | length > 0 else [] }}"
+        - "{{ rbdmirror_group_name if groups.get(rbdmirror_group_name, []) | length > 0 else [] }}"
+        - "{{ mgr_group_name if groups.get(mgr_group_name, []) | length > 0 else [] }}"
+
+    - name: copy ceph key(s) if needed
+      copy:
+        dest: "{{ ceph_conf_key_directory }}/{{ cluster }}.client.crash.keyring"
+        content: "{{ _crash_keys.stdout + '\n' }}"
+        owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
+        group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
+        mode: "{{ ceph_keyring_permissions }}"
+      with_items: "{{ list_target_node }}"
+      delegate_to: "{{ item }}"
+      run_once: True
+
+- name: start ceph-crash daemon
+  when: containerized_deployment | bool
+  block:
+    - name: create /var/lib/ceph/crash/posted
+      file:
+        path: /var/lib/ceph/crash/posted
+        state: directory
+        mode: '0755'
+        owner: "{{ ceph_uid }}"
+        group: "{{ ceph_uid }}"
+
+    - name: include_tasks systemd.yml
+      include_tasks: systemd.yml
+
+- name: start the ceph-crash service
+  systemd:
+    name: "{{ 'ceph-crash@' + ansible_hostname if containerized_deployment | bool else 'ceph-crash.service' }}"
+    state: started
+    enabled: yes
+    masked: no
+    daemon_reload: yes
\ No newline at end of file
diff --git a/roles/ceph-crash/tasks/systemd.yml b/roles/ceph-crash/tasks/systemd.yml
new file mode 100644 (file)
index 0000000..3b2ded8
--- /dev/null
@@ -0,0 +1,9 @@
+---
+- name: generate systemd unit file for ceph-crash container
+  template:
+    src: "{{ role_path }}/templates/ceph-crash.service.j2"
+    dest: /etc/systemd/system/ceph-crash@.service
+    owner: "root"
+    group: "root"
+    mode: "0644"
+  notify: restart ceph crash
\ No newline at end of file
diff --git a/roles/ceph-crash/templates/ceph-crash.service.j2 b/roles/ceph-crash/templates/ceph-crash.service.j2
new file mode 100644 (file)
index 0000000..ed06ef0
--- /dev/null
@@ -0,0 +1,41 @@
+[Unit]
+Description=Ceph crash dump collector
+{% if container_binary == 'docker' %}
+After=docker.service
+Requires=docker.service
+{% else %}
+After=network.target
+{% endif %}
+
+[Service]
+{% if container_binary == 'podman' %}
+ExecStartPre=-/usr/bin/rm -f /%t/%n-pid /%t/%n-cid
+ExecStartPre=-/usr/bin/{{ container_binary }} rm -f ceph-crash-%i
+{% endif %}
+ExecStart=/usr/bin/{{ container_binary }} run --rm --name ceph-crash-%i \
+{% if container_binary == 'podman' %}
+-d --conmon-pidfile /%t/%n-pid --cidfile /%t/%n-cid \
+{% endif %}
+--net=host \
+-v /var/lib/ceph:/var/lib/ceph:z \
+-v /etc/localtime:/etc/localtime:ro \
+--entrypoint=/usr/bin/ceph-crash {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}
+{% if container_binary == 'podman' %}
+ExecStop=-/usr/bin/sh -c "/usr/bin/{{ container_binary }} rm -f `cat /%t/%n-cid`"
+{% else %}
+ExecStop=-/usr/bin/{{ container_binary }} stop ceph-crash-%i
+{% endif %}
+StartLimitInterval=10min
+StartLimitBurst=30
+{% if container_binary == 'podman' %}
+Type=forking
+PIDFile=/%t/%n-pid
+{% endif %}
+KillMode=none
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=10
+
+[Install]
+WantedBy=multi-user.target
index a6f33e24bfe1864b001cfe66d7ad70e01f42ae45..e375c94146480321a9d1db5924c5e4ac97f0c477 100644 (file)
@@ -55,3 +55,7 @@
       include_tasks: handler_rbd_target_api_gw.yml
       when: iscsi_gw_group_name in group_names
       listen: "restart ceph rbd-target-api-gw"
+
+    - name: ceph crash handler
+      include_tasks: handler_crash.yml
+      listen: "restart ceph crash"
index cd6539235a38220311b35e4dd4b6f89192e1a11d..86d488de6648fcde6e31b22f347cd16c4a1fe62f 100644 (file)
   failed_when: false
   check_mode: no
   when: inventory_hostname in groups.get(iscsi_gw_group_name, [])
+
+- name: check for a ceph-crash container
+  command: "{{ container_binary }} ps -q --filter='name=ceph-crash-{{ ansible_hostname }}'"
+  register: ceph_crash_container_stat
+  changed_when: false
+  failed_when: false
+  check_mode: no
\ No newline at end of file
index 713fb4f71eed3ad99be514f877b40be33ca3ce54..20f33efc848ac6b618001c0c0c4a8c5978ddc147 100644 (file)
   failed_when: false
   check_mode: no
   when: inventory_hostname in groups.get(iscsi_gw_group_name, [])
+
+- name: check for a ceph-crash process
+  command: pgrep ceph-crash
+  changed_when: false
+  failed_when: false
+  check_mode: no
+  register: crash_process
\ No newline at end of file
diff --git a/roles/ceph-handler/tasks/handler_crash.yml b/roles/ceph-handler/tasks/handler_crash.yml
new file mode 100644 (file)
index 0000000..6187fca
--- /dev/null
@@ -0,0 +1,18 @@
+---
+- name: set _crash_handler_called before restart
+  set_fact:
+    _crash_handler_called: True
+
+- name: restart the ceph-crash service
+  systemd:
+    name: ceph-crash@{{ ansible_hostname }}
+    state: restarted
+    enabled: yes
+    masked: no
+    daemon_reload: yes
+  ignore_errors: true
+  when: hostvars[inventory_hostname]['_crash_handler_called'] | default(False) | bool
+
+- name: set _crash_handler_called after restart
+  set_fact:
+    _crash_handler_called: False
index bdad343fad5ff3929c33029ca2d988ed4f1ee9d3..0517c6f3b7644c7ed2b0c02fc248af8dd366714e 100644 (file)
 - name: set_fact handler_mgr_status
   set_fact:
     handler_mgr_status: "{{ (mgr_socket_stat.get('rc') == 0) if not containerized_deployment | bool else (ceph_mgr_container_stat.get('rc') == 0 and ceph_mgr_container_stat.get('stdout_lines', []) | length != 0) }}"
-  when: inventory_hostname in groups.get(mgr_group_name, [])
\ No newline at end of file
+  when: inventory_hostname in groups.get(mgr_group_name, [])
+
+- name: set_fact handler_crash_status
+  set_fact:
+    handler_crash_status: "{{ crash_process.get('rc') == 0 if not containerized_deployment | bool else (ceph_crash_container_stat.get('rc') == 0 and ceph_crash_container_stat.get('stdout_lines', []) | length != 0) }}"
+  when:
+    - inventory_hostname in groups.get(mon_group_name, [])
+      or inventory_hostname in groups.get(mgr_group_name, [])
+      or inventory_hostname in groups.get(osd_group_name, [])
+      or inventory_hostname in groups.get(mds_group_name, [])
+      or inventory_hostname in groups.get(rgw_group_name, [])
+      or inventory_hostname in groups.get(rbdmirror_group_name, [])
\ No newline at end of file
index ed2595ee16b384fb81238f105f238ef016ce4193..cdffea8e5ffb534166f3adfdf863eec654e15a0a 100644 (file)
     - dashboard_enabled | bool
     - groups.get(grafana_server_group_name, []) | length > 0
 
+- hosts:
+  - mons
+  - osds
+  - mdss
+  - rgws
+  - rbdmirrors
+  - mgrs
+
+  gather_facts: false
+  become: True
+  any_errors_fatal: true
+
+  tasks:
+    - import_role:
+        name: ceph-defaults
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary.yml
+    - import_role:
+        name: ceph-handler
+    - import_role:
+        name: ceph-crash
+
+
 - hosts: mons
   gather_facts: false
   become: True
index 49a89900992279fad56f4026549e6f91b031261a..374ce878ec499d24d5803bc73d0d122a97d260f9 100644 (file)
     - dashboard_enabled | bool
     - groups.get(grafana_server_group_name, []) | length > 0
 
+- hosts:
+  - mons
+  - osds
+  - mdss
+  - rgws
+  - rbdmirrors
+  - mgrs
+
+  gather_facts: false
+  become: True
+  any_errors_fatal: true
+
+  tasks:
+    - import_role:
+        name: ceph-defaults
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary.yml
+    - import_role:
+        name: ceph-handler
+    - import_role:
+        name: ceph-crash
+
 - hosts: mons
   gather_facts: false
   become: True
index 6853d64e47d015428329bd5a4ff0d00565ab2f65..74d378ccc6603de07f2d4128ebd52d27e23041a1 100644 (file)
@@ -127,6 +127,9 @@ def node(host, request):
             request.function, group_names)
         pytest.skip(reason)
 
+    if request.node.get_closest_marker('ceph_crash') and group_names in [['nfss'], ['iscsigws'], ['clients'], ['grafana-server']]:
+        pytest.skip('Not a valid test for nfs, client or iscsigw nodes')
+
     if request.node.get_closest_marker("no_docker") and docker:
         pytest.skip(
             "Not a valid test for containerized deployments or atomic hosts")
index 184002dbadd7f91d9ade7e6766912ce3d4fb81c8..7e7c6de5db6265a8aa16b1aa9c8ab08f267f60aa 100644 (file)
@@ -29,3 +29,18 @@ class TestCephConf(object):
             if pattern.search(mon_host_line) is None:
                 result = False
             assert result
+
+class TestCephCrash(object):
+    @pytest.mark.no_docker
+    @pytest.mark.ceph_crash
+    def test_ceph_crash_service_enabled_and_running(self, node, host):
+        s = host.service("ceph-crash")
+        assert s.is_enabled
+        assert s.is_running
+
+    @pytest.mark.docker
+    @pytest.mark.ceph_crash
+    def test_ceph_crash_service_enabled_and_running_container(self, node, host):
+        s = host.service("ceph-crash@{hostname}".format(hostname=node["vars"]["inventory_hostname"]))
+        assert s.is_enabled
+        assert s.is_running
\ No newline at end of file