From e6059fdcd3810f89082447b513421147e342bce3 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Fri, 3 Jul 2020 10:21:49 +0200 Subject: [PATCH] ceph-crash: introduce new role ceph-crash This commit introduces a new role `ceph-crash` in order to deploy everything needed for the ceph-crash daemon. Signed-off-by: Guillaume Abrioux (cherry picked from commit 9d2f2108e1c9b6ae42b3133bb9ac37d4765e5e07) --- infrastructure-playbooks/docker-to-podman.yml | 10 +++ infrastructure-playbooks/purge-cluster.yml | 23 ++++++ .../purge-container-cluster.yml | 29 ++++++++ infrastructure-playbooks/rolling_update.yml | 21 ++++++ ...inerized-to-containerized-ceph-daemons.yml | 34 +++++++++ .../tasks/fetch_image.yml | 29 ++++++++ roles/ceph-crash/meta/main.yml | 15 ++++ roles/ceph-crash/tasks/main.yml | 71 +++++++++++++++++++ roles/ceph-crash/tasks/systemd.yml | 9 +++ .../templates/ceph-crash.service.j2 | 41 +++++++++++ roles/ceph-handler/handlers/main.yml | 4 ++ .../tasks/check_running_containers.yml | 7 ++ .../tasks/check_socket_non_container.yml | 7 ++ roles/ceph-handler/tasks/handler_crash.yml | 18 +++++ roles/ceph-handler/tasks/main.yml | 13 +++- site-container.yml.sample | 24 +++++++ site.yml.sample | 23 ++++++ tests/conftest.py | 3 + tests/functional/tests/test_install.py | 15 ++++ 19 files changed, 395 insertions(+), 1 deletion(-) create mode 100644 roles/ceph-crash/meta/main.yml create mode 100644 roles/ceph-crash/tasks/main.yml create mode 100644 roles/ceph-crash/tasks/systemd.yml create mode 100644 roles/ceph-crash/templates/ceph-crash.service.j2 create mode 100644 roles/ceph-handler/tasks/handler_crash.yml diff --git a/infrastructure-playbooks/docker-to-podman.yml b/infrastructure-playbooks/docker-to-podman.yml index 5e9e048c6..a6d2cb703 100644 --- a/infrastructure-playbooks/docker-to-podman.yml +++ b/infrastructure-playbooks/docker-to-podman.yml @@ -173,6 +173,16 @@ tasks_from: systemd.yml when: inventory_hostname in groups.get(rgw_group_name, []) + - import_role: + name: ceph-crash + tasks_from: systemd.yml + when: inventory_hostname in groups.get(mon_group_name, []) or + inventory_hostname in groups.get(osd_group_name, []) or + inventory_hostname in groups.get(mds_group_name, []) or + inventory_hostname in groups.get(rgw_group_name, []) or + inventory_hostname in groups.get(mgr_group_name, []) or + inventory_hostname in groups.get(rbdmirror_group_name, []) + - name: dashboard configuration when: dashboard_enabled | bool block: diff --git a/infrastructure-playbooks/purge-cluster.yml b/infrastructure-playbooks/purge-cluster.yml index 93e43cf42..7cecdf39d 100644 --- a/infrastructure-playbooks/purge-cluster.yml +++ b/infrastructure-playbooks/purge-cluster.yml @@ -610,6 +610,29 @@ - /var/lib/ceph/bootstrap-mgr - /var/lib/ceph/tmp +- name: purge ceph-crash daemons + hosts: + - "{{ mon_group_name | default('mons') }}" + - "{{ osd_group_name | default('osds') }}" + - "{{ mds_group_name | default('mdss') }}" + - "{{ rgw_group_name | default('rgws') }}" + - "{{ rbdmirror_group_name | default('rbdmirrors') }}" + - "{{ mgr_group_name | default('mgrs') }}" + gather_facts: false + become: true + tasks: + - name: stop ceph-crash service + service: + name: ceph-crash.service + state: stopped + enabled: no + failed_when: false + + - name: remove /var/lib/ceph/crash + file: + path: /var/lib/ceph/crash + state: absent + - name: final cleanup - check any running ceph, purge ceph packages, purge config and remove data diff --git a/infrastructure-playbooks/purge-container-cluster.yml b/infrastructure-playbooks/purge-container-cluster.yml index c326735a5..cebad947d 100644 --- a/infrastructure-playbooks/purge-container-cluster.yml +++ b/infrastructure-playbooks/purge-container-cluster.yml @@ -468,6 +468,35 @@ failed_when: false when: dashboard_enabled | bool +- name: purge ceph-crash containers + hosts: + - "{{ mon_group_name | default('mons') }}" + - "{{ osd_group_name | default('osds') }}" + - "{{ mds_group_name | default('mdss') }}" + - "{{ rgw_group_name | default('rgws') }}" + - "{{ rbdmirror_group_name | default('rbdmirrors') }}" + - "{{ mgr_group_name | default('mgrs') }}" + gather_facts: false + become: true + tasks: + - name: stop ceph-crash container + service: + name: "ceph-crash@{{ ansible_hostname }}" + state: stopped + enabled: no + failed_when: false + + - name: remove service file + file: + name: "/etc/systemd/system/ceph-crash.service" + state: absent + failed_when: false + + - name: remove /var/lib/ceph/crash + file: + path: /var/lib/ceph/crash + state: absent + - name: check container hosts hosts: diff --git a/infrastructure-playbooks/rolling_update.yml b/infrastructure-playbooks/rolling_update.yml index 6ff628d23..f6c0918c4 100644 --- a/infrastructure-playbooks/rolling_update.yml +++ b/infrastructure-playbooks/rolling_update.yml @@ -912,6 +912,27 @@ - import_role: name: ceph-client +- name: upgrade ceph-crash daemons + hosts: + - "{{ mon_group_name | default('mons') }}" + - "{{ osd_group_name | default('osds') }}" + - "{{ mds_group_name | default('mdss') }}" + - "{{ rgw_group_name | default('rgws') }}" + - "{{ rbdmirror_group_name | default('rbdmirrors') }}" + - "{{ mgr_group_name | default('mgrs') }}" + gather_facts: false + become: true + tasks: + - import_role: + name: ceph-defaults + - import_role: + name: ceph-facts + tasks_from: container_binary.yml + - import_role: + name: ceph-handler + - import_role: + name: ceph-crash + - name: complete upgrade hosts: - "{{ mon_group_name | default('mons') }}" diff --git a/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml b/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml index b5ffcf102..5dee84924 100644 --- a/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml +++ b/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml @@ -546,3 +546,37 @@ - import_role: name: ceph-nfs + +- name: switching from non-containerized to containerized ceph-crash + + hosts: + - "{{ mon_group_name | default('mons') }}" + - "{{ osd_group_name | default('osds') }}" + - "{{ mds_group_name | default('mdss') }}" + - "{{ rgw_group_name | default('rgws') }}" + - "{{ rbdmirror_group_name | default('rbdmirrors') }}" + - "{{ mgr_group_name | default('mgrs') }}" + + vars: + containerized_deployment: true + serial: 1 + become: true + tasks: + - name: stop non-containerized ceph-crash + service: + name: ceph-crash + state: stopped + enabled: no + + - import_role: + name: ceph-defaults + + - import_role: + name: ceph-facts + tasks_from: container_binary.yml + + - import_role: + name: ceph-handler + + - import_role: + name: ceph-crash \ No newline at end of file diff --git a/roles/ceph-container-common/tasks/fetch_image.yml b/roles/ceph-container-common/tasks/fetch_image.yml index 9748298d6..8ae615037 100644 --- a/roles/ceph-container-common/tasks/fetch_image.yml +++ b/roles/ceph-container-common/tasks/fetch_image.yml @@ -63,6 +63,14 @@ - ceph_nfs_container_stat.get('rc') == 0 - ceph_nfs_container_stat.get('stdout_lines', [])|length != 0 +- name: inspect ceph crash container + command: "{{ container_binary }} inspect {{ ceph_crash_container_stat.stdout }}" + changed_when: false + register: ceph_crash_inspect + when: + - ceph_crash_container_stat.get('rc') == 0 + - ceph_crash_container_stat.get('stdout_lines', [])|length != 0 + # NOTE(leseb): using failed_when to handle the case when the image is not present yet - name: "inspecting ceph mon container image before pulling" command: "{{ container_binary }} inspect {{ (ceph_mon_inspect.stdout | from_json)[0].Image }}" @@ -127,6 +135,13 @@ - nfs_group_name in group_names - ceph_nfs_inspect.get('rc') == 0 +- name: "inspecting ceph crash container image before pulling" + command: "{{ container_binary }} inspect {{ (ceph_crash_inspect.stdout | from_json)[0].Image }}" + changed_when: false + failed_when: false + register: ceph_crash_container_inspect_before_pull + when: ceph_crash_inspect.get('rc') == 0 + - name: set_fact ceph_mon_image_repodigest_before_pulling set_fact: ceph_mon_image_repodigest_before_pulling: "{{ (ceph_mon_container_inspect_before_pull.stdout | from_json)[0].Id }}" @@ -162,6 +177,11 @@ - mgr_group_name in group_names - ceph_mgr_container_inspect_before_pull.get('rc') == 0 +- name: set_fact ceph_crash_image_repodigest_before_pulling + set_fact: + ceph_crash_image_repodigest_before_pulling: "{{ (ceph_crash_container_inspect_before_pull.stdout | from_json)[0].Id }}" + when: ceph_crash_container_inspect_before_pull.get('rc') == 0 + - name: set_fact ceph_rbd_mirror_image_repodigest_before_pulling set_fact: ceph_rbd_mirror_image_repodigest_before_pulling: "{{ (ceph_rbd_mirror_container_inspect_before_pull.stdout | from_json)[0].Id }}" @@ -266,6 +286,15 @@ - ceph_nfs_container_inspect_before_pull.get('rc') == 0 - ceph_nfs_image_repodigest_before_pulling != image_repodigest_after_pulling +- name: set_fact ceph_crash_image_updated + set_fact: + ceph_crash_image_updated: "{{ ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling }}" + changed_when: true + notify: restart ceph crash + when: + - ceph_crash_container_inspect_before_pull.get('rc') == 0 + - ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling + - name: export local ceph dev image command: > {{ container_binary }} save -o "/tmp/{{ ceph_docker_username }}-{{ ceph_docker_imagename }}-{{ ceph_docker_image_tag }}.tar" diff --git a/roles/ceph-crash/meta/main.yml b/roles/ceph-crash/meta/main.yml new file mode 100644 index 000000000..43578ce14 --- /dev/null +++ b/roles/ceph-crash/meta/main.yml @@ -0,0 +1,15 @@ +--- +galaxy_info: + company: Red Hat + author: Guillaume Abrioux + description: Deploy ceph-crash + license: Apache + min_ansible_version: 2.7 + platforms: + - name: EL + versions: + - 7 + - 8 + galaxy_tags: + - system +dependencies: [] diff --git a/roles/ceph-crash/tasks/main.yml b/roles/ceph-crash/tasks/main.yml new file mode 100644 index 000000000..2b360a65e --- /dev/null +++ b/roles/ceph-crash/tasks/main.yml @@ -0,0 +1,71 @@ +--- +- name: create and copy client.crash keyring + when: cephx | bool + block: + - name: create client.crash keyring + ceph_key: + state: present + name: "client.crash" + caps: "{{ {'mon': 'allow profile crash', 'mgr': 'allow profile crash'} }}" + cluster: "{{ cluster }}" + dest: "{{ ceph_conf_key_directory }}" + import_key: True + mode: "{{ ceph_keyring_permissions }}" + owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}" + group: "{{ ceph_uid if containerized_deployment else 'ceph' }}" + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + run_once: True + + - name: get keys from monitors + command: "{{ hostvars[groups[mon_group_name][0]]['container_exec_cmd'] | default('') }} ceph --cluster {{ cluster }} auth get client.crash" + register: _crash_keys + delegate_to: "{{ groups.get(mon_group_name)[0] }}" + run_once: true + + - name: get a list of node where the keyring should be copied + set_fact: + list_target_node: "{{ list_target_node | default([]) | union(((groups.get('all') | difference(groups.get(grafana_server_group_name, []) + groups.get(client_group_name, []) + groups.get(nfs_group_name, []) + groups.get(iscsi_gw_group_name, []))) + groups.get(item, [])) | unique) }}" + run_once: True + with_items: + - "{{ mon_group_name if groups.get(mon_group_name, []) | length > 0 else [] }}" + - "{{ osd_group_name if groups.get(osd_group_name, []) | length > 0 else [] }}" + - "{{ mds_group_name if groups.get(mds_group_name, []) | length > 0 else [] }}" + - "{{ rgw_group_name if groups.get(rgw_group_name, []) | length > 0 else [] }}" + - "{{ rbdmirror_group_name if groups.get(rbdmirror_group_name, []) | length > 0 else [] }}" + - "{{ mgr_group_name if groups.get(mgr_group_name, []) | length > 0 else [] }}" + + - name: copy ceph key(s) if needed + copy: + dest: "{{ ceph_conf_key_directory }}/{{ cluster }}.client.crash.keyring" + content: "{{ _crash_keys.stdout + '\n' }}" + owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}" + group: "{{ ceph_uid if containerized_deployment else 'ceph' }}" + mode: "{{ ceph_keyring_permissions }}" + with_items: "{{ list_target_node }}" + delegate_to: "{{ item }}" + run_once: True + +- name: start ceph-crash daemon + when: containerized_deployment | bool + block: + - name: create /var/lib/ceph/crash/posted + file: + path: /var/lib/ceph/crash/posted + state: directory + mode: '0755' + owner: "{{ ceph_uid }}" + group: "{{ ceph_uid }}" + + - name: include_tasks systemd.yml + include_tasks: systemd.yml + +- name: start the ceph-crash service + systemd: + name: "{{ 'ceph-crash@' + ansible_hostname if containerized_deployment | bool else 'ceph-crash.service' }}" + state: started + enabled: yes + masked: no + daemon_reload: yes \ No newline at end of file diff --git a/roles/ceph-crash/tasks/systemd.yml b/roles/ceph-crash/tasks/systemd.yml new file mode 100644 index 000000000..3b2ded807 --- /dev/null +++ b/roles/ceph-crash/tasks/systemd.yml @@ -0,0 +1,9 @@ +--- +- name: generate systemd unit file for ceph-crash container + template: + src: "{{ role_path }}/templates/ceph-crash.service.j2" + dest: /etc/systemd/system/ceph-crash@.service + owner: "root" + group: "root" + mode: "0644" + notify: restart ceph crash \ No newline at end of file diff --git a/roles/ceph-crash/templates/ceph-crash.service.j2 b/roles/ceph-crash/templates/ceph-crash.service.j2 new file mode 100644 index 000000000..ed06ef077 --- /dev/null +++ b/roles/ceph-crash/templates/ceph-crash.service.j2 @@ -0,0 +1,41 @@ +[Unit] +Description=Ceph crash dump collector +{% if container_binary == 'docker' %} +After=docker.service +Requires=docker.service +{% else %} +After=network.target +{% endif %} + +[Service] +{% if container_binary == 'podman' %} +ExecStartPre=-/usr/bin/rm -f /%t/%n-pid /%t/%n-cid +ExecStartPre=-/usr/bin/{{ container_binary }} rm -f ceph-crash-%i +{% endif %} +ExecStart=/usr/bin/{{ container_binary }} run --rm --name ceph-crash-%i \ +{% if container_binary == 'podman' %} +-d --conmon-pidfile /%t/%n-pid --cidfile /%t/%n-cid \ +{% endif %} +--net=host \ +-v /var/lib/ceph:/var/lib/ceph:z \ +-v /etc/localtime:/etc/localtime:ro \ +--entrypoint=/usr/bin/ceph-crash {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} +{% if container_binary == 'podman' %} +ExecStop=-/usr/bin/sh -c "/usr/bin/{{ container_binary }} rm -f `cat /%t/%n-cid`" +{% else %} +ExecStop=-/usr/bin/{{ container_binary }} stop ceph-crash-%i +{% endif %} +StartLimitInterval=10min +StartLimitBurst=30 +{% if container_binary == 'podman' %} +Type=forking +PIDFile=/%t/%n-pid +{% endif %} +KillMode=none +Restart=always +RestartSec=10s +TimeoutStartSec=120 +TimeoutStopSec=10 + +[Install] +WantedBy=multi-user.target diff --git a/roles/ceph-handler/handlers/main.yml b/roles/ceph-handler/handlers/main.yml index a6f33e24b..e375c9414 100644 --- a/roles/ceph-handler/handlers/main.yml +++ b/roles/ceph-handler/handlers/main.yml @@ -55,3 +55,7 @@ include_tasks: handler_rbd_target_api_gw.yml when: iscsi_gw_group_name in group_names listen: "restart ceph rbd-target-api-gw" + + - name: ceph crash handler + include_tasks: handler_crash.yml + listen: "restart ceph crash" diff --git a/roles/ceph-handler/tasks/check_running_containers.yml b/roles/ceph-handler/tasks/check_running_containers.yml index cd6539235..86d488de6 100644 --- a/roles/ceph-handler/tasks/check_running_containers.yml +++ b/roles/ceph-handler/tasks/check_running_containers.yml @@ -78,3 +78,10 @@ failed_when: false check_mode: no when: inventory_hostname in groups.get(iscsi_gw_group_name, []) + +- name: check for a ceph-crash container + command: "{{ container_binary }} ps -q --filter='name=ceph-crash-{{ ansible_hostname }}'" + register: ceph_crash_container_stat + changed_when: false + failed_when: false + check_mode: no \ No newline at end of file diff --git a/roles/ceph-handler/tasks/check_socket_non_container.yml b/roles/ceph-handler/tasks/check_socket_non_container.yml index 713fb4f71..20f33efc8 100644 --- a/roles/ceph-handler/tasks/check_socket_non_container.yml +++ b/roles/ceph-handler/tasks/check_socket_non_container.yml @@ -216,3 +216,10 @@ failed_when: false check_mode: no when: inventory_hostname in groups.get(iscsi_gw_group_name, []) + +- name: check for a ceph-crash process + command: pgrep ceph-crash + changed_when: false + failed_when: false + check_mode: no + register: crash_process \ No newline at end of file diff --git a/roles/ceph-handler/tasks/handler_crash.yml b/roles/ceph-handler/tasks/handler_crash.yml new file mode 100644 index 000000000..6187fca79 --- /dev/null +++ b/roles/ceph-handler/tasks/handler_crash.yml @@ -0,0 +1,18 @@ +--- +- name: set _crash_handler_called before restart + set_fact: + _crash_handler_called: True + +- name: restart the ceph-crash service + systemd: + name: ceph-crash@{{ ansible_hostname }} + state: restarted + enabled: yes + masked: no + daemon_reload: yes + ignore_errors: true + when: hostvars[inventory_hostname]['_crash_handler_called'] | default(False) | bool + +- name: set _crash_handler_called after restart + set_fact: + _crash_handler_called: False diff --git a/roles/ceph-handler/tasks/main.yml b/roles/ceph-handler/tasks/main.yml index bdad343fa..0517c6f3b 100644 --- a/roles/ceph-handler/tasks/main.yml +++ b/roles/ceph-handler/tasks/main.yml @@ -36,4 +36,15 @@ - name: set_fact handler_mgr_status set_fact: handler_mgr_status: "{{ (mgr_socket_stat.get('rc') == 0) if not containerized_deployment | bool else (ceph_mgr_container_stat.get('rc') == 0 and ceph_mgr_container_stat.get('stdout_lines', []) | length != 0) }}" - when: inventory_hostname in groups.get(mgr_group_name, []) \ No newline at end of file + when: inventory_hostname in groups.get(mgr_group_name, []) + +- name: set_fact handler_crash_status + set_fact: + handler_crash_status: "{{ crash_process.get('rc') == 0 if not containerized_deployment | bool else (ceph_crash_container_stat.get('rc') == 0 and ceph_crash_container_stat.get('stdout_lines', []) | length != 0) }}" + when: + - inventory_hostname in groups.get(mon_group_name, []) + or inventory_hostname in groups.get(mgr_group_name, []) + or inventory_hostname in groups.get(osd_group_name, []) + or inventory_hostname in groups.get(mds_group_name, []) + or inventory_hostname in groups.get(rgw_group_name, []) + or inventory_hostname in groups.get(rbdmirror_group_name, []) \ No newline at end of file diff --git a/site-container.yml.sample b/site-container.yml.sample index ed2595ee1..cdffea8e5 100644 --- a/site-container.yml.sample +++ b/site-container.yml.sample @@ -426,6 +426,30 @@ - dashboard_enabled | bool - groups.get(grafana_server_group_name, []) | length > 0 +- hosts: + - mons + - osds + - mdss + - rgws + - rbdmirrors + - mgrs + + gather_facts: false + become: True + any_errors_fatal: true + + tasks: + - import_role: + name: ceph-defaults + - import_role: + name: ceph-facts + tasks_from: container_binary.yml + - import_role: + name: ceph-handler + - import_role: + name: ceph-crash + + - hosts: mons gather_facts: false become: True diff --git a/site.yml.sample b/site.yml.sample index 49a899009..374ce878e 100644 --- a/site.yml.sample +++ b/site.yml.sample @@ -448,6 +448,29 @@ - dashboard_enabled | bool - groups.get(grafana_server_group_name, []) | length > 0 +- hosts: + - mons + - osds + - mdss + - rgws + - rbdmirrors + - mgrs + + gather_facts: false + become: True + any_errors_fatal: true + + tasks: + - import_role: + name: ceph-defaults + - import_role: + name: ceph-facts + tasks_from: container_binary.yml + - import_role: + name: ceph-handler + - import_role: + name: ceph-crash + - hosts: mons gather_facts: false become: True diff --git a/tests/conftest.py b/tests/conftest.py index 6853d64e4..74d378ccc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,6 +127,9 @@ def node(host, request): request.function, group_names) pytest.skip(reason) + if request.node.get_closest_marker('ceph_crash') and group_names in [['nfss'], ['iscsigws'], ['clients'], ['grafana-server']]: + pytest.skip('Not a valid test for nfs, client or iscsigw nodes') + if request.node.get_closest_marker("no_docker") and docker: pytest.skip( "Not a valid test for containerized deployments or atomic hosts") diff --git a/tests/functional/tests/test_install.py b/tests/functional/tests/test_install.py index 184002dba..7e7c6de5d 100644 --- a/tests/functional/tests/test_install.py +++ b/tests/functional/tests/test_install.py @@ -29,3 +29,18 @@ class TestCephConf(object): if pattern.search(mon_host_line) is None: result = False assert result + +class TestCephCrash(object): + @pytest.mark.no_docker + @pytest.mark.ceph_crash + def test_ceph_crash_service_enabled_and_running(self, node, host): + s = host.service("ceph-crash") + assert s.is_enabled + assert s.is_running + + @pytest.mark.docker + @pytest.mark.ceph_crash + def test_ceph_crash_service_enabled_and_running_container(self, node, host): + s = host.service("ceph-crash@{hostname}".format(hostname=node["vars"]["inventory_hostname"])) + assert s.is_enabled + assert s.is_running \ No newline at end of file -- 2.39.5