From: Dimitri Savineau Date: Thu, 13 Feb 2020 20:56:23 +0000 (-0500) Subject: ceph-prometheus: add alertmanager HA config X-Git-Tag: v6.0.0alpha1~68 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=b9d975385c2dceca3b06c18d4c37eadbe9f48c92;p=ceph-ansible.git ceph-prometheus: add alertmanager HA config When using multiple alertmanager nodes (via the grafana-server group) then we need to specify the other peers in the configuration. https://prometheus.io/docs/alerting/alertmanager/#high-availability https://github.com/prometheus/alertmanager#high-availability Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1792225 Signed-off-by: Dimitri Savineau --- diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index 09cbc7321..25f3ff7a0 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -777,6 +777,7 @@ dummy: #alertmanager_data_dir: /var/lib/alertmanager #alertmanager_conf_dir: /etc/alertmanager #alertmanager_port: 9093 +#alertmanager_cluster_port: 9094 ################################## diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample index 9300845af..595201bde 100644 --- a/group_vars/rhcs.yml.sample +++ b/group_vars/rhcs.yml.sample @@ -777,6 +777,7 @@ alertmanager_container_image: registry.redhat.io/openshift4/ose-prometheus-alert #alertmanager_data_dir: /var/lib/alertmanager #alertmanager_conf_dir: /etc/alertmanager #alertmanager_port: 9093 +#alertmanager_cluster_port: 9094 ################################## diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index 296993d44..82ea17b75 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -769,6 +769,7 @@ alertmanager_container_memory: 4 alertmanager_data_dir: /var/lib/alertmanager alertmanager_conf_dir: /etc/alertmanager alertmanager_port: 9093 +alertmanager_cluster_port: 9094 ################################## diff --git a/roles/ceph-infra/tasks/dashboard_firewall.yml b/roles/ceph-infra/tasks/dashboard_firewall.yml index f3166355d..d598b9331 100644 --- a/roles/ceph-infra/tasks/dashboard_firewall.yml +++ b/roles/ceph-infra/tasks/dashboard_firewall.yml @@ -52,6 +52,17 @@ permanent: true immediate: true state: enabled + + - name: open alertmanager cluster port + firewalld: + port: "{{ alertmanager_cluster_port }}/{{ item }}" + zone: "{{ ceph_dashboard_firewall_zone }}" + permanent: true + immediate: true + state: enabled + with_items: + - "tcp" + - "udp" when: - grafana_server_group_name is defined - grafana_server_group_name in group_names diff --git a/roles/ceph-prometheus/templates/alertmanager.service.j2 b/roles/ceph-prometheus/templates/alertmanager.service.j2 index c905cd11d..2c787cb36 100644 --- a/roles/ceph-prometheus/templates/alertmanager.service.j2 +++ b/roles/ceph-prometheus/templates/alertmanager.service.j2 @@ -22,9 +22,13 @@ ExecStart=/usr/bin/{{ container_binary }} run --rm --name=alertmanager \ --memory-swap={{ alertmanager_container_memory * 2 }}GB \ {{ alertmanager_container_image }} \ --config.file=/etc/alertmanager/alertmanager.yml \ + --cluster.listen-address={{ grafana_server_addr }}:{{ alertmanager_cluster_port }} \ +{% for peer in grafana_server_addrs|difference(grafana_server_addr) %} + --cluster.peer={{ peer }}:{{ alertmanager_cluster_port }} \ +{% endfor %} --storage.path=/alertmanager \ --web.external-url=http://{{ ansible_fqdn }}:{{ alertmanager_port }}/ \ - --web.listen-address=:{{ alertmanager_port }} + --web.listen-address={{ grafana_server_addr }}:{{ alertmanager_port }} ExecStop=/usr/bin/{{ container_binary }} stop alertmanager KillMode=none Restart=always