From: Boris Ranto Date: Wed, 5 Dec 2018 00:54:27 +0000 (+0100) Subject: Initial support for alertmanager X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fheads%2Fwip-alertmanager;p=cephmetrics.git Initial support for alertmanager Signed-off-by: Boris Ranto --- diff --git a/ansible/roles/ceph-prometheus/defaults/main.yml b/ansible/roles/ceph-prometheus/defaults/main.yml index b45a5e4..f4806cb 100644 --- a/ansible/roles/ceph-prometheus/defaults/main.yml +++ b/ansible/roles/ceph-prometheus/defaults/main.yml @@ -6,5 +6,14 @@ defaults: container_cpu_cores: 2 # container_memory is in GB container_memory: 4 - data_dir: /var/lib/cephmetrics + data_dir: /var/lib/prometheus + conf_dir: /etc/prometheus user_id: '65534' # This is the UID used by the prom/prometheus docker image + alertmanager: + container_image: prom/alertmanager:latest + container_cpu_period: 100000 + container_cpu_cores: 2 + # container_memory is in GB + container_memory: 4 + data_dir: /var/lib/alertmanager + conf_dir: /etc/alertmanager diff --git a/ansible/roles/ceph-prometheus/files/alertmanager.service b/ansible/roles/ceph-prometheus/files/alertmanager.service new file mode 100644 index 0000000..2683c23 --- /dev/null +++ b/ansible/roles/ceph-prometheus/files/alertmanager.service @@ -0,0 +1,17 @@ +# This file is managed by ansible, don't make changes here - they will be +# overwritten. +[Unit] +Description=alertmanager +After=docker.service + +[Service] +EnvironmentFile=-/etc/environment +ExecStart=/usr/bin/docker start --attach alertmanager +ExecStop=/usr/bin/docker stop alertmanager +Restart=always +RestartSec=10s +TimeoutStartSec=120 +TimeoutStopSec=15 + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/ceph-prometheus/files/prometheus.service b/ansible/roles/ceph-prometheus/files/prometheus.service index 7b6c8ef..1470935 100644 --- a/ansible/roles/ceph-prometheus/files/prometheus.service +++ b/ansible/roles/ceph-prometheus/files/prometheus.service @@ -7,7 +7,7 @@ After=docker.service [Service] EnvironmentFile=-/etc/environment ExecStart=/usr/bin/docker start --attach prometheus -ExecStop=-/usr/bin/docker stop prometheus +ExecStop=/usr/bin/docker stop prometheus Restart=always RestartSec=10s TimeoutStartSec=120 diff --git a/ansible/roles/ceph-prometheus/handlers/main.yml b/ansible/roles/ceph-prometheus/handlers/main.yml index 421bea5..8fd15fa 100644 --- a/ansible/roles/ceph-prometheus/handlers/main.yml +++ b/ansible/roles/ceph-prometheus/handlers/main.yml @@ -3,7 +3,10 @@ # We use the systemd module here so we can use the daemon_reload feature, # since we're shipping the .service file ourselves systemd: - name: prometheus + name: "{{ item }}" daemon_reload: true enabled: true state: restarted + with_items: + - 'alertmanager' + - 'prometheus' diff --git a/ansible/roles/ceph-prometheus/tasks/main.yml b/ansible/roles/ceph-prometheus/tasks/main.yml index 813d398..f1ea627 100644 --- a/ansible/roles/ceph-prometheus/tasks/main.yml +++ b/ansible/roles/ceph-prometheus/tasks/main.yml @@ -1,19 +1,38 @@ --- - include: merge_vars.yml -- name: Create prometheus data directory +- name: Create prometheus directories file: - path: "{{ prometheus.data_dir }}" + path: "{{ item }}" state: directory owner: "{{ prometheus.user_id }}" + with_items: + - "{{ prometheus.conf_dir }}" + - "{{ prometheus.data_dir }}" -- name: Write config file +- name: Write prometheus config file template: src: prometheus.yml - dest: "{{ prometheus.data_dir }}/" + dest: "{{ prometheus.conf_dir }}/" owner: "{{ prometheus.user_id }}" notify: Service handler +- name: Create alertmanager directories + file: + path: "{{ item }}" + state: directory + owner: "root" + with_items: + - "{{ alertmanager.conf_dir }}" + - "{{ alertmanager.data_dir }}" + +- name: Write alertmanager config file + template: + src: alertmanager.yml + dest: "{{ alertmanager.conf_dir }}/" + owner: "root" + notify: Service handler + - include: setup_container.yml when: containerized diff --git a/ansible/roles/ceph-prometheus/tasks/setup_container.yml b/ansible/roles/ceph-prometheus/tasks/setup_container.yml index 2fc8a73..6e0816d 100644 --- a/ansible/roles/ceph-prometheus/tasks/setup_container.yml +++ b/ansible/roles/ceph-prometheus/tasks/setup_container.yml @@ -5,18 +5,57 @@ allow_duplicates: false when: containerized +- name: Make sure the alertmanager service is down + service: + name: alertmanager + state: stopped + failed_when: false + +- name: Start alertmanager container + docker_container: + name: alertmanager + image: "{{ alertmanager.container_image }}" + state: started + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + # restart to allow updates + restart: true + restart_policy: no + force_kill: yes + published_ports: '9093:9093' + detach: true + volumes: + - "{{ alertmanager.conf_dir }}:/etc/alertmanager:Z" + - "{{ alertmanager.data_dir }}:/alertmanager:Z" + networks: + - name: "{{ docker.network_name }}" + keep_volumes: true + pull: true + cpu_period: "{{ alertmanager.container_cpu_period }}" + # As of ansible-2.5.2, this module doesn't support the equivalent of the + # --cpus flag, so we must use period/quota for now + cpu_quota: "{{ alertmanager.container_cpu_period * alertmanager.container_cpu_cores }}" + #memory: 0 + #memory_swap: 0 + memory: "{{ alertmanager.container_memory }}GB" + memory_swap: "{{ alertmanager.container_memory * 2 }}GB" + notify: Service handler + - name: Make sure the prometheus service is down service: name: prometheus state: stopped failed_when: false -- name: Start docker container +- name: Start prometheus docker container docker_container: name: prometheus image: "{{ prometheus.container_image }}" state: started - command: "--config.file=/prometheus/prometheus.yml" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' # restart to allow updates restart: true restart_policy: no @@ -24,6 +63,7 @@ published_ports: '9090:9090' detach: true volumes: + - "{{ prometheus.conf_dir }}:/etc/prometheus:Z" - "{{ prometheus.data_dir }}:/prometheus:Z" networks: - name: "{{ docker.network_name }}" @@ -40,11 +80,14 @@ memory_swap: "{{ prometheus.container_memory * 2 }}GB" notify: Service handler -- name: Ship systemd service +- name: Ship systemd services copy: - src: prometheus.service + src: "{{ item }}" dest: "/etc/systemd/system/" owner: root group: root mode: 0644 + with_items: + - 'alertmanager.service' + - 'prometheus.service' notify: Service handler diff --git a/ansible/roles/ceph-prometheus/templates/alertmanager.yml b/ansible/roles/ceph-prometheus/templates/alertmanager.yml new file mode 100644 index 0000000..48ef776 --- /dev/null +++ b/ansible/roles/ceph-prometheus/templates/alertmanager.yml @@ -0,0 +1,15 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'ceph-dashboard' +receivers: +- name: 'ceph-dashboard' + webhook_configs: +{% for host in groups['mgrs'] %} + - url: 'http://{{ host }}:{{ dashboard.port }}/api/prometheus_receiver' +{% endfor %} diff --git a/ansible/roles/ceph-prometheus/templates/prometheus.yml b/ansible/roles/ceph-prometheus/templates/prometheus.yml index 398127b..aaa7173 100644 --- a/ansible/roles/ceph-prometheus/templates/prometheus.yml +++ b/ansible/roles/ceph-prometheus/templates/prometheus.yml @@ -2,6 +2,9 @@ global: scrape_interval: 15s evaluation_interval: 15s +rule_files: + - '/etc/prometheus/alerts/*' + scrape_configs: - job_name: 'prometheus' static_configs: @@ -37,3 +40,8 @@ scrape_configs: instance: "{{ hostvars[host]['ansible_nodename'] }}" {% endfor %} {% endif %} +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: ['alertmanager:9093'] diff --git a/dashboard-ansible.spec.in b/dashboard-ansible.spec.in index 03b8ba7..c59a5e7 100644 --- a/dashboard-ansible.spec.in +++ b/dashboard-ansible.spec.in @@ -35,18 +35,16 @@ patch -p1 < patches/0001-ansible-Disable-devel_mode.patch sed -i -e 's/devel_mode: true/devel_mode: false/' ansible/roles/*/defaults/main.yml # Change the prometheus container location/version -sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml -#sed -i -e 's|version: .*$|version: v3.9|' ansible/roles/ceph-prometheus/defaults/main.yml +sed -i -e 's|container_image: prom/prometheus:.*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml -# Change the grafana container location/version -sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml -#sed -i -e 's|version: .*$|version: 3|' ansible/roles/ceph-grafana/defaults/main.yml +# Change the alertmanager container location/version +sed -i -e 's|container_image: prom/alertmanager:.*$|container_image: registry.access.redhat.com/openshift3/prometheus-alertmanager:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml # Change the node_exporter container location/version sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus-node-exporter:v3.10|' ansible/roles/ceph-node-exporter/defaults/main.yml -# Change the service_name for node_exporter -#sed -i -e 's|service_name: .*|service_name: prometheus-node-exporter|' ansible/roles/ceph-node-exporter/defaults/main.yml +# Change the grafana container location/version +sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml %install