Initial support for alertmanager

author Boris Ranto <branto@redhat.com>

Wed, 5 Dec 2018 00:54:27 +0000 (01:54 +0100)

committer Boris Ranto <branto@redhat.com>

Wed, 5 Dec 2018 01:45:45 +0000 (02:45 +0100)
author Boris Ranto <branto@redhat.com>
Wed, 5 Dec 2018 00:54:27 +0000 (01:54 +0100)
committer Boris Ranto <branto@redhat.com>
Wed, 5 Dec 2018 01:45:45 +0000 (02:45 +0100)
diff --git a/ansible/roles/ceph-prometheus/defaults/main.yml b/ansible/roles/ceph-prometheus/defaults/main.yml

index b45a5e413f4c3066df681500d31963a73f91dc09..f4806cba3f01ecd0559e7639154207dcfcbf088d 100644 (file)
--- a/ansible/roles/ceph-prometheus/defaults/main.yml
+++ b/ansible/roles/ceph-prometheus/defaults/main.yml
@@ -6,5 +6,14 @@ defaults:
      container_cpu_cores: 2
      # container_memory is in GB
      container_memory: 4
-    data_dir: /var/lib/cephmetrics
+    data_dir: /var/lib/prometheus
+    conf_dir: /etc/prometheus
      user_id: '65534'  # This is the UID used by the prom/prometheus docker image
+  alertmanager:
+    container_image: prom/alertmanager:latest
+    container_cpu_period: 100000
+    container_cpu_cores: 2
+    # container_memory is in GB
+    container_memory: 4
+    data_dir: /var/lib/alertmanager
+    conf_dir: /etc/alertmanager
diff --git a/ansible/roles/ceph-prometheus/files/alertmanager.service b/ansible/roles/ceph-prometheus/files/alertmanager.service

new file mode 100644 (file)

index 0000000..2683c23
--- /dev/null
+++ b/ansible/roles/ceph-prometheus/files/alertmanager.service
@@ -0,0 +1,17 @@
+# This file is managed by ansible, don't make changes here - they will be
+# overwritten.
+[Unit]
+Description=alertmanager
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStart=/usr/bin/docker start --attach alertmanager
+ExecStop=/usr/bin/docker stop alertmanager
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
diff --git a/ansible/roles/ceph-prometheus/files/prometheus.service b/ansible/roles/ceph-prometheus/files/prometheus.service

index 7b6c8efa83acf240ad168314d10320272e60259a..147093542c9c8985162da379a6208325824c0586 100644 (file)
--- a/ansible/roles/ceph-prometheus/files/prometheus.service
+++ b/ansible/roles/ceph-prometheus/files/prometheus.service
@@ -7,7 +7,7 @@ After=docker.service
  [Service]
  EnvironmentFile=-/etc/environment
  ExecStart=/usr/bin/docker start --attach prometheus
-ExecStop=-/usr/bin/docker stop prometheus
+ExecStop=/usr/bin/docker stop prometheus
  Restart=always
  RestartSec=10s
  TimeoutStartSec=120
diff --git a/ansible/roles/ceph-prometheus/handlers/main.yml b/ansible/roles/ceph-prometheus/handlers/main.yml

index 421bea57704311656174f601987d3109c11a9495..8fd15fa1a86bd82cef23757b8d6cafa689c3e138 100644 (file)
--- a/ansible/roles/ceph-prometheus/handlers/main.yml
+++ b/ansible/roles/ceph-prometheus/handlers/main.yml
@@ -3,7 +3,10 @@
    # We use the systemd module here so we can use the daemon_reload feature,
    # since we're shipping the .service file ourselves
    systemd:
-    name: prometheus
+    name: "{{ item }}"
      daemon_reload: true
      enabled: true
      state: restarted
+  with_items:
+    - 'alertmanager'
+    - 'prometheus'
diff --git a/ansible/roles/ceph-prometheus/tasks/main.yml b/ansible/roles/ceph-prometheus/tasks/main.yml

index 813d398767e155d80ee5d134b718bb78b795be42..f1ea6270d1258a547c64de4fd4097902b3da1111 100644 (file)
--- a/ansible/roles/ceph-prometheus/tasks/main.yml
+++ b/ansible/roles/ceph-prometheus/tasks/main.yml
@@ -1,19 +1,38 @@
  ---
  - include: merge_vars.yml
  
-- name: Create prometheus data directory
+- name: Create prometheus directories
    file:
-    path: "{{ prometheus.data_dir }}"
+    path: "{{ item }}"
      state: directory
      owner: "{{ prometheus.user_id }}"
+  with_items:
+   - "{{ prometheus.conf_dir }}"
+   - "{{ prometheus.data_dir }}"
  
-- name: Write config file
+- name: Write prometheus config file
    template:
      src: prometheus.yml
-    dest: "{{ prometheus.data_dir }}/"
+    dest: "{{ prometheus.conf_dir }}/"
      owner: "{{ prometheus.user_id }}"
    notify: Service handler
  
+- name: Create alertmanager directories
+  file:
+    path: "{{ item }}"
+    state: directory
+    owner: "root"
+  with_items:
+   - "{{ alertmanager.conf_dir }}"
+   - "{{ alertmanager.data_dir }}"
+
+- name: Write alertmanager config file
+  template:
+    src: alertmanager.yml
+    dest: "{{ alertmanager.conf_dir }}/"
+    owner: "root"
+  notify: Service handler
+
  - include: setup_container.yml
    when: containerized
  
diff --git a/ansible/roles/ceph-prometheus/tasks/setup_container.yml b/ansible/roles/ceph-prometheus/tasks/setup_container.yml

index 2fc8a7383ef8f5a3c06ccbfca0b6d7fed96fac84..6e0816d338a2d4b2af209d621d0c322f4c780532 100644 (file)
--- a/ansible/roles/ceph-prometheus/tasks/setup_container.yml
+++ b/ansible/roles/ceph-prometheus/tasks/setup_container.yml
@@ -5,18 +5,57 @@
      allow_duplicates: false
    when: containerized
  
+- name: Make sure the alertmanager service is down
+  service:
+    name: alertmanager
+    state: stopped
+  failed_when: false
+
+- name: Start alertmanager container
+  docker_container:
+    name: alertmanager
+    image: "{{ alertmanager.container_image }}"
+    state: started
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+    # restart to allow updates
+    restart: true
+    restart_policy: no
+    force_kill: yes
+    published_ports: '9093:9093'
+    detach: true
+    volumes:
+      - "{{ alertmanager.conf_dir }}:/etc/alertmanager:Z"
+      - "{{ alertmanager.data_dir }}:/alertmanager:Z"
+    networks:
+      - name: "{{ docker.network_name }}"
+    keep_volumes: true
+    pull: true
+    cpu_period: "{{ alertmanager.container_cpu_period }}"
+    # As of ansible-2.5.2, this module doesn't support the equivalent of the
+    # --cpus flag, so we must use period/quota for now
+    cpu_quota: "{{ alertmanager.container_cpu_period * alertmanager.container_cpu_cores }}"
+    #memory: 0
+    #memory_swap: 0
+    memory: "{{ alertmanager.container_memory }}GB"
+    memory_swap: "{{ alertmanager.container_memory * 2 }}GB"
+  notify: Service handler
+
  - name: Make sure the prometheus service is down
    service:
      name: prometheus
      state: stopped
    failed_when: false
  
-- name: Start docker container
+- name: Start prometheus docker container
    docker_container:
      name: prometheus
      image: "{{ prometheus.container_image }}"
      state: started
-    command: "--config.file=/prometheus/prometheus.yml"
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
      # restart to allow updates
      restart: true
      restart_policy: no
@@ -24,6 +63,7 @@
      published_ports: '9090:9090'
      detach: true
      volumes:
+      - "{{ prometheus.conf_dir }}:/etc/prometheus:Z"
        - "{{ prometheus.data_dir }}:/prometheus:Z"
      networks:
        - name: "{{ docker.network_name }}"
@@ -40,11 +80,14 @@
      memory_swap: "{{ prometheus.container_memory * 2 }}GB"
    notify: Service handler
  
-- name: Ship systemd service
+- name: Ship systemd services
    copy:
-    src: prometheus.service
+    src: "{{ item }}"
      dest: "/etc/systemd/system/"
      owner: root
      group: root
      mode: 0644
+  with_items:
+    - 'alertmanager.service'
+    - 'prometheus.service'
    notify: Service handler
diff --git a/ansible/roles/ceph-prometheus/templates/alertmanager.yml b/ansible/roles/ceph-prometheus/templates/alertmanager.yml

new file mode 100644 (file)

index 0000000..48ef776
--- /dev/null
+++ b/ansible/roles/ceph-prometheus/templates/alertmanager.yml
@@ -0,0 +1,15 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 1h
+  receiver: 'ceph-dashboard'
+receivers:
+- name: 'ceph-dashboard'
+  webhook_configs:
+{% for host in groups['mgrs'] %}
+  - url: 'http://{{ host }}:{{ dashboard.port }}/api/prometheus_receiver'
+{% endfor %}
diff --git a/ansible/roles/ceph-prometheus/templates/prometheus.yml b/ansible/roles/ceph-prometheus/templates/prometheus.yml

index 398127b5bb488aa5e03a8b4062f34756d7ee3285..aaa7173cb78cf99e426229de37384a47c8ab95eb 100644 (file)
--- a/ansible/roles/ceph-prometheus/templates/prometheus.yml
+++ b/ansible/roles/ceph-prometheus/templates/prometheus.yml
@@ -2,6 +2,9 @@ global:
    scrape_interval: 15s
    evaluation_interval: 15s
  
+rule_files:
+  - '/etc/prometheus/alerts/*'
+
  scrape_configs:
    - job_name: 'prometheus'
      static_configs:
@@ -37,3 +40,8 @@ scrape_configs:
            instance: "{{ hostvars[host]['ansible_nodename'] }}"
  {% endfor %}
  {% endif %}
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets: ['alertmanager:9093']
diff --git a/dashboard-ansible.spec.in b/dashboard-ansible.spec.in

index 03b8ba72be15191b75f5005bff56b689a830aed8..c59a5e7e2feb9adb07037d44e58c0ed50e38b240 100644 (file)
--- a/dashboard-ansible.spec.in
+++ b/dashboard-ansible.spec.in
@@ -35,18 +35,16 @@ patch -p1 < patches/0001-ansible-Disable-devel_mode.patch
  sed -i -e 's/devel_mode: true/devel_mode: false/' ansible/roles/*/defaults/main.yml
  
  # Change the prometheus container location/version
-sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
-#sed -i -e 's|version: .*$|version: v3.9|' ansible/roles/ceph-prometheus/defaults/main.yml
+sed -i -e 's|container_image: prom/prometheus:.*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
  
-# Change the grafana container location/version
-sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml
-#sed -i -e 's|version: .*$|version: 3|' ansible/roles/ceph-grafana/defaults/main.yml
+# Change the alertmanager container location/version
+sed -i -e 's|container_image: prom/alertmanager:.*$|container_image: registry.access.redhat.com/openshift3/prometheus-alertmanager:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
  
  # Change the node_exporter container location/version
  sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus-node-exporter:v3.10|' ansible/roles/ceph-node-exporter/defaults/main.yml
  
-# Change the service_name for node_exporter
-#sed -i -e 's|service_name: .*|service_name: prometheus-node-exporter|' ansible/roles/ceph-node-exporter/defaults/main.yml
+# Change the grafana container location/version
+sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml
  
  
  %install
author	Boris Ranto <branto@redhat.com>
	Wed, 5 Dec 2018 00:54:27 +0000 (01:54 +0100)
committer	Boris Ranto <branto@redhat.com>
	Wed, 5 Dec 2018 01:45:45 +0000 (02:45 +0100)
ansible/roles/ceph-prometheus/defaults/main.yml		patch \| blob \| history
ansible/roles/ceph-prometheus/files/alertmanager.service	[new file with mode: 0644]	patch \| blob
ansible/roles/ceph-prometheus/files/prometheus.service		patch \| blob \| history
ansible/roles/ceph-prometheus/handlers/main.yml		patch \| blob \| history
ansible/roles/ceph-prometheus/tasks/main.yml		patch \| blob \| history
ansible/roles/ceph-prometheus/tasks/setup_container.yml		patch \| blob \| history
ansible/roles/ceph-prometheus/templates/alertmanager.yml	[new file with mode: 0644]	patch \| blob
ansible/roles/ceph-prometheus/templates/prometheus.yml		patch \| blob \| history
dashboard-ansible.spec.in		patch \| blob \| history