]> git-server-git.apps.pok.os.sepia.ceph.com Git - cephmetrics.git/commitdiff
Initial support for alertmanager wip-alertmanager
authorBoris Ranto <branto@redhat.com>
Wed, 5 Dec 2018 00:54:27 +0000 (01:54 +0100)
committerBoris Ranto <branto@redhat.com>
Wed, 5 Dec 2018 01:45:45 +0000 (02:45 +0100)
Signed-off-by: Boris Ranto <branto@redhat.com>
ansible/roles/ceph-prometheus/defaults/main.yml
ansible/roles/ceph-prometheus/files/alertmanager.service [new file with mode: 0644]
ansible/roles/ceph-prometheus/files/prometheus.service
ansible/roles/ceph-prometheus/handlers/main.yml
ansible/roles/ceph-prometheus/tasks/main.yml
ansible/roles/ceph-prometheus/tasks/setup_container.yml
ansible/roles/ceph-prometheus/templates/alertmanager.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/templates/prometheus.yml
dashboard-ansible.spec.in

index b45a5e413f4c3066df681500d31963a73f91dc09..f4806cba3f01ecd0559e7639154207dcfcbf088d 100644 (file)
@@ -6,5 +6,14 @@ defaults:
     container_cpu_cores: 2
     # container_memory is in GB
     container_memory: 4
-    data_dir: /var/lib/cephmetrics
+    data_dir: /var/lib/prometheus
+    conf_dir: /etc/prometheus
     user_id: '65534'  # This is the UID used by the prom/prometheus docker image
+  alertmanager:
+    container_image: prom/alertmanager:latest
+    container_cpu_period: 100000
+    container_cpu_cores: 2
+    # container_memory is in GB
+    container_memory: 4
+    data_dir: /var/lib/alertmanager
+    conf_dir: /etc/alertmanager
diff --git a/ansible/roles/ceph-prometheus/files/alertmanager.service b/ansible/roles/ceph-prometheus/files/alertmanager.service
new file mode 100644 (file)
index 0000000..2683c23
--- /dev/null
@@ -0,0 +1,17 @@
+# This file is managed by ansible, don't make changes here - they will be
+# overwritten.
+[Unit]
+Description=alertmanager
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStart=/usr/bin/docker start --attach alertmanager
+ExecStop=/usr/bin/docker stop alertmanager
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
index 7b6c8efa83acf240ad168314d10320272e60259a..147093542c9c8985162da379a6208325824c0586 100644 (file)
@@ -7,7 +7,7 @@ After=docker.service
 [Service]
 EnvironmentFile=-/etc/environment
 ExecStart=/usr/bin/docker start --attach prometheus
-ExecStop=-/usr/bin/docker stop prometheus
+ExecStop=/usr/bin/docker stop prometheus
 Restart=always
 RestartSec=10s
 TimeoutStartSec=120
index 421bea57704311656174f601987d3109c11a9495..8fd15fa1a86bd82cef23757b8d6cafa689c3e138 100644 (file)
@@ -3,7 +3,10 @@
   # We use the systemd module here so we can use the daemon_reload feature,
   # since we're shipping the .service file ourselves
   systemd:
-    name: prometheus
+    name: "{{ item }}"
     daemon_reload: true
     enabled: true
     state: restarted
+  with_items:
+    - 'alertmanager'
+    - 'prometheus'
index 813d398767e155d80ee5d134b718bb78b795be42..f1ea6270d1258a547c64de4fd4097902b3da1111 100644 (file)
@@ -1,19 +1,38 @@
 ---
 - include: merge_vars.yml
 
-- name: Create prometheus data directory
+- name: Create prometheus directories
   file:
-    path: "{{ prometheus.data_dir }}"
+    path: "{{ item }}"
     state: directory
     owner: "{{ prometheus.user_id }}"
+  with_items:
+   - "{{ prometheus.conf_dir }}"
+   - "{{ prometheus.data_dir }}"
 
-- name: Write config file
+- name: Write prometheus config file
   template:
     src: prometheus.yml
-    dest: "{{ prometheus.data_dir }}/"
+    dest: "{{ prometheus.conf_dir }}/"
     owner: "{{ prometheus.user_id }}"
   notify: Service handler
 
+- name: Create alertmanager directories
+  file:
+    path: "{{ item }}"
+    state: directory
+    owner: "root"
+  with_items:
+   - "{{ alertmanager.conf_dir }}"
+   - "{{ alertmanager.data_dir }}"
+
+- name: Write alertmanager config file
+  template:
+    src: alertmanager.yml
+    dest: "{{ alertmanager.conf_dir }}/"
+    owner: "root"
+  notify: Service handler
+
 - include: setup_container.yml
   when: containerized
 
index 2fc8a7383ef8f5a3c06ccbfca0b6d7fed96fac84..6e0816d338a2d4b2af209d621d0c322f4c780532 100644 (file)
@@ -5,18 +5,57 @@
     allow_duplicates: false
   when: containerized
 
+- name: Make sure the alertmanager service is down
+  service:
+    name: alertmanager
+    state: stopped
+  failed_when: false
+
+- name: Start alertmanager container
+  docker_container:
+    name: alertmanager
+    image: "{{ alertmanager.container_image }}"
+    state: started
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+    # restart to allow updates
+    restart: true
+    restart_policy: no
+    force_kill: yes
+    published_ports: '9093:9093'
+    detach: true
+    volumes:
+      - "{{ alertmanager.conf_dir }}:/etc/alertmanager:Z"
+      - "{{ alertmanager.data_dir }}:/alertmanager:Z"
+    networks:
+      - name: "{{ docker.network_name }}"
+    keep_volumes: true
+    pull: true
+    cpu_period: "{{ alertmanager.container_cpu_period }}"
+    # As of ansible-2.5.2, this module doesn't support the equivalent of the
+    # --cpus flag, so we must use period/quota for now
+    cpu_quota: "{{ alertmanager.container_cpu_period * alertmanager.container_cpu_cores }}"
+    #memory: 0
+    #memory_swap: 0
+    memory: "{{ alertmanager.container_memory }}GB"
+    memory_swap: "{{ alertmanager.container_memory * 2 }}GB"
+  notify: Service handler
+
 - name: Make sure the prometheus service is down
   service:
     name: prometheus
     state: stopped
   failed_when: false
 
-- name: Start docker container
+- name: Start prometheus docker container
   docker_container:
     name: prometheus
     image: "{{ prometheus.container_image }}"
     state: started
-    command: "--config.file=/prometheus/prometheus.yml"
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
     # restart to allow updates
     restart: true
     restart_policy: no
@@ -24,6 +63,7 @@
     published_ports: '9090:9090'
     detach: true
     volumes:
+      - "{{ prometheus.conf_dir }}:/etc/prometheus:Z"
       - "{{ prometheus.data_dir }}:/prometheus:Z"
     networks:
       - name: "{{ docker.network_name }}"
     memory_swap: "{{ prometheus.container_memory * 2 }}GB"
   notify: Service handler
 
-- name: Ship systemd service
+- name: Ship systemd services
   copy:
-    src: prometheus.service
+    src: "{{ item }}"
     dest: "/etc/systemd/system/"
     owner: root
     group: root
     mode: 0644
+  with_items:
+    - 'alertmanager.service'
+    - 'prometheus.service'
   notify: Service handler
diff --git a/ansible/roles/ceph-prometheus/templates/alertmanager.yml b/ansible/roles/ceph-prometheus/templates/alertmanager.yml
new file mode 100644 (file)
index 0000000..48ef776
--- /dev/null
@@ -0,0 +1,15 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 1h
+  receiver: 'ceph-dashboard'
+receivers:
+- name: 'ceph-dashboard'
+  webhook_configs:
+{% for host in groups['mgrs'] %}
+  - url: 'http://{{ host }}:{{ dashboard.port }}/api/prometheus_receiver'
+{% endfor %}
index 398127b5bb488aa5e03a8b4062f34756d7ee3285..aaa7173cb78cf99e426229de37384a47c8ab95eb 100644 (file)
@@ -2,6 +2,9 @@ global:
   scrape_interval: 15s
   evaluation_interval: 15s
 
+rule_files:
+  - '/etc/prometheus/alerts/*'
+
 scrape_configs:
   - job_name: 'prometheus'
     static_configs:
@@ -37,3 +40,8 @@ scrape_configs:
           instance: "{{ hostvars[host]['ansible_nodename'] }}"
 {% endfor %}
 {% endif %}
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets: ['alertmanager:9093']
index 03b8ba72be15191b75f5005bff56b689a830aed8..c59a5e7e2feb9adb07037d44e58c0ed50e38b240 100644 (file)
@@ -35,18 +35,16 @@ patch -p1 < patches/0001-ansible-Disable-devel_mode.patch
 sed -i -e 's/devel_mode: true/devel_mode: false/' ansible/roles/*/defaults/main.yml
 
 # Change the prometheus container location/version
-sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
-#sed -i -e 's|version: .*$|version: v3.9|' ansible/roles/ceph-prometheus/defaults/main.yml
+sed -i -e 's|container_image: prom/prometheus:.*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
 
-# Change the grafana container location/version
-sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml
-#sed -i -e 's|version: .*$|version: 3|' ansible/roles/ceph-grafana/defaults/main.yml
+# Change the alertmanager container location/version
+sed -i -e 's|container_image: prom/alertmanager:.*$|container_image: registry.access.redhat.com/openshift3/prometheus-alertmanager:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
 
 # Change the node_exporter container location/version
 sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus-node-exporter:v3.10|' ansible/roles/ceph-node-exporter/defaults/main.yml
 
-# Change the service_name for node_exporter
-#sed -i -e 's|service_name: .*|service_name: prometheus-node-exporter|' ansible/roles/ceph-node-exporter/defaults/main.yml
+# Change the grafana container location/version
+sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml
 
 
 %install