container_cpu_cores: 2
# container_memory is in GB
container_memory: 4
- data_dir: /var/lib/cephmetrics
+ data_dir: /var/lib/prometheus
+ conf_dir: /etc/prometheus
user_id: '65534' # This is the UID used by the prom/prometheus docker image
+ alertmanager:
+ container_image: prom/alertmanager:latest
+ container_cpu_period: 100000
+ container_cpu_cores: 2
+ # container_memory is in GB
+ container_memory: 4
+ data_dir: /var/lib/alertmanager
+ conf_dir: /etc/alertmanager
--- /dev/null
+# This file is managed by ansible, don't make changes here - they will be
+# overwritten.
+[Unit]
+Description=alertmanager
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStart=/usr/bin/docker start --attach alertmanager
+ExecStop=/usr/bin/docker stop alertmanager
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
[Service]
EnvironmentFile=-/etc/environment
ExecStart=/usr/bin/docker start --attach prometheus
-ExecStop=-/usr/bin/docker stop prometheus
+ExecStop=/usr/bin/docker stop prometheus
Restart=always
RestartSec=10s
TimeoutStartSec=120
# We use the systemd module here so we can use the daemon_reload feature,
# since we're shipping the .service file ourselves
systemd:
- name: prometheus
+ name: "{{ item }}"
daemon_reload: true
enabled: true
state: restarted
+ with_items:
+ - 'alertmanager'
+ - 'prometheus'
---
- include: merge_vars.yml
-- name: Create prometheus data directory
+- name: Create prometheus directories
file:
- path: "{{ prometheus.data_dir }}"
+ path: "{{ item }}"
state: directory
owner: "{{ prometheus.user_id }}"
+ with_items:
+ - "{{ prometheus.conf_dir }}"
+ - "{{ prometheus.data_dir }}"
-- name: Write config file
+- name: Write prometheus config file
template:
src: prometheus.yml
- dest: "{{ prometheus.data_dir }}/"
+ dest: "{{ prometheus.conf_dir }}/"
owner: "{{ prometheus.user_id }}"
notify: Service handler
+- name: Create alertmanager directories
+ file:
+ path: "{{ item }}"
+ state: directory
+ owner: "root"
+ with_items:
+ - "{{ alertmanager.conf_dir }}"
+ - "{{ alertmanager.data_dir }}"
+
+- name: Write alertmanager config file
+ template:
+ src: alertmanager.yml
+ dest: "{{ alertmanager.conf_dir }}/"
+ owner: "root"
+ notify: Service handler
+
- include: setup_container.yml
when: containerized
allow_duplicates: false
when: containerized
+- name: Make sure the alertmanager service is down
+ service:
+ name: alertmanager
+ state: stopped
+ failed_when: false
+
+- name: Start alertmanager container
+ docker_container:
+ name: alertmanager
+ image: "{{ alertmanager.container_image }}"
+ state: started
+ command:
+ - '--config.file=/etc/alertmanager/alertmanager.yml'
+ - '--storage.path=/alertmanager'
+ # restart to allow updates
+ restart: true
+ restart_policy: no
+ force_kill: yes
+ published_ports: '9093:9093'
+ detach: true
+ volumes:
+ - "{{ alertmanager.conf_dir }}:/etc/alertmanager:Z"
+ - "{{ alertmanager.data_dir }}:/alertmanager:Z"
+ networks:
+ - name: "{{ docker.network_name }}"
+ keep_volumes: true
+ pull: true
+ cpu_period: "{{ alertmanager.container_cpu_period }}"
+ # As of ansible-2.5.2, this module doesn't support the equivalent of the
+ # --cpus flag, so we must use period/quota for now
+ cpu_quota: "{{ alertmanager.container_cpu_period * alertmanager.container_cpu_cores }}"
+ #memory: 0
+ #memory_swap: 0
+ memory: "{{ alertmanager.container_memory }}GB"
+ memory_swap: "{{ alertmanager.container_memory * 2 }}GB"
+ notify: Service handler
+
- name: Make sure the prometheus service is down
service:
name: prometheus
state: stopped
failed_when: false
-- name: Start docker container
+- name: Start prometheus docker container
docker_container:
name: prometheus
image: "{{ prometheus.container_image }}"
state: started
- command: "--config.file=/prometheus/prometheus.yml"
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
# restart to allow updates
restart: true
restart_policy: no
published_ports: '9090:9090'
detach: true
volumes:
+ - "{{ prometheus.conf_dir }}:/etc/prometheus:Z"
- "{{ prometheus.data_dir }}:/prometheus:Z"
networks:
- name: "{{ docker.network_name }}"
memory_swap: "{{ prometheus.container_memory * 2 }}GB"
notify: Service handler
-- name: Ship systemd service
+- name: Ship systemd services
copy:
- src: prometheus.service
+ src: "{{ item }}"
dest: "/etc/systemd/system/"
owner: root
group: root
mode: 0644
+ with_items:
+ - 'alertmanager.service'
+ - 'prometheus.service'
notify: Service handler
--- /dev/null
+global:
+ resolve_timeout: 5m
+
+route:
+ group_by: ['alertname']
+ group_wait: 10s
+ group_interval: 10s
+ repeat_interval: 1h
+ receiver: 'ceph-dashboard'
+receivers:
+- name: 'ceph-dashboard'
+ webhook_configs:
+{% for host in groups['mgrs'] %}
+ - url: 'http://{{ host }}:{{ dashboard.port }}/api/prometheus_receiver'
+{% endfor %}
scrape_interval: 15s
evaluation_interval: 15s
+rule_files:
+ - '/etc/prometheus/alerts/*'
+
scrape_configs:
- job_name: 'prometheus'
static_configs:
instance: "{{ hostvars[host]['ansible_nodename'] }}"
{% endfor %}
{% endif %}
+alerting:
+ alertmanagers:
+ - scheme: http
+ static_configs:
+ - targets: ['alertmanager:9093']
sed -i -e 's/devel_mode: true/devel_mode: false/' ansible/roles/*/defaults/main.yml
# Change the prometheus container location/version
-sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
-#sed -i -e 's|version: .*$|version: v3.9|' ansible/roles/ceph-prometheus/defaults/main.yml
+sed -i -e 's|container_image: prom/prometheus:.*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
-# Change the grafana container location/version
-sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml
-#sed -i -e 's|version: .*$|version: 3|' ansible/roles/ceph-grafana/defaults/main.yml
+# Change the alertmanager container location/version
+sed -i -e 's|container_image: prom/alertmanager:.*$|container_image: registry.access.redhat.com/openshift3/prometheus-alertmanager:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
# Change the node_exporter container location/version
sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus-node-exporter:v3.10|' ansible/roles/ceph-node-exporter/defaults/main.yml
-# Change the service_name for node_exporter
-#sed -i -e 's|service_name: .*|service_name: prometheus-node-exporter|' ansible/roles/ceph-node-exporter/defaults/main.yml
+# Change the grafana container location/version
+sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-grafana/defaults/main.yml
%install