From: Brad Hubbard Date: Thu, 25 Mar 2021 23:57:14 +0000 (+1000) Subject: Revert "mgr/dashboard:test prometheus rules through promtool" X-Git-Tag: v16.2.0~21^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=334e92b9b4811ab2d8283fca22d523a9ed35beed;p=ceph.git Revert "mgr/dashboard:test prometheus rules through promtool" Reverts: https://github.com/ceph/ceph/pull/39983 This is currently blocking testing on ubuntu on the eve of a pacific release. The problems associated with this PR have been resolved upstream but have not been backported yet and are non-trivial. This reverts commit be7f9e704c8d9ab70713a78c9a83481b5e26ee79. Signed-off-by: Brad Hubbard --- diff --git a/install-deps.sh b/install-deps.sh index 14d64724d9666..73242df8fb8ba 100755 --- a/install-deps.sh +++ b/install-deps.sh @@ -304,9 +304,6 @@ else case "$ID" in debian|ubuntu|devuan|elementary) echo "Using apt-get to install dependencies" - $SUDO apt install -y docker.io - $SUDO systemctl start docker - $SUDO systemctl enable docker $SUDO apt-get install -y devscripts equivs $SUDO apt-get install -y dpkg-dev ensure_python3_sphinx_on_ubuntu @@ -347,9 +344,6 @@ else case "$ID" in fedora) $SUDO dnf install -y dnf-utils - $SUDO dnf install -y docker-ce docker-ce-cli containerd.io - $SUDO systemctl start docker - $SUDO systemctl enable docker ;; centos|rhel|ol|virtuozzo) MAJOR_VERSION="$(echo $VERSION_ID | cut -d. -f1)" @@ -444,7 +438,6 @@ function preload_wheels_for_tox() { mv $wip_wheelhouse wheelhouse md5sum $require_files $constraint_files > $md5 fi - popd > /dev/null } diff --git a/monitoring/prometheus/alerts/test_alerts.yml b/monitoring/prometheus/alerts/test_alerts.yml deleted file mode 100644 index cc246954388fd..0000000000000 --- a/monitoring/prometheus/alerts/test_alerts.yml +++ /dev/null @@ -1,774 +0,0 @@ -rule_files: - - ceph_default_alerts.yml -evaluation_interval: 5m -tests: - # health error - - interval: 5m - input_series: - - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}' - values: '2 2 2 2 2 2 2' - promql_expr_test: - - expr: ceph_health_status == 2 - eval_time: 5m - exp_samples: - - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}' - value: 2 - alert_rule_test: - - eval_time: 1m - alertname: health error - - eval_time: 6m - alertname: health error - exp_alerts: - - exp_labels: - instance: ceph:9283 - job: ceph - oid: 1.3.6.1.4.1.50495.15.1.2.2.1 - type: ceph_default - severity: critical - exp_annotations: - description: > - Ceph in HEALTH_ERROR state for more than 5 minutes. - Please check "ceph health detail" for more information. - - # health warning - - interval: 5m - input_series: - - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1 1 1 1 1' - promql_expr_test: - - expr: ceph_health_status == 1 - eval_time: 15m - exp_samples: - - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}' - value: 1 - alert_rule_test: - - eval_time: 10m - alertname: health warn - - eval_time: 20m - alertname: health warn - exp_alerts: - - exp_labels: - instance: ceph:9283 - job: ceph - oid: 1.3.6.1.4.1.50495.15.1.2.2.2 - type: ceph_default - severity: warning - exp_annotations: - description: > - Ceph has been in HEALTH_WARN for more than 15 minutes. - Please check "ceph health detail" for more information. - - # low monitor quorum count - - interval: 1m - input_series: - - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283", - job="ceph"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283", - job="ceph"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283", - job="ceph"}' - values: '0 0 0 0 0' - - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version - 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific - (dev)",hostname="ceph",instance="ceph:9283",job="ceph", - public_addr="172.20.0.2",rank="0"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version - 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific - (dev)",hostname="ceph",instance="ceph:9283",job="ceph", - public_addr="172.20.0.2",rank="1"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version - 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific - (dev)",hostname="ceph",instance="ceph:9283",job="ceph", - public_addr="172.20.0.2",rank="2"}' - values: '1 1 1 1 1' - promql_expr_test: - - expr: sum(ceph_mon_quorum_status) < 3 - eval_time: 1m - exp_samples: - - labels: '{}' - value: 2 - alert_rule_test: - - eval_time: 1m - alertname: low monitor quorum count - exp_alerts: - - exp_labels: - oid: 1.3.6.1.4.1.50495.15.1.2.3.1 - type: ceph_default - severity: critical - exp_annotations: - description: | - Monitor count in quorum is below three. - - Only 2 of 3 monitors are active. - - The following monitors are down: - - mon.c on ceph - - - # 10% OSDs down - - interval: 1m - input_series: - - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' - values: '0 0 0 0 0' - - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1' - promql_expr_test: - - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 - eval_time: 1m - exp_samples: - - labels: '{}' - value: 3.333333333333333E+01 - alert_rule_test: - - eval_time: 1m - alertname: 10% OSDs down - exp_alerts: - - exp_labels: - oid: 1.3.6.1.4.1.50495.15.1.2.4.1 - type: ceph_default - severity: critical - exp_annotations: - description: | - 33.33% or 1 of 3 OSDs are down (≥ 10%). - - The following OSDs are down: - - osd.1 on ceph - - # OSD down - - interval: 1m - input_series: - - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' - values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - promql_expr_test: - - expr: count(ceph_osd_up == 0) > 0 - eval_time: 1m - exp_samples: - - labels: '{}' - value: 1 - alert_rule_test: - - eval_time: 15m - alertname: OSD down - exp_alerts: - - exp_labels: - oid: 1.3.6.1.4.1.50495.15.1.2.4.2 - type: ceph_default - severity: warning - exp_annotations: - description: | - - 1 OSD down for more than 15 minutes. - - 1 of 3 OSDs are down. - - The following OSD is down: - - osd.1 on ceph - - # OSDs near full - - interval: 1m - input_series: - - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283" - ,job="ceph"}' - values: '1076310016 1076310016 1076310016 1076310016 1076310016 - 1076310016' - - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283" - ,job="ceph"}' - values: '1076310016 1076310016 1076310016 1076310016 1076310016 - 1076310016' - - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283" - ,job="ceph"}' - values: '1076310016 1076310016 1076310016 1076310016 1076310016 - 106447810032' - - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283" - ,job="ceph"}' - values: '108447916032 108447916032 108447916032 108447916032 108447916032 - 108447916032' - - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283" - ,job="ceph"}' - values: '108447916032 108447916032 108447916032 108447916032 108447916032 - 108447916032' - - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283" - ,job="ceph"}' - values: '108447916032 108447916032 108447916032 108447916032 108447916032 - 108447916032' - - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - promql_expr_test: - - expr: | - ( - ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) - ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname) - ceph_osd_metadata - ) * 100 > 90 - - eval_time: 5m - exp_samples: - - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283", - job="ceph"}' - value: 9.815569899986845E+01 - alert_rule_test: - - eval_time: 10m - alertname: OSDs near full - exp_alerts: - - exp_labels: - ceph_daemon: osd.2 - hostname: ceph - instance: ceph:9283 - job: ceph - oid: 1.3.6.1.4.1.50495.15.1.2.4.3 - type: ceph_default - severity: critical - exp_annotations: - description: > - OSD osd.2 on ceph is dangerously full: 98.16% - - # flapping OSD - - interval: 1s - input_series: - - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' - values: '1+1x100' - - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' - values: '1+0x100' - - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' - values: '1+0x100' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - promql_expr_test: - - expr: | - ( - rate(ceph_osd_up[5m]) - * on(ceph_daemon) group_left(hostname) ceph_osd_metadata - ) * 60 > 1 - eval_time: 1m - exp_samples: - - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283", - job="ceph"}' - value: 1.2200000000000001E+01 - alert_rule_test: - - eval_time: 5m - alertname: flapping OSD - exp_alerts: - - exp_labels: - ceph_daemon: osd.0 - hostname: ceph - instance: ceph:9283 - job: ceph - oid: 1.3.6.1.4.1.50495.15.1.2.4.4 - severity: warning - type: ceph_default - exp_annotations: - description: > - OSD osd.0 on ceph was - marked down and back up at 20.1 times once a - minute for 5 minutes. - - # high pg count deviation - - interval: 1m - input_series: - - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283", - job="ceph"}' - values: '169 169 169 169 169 169' - - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283", - job="ceph"}' - values: '169 169 169 169 169 90' - - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283", - job="ceph"}' - values: '169 169 169 169 169 169' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - promql_expr_test: - - expr: | - abs( - ( - (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) - by (job) - ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 - - eval_time: 5m - exp_samples: - - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283", - job="ceph"}' - value: 3.691588785046729E-01 - alert_rule_test: - - eval_time: 10m - alertname: high pg count deviation - exp_alerts: - - exp_labels: - ceph_daemon: osd.1 - hostname: ceph - instance: ceph:9283 - job: ceph - oid: 1.3.6.1.4.1.50495.15.1.2.4.5 - severity: warning - type: ceph_default - exp_annotations: - description: > - OSD osd.1 on ceph deviates - by more than 30% from average PG count. - - # pgs inactive - - interval: 1m - input_series: - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="1"}' - values: '1 1 1 1 1 1 1 1' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="2"}' - values: '1 1 1 1 1 1 1 1' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="3"}' - values: '1 1 1 1 1 1 1 1' - - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}' - values: '1 1 1 1 1 1 1 1' - - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}' - values: '32 32 32 32 32 32 32 32' - - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}' - values: '33 32 32 32 32 33 33 32' - - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}' - values: '1 1 1 1 1 1 1 1 1' - - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}' - values: '32 32 32 32 32 32 32 32' - - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}' - values: '32 32 32 32 32 32 32 32' - promql_expr_test: - - expr: ceph_pool_metadata * on(pool_id,instance) group_left() - (ceph_pg_total - ceph_pg_active) > 0 - eval_time: 5m - exp_samples: - - labels: '{instance="ceph:9283", job="ceph", - name="device_health_metrics", - pool_id="3"}' - value: 1 - alert_rule_test: - - eval_time: 5m - alertname: pgs inactive - exp_alerts: - - exp_labels: - instance: ceph:9283 - job: ceph - name: device_health_metrics - oid: 1.3.6.1.4.1.50495.15.1.2.7.1 - pool_id: 3 - severity: critical - type: ceph_default - exp_annotations: - description: > - 1 PGs have been inactive for more than 5 minutes in pool - device_health_metrics. - Inactive placement groups aren't able to serve read/write - requests. - - #pgs unclean - - interval: 1m - input_series: - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="1"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="3"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}' - values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 - 32 32 32' - - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}' - values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 - 33 33' - - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}' - values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 - 32 32' - - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}' - values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 - 32 32' - promql_expr_test: - - expr: ceph_pool_metadata * on(pool_id,instance) group_left() - (ceph_pg_total - ceph_pg_clean) > 0 - eval_time: 15m - exp_samples: - - labels: '{instance="ceph:9283", job="ceph", - name="device_health_metrics", pool_id="3"}' - value: 1 - alert_rule_test: - - eval_time: 16m - alertname: pgs unclean - exp_alerts: - - exp_labels: - instance: ceph:9283 - job: ceph - name: device_health_metrics - oid: 1.3.6.1.4.1.50495.15.1.2.7.2 - pool_id: 3 - severity: warning - type: ceph_default - exp_annotations: - description: > - 1 PGs haven't been clean for more than 15 minutes in pool - device_health_metrics. - Unclean PGs haven't been able to completely recover from a - previous failure. - - # root volume full - - interval: 1m - input_series: - - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost - --live-home",fstype="ext4",instance="node-exporter",job="node-exporter", - mountpoint="/"}' - values: '35336400896 35336400896 35336400896 35336400896 35336400896 - 3533640089 3533640089' - - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost - --live-home",fstype="ext4",instance="node-exporter",job="node-exporter", - mountpoint="/"}' - values: '73445531648 73445531648 73445531648 73445531648 73445531648 - 73445531648 73445531648' - promql_expr_test: - - expr: node_filesystem_avail_bytes{mountpoint="/"} / - node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 - eval_time: 5m - exp_samples: - - labels: '{device="/dev/mapper/fedora_localhost --live-home", - fstype="ext4", instance="node-exporter", job="node-exporter", - mountpoint="/"}' - value: 4.8112390362092565E+00 - alert_rule_test: - - eval_time: 10m - alertname: root volume full - exp_alerts: - - exp_labels: - device: /dev/mapper/fedora_localhost --live-home - fstype: ext4 - instance: node-exporter - job: node-exporter - mountpoint: / - oid: 1.3.6.1.4.1.50495.15.1.2.8.1 - severity: critical - type: ceph_default - exp_annotations: - description: > - Root volume (OSD and MON store) is dangerously full: 4.811% free. - - # network packets dropped - - interval: 1s - input_series: - - series: 'node_network_receive_drop_total{device="eth0", - instance="node-exporter",job="node-exporter"}' - values: '1+1x500' - - series: 'node_network_transmit_drop_total{device="eth0", - instance="node-exporter",job="node-exporter"}' - values: '1+1x500' - promql_expr_test: - - expr: | - ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) >= 10 - - eval_time: 5m - exp_samples: - - labels: '{device="eth0", instance="node-exporter", - job="node-exporter"}' - value: 1.2E+02 - alert_rule_test: - - eval_time: 5m - alertname: network packets dropped - exp_alerts: - - exp_labels: - device: eth0 - instance: node-exporter - job: node-exporter - oid: 1.3.6.1.4.1.50495.15.1.2.8.2 - severity: warning - type: ceph_default - exp_annotations: - description: > - Node node-exporter experiences packet drop > 0.01% or > - 10 packets/s on interface eth0. - - # network packets errors - - interval: 1s - input_series: - - series: 'node_network_receive_errs_total{device="eth0", - instance="node-exporter",job="node-exporter"}' - values: '1+1x500' - - series: 'node_network_transmit_errs_total{device="eth0", - instance="node-exporter",job="node-exporter"}' - values: '1+1x500' - promql_expr_test: - - expr: | - ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) >= 10 - - eval_time: 5m - exp_samples: - - labels: '{device="eth0", instance="node-exporter", - job="node-exporter"}' - value: 1.2E+02 - alert_rule_test: - - eval_time: 5m - alertname: network packet errors - exp_alerts: - - exp_labels: - device: eth0 - instance: node-exporter - job: node-exporter - oid: 1.3.6.1.4.1.50495.15.1.2.8.3 - severity: warning - type: ceph_default - exp_annotations: - description: > - Node node-exporter experiences packet errors > 0.01% or > 10 - packets/s on interface eth0. - - # MTU Mismatch - - interval: 1m - input_series: - - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter", - job="node-exporter"}' - values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter", - job="node-exporter"}' - values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter", - job="node-exporter"}' - values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter", - job="node-exporter"}' - values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter", - job="node-exporter"}' - values: '9000 9000 9000 9000 9000' - promql_expr_test: - - expr: node_network_mtu_bytes{device!="lo"} != on() group_left() - (quantile(0.5, node_network_mtu_bytes{device!="lo"})) - eval_time: 1m - exp_samples: - - labels: '{__name__="node_network_mtu_bytes", device="eth4", - instance="node-exporter", job="node-exporter"}' - value: 9000 - alert_rule_test: - - eval_time: 1m - alertname: MTU Mismatch - exp_alerts: - - exp_labels: - device: eth4 - instance: node-exporter - job: node-exporter - oid: 1.3.6.1.4.1.50495.15.1.2.8.5 - severity: warning - type: ceph_default - exp_annotations: - description: > - Node node-exporter has a different MTU size (9000) - than the median value on device eth4. - - # pool full - - interval: 1m - input_series: - - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="1"}' - values: '0 0 0 0 0 0 0 0 0' - - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="2"}' - values: '1850 1850 1850 1850 1850 1850 1850' - - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="3"}' - values: '10628706304000 10628706304000 23524 23524 23524 23524 23524 23524 - 23524' - - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="1"}' - values: '106287063040 106287063040 106287063040 106287063040 106287063040 - 106287063040 106287063040' - - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="2"}' - values: '106287063040 106287063040 106287063040 106287063040 106287063040 - 106287063040 106287063040' - - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="3"}' - values: '106287063040 1 106287063040 106287063040 106287063040 - 106287063040 106287063040' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="device_health_metrics",pool_id="1"}' - values: '1 1 1 1 1 1 1 1 1' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name=".rgw.root",pool_id="2"}' - values: '1 1 1 1 1 1 1 1 1' - - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", - name="default.rgw.log",pool_id="3"}' - values: '1 1 1 1 1 1 1 1 1' - promql_expr_test: - - expr: | - ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail) - * on(pool_id) group_right ceph_pool_metadata * 100 > 90 - - eval_time: 1m - exp_samples: - - labels: '{instance="ceph:9283", job="ceph", name="default.rgw.log", - pool_id="3"}' - value: 9.999999999999059E+01 - alert_rule_test: - - eval_time: 2m - alertname: pool full - exp_alerts: - - exp_labels: - instance: ceph:9283 - job: ceph - name: default.rgw.log - oid: 1.3.6.1.4.1.50495.15.1.2.9.1 - pool_id: 3 - severity: critical - type: ceph_default - exp_annotations: - description: Pool default.rgw.log at 99.01% capacity. - - # slow OSD ops - - interval : 1m - input_series: - - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}' - values: '1+0x120' - promql_expr_test: - - expr: ceph_healthcheck_slow_ops > 0 - eval_time: 1m - exp_samples: - - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283", - job="ceph"}' - value: 1 - alert_rule_test: - - eval_time: 20m - alertname: Slow OSD Ops - exp_alerts: - - exp_labels: - instance: ceph:9283 - job: ceph - severity: warning - type: ceph_default - exp_annotations: - description: > - 1 OSD requests are taking too long to process - (osd_op_complaint_time exceeded) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index cae51bb984054..2808f036f44c3 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -591,8 +591,6 @@ add_ceph_test(run-cli-tests ${CMAKE_CURRENT_SOURCE_DIR}/run-cli-tests) add_ceph_test(smoke.sh ${CMAKE_CURRENT_SOURCE_DIR}/smoke.sh) -add_ceph_test(run-promtool-unittests.sh ${CMAKE_CURRENT_SOURCE_DIR}/run-promtool-unittests.sh) - set_property( TEST ${tox_tests} PROPERTY ENVIRONMENT ${env_vars_for_tox_tests}) diff --git a/src/test/run-promtool-unittests.sh b/src/test/run-promtool-unittests.sh deleted file mode 100755 index 84fc1d7d3ccd9..0000000000000 --- a/src/test/run-promtool-unittests.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -: ${CEPH_ROOT:=$SCRIPTPATH/../../} - -sudo docker run --rm \ - -v "$CEPH_ROOT":/ceph \ - --name=promtool \ - --network=host \ - dnanexus/promtool:2.9.2 \ - test rules /ceph/monitoring/prometheus/alerts/test_alerts.yml