]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-mixin: fix CephNodeNetworkPacket alerts 47707/head
authorAswin Toni <aswin.toni@cern.ch>
Tue, 23 Aug 2022 08:30:12 +0000 (10:30 +0200)
committerAswin Toni <aswin.toni@cern.ch>
Tue, 23 Aug 2022 13:26:52 +0000 (15:26 +0200)
Signed-off-by: Aswin Toni <aswin.toni@cern.ch>
monitoring/ceph-mixin/config.libsonnet
monitoring/ceph-mixin/prometheus_alerts.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index 7c3216b36d489296bb960e34783080ac43fe49c7..7ee1210b043310cf23652ebc95191e43ac4b9252 100644 (file)
@@ -4,5 +4,8 @@
 
     clusterLabel: 'cluster',
     showMultiCluster: false,
+
+    CephNodeNetworkPacketDropsThreshold: 0.005,
+    CephNodeNetworkPacketDropsPerSec: 10,
   },
 }
index 6107c3e54d47f3006985a8c84e46205be0b4611f..8927305b11f3d3aa06498b277a7f1005bdf222e3 100644 (file)
         },
         {
           alert: 'CephNodeNetworkPacketDrops',
-          expr: '(  increase(node_network_receive_drop_total{device!="lo"}[1m]) +  increase(node_network_transmit_drop_total{device!="lo"}[1m])) / (  increase(node_network_receive_packets_total{device!="lo"}[1m]) +  increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or (  increase(node_network_receive_drop_total{device!="lo"}[1m]) +  increase(node_network_transmit_drop_total{device!="lo"}[1m])) >= 10',
+          expr: |||
+            (
+              rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+              rate(node_network_transmit_drop_total{device!="lo"}[1m])
+            ) / (
+              rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+              rate(node_network_transmit_packets_total{device!="lo"}[1m])
+            ) >= %(CephNodeNetworkPacketDropsThreshold)s and (
+              rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+              rate(node_network_transmit_drop_total{device!="lo"}[1m])
+            ) >= %(CephNodeNetworkPacketDropsPerSec)s
+          ||| % $._config,
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
           annotations: {
             summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
-            description: 'Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}.',
+            description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
           },
         },
         {
           alert: 'CephNodeNetworkPacketErrors',
-          expr: '(  increase(node_network_receive_errs_total{device!="lo"}[1m]) +  increase(node_network_transmit_errs_total{device!="lo"}[1m])) / (  increase(node_network_receive_packets_total{device!="lo"}[1m]) +  increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or (  increase(node_network_receive_errs_total{device!="lo"}[1m]) +  increase(node_network_transmit_errs_total{device!="lo"}[1m])) >= 10',
+          expr: |||
+            (
+              rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+              rate(node_network_transmit_errs_total{device!="lo"}[1m])
+            ) / (
+              rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+              rate(node_network_transmit_packets_total{device!="lo"}[1m])
+            ) >= 0.0001 or (
+              rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+              rate(node_network_transmit_errs_total{device!="lo"}[1m])
+            ) >= 10
+          |||,
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' },
           annotations: {
             summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(),
index 7602d48a414d78e25221a9071b576cb695975999..a544d41eb0ee0ac15806a52c93ea5c36856c20cf 100644 (file)
@@ -459,9 +459,19 @@ groups:
           type: "ceph_default"
       - alert: "CephNodeNetworkPacketDrops"
         annotations:
-          description: "Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
+          description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
           summary: "One or more NICs reports packet drops"
-        expr: "(  increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) +  increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) / (  increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) +  increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or (  increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) +  increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) >= 10"
+        expr: |
+          (
+            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) / (
+            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0050000000000000001 and (
+            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) >= 10
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
           severity: "warning"
@@ -470,7 +480,17 @@ groups:
         annotations:
           description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
           summary: "One or more NICs reports packet errors"
-        expr: "(  increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) +  increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) / (  increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) +  increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or (  increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) +  increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) >= 10"
+        expr: |
+          (
+            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) / (
+            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+            rate(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) >= 10
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
           severity: "warning"
index 847cf3b7e3d348ea4e683642b9f0425d74246e46..7b7e7db7301bddc9f0d55e2ca2b56198fb89652c 100644 (file)
@@ -375,32 +375,38 @@ tests:
            description: "Root volume is dangerously full: 4.811% free."
 
  # network packets dropped
- - interval: 1s
+ - interval: 1m
    input_series:
     - series: 'node_network_receive_drop_total{device="eth0",
       instance="node-exporter",job="node-exporter"}'
-      values: '1+1x500'
+      values: '0+600x10'
     - series: 'node_network_transmit_drop_total{device="eth0",
       instance="node-exporter",job="node-exporter"}'
-      values: '1+1x500'
+      values: '0+600x10'
+    - series: 'node_network_receive_packets_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '0+750x10'
+    - series: 'node_network_transmit_packets_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '0+750x10'
    promql_expr_test:
      - expr: |
          (
-           increase(node_network_receive_drop_total{device!="lo"}[1m]) +
-           increase(node_network_transmit_drop_total{device!="lo"}[1m])
+           rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+           rate(node_network_transmit_drop_total{device!="lo"}[1m])
          ) / (
-           increase(node_network_receive_packets_total{device!="lo"}[1m]) +
-           increase(node_network_transmit_packets_total{device!="lo"}[1m])
-         ) >= 0.0001 or (
-           increase(node_network_receive_drop_total{device!="lo"}[1m]) +
-           increase(node_network_transmit_drop_total{device!="lo"}[1m])
+           rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+           rate(node_network_transmit_packets_total{device!="lo"}[1m])
+         ) >= 0.0050000000000000001 and (
+           rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+           rate(node_network_transmit_drop_total{device!="lo"}[1m])
          ) >= 10
 
        eval_time: 5m
        exp_samples:
          - labels: '{device="eth0", instance="node-exporter",
            job="node-exporter"}'
-           value: 1.2E+02
+           value: 8E-1
    alert_rule_test:
      - eval_time: 5m
        alertname: CephNodeNetworkPacketDrops
@@ -414,35 +420,41 @@ tests:
            type: ceph_default
          exp_annotations:
            summary: One or more NICs reports packet drops
-           description: "Node node-exporter experiences packet drop > 0.01% or > 10 packets/s on interface eth0."
+           description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
 
  # network packets errors
- - interval: 1s
+ - interval: 1m
    input_series:
     - series: 'node_network_receive_errs_total{device="eth0",
       instance="node-exporter",job="node-exporter"}'
-      values: '1+1x500'
+      values: '0+600x10'
     - series: 'node_network_transmit_errs_total{device="eth0",
       instance="node-exporter",job="node-exporter"}'
-      values: '1+1x500'
+      values: '0+600x10'
+    - series: 'node_network_transmit_packets_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '0+750x10'
+    - series: 'node_network_receive_packets_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '0+750x10'
    promql_expr_test:
      - expr: |
          (
-           increase(node_network_receive_errs_total{device!="lo"}[1m]) +
-           increase(node_network_transmit_errs_total{device!="lo"}[1m])
+           rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+           rate(node_network_transmit_errs_total{device!="lo"}[1m])
          ) / (
-           increase(node_network_receive_packets_total{device!="lo"}[1m]) +
-           increase(node_network_transmit_packets_total{device!="lo"}[1m])
+           rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+           rate(node_network_transmit_packets_total{device!="lo"}[1m])
          ) >= 0.0001 or (
-           increase(node_network_receive_errs_total{device!="lo"}[1m]) +
-           increase(node_network_transmit_errs_total{device!="lo"}[1m])
+           rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+           rate(node_network_transmit_errs_total{device!="lo"}[1m])
          ) >= 10
 
        eval_time: 5m
        exp_samples:
          - labels: '{device="eth0", instance="node-exporter",
            job="node-exporter"}'
-           value: 1.2E+02
+           value: 8E-01
    alert_rule_test:
      - eval_time: 5m
        alertname: CephNodeNetworkPacketErrors