clusterLabel: 'cluster',
showMultiCluster: false,
+
+ CephNodeNetworkPacketDropsThreshold: 0.005,
+ CephNodeNetworkPacketDropsPerSec: 10,
},
}
},
{
alert: 'CephNodeNetworkPacketDrops',
- expr: '( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) >= 10',
+ expr: |||
+ (
+ rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) / (
+ rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= %(CephNodeNetworkPacketDropsThreshold)s and (
+ rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) >= %(CephNodeNetworkPacketDropsPerSec)s
+ ||| % $._config,
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
annotations: {
summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
- description: 'Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}.',
+ description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
},
},
{
alert: 'CephNodeNetworkPacketErrors',
- expr: '( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) >= 10',
+ expr: |||
+ (
+ rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) / (
+ rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) >= 10
+ |||,
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' },
annotations: {
summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(),
type: "ceph_default"
- alert: "CephNodeNetworkPacketDrops"
annotations:
- description: "Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
+ description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
summary: "One or more NICs reports packet drops"
- expr: "( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) >= 10"
+ expr: |
+ (
+ rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) / (
+ rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0050000000000000001 and (
+ rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
severity: "warning"
annotations:
description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
summary: "One or more NICs reports packet errors"
- expr: "( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) >= 10"
+ expr: |
+ (
+ rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) / (
+ rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
severity: "warning"
description: "Root volume is dangerously full: 4.811% free."
# network packets dropped
- - interval: 1s
+ - interval: 1m
input_series:
- series: 'node_network_receive_drop_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
- values: '1+1x500'
+ values: '0+600x10'
- series: 'node_network_transmit_drop_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
- values: '1+1x500'
+ values: '0+600x10'
+ - series: 'node_network_receive_packets_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '0+750x10'
+ - series: 'node_network_transmit_packets_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '0+750x10'
promql_expr_test:
- expr: |
(
- increase(node_network_receive_drop_total{device!="lo"}[1m]) +
- increase(node_network_transmit_drop_total{device!="lo"}[1m])
+ rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
- increase(node_network_receive_packets_total{device!="lo"}[1m]) +
- increase(node_network_transmit_packets_total{device!="lo"}[1m])
- ) >= 0.0001 or (
- increase(node_network_receive_drop_total{device!="lo"}[1m]) +
- increase(node_network_transmit_drop_total{device!="lo"}[1m])
+ rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0050000000000000001 and (
+ rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
eval_time: 5m
exp_samples:
- labels: '{device="eth0", instance="node-exporter",
job="node-exporter"}'
- value: 1.2E+02
+ value: 8E-1
alert_rule_test:
- eval_time: 5m
alertname: CephNodeNetworkPacketDrops
type: ceph_default
exp_annotations:
summary: One or more NICs reports packet drops
- description: "Node node-exporter experiences packet drop > 0.01% or > 10 packets/s on interface eth0."
+ description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
# network packets errors
- - interval: 1s
+ - interval: 1m
input_series:
- series: 'node_network_receive_errs_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
- values: '1+1x500'
+ values: '0+600x10'
- series: 'node_network_transmit_errs_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
- values: '1+1x500'
+ values: '0+600x10'
+ - series: 'node_network_transmit_packets_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '0+750x10'
+ - series: 'node_network_receive_packets_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '0+750x10'
promql_expr_test:
- expr: |
(
- increase(node_network_receive_errs_total{device!="lo"}[1m]) +
- increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
- increase(node_network_receive_packets_total{device!="lo"}[1m]) +
- increase(node_network_transmit_packets_total{device!="lo"}[1m])
+ rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
- increase(node_network_receive_errs_total{device!="lo"}[1m]) +
- increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+ rate(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
eval_time: 5m
exp_samples:
- labels: '{device="eth0", instance="node-exporter",
job="node-exporter"}'
- value: 1.2E+02
+ value: 8E-01
alert_rule_test:
- eval_time: 5m
alertname: CephNodeNetworkPacketErrors