From 0d5292e1d488ec128fd8d5576636edc50da4ff6f Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Wed, 15 Feb 2023 23:43:08 +0530 Subject: [PATCH] ceph-mixin: add RBD Mirror monitoring alerts Signed-off-by: Arun Kumar Mohan (cherry picked from commit 5c21134064dc0a452ecfa0b3273b709b2fa3d150) --- monitoring/ceph-mixin/config.libsonnet | 2 + .../ceph-mixin/prometheus_alerts.libsonnet | 45 +++++ monitoring/ceph-mixin/prometheus_alerts.yml | 42 ++++ .../ceph-mixin/tests_alerts/test_alerts.yml | 189 ++++++++++++++++++ monitoring/snmp/CEPH-MIB.txt | 24 +++ 5 files changed, 302 insertions(+) diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet index 7ee1210b043..e14bce4a424 100644 --- a/monitoring/ceph-mixin/config.libsonnet +++ b/monitoring/ceph-mixin/config.libsonnet @@ -7,5 +7,7 @@ CephNodeNetworkPacketDropsThreshold: 0.005, CephNodeNetworkPacketDropsPerSec: 10, + CephRBDMirrorImageTransferBandwidthThreshold: 0.8, + CephRBDMirrorImagesPerDaemonThreshold: 100, }, } diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index b7558a70fa8..82513e50506 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -801,5 +801,50 @@ }, ], }, + { + name: 'rbdmirror', + rules: [ + { + alert: 'CephRBDMirrorImagesPerDaemonHigh', + 'for': '1m', + expr: 'sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config, + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.2' }, + annotations: { + summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config, + description: 'Number of image replications per daemon is not suppossed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config, + }, + }, + { + alert: 'CephRBDMirrorImagesNotInSync', + 'for': '1m', + expr: 'sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.3' }, + annotations: { + summary: 'Some of the RBD mirror images are not in sync with the remote counter parts.', + description: 'Both local and remote RBD mirror images should be in sync.', + }, + }, + { + alert: 'CephRBDMirrorImagesNotInSyncVeryHigh', + 'for': '1m', + expr: 'count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.4' }, + annotations: { + summary: 'Number of unsynchronized images are very high.', + description: 'More than 10% of the images have synchronization problems', + }, + }, + { + alert: 'CephRBDMirrorImageTransferBandwidthHigh', + 'for': '1m', + expr: 'rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > %.2f' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold], + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.5' }, + annotations: { + summary: 'The replication network usage has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'], + description: 'Detected a heavy increase in bandwidth for rbd replications (over %d%s) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'], + }, + }, + ], + }, ], } diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 49c38ebd355..9bccefb9fc4 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -714,3 +714,45 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.1.2" severity: "critical" type: "ceph_default" + - name: "rbdmirror" + rules: + - alert: "CephRBDMirrorImagesPerDaemonHigh" + annotations: + description: "Number of image replications per daemon is not suppossed to go beyond threshold 100" + summary: "Number of image replications are now above 100" + expr: "sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.2" + severity: "critical" + type: "ceph_default" + - alert: "CephRBDMirrorImagesNotInSync" + annotations: + description: "Both local and remote RBD mirror images should be in sync." + summary: "Some of the RBD mirror images are not in sync with the remote counter parts." + expr: "sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.3" + severity: "critical" + type: "ceph_default" + - alert: "CephRBDMirrorImagesNotInSyncVeryHigh" + annotations: + description: "More than 10% of the images have synchronization problems" + summary: "Number of unsynchronized images are very high." + expr: "count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.4" + severity: "critical" + type: "ceph_default" + - alert: "CephRBDMirrorImageTransferBandwidthHigh" + annotations: + description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously" + summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes" + expr: "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.5" + severity: "warning" + type: "ceph_default" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 4768af7de40..903a1480cea 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2030,3 +2030,192 @@ tests: exp_annotations: summary: Fan error(s) detected description: "Fan error(s) detected. Check `ceph health detail`." + + # new rbdmirror alerts tests + # RBD Mirror Alerts + # alert: CephRBDMirrorImagesPerDaemonHigh + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}' + values: '0+0x20 1+1x130' + - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data"}' + values: '1+1x130 131+0x20' + # prometheus query test + promql_expr_test: + # negative test where there are no samples + - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100 + eval_time: 50m + exp_samples: + # second positive test + - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100 + eval_time: 70m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628", namespace="default"}' + value: 121 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 30m + alertname: CephRBDMirrorImagesPerDaemonHigh + exp_alerts: + # positive test where alert is fired + - eval_time: 70m + alertname: CephRBDMirrorImagesPerDaemonHigh + exp_alerts: + - exp_labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.2" + severity: "critical" + type: "ceph_default" + ceph_daemon: "client.admin.40628" + namespace: "default" + exp_annotations: + description: "Number of image replications per daemon is not suppossed to go beyond threshold 100" + summary: "Number of image replications are now above 100" + + # alert: CephRBDMirrorImagesNotInSync + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 3.21+0x20' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 2.03+0x20' + # prometheus query test + promql_expr_test: + # negative test where there are no samples + - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0 + eval_time: 30m + exp_samples: + # second positive test + - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0 + eval_time: 45m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}' + value: 1.1800000000000002 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 20m + alertname: CephRBDMirrorImagesNotInSync + exp_alerts: + # positive test where alert is fired + - eval_time: 50m + alertname: CephRBDMirrorImagesNotInSync + exp_alerts: + - exp_labels: + image: "image1" + pool: "data" + oid: "1.3.6.1.4.1.50495.1.2.1.10.3" + severity: "critical" + type: "ceph_default" + ceph_daemon: "client.admin.40628" + namespace: "default" + exp_annotations: + description: "Both local and remote RBD mirror images should be in sync." + summary: "Some of the RBD mirror images are not in sync with the remote counter parts." + + # alert: CephRBDMirrorImagesNotInSyncVeryHigh + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 3.21+0x20' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 2.03+0x20' + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 3.301+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 7.13+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 3.301+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 7.13+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}' + values: '2.189+0x65' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}' + values: '2.189+0x65' + - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628"}' + values: '1+0x20 2+0x45' + # prometheus query test + promql_expr_test: + # test each query individually + # query 1 + - expr: count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) + eval_time: 45m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 3 + # query 2 + - expr: sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots) * .1 + eval_time: 45m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.2 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 2m + alertname: CephRBDMirrorImagesNotInSyncVeryHigh + exp_alerts: + # positive test where alert is fired + - eval_time: 50m + alertname: CephRBDMirrorImagesNotInSyncVeryHigh + exp_alerts: + - exp_labels: + ceph_daemon: "client.admin.40628" + oid: "1.3.6.1.4.1.50495.1.2.1.10.4" + severity: "critical" + type: "ceph_default" + exp_annotations: + description: "More than 10% of the images have synchronization problems" + summary: "Number of unsynchronized images are very high." + + # alert: "CephRBDMirrorImageTransferBandwidthHigh" + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628"}' + values: '0+0x10 1+0x5 10+30x25 736+200x30' + # prometheus query test + promql_expr_test: + # test each couple of rates + # rate 1 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 5m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.0 + # rate 2 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 20m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.33 + # rate 3 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 40m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.5 + # rate 4 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 50m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 3.3333333333333335 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 2m + alertname: CephRBDMirrorImageTransferBandwidthHigh + exp_alerts: + # positive test where alert is fired + - eval_time: 50m + alertname: CephRBDMirrorImageTransferBandwidthHigh + exp_alerts: + - exp_labels: + ceph_daemon: "client.admin.40628" + oid: "1.3.6.1.4.1.50495.1.2.1.10.5" + severity: "warning" + type: "ceph_default" + exp_annotations: + description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously" + summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes" + diff --git a/monitoring/snmp/CEPH-MIB.txt b/monitoring/snmp/CEPH-MIB.txt index f54cb361037..5f0e5b2cbcd 100644 --- a/monitoring/snmp/CEPH-MIB.txt +++ b/monitoring/snmp/CEPH-MIB.txt @@ -245,6 +245,26 @@ promRadosUnfound NOTIFICATION-TYPE DESCRIPTION "A RADOS object can not be found, even though all OSDs are online." ::= { promRados 1 } +promRadosRBDMirrorImagesVeryHigh NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Number of RBD image replications are very high." +::= { promRados 2 } + +promRadosRBDMirrorUnsyncImages NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Local RBD images are not in sync with the remote counter parts" +::= { promRados 3 } + +promRadosRBDMirrorUnsyncImagesHigh NOTIFICATION-TYPE + STATUS current + DESCRIPTION "There is a high percentage of un-sync RBD images." +::= { promRados 4 } + +promRadosRBDMirrorHighBandwidth NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A high bandwidth usage is detected during RBD image transfers." +::= { promRados 5 } + promCephadmDaemonDown NOTIFICATION-TYPE STATUS current DESCRIPTION "Cephadm has determined that a daemon is down." @@ -310,6 +330,10 @@ cephNotificationGroup NOTIFICATION-GROUP promPoolFull, promPoolFilling, promRadosUnfound, + promRadosRBDMirrorImagesVeryHigh, + promRadosRBDMirrorUnsyncImages, + promRadosRBDMirrorUnsyncImagesHigh, + promRadosRBDMirrorHighBandwidth, promCephadmDaemonDown, promCephadmUpgradeFailure, promPrometheusJobMissing -- 2.39.5