ceph-mixin: add RBD Mirror monitoring alerts

author Arun Kumar Mohan <amohan@redhat.com>

Wed, 15 Feb 2023 18:13:08 +0000 (23:43 +0530)

committer Arun Kumar Mohan <amohan@redhat.com>

Thu, 28 Mar 2024 09:22:20 +0000 (14:52 +0530)
author Arun Kumar Mohan <amohan@redhat.com>
Wed, 15 Feb 2023 18:13:08 +0000 (23:43 +0530)
committer Arun Kumar Mohan <amohan@redhat.com>
Thu, 28 Mar 2024 09:22:20 +0000 (14:52 +0530)
diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet

index 7ee1210b043310cf23652ebc95191e43ac4b9252..e14bce4a424ec071fe6b18ba53f0676b0b2fdb51 100644 (file)
--- a/monitoring/ceph-mixin/config.libsonnet
+++ b/monitoring/ceph-mixin/config.libsonnet
@@ -7,5 +7,7 @@
  
      CephNodeNetworkPacketDropsThreshold: 0.005,
      CephNodeNetworkPacketDropsPerSec: 10,
+    CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
+    CephRBDMirrorImagesPerDaemonThreshold: 100,
    },
  }
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet

index b7558a70fa87e11f51ef5b556438861cb8e67768..82513e5050602b6a3f82a9f15f866e276ed42077 100644 (file)
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -801,5 +801,50 @@
          },
        ],
      },
+    {
+      name: 'rbdmirror',
+      rules: [
+        {
+          alert: 'CephRBDMirrorImagesPerDaemonHigh',
+          'for': '1m',
+          expr: 'sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+          labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.2' },
+          annotations: {
+            summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+            description: 'Number of image replications per daemon is not suppossed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+          },
+        },
+        {
+          alert: 'CephRBDMirrorImagesNotInSync',
+          'for': '1m',
+          expr: 'sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0',
+          labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.3' },
+          annotations: {
+            summary: 'Some of the RBD mirror images are not in sync with the remote counter parts.',
+            description: 'Both local and remote RBD mirror images should be in sync.',
+          },
+        },
+        {
+          alert: 'CephRBDMirrorImagesNotInSyncVeryHigh',
+          'for': '1m',
+          expr: 'count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)',
+          labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.4' },
+          annotations: {
+            summary: 'Number of unsynchronized images are very high.',
+            description: 'More than 10% of the images have synchronization problems',
+          },
+        },
+        {
+          alert: 'CephRBDMirrorImageTransferBandwidthHigh',
+          'for': '1m',
+          expr: 'rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > %.2f' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold],
+          labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.5' },
+          annotations: {
+            summary: 'The replication network usage has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
+            description: 'Detected a heavy increase in bandwidth for rbd replications (over %d%s) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
+          },
+        },
+      ],
+    },
    ],
  }
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml

index 49c38ebd3555f9d2f2fb40507e6d7a59faf2fff6..9bccefb9fc49efdb075938fbd55b5bba698486c5 100644 (file)
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -714,3 +714,45 @@ groups:
            oid: "1.3.6.1.4.1.50495.1.2.1.1.2"
            severity: "critical"
            type: "ceph_default"
+  - name: "rbdmirror"
+    rules:
+      - alert: "CephRBDMirrorImagesPerDaemonHigh"
+        annotations:
+          description: "Number of image replications per daemon is not suppossed to go beyond threshold 100"
+          summary: "Number of image replications are now above 100"
+        expr: "sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephRBDMirrorImagesNotInSync"
+        annotations:
+          description: "Both local and remote RBD mirror images should be in sync."
+          summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
+        expr: "sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephRBDMirrorImagesNotInSyncVeryHigh"
+        annotations:
+          description: "More than 10% of the images have synchronization problems"
+          summary: "Number of unsynchronized images are very high."
+        expr: "count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephRBDMirrorImageTransferBandwidthHigh"
+        annotations:
+          description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
+          summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+        expr: "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
+          severity: "warning"
+          type: "ceph_default"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index 4768af7de408279ed444d9758ad0691798ccdeeb..903a1480cea9d5d3183d7d01b201f74541ea5521 100644 (file)
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -2030,3 +2030,192 @@ tests:
          exp_annotations:
            summary: Fan error(s) detected
            description: "Fan error(s) detected. Check `ceph health detail`."
+
+  # new rbdmirror alerts tests
+  # RBD Mirror Alerts
+  # alert: CephRBDMirrorImagesPerDaemonHigh
+ - interval: 1m
+   input_series:
+     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}'
+       values: '0+0x20 1+1x130'
+     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data"}'
+       values: '1+1x130 131+0x20'
+   # prometheus query test
+   promql_expr_test:
+     # negative test where there are no samples
+     - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
+       eval_time: 50m
+       exp_samples:
+     # second positive test
+     - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
+       eval_time: 70m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628", namespace="default"}'
+           value: 121
+   # prometheus alert test
+   alert_rule_test:
+     # negative test
+     - eval_time: 30m
+       alertname: CephRBDMirrorImagesPerDaemonHigh
+       exp_alerts:
+     # positive test where alert is fired
+     - eval_time: 70m
+       alertname: CephRBDMirrorImagesPerDaemonHigh
+       exp_alerts:
+       - exp_labels:
+           oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
+           severity: "critical"
+           type: "ceph_default"
+           ceph_daemon: "client.admin.40628"
+           namespace: "default"
+         exp_annotations:
+           description: "Number of image replications per daemon is not suppossed to go beyond threshold 100"
+           summary: "Number of image replications are now above 100"
+
+ # alert: CephRBDMirrorImagesNotInSync
+ - interval: 1m
+   input_series:
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+       values: '1.678+0x20 2.03+0x20 3.21+0x20'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+       values: '1.678+0x20 2.03+0x20 2.03+0x20'
+   # prometheus query test
+   promql_expr_test:
+     # negative test where there are no samples
+     - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
+       eval_time: 30m
+       exp_samples:
+       # second positive test
+     - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
+       eval_time: 45m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}'
+           value: 1.1800000000000002
+   # prometheus alert test
+   alert_rule_test:
+     # negative test
+     - eval_time: 20m
+       alertname: CephRBDMirrorImagesNotInSync
+       exp_alerts:
+     # positive test where alert is fired
+     - eval_time: 50m
+       alertname: CephRBDMirrorImagesNotInSync
+       exp_alerts:
+         - exp_labels:
+             image: "image1"
+             pool: "data"
+             oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
+             severity: "critical"
+             type: "ceph_default"
+             ceph_daemon: "client.admin.40628"
+             namespace: "default"
+           exp_annotations:
+             description: "Both local and remote RBD mirror images should be in sync."
+             summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
+
+ # alert: CephRBDMirrorImagesNotInSyncVeryHigh
+ - interval: 1m
+   input_series:
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+       values: '1.678+0x20 2.03+0x20 3.21+0x20'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+       values: '1.678+0x20 2.03+0x20 2.03+0x20'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}'
+       values: '2.189+0x20 3.301+0x14 3.301+0x26'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}'
+       values: '2.189+0x20 3.301+0x14 7.13+0x26'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}'
+       values: '2.189+0x20 3.301+0x14 3.301+0x26'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}'
+       values: '2.189+0x20 3.301+0x14 7.13+0x26'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}'
+       values: '2.189+0x65'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}'
+       values: '2.189+0x65'
+     - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628"}'
+       values: '1+0x20 2+0x45'
+   # prometheus query test
+   promql_expr_test:
+     # test each query individually
+     # query 1
+     - expr: count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0)
+       eval_time: 45m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628"}'
+           value: 3
+     # query 2
+     - expr: sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots) * .1
+       eval_time: 45m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628"}'
+           value: 0.2
+   # prometheus alert test
+   alert_rule_test:
+     # negative test
+     - eval_time: 2m
+       alertname: CephRBDMirrorImagesNotInSyncVeryHigh
+       exp_alerts:
+      # positive test where alert is fired
+     - eval_time: 50m
+       alertname: CephRBDMirrorImagesNotInSyncVeryHigh
+       exp_alerts:
+         - exp_labels:
+             ceph_daemon: "client.admin.40628"
+             oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
+             severity: "critical"
+             type: "ceph_default"
+           exp_annotations:
+             description: "More than 10% of the images have synchronization problems"
+             summary: "Number of unsynchronized images are very high."
+
+ # alert: "CephRBDMirrorImageTransferBandwidthHigh"
+ - interval: 1m
+   input_series:
+     - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628"}'
+       values: '0+0x10 1+0x5 10+30x25 736+200x30'
+   # prometheus query test
+   promql_expr_test:
+     # test each couple of rates
+     # rate 1
+     - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
+       eval_time: 5m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628"}'
+           value: 0.0
+     # rate 2
+     - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
+       eval_time: 20m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628"}'
+           value: 0.33
+     # rate 3
+     - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
+       eval_time: 40m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628"}'
+           value: 0.5
+     # rate 4
+     - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
+       eval_time: 50m
+       exp_samples:
+         - labels: '{ceph_daemon="client.admin.40628"}'
+           value: 3.3333333333333335
+   # prometheus alert test
+   alert_rule_test:
+     # negative test
+     - eval_time: 2m
+       alertname: CephRBDMirrorImageTransferBandwidthHigh
+       exp_alerts:
+      # positive test where alert is fired
+     - eval_time: 50m
+       alertname: CephRBDMirrorImageTransferBandwidthHigh
+       exp_alerts:
+       - exp_labels:
+           ceph_daemon: "client.admin.40628"
+           oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
+           severity: "warning"
+           type: "ceph_default"
+         exp_annotations:
+           description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
+           summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+
diff --git a/monitoring/snmp/CEPH-MIB.txt b/monitoring/snmp/CEPH-MIB.txt

index f54cb3610377a68f29f5310d704559dff80c851b..5f0e5b2cbcda50d3e338aa0a8e6b52fec4be18c6 100644 (file)
--- a/monitoring/snmp/CEPH-MIB.txt
+++ b/monitoring/snmp/CEPH-MIB.txt
@@ -245,6 +245,26 @@ promRadosUnfound NOTIFICATION-TYPE
      DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
  ::= { promRados 1 }
  
+promRadosRBDMirrorImagesVeryHigh NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Number of RBD image replications are very high."
+::= { promRados 2 }
+
+promRadosRBDMirrorUnsyncImages NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Local RBD images are not in sync with the remote counter parts"
+::= { promRados 3 }
+
+promRadosRBDMirrorUnsyncImagesHigh NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "There is a high percentage of un-sync RBD images."
+::= { promRados 4 }
+
+promRadosRBDMirrorHighBandwidth NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A high bandwidth usage is detected during RBD image transfers."
+::= { promRados 5 }
+
  promCephadmDaemonDown NOTIFICATION-TYPE
      STATUS      current
      DESCRIPTION "Cephadm has determined that a daemon is down."
@@ -310,6 +330,10 @@ cephNotificationGroup NOTIFICATION-GROUP
          promPoolFull,
          promPoolFilling,
          promRadosUnfound,
+        promRadosRBDMirrorImagesVeryHigh,
+        promRadosRBDMirrorUnsyncImages,
+        promRadosRBDMirrorUnsyncImagesHigh,
+        promRadosRBDMirrorHighBandwidth,
          promCephadmDaemonDown,
          promCephadmUpgradeFailure,
          promPrometheusJobMissing
author	Arun Kumar Mohan <amohan@redhat.com>
	Wed, 15 Feb 2023 18:13:08 +0000 (23:43 +0530)
committer	Arun Kumar Mohan <amohan@redhat.com>
	Thu, 28 Mar 2024 09:22:20 +0000 (14:52 +0530)
monitoring/ceph-mixin/config.libsonnet		patch \| blob \| history
monitoring/ceph-mixin/prometheus_alerts.libsonnet		patch \| blob \| history
monitoring/ceph-mixin/prometheus_alerts.yml		patch \| blob \| history
monitoring/ceph-mixin/tests_alerts/test_alerts.yml		patch \| blob \| history
monitoring/snmp/CEPH-MIB.txt		patch \| blob \| history