From f9213ad9cf438afc81a1bf0a53c00184dfaf7b1b Mon Sep 17 00:00:00 2001 From: Paul Cuzner Date: Thu, 23 Sep 2021 09:43:57 +1200 Subject: [PATCH] monitoring:Adding the Ceph MIB The ceph MIB has been created and maintained in a a separate repo: https://github.com/SUSE/prometheus-webhook-snmp This patch brings this MIB into the main ceph repo, so alert changes can target prometheus and potentially SNMP environments within the same PR. Kudos to Volker Theile for creating the MIB. Fixes: https://tracker.ceph.com/issues/52708 Signed-off-by: Paul Cuzner --- monitoring/prometheus/README.md | 7 + monitoring/snmp/CEPH-PROMETHEUS-ALERT-MIB.txt | 377 ++++++++++++++++++ monitoring/snmp/README.md | 24 ++ 3 files changed, 408 insertions(+) create mode 100644 monitoring/snmp/CEPH-PROMETHEUS-ALERT-MIB.txt create mode 100644 monitoring/snmp/README.md diff --git a/monitoring/prometheus/README.md b/monitoring/prometheus/README.md index fde63a35fe2ee..205e373b19c0e 100644 --- a/monitoring/prometheus/README.md +++ b/monitoring/prometheus/README.md @@ -5,3 +5,10 @@ In monitoring/prometheus/alerts you'll find a set of Prometheus alert rules that should provide a decent set of default alerts for a Ceph cluster. Just put this file in a place according to your Prometheus configuration (wherever the `rules` configuration stanza points). + +### SNMP +Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending prometheus +alerts through to an SNMP management platform. The translation from prometheus +alert to SNMP trap requires the prometheus alert to contain an OID that maps to +a definition within the MIB. When making changes to the prometheus alert rules +file, developers should include any necessary changes to the MIB. diff --git a/monitoring/snmp/CEPH-PROMETHEUS-ALERT-MIB.txt b/monitoring/snmp/CEPH-PROMETHEUS-ALERT-MIB.txt new file mode 100644 index 0000000000000..0eeda4471cb7d --- /dev/null +++ b/monitoring/snmp/CEPH-PROMETHEUS-ALERT-MIB.txt @@ -0,0 +1,377 @@ +CEPH-PROMETHEUS-ALERT-MIB DEFINITIONS ::= BEGIN + +IMPORTS + MODULE-IDENTITY, OBJECT-TYPE, NOTIFICATION-TYPE, enterprises, TimeTicks + FROM SNMPv2-SMI + DisplayString + FROM SNMPv2-TC +; + +ceph OBJECT IDENTIFIER ::= { enterprises 50495 } +prometheus OBJECT IDENTIFIER ::= { ceph 15 } + +prometheusAlert MODULE-IDENTITY + LAST-UPDATED "201904010000Z" -- 1. Apr 2019 + ORGANIZATION "The Ceph Project" + CONTACT-INFO "https://ceph.com" + DESCRIPTION "Prometheus Alert SNMP MIB" + REVISION "201904010000Z" -- 1. Apr 2019 + DESCRIPTION "Initial version." + ::= { prometheus 1 } + +prometheusAlertObjects OBJECT IDENTIFIER ::= { prometheusAlert 1 } +prometheusAlertTraps OBJECT IDENTIFIER ::= { prometheusAlert 2 } + +-- +-- Objects +-- + +prometheusAlertNotificationAlertName OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The name of the Prometheus alert." +::= { prometheusAlertObjects 1 } + +prometheusAlertNotificationStatus OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The status of the Prometheus alert." +::= { prometheusAlertObjects 2 } + +prometheusAlertNotificationSeverity OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The severity of the Prometheus alert." +::= { prometheusAlertObjects 3 } + +prometheusAlertNotificationInstance OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "Unique identifier for the Prometheus instance." +::= { prometheusAlertObjects 4 } + +prometheusAlertNotificationJob OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The name of the Prometheus job." +::= { prometheusAlertObjects 5 } + +prometheusAlertNotificationDescription OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The Prometheus alert description field." +::= { prometheusAlertObjects 6 } + +prometheusAlertNotificationLabels OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "Additional Prometheus alert labels as JSON string." +::= { prometheusAlertObjects 7 } + +prometheusAlertNotificationTimestamp OBJECT-TYPE + SYNTAX TimeTicks + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The time when the Prometheus alert occurred." +::= { prometheusAlertObjects 8 } + +prometheusAlertNotificationRawData OBJECT-TYPE + SYNTAX DisplayString + MAX-ACCESS accessible-for-notify + STATUS current + DESCRIPTION "The raw Prometheus alert as JSON string." +::= { prometheusAlertObjects 9 } + +-- +-- Traps +-- + +prometheusAlertTrapDefault NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "Default trap." +::= { prometheusAlertTraps 1 } + +prometheusAlertClusterHealthTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 2 } +prometheusAlertMonTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 3 } +prometheusAlertOsdTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 4 } +prometheusAlertMdsTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 5 } +prometheusAlertMgrTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 6 } +prometheusAlertPgsTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 7 } +prometheusAlertNodesTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 8 } +prometheusAlertPoolsTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 9 } + +prometheusAlertClusterHealthTrapHealthError NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "Ceph in health_error state for more than 5m." +::= { prometheusAlertClusterHealthTraps 1 } + +prometheusAlertClusterHealthTrapHealthWarn NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "Ceph in health_warn for more than 15m." +::= { prometheusAlertClusterHealthTraps 2 } + +prometheusAlertMonTrapLowMonitorQuorumCount NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "Monitor count in quorum is low." +::= { prometheusAlertMonTraps 1 } + +prometheusAlertOsdTrap10PercentOsdsDown NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "More than 10% of OSDs are down." +::= { prometheusAlertOsdTraps 1 } + +prometheusAlertOsdTrapOsdDown NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "One or more OSDs down for more than 15 minutes." +::= { prometheusAlertOsdTraps 2 } + +prometheusAlertOsdTrapOsdsNearFull NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "An OSD is dangerously full, over 80%." +::= { prometheusAlertOsdTraps 3 } + +prometheusAlertOsdTrapFlapOsd NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes." +::= { prometheusAlertOsdTraps 4 } + +prometheusAlertOsdTrapHighPgCountDeviation NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "An OSD deviates by more then 30% from average PG count." +::= { prometheusAlertOsdTraps 5 } + +prometheusAlertPgsTrapPgsInactive NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "One or more PGs are inactive for more than 5 minutes." +::= { prometheusAlertPgsTraps 1 } + +prometheusAlertPgsTrapPgsUnclean NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "One or more PGs are not clean for more than 15 minutes." +::= { prometheusAlertPgsTraps 2 } + +prometheusAlertNodesTrapRootVolumeFull NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)." +::= { prometheusAlertNodesTraps 1 } + +prometheusAlertNodesTrapNetworkPacketsDropped NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface." +::= { prometheusAlertNodesTraps 2 } + +prometheusAlertNodesTrapNetworkPacketErrors NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface." +::= { prometheusAlertNodesTraps 3 } + +prometheusAlertNodesTrapStorageFilling NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours." +::= { prometheusAlertNodesTraps 4 } + +prometheusAlertPoolsTrapPoolFull NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "A pool is at 90% capacity or over." +::= { prometheusAlertPoolsTraps 1 } + +prometheusAlertPoolsTrapPoolFillingUp NOTIFICATION-TYPE + OBJECTS { + prometheusAlertNotificationAlertName, + prometheusAlertNotificationStatus, + prometheusAlertNotificationSeverity, + prometheusAlertNotificationInstance, + prometheusAlertNotificationJob, + prometheusAlertNotificationDescription, + prometheusAlertNotificationLabels, + prometheusAlertNotificationTimestamp, + prometheusAlertNotificationRawData + } + STATUS current + DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours." +::= { prometheusAlertPoolsTraps 2 } + +END diff --git a/monitoring/snmp/README.md b/monitoring/snmp/README.md new file mode 100644 index 0000000000000..dccef1908f89d --- /dev/null +++ b/monitoring/snmp/README.md @@ -0,0 +1,24 @@ +# SNMP schema + +## Traps + +| OID | Description | +| :--- | :--- | +| 1.3.6.1.4.1.50495.15.1.2.1 | The default trap. This is used if no OID is specified in the alert labels. | +| 1.3.6.1.4.1.50495.15.1.2.[2...N] | Custom traps. | + +## Objects + +The following objects are appended as variable binds to an SNMP trap. + +| OID | Type | Description | +| :--- | :---: | :--- | +| 1.3.6.1.4.1.50495.15.1.1.1 | String | The name of the Prometheus alert. | +| 1.3.6.1.4.1.50495.15.1.1.2 | String | The status of the Prometheus alert. | +| 1.3.6.1.4.1.50495.15.1.1.3 | String | The severity of the Prometheus alert. | +| 1.3.6.1.4.1.50495.15.1.1.4 | String | Unique identifier for the Prometheus instance. | +| 1.3.6.1.4.1.50495.15.1.1.5 | String | The name of the Prometheus job. | +| 1.3.6.1.4.1.50495.15.1.1.6 | String | The Prometheus alert description field. | +| 1.3.6.1.4.1.50495.15.1.1.7 | String | Additional Prometheus alert labels as JSON string. | +| 1.3.6.1.4.1.50495.15.1.1.8 | Unix timestamp | The time when the Prometheus alert occurred. | +| 1.3.6.1.4.1.50495.15.1.1.9 | String | The raw Prometheus alert as JSON string. | \ No newline at end of file -- 2.39.5