]> git.apps.os.sepia.ceph.com Git - ceph-cm-ansible.git/commitdiff
common: Add support for NVMe drives in smart.sh 318/head
authorDavid Galloway <dgallowa@redhat.com>
Thu, 13 Apr 2017 19:38:15 +0000 (15:38 -0400)
committerDavid Galloway <dgallowa@redhat.com>
Thu, 13 Apr 2017 20:45:17 +0000 (16:45 -0400)
Signed-off-by: David Galloway <dgallowa@redhat.com>
roles/common/files/libexec/smart.sh

index 810f3505f8a539404d00ab79f4a2db52435d1531..84c8319142e9a5e4663c367c379d5d1cf5052f24 100755 (executable)
@@ -23,6 +23,8 @@ failingdrives=0
 unknownmsg="Unknown error"
 # Return code for nagios (Default to SUCCESS)
 rc=0
+# Location of nvme-cli executable
+nvmecli="/usr/sbin/nvme"
 # Array of messages indicating drive health.  Output after nagios status.
 declare -a messages
 
@@ -43,6 +45,11 @@ main ()
     exit 3
   fi
 
+  if [ "$nvme" = true ]
+  then
+    nvme_smart
+  fi
+
   ## Return UNKNOWN if no drives found
   if [ "$numdrives" -eq "0" ]
   then
@@ -103,6 +110,17 @@ preflight ()
     echo "yum/apt-get install smartmontools"
     exit 3
   fi
+
+  # Check for nvme devices and nvme-cli executable
+  if cat /proc/partitions | grep -q nvme
+  then
+    nvme=true
+    if ! [ -x "$nvmecli" ]
+    then
+      echo "ERROR - NVMe Device detected but no nvme-cli executable"
+      exit 3
+    fi
+  fi
 }
 
 # Gather smart data for drives behind Areca RAID controller
@@ -288,5 +306,72 @@ normal_smart ()
   done
 }
 
+nvme_smart ()
+{
+  # Loop through NVMe devices
+  for nvmedisk in $(sudo $nvmecli list | grep nvme | awk '{ print $1 }')
+  do
+    # Include NVMe devices in overall drive count
+    let "numdrives+=1"
+    # Clear output variable from any previous disk checks
+    output=""
+    output=$(sudo $nvmecli smart-log $nvmedisk | \
+             grep -E "^"critical_warning"|^"percentage_used"|^"media_errors"|^"num_err_log_entries"" \
+             | awk '{ print $NF }' | sed 's/%//' | tr '\n' ' ')
+    outputcount=$(echo $output | wc -w)
+    # Only continue if we received 4 SMART data points
+    if [ "$outputcount" = "4" ]
+    then
+      read critical_warning percentage_used media_errors num_err_log_entries <<< $output
+      # Check for critical warnings
+      if [ "$critical_warning" != "0" ]
+      then
+        messages+=("$nvmedrive indicates there is a critical warning")
+        failed=true
+        rc=1
+      fi
+      # Alert if >= 90% of manufacturer predicted life consumed
+      if [ "$percentage_used" -ge 90 ] && [ "$percentage_used" -lt 100 ]
+      then
+        messages+=("$nvmedisk has estimated $(expr 100 - $percentage_used)% life remaining")
+        failed=true
+        rc=1 # Warn if >= 90 and < 100
+      elif [ "$percentage_used" -ge 100 ]
+      then
+        messages+=("$nvmedisk has consumed $percentage_used% of its estimated life")
+        failed=true
+        rc=2 # Crit if > 100
+      fi
+      # Check for media errors
+      if [ "$media_errors" != "0" ]
+      then
+        messages+=("$nvmedisk indicates there are $media_errors media errors")
+        failed=true
+        rc=2
+      fi
+      # Check for error log entries
+      if [ "$num_err_log_entries" != "0" ]
+      then
+        messages+=("$nvmedisk indicates there are $num_err_log_entries error log entries")
+        failed=true
+        rc=2
+      fi
+    elif [ "$outputcount" != "4" ]
+    then
+      messages+=("$nvmedisk returned $outputcount of 4 expected attributes")
+      unknownmsg="SMART data could not be read for one or more drives"
+      rc=3
+    else
+      messages+=("Error processing data for $nvmedisk")
+      rc=3
+    fi
+    # Make sure NVMe devices with more than one type of error only get counted once
+    if [ "$failed" = true ]
+    then
+      let "failingdrives+=1"
+    fi
+  done
+}
+
 ## Call main() function
 main