]> git.apps.os.sepia.ceph.com Git - ceph-cm-ansible.git/commitdiff
testnode: Replace old smart.pl with rewritten smart.sh script 245/head
authorDavid Galloway <dgallowa@redhat.com>
Tue, 31 May 2016 22:44:47 +0000 (18:44 -0400)
committerDavid Galloway <dgallowa@redhat.com>
Fri, 3 Jun 2016 18:36:33 +0000 (14:36 -0400)
Fixes: http://tracker.ceph.com/issues/14682
Signed-off-by: David Galloway <dgallowa@redhat.com>
roles/testnode/files/libexec/smart.pl [deleted file]
roles/testnode/files/libexec/smart.sh [new file with mode: 0755]
roles/testnode/tasks/disk_monitoring.yml
roles/testnode/templates/nagios/nrpe.cfg

diff --git a/roles/testnode/files/libexec/smart.pl b/roles/testnode/files/libexec/smart.pl
deleted file mode 100644 (file)
index eba57dc..0000000
+++ /dev/null
@@ -1,257 +0,0 @@
-#!/usr/bin/perl
-
-# {{ ansible_managed }}
-
-use strict;
-
-my $warn;
-my $crit;
-my $out;
-
-my @out; # array of output messages
-my @failedout; # array of failed drive numbers
-my $drives;
-my $pci;
-my $type;
-my $mdadm;
-my $fullcommand;
-my $message;
-
-my $hostname = `uname -n`;
-chomp $hostname;
-my $pci = `lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -1`;
-
-my $smartctl = "/usr/sbin/smartctl";
-
-our $realloc = '50';
-our $pend = '1';
-our $uncorrect = '1';
-
-if ( $hostname =~ /mira/i )
-{
-       $realloc = '200';
-       $pend = '1';
-        $uncorrect = '1';
-}
-
-unless ( -x $smartctl )
-{
-       $crit = 1;
-       push(@out,"smartmontools package missing or broken");
-}
-
-
-sub smartctl
-{
-       my $command=$_[0];
-       my $raidtype=$_[1];
-       my $drive=$_[2];
-       my $scsidev=$_[3];
-
-       if ( $raidtype =~ /areca/i )
-       {
-               $fullcommand = "sudo $command -a -d areca,$drive $scsidev |";
-       }
-        if ( $raidtype =~ /mdadm/i )
-        {
-                $fullcommand = "sudo $command -a -d ata /dev/$drive|";
-        }
-       if ( $raidtype =~ /none/i )
-       {
-               $fullcommand = "sudo $command -a -d sat /dev/$drive|";
-       }
-
-       open(SMART,$fullcommand);
-       while (<SMART>)
-       {
-               if ( $_ =~ /FAILING_NOW/ )
-               {
-                       my @fail = split;
-                       $message = "Drive $drive is S.M.A.R.T. failing for $fail[1]";
-                       $crit = 1;
-                       push(@out,$message);
-                       push(@failedout,$drive);
-               }
-               if (( $_ =~ /_sector/i ) || ( $_ =~ /d_uncorrect/i ))
-               {
-                       my @sector = split;
-                       if ( $sector[1] =~ /reallocated/i  )
-                       {
-                               $type = "reallocated";
-                       }
-                       if ( $sector[1] =~ /pending/i  )
-                       {
-                               $type = "pending";
-                       }
-                        if ( $sector[1] =~ /d_uncorrect/i  )
-                        {
-                                $type = "uncorrect";
-                        }
-                       foreach ( $sector[9] )
-                       {
-                               my $count = $_;
-                               my $l = chr(ord('a') + $drive - 1);
-                               $message = "Drive $drive (sd$l) has $count $type sectors";
-
-                               if ( ( $type =~ /reallocated/i && $count > $realloc ) && ( $type =~ /pending/i && $count > $pend ) && ( $type =~ /uncorrect/i && $count > $uncorrect  ) )
-                               {
-                                       $crit = 1;
-                                       push(@out,$message);
-                                       push(@failedout,$drive);
-                               }
-                               else
-                               {
-                                       if ( $type =~ /reallocated/i && $count > $realloc )
-                                       {
-                                               $crit = 1;
-                                               push(@out,$message);
-                                               push(@failedout,$drive);
-                                       }
-                                       if ( $type =~ /pending/i && $count > $pend )
-                                       {
-                                               $crit = 1;
-                                               push(@out,$message);
-                                               push(@failedout,$drive);
-                                       }
-                                       if ( $type =~ /uncorrect/i && $count > $uncorrect )
-                                       {
-                                               $crit = 1;
-                                               push(@out,$message);
-                                               push(@failedout,$drive);
-                                       }
-                               }
-                       }
-               }
-       }
-}
-
-# software raid!
-if (-e "/proc/mdstat") 
-{
-       open(R,"/proc/mdstat");
-       while (<R>)
-       {
-               if (/^(md\d+) : (\w+)/)
-               {
-                       $mdadm = $mdadm + 1;
-               }
-       }
-       if ( $mdadm gt 0 )
-       {
-        open(BLOCK,"cat /proc/partitions | grep -w sd[a-z] |");
-               while (<BLOCK>)
-               {
-                       my @output = split;
-                       my $blockdevice = $output[3];
-                       foreach ( $blockdevice )
-                       {
-                               $drives++;
-                               smartctl("$smartctl","mdadm",$blockdevice,"none");
-                       }
-               }
-       }
-}
-
-#areca hardware raid
-if ( $pci =~ /areca/i)
-{
-       my $firmware = `sudo /usr/sbin/cli64 sys info | grep -i firm | awk '{print \$5}' | cut -d'-' -f1`;
-       chomp $firmware;
-
-       if ( $firmware < 2011 )
-       {
-               $crit = 1;
-               $message = "Controller needs newer firmware for S.M.A.R.T. support";
-               push(@out,$message);
-       }
-
-        open (SG, '/proc/scsi/sg/devices');
-       my $sgindex = 0;
-       while (<SG>) {
-               my ($host, $chan, $id, $lun, $type, $opens, $depth, $busy, $online) = split();
-               if ($type == 3) {
-                       last;
-               }
-               $sgindex++;
-       }
-       my $scsidev = "/dev/sg$sgindex";
-        open(CLI,"sudo /usr/sbin/cli64 disk info | grep -vi Modelname | grep -v ====== | grep -vi GuiErr |");
-       while (<CLI>)
-       {
-               $drives++;
-               if ( $_ =~ /^\ \ [0-9]+/ )
-               {
-                       my @info = split(/\s+/,$_);
-                       foreach ($info[1])
-                       {
-                               my $drive = $_;
-                               my $status = $info[$#info];
-                if ( $status =~ /Failed/ || $status =~ /N\.A\./ ) {
-                    push(@out, "Drive $drive $status");
-                   push(@failedout,$drive);
-                } else {
-                    smartctl("$smartctl","areca",$drive,$scsidev);
-                }
-                       }
-               }
-       }
-}
-
-# assume JBOD/direct access if not areca or hw raid
-if ( $mdadm == 0 && $pci !~ /areca/i )
-{
-       open(BLOCK,"cat /proc/partitions | grep -w sd[a-z] |");
-       while (<BLOCK>)
-       {
-               my @output = split;
-               my $blockdevice = $output[3];
-               foreach ( $blockdevice )
-               {
-                       $drives++;
-                       smartctl("$smartctl","none",$blockdevice,"none");
-               }
-       }
-}
-
-# show results
-my $result = 0;
-$result = 1 if $warn;
-$result = 2 if $crit;
-# print "warn = $warn crit = $crit\n";
-
-my $out = "No real disks found on machine";
-$out = "All $drives drives happy as clams" if $drives;
-
-
-# count unique num failed drives
-my %counts = ();
-for (@failedout) {
-       $counts{$_}++;
-}
-
-my $uniquedrives = 0;
-foreach my $keys (keys %counts) {
-       $uniquedrives++;
-}
-
-# prints multiline output unless -s flag used
-if ($ARGV[0] =~ /-s/) {
-       if (@out)
-       {
-               # this outputs all messages to one line presumably
-               # because nagios < v3.0 couldn't handle multiline output
-               $out = join(';     ', @out);
-       }
-
-       print "$out\n";
-} else {
-       if (@out) {
-               print "$uniquedrives of $drives drives failing/missing |\n";
-               foreach my $line (@out) {
-                       print $line, "\n";
-               }
-       } else {
-               print "$out\n";
-       }
-}
-exit $result;
diff --git a/roles/testnode/files/libexec/smart.sh b/roles/testnode/files/libexec/smart.sh
new file mode 100755 (executable)
index 0000000..486b5cc
--- /dev/null
@@ -0,0 +1,290 @@
+#!/bin/bash
+# Description: Bash script to check drive health using pending, uncorrectable,
+# and reallocated sector count
+#
+# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN
+# SMART Attribute Codes:
+#   5 = Reallocated
+#   187 = Reported Uncorrect
+#   197 = Pending
+#   198 = Uncorrectable Sector Count
+#
+# TO-DO: Add support for dynamic SMART attribute lookup.  For example,
+#        187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs.
+#
+# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes
+
+### Define global variables ###
+# total number of drives (or RAID slots) discovered
+numdrives=0
+# Number of failed, failing, and/or missing drives
+failingdrives=0
+# Fallback message for UNKNOWN return code output
+unknownmsg="Unknown error"
+# Return code for nagios (Default to UNKNOWN)
+rc=3
+# Array of messages indicating drive health.  Output after nagios status.
+declare -a messages
+
+### Functions ###
+main ()
+{
+  preflight
+
+  if [ "$raid" = true ]
+  then
+    areca_smart
+    areca_failed
+  elif [ "$raid" = false ]
+  then
+    normal_smart
+  else
+    echo "ERROR - Could not determine if RAID present"
+    exit 3
+  fi
+
+  ## Return UNKNOWN if no drives found
+  if [ "$numdrives" -eq "0" ]
+  then
+    unknownmsg="No drives found!"
+    rc=3
+  fi
+  
+  ## Return code and service status for nagios
+  if [ "$rc" = 0 ]
+  then
+    echo "OK - All $numdrives drives healthy"
+  elif [ "$rc" = 1 ]
+  then
+    echo "WARNING - $failingdrives of $numdrives drives sick"
+  elif [ "$rc" = 2 ]
+  then
+    echo "CRITICAL - $failingdrives of $numdrives drives need replacing"
+  elif [ "$rc" = 3 ]
+  then
+    echo "UNKNOWN - $unknownmsg"
+  else
+    echo "ERROR - Got no return code"
+  fi
+  
+  ## Iterate through array of messages
+  # Nagios reads and displays the first line of output on the Services page.
+  # All individual messages about failed/failing disk statistics can be viewed
+  # on the individual system's SMART detail page in nagios.
+  for msg in "${messages[@]}"
+  do
+    echo "$msg"
+  done
+  
+  exit $rc
+}
+
+# Pre-flight checks
+preflight ()
+{
+  # Set raid var then check for cli64 command and bail if missing
+  if lspci | grep -qi areca
+  then
+    raid=true
+  else
+    raid=false
+  fi
+  
+  if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ]
+  then
+    echo "ERROR - cli64 command not found or is not executable"
+    exit 3
+  fi
+  
+  # Check for smartmontools and bail if missing
+  if ! [ -x "$(command -v smartctl)" ]
+  then
+    echo "ERROR - smartctl is not installed or is not executable"
+    echo "yum/apt-get install smartmontools"
+    exit 3
+  fi
+}
+
+# Gather smart data for drives behind Areca RAID controller
+areca_smart ()
+{
+  # Store output of cli64 to reduce repeated executions
+  cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
+  numdrives=$(echo "$cli64out" | wc -l)
+  # Loop through all disks not marked as 'N.A.' or 'Failed'
+  for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \
+  | grep -o "Slot#[[:digit:]]" | cut -c6-)
+  do
+    failed=false
+    # Determine if disk is JBOD or part of hardware RAID
+    if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD'
+    then
+      jbod=true
+    else
+      jbod=false
+    fi
+    output=$(sudo cli64 disk smart drv=$slot \
+    | grep -E "^  "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ')
+    outputcount=$(echo $output | wc -w)
+    # Only continue if we received 3 SMART data points
+    if [ "$outputcount" = "3" ]
+    then
+      # Only do slot to drive letter matching once per bad JBOD
+      if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ]
+      then
+        dl=$(areca_bay_to_letter $slot)
+      elif [ "$jbod" = false ]
+      then
+        dl="(RAID)"
+      fi
+      read reallocated pending uncorrect <<< $output
+      if [ "$reallocated" != "0" ]
+      then
+        messages+=("Drive $slot $dl has $reallocated reallocated sectors")
+        failed=true
+        # A small number of reallocated sectors is OK
+        if [ "$reallocated" -le 5 ]
+        then
+          rc=1 # Warn if <= 5
+        else
+          rc=2 # Crit if >5
+        fi
+      fi
+      if [ "$pending" != "0" ]
+      then
+        messages+=("Drive $slot $dl has $pending pending sectors")
+        failed=true
+        rc=2
+      fi
+      if [ "$uncorrect" != "0" ]
+      then
+        messages+=("Drive $slot $dl has $uncorrect uncorrect sectors")
+        failed=true
+        rc=2
+      fi
+    else
+      messages+=("Drive $slot returned $outputcount of 3 expected attributes")
+      unknownmsg="SMART data could not be read for one or more drives"
+      rc=3
+    fi
+    # Make sure drives with multiple types of bad sectors only get counted once
+    if [ "$failed" = true ]
+    then
+      let "failingdrives+=1"
+    fi
+  done
+}
+
+# Correlate Areca drive bay to drive letter
+areca_bay_to_letter ()
+{
+  # Get S/N according to RAID controller given argument $1 (slot #)
+  areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \
+  | awk '{ print $NF }')
+  # Loop through and get S/N according to smartctl given drive name
+  for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
+  | awk '{ print $NF }')
+  do
+    smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \
+    | awk '{ print $NF }')
+    # If cli64 and smartctl find a S/N match, return drive letter
+    if [ "$areca_serial" = "$smart_serial" ]
+    then
+      echo "($dl)"
+    fi
+  done
+}
+
+# Tally missing and failed drives connected to Areca RAID
+areca_failed ()
+{
+  # Store output of cli64 to reduce repeated executions
+  cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
+  # Missing (N.A.) drives
+  for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
+  | grep "N.A." | awk '{ print $1 }')
+  do
+    messages+=("Drive $drive is missing")
+    let "failingdrives+=1"
+    rc=2
+  done
+  # Hard failed drives
+  for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
+  | grep 'Failed' | awk '{ print $1 }')
+  do
+    messages+=("Drive $drive failed")
+    let "failingdrives+=1"
+    rc=2
+  done
+}
+
+# Standard SATA/SAS drive smartctl check
+normal_smart ()
+{
+  # The grep regex will include drives named sdaa, for example
+  numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l)
+  for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
+  | awk '{ print $NF }')
+  do
+    failed=false
+    output=$(sudo smartctl -a /dev/$l | grep -E "^  "5"|^"197"|^"198"" \
+    | awk '{ print $NF }' | tr '\n' ' ')
+    outputcount=$(echo $output | wc -w)
+    # Check if drive is SSD and set var accordingly
+    if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then
+      is_ssd=true
+    else
+      is_ssd=false
+    fi
+    # Only continue if we received 3 SMART data points and drive is not SSD
+    if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ]
+    then
+      read reallocated pending uncorrect <<< $output
+      if [ "$reallocated" != "0" ]
+      then
+        messages+=("Drive $l has $reallocated reallocated sectors")
+        failed=true
+        # A small number of reallocated sectors is OK
+        if [ "$reallocated" -le 5 ]
+        then
+          rc=1 # Warn if <= 5
+        else
+          rc=2 # Crit if >5
+        fi
+      fi
+      if [ "$pending" != "0" ]
+      then
+        messages+=("Drive $l has $pending pending sectors")
+        failed=true
+        rc=2
+      fi
+      if [ "$uncorrect" != "0" ]
+      then
+        messages+=("Drive $l has $uncorrect uncorrect sectors")
+        failed=true
+        rc=2
+      fi
+    elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ]
+    then
+      messages+=("Drive $l returned $outputcount of 3 expected attributes")
+      unknownmsg="SMART data could not be read for one or more drives"
+      rc=3
+    # Set no return code and assume any SSD is healthy for now
+    elif [ "$is_ssd" = true ]
+    then
+      messages+=("Drive $l is an SSD.  Not yet supported.")
+      rc=0
+    else
+      messages+=("Error processing data for drive $l")
+      rc=3
+    fi
+    # Make sure drives with multiple types of bad sectors only get counted once
+    if [ "$failed" = true ]
+    then
+      let "failingdrives+=1"
+    fi
+  done
+}
+
+## Call main() function
+main
index d48e2295dd68023de6ad5882d794603bb920b88c..2d06a177ee2526f15aa607c918141f5424636106 100644 (file)
@@ -28,6 +28,6 @@
     group: root
     mode: 0755
   with_items:
-    - smart.pl
+    - smart.sh
     - raid.pl
     - diskusage.pl
index db2448a62df5555771ec6a23d888c397544af882..84435c8992c80a6768231d36e6cf49ccab41a2c8 100644 (file)
@@ -22,7 +22,7 @@ command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning
 command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500
 command[check_raid]=/usr/libexec/raid.pl
 command[check_disks]=/usr/libexec/diskusage.pl 90 95
-command[check_smart]=/usr/libexec/smart.pl
+command[check_smart]=/usr/libexec/smart.sh
 
 include=/etc/nagios/nrpe_local.cfg