From: David Galloway Date: Tue, 31 May 2016 22:44:47 +0000 (-0400) Subject: testnode: Replace old smart.pl with rewritten smart.sh script X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=06f0eec09cbe114ac7b7add90459107d88c9bf54;p=ceph-cm-ansible.git testnode: Replace old smart.pl with rewritten smart.sh script Fixes: http://tracker.ceph.com/issues/14682 Signed-off-by: David Galloway --- diff --git a/roles/testnode/files/libexec/smart.pl b/roles/testnode/files/libexec/smart.pl deleted file mode 100644 index eba57dc..0000000 --- a/roles/testnode/files/libexec/smart.pl +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/perl - -# {{ ansible_managed }} - -use strict; - -my $warn; -my $crit; -my $out; - -my @out; # array of output messages -my @failedout; # array of failed drive numbers -my $drives; -my $pci; -my $type; -my $mdadm; -my $fullcommand; -my $message; - -my $hostname = `uname -n`; -chomp $hostname; -my $pci = `lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -1`; - -my $smartctl = "/usr/sbin/smartctl"; - -our $realloc = '50'; -our $pend = '1'; -our $uncorrect = '1'; - -if ( $hostname =~ /mira/i ) -{ - $realloc = '200'; - $pend = '1'; - $uncorrect = '1'; -} - -unless ( -x $smartctl ) -{ - $crit = 1; - push(@out,"smartmontools package missing or broken"); -} - - -sub smartctl -{ - my $command=$_[0]; - my $raidtype=$_[1]; - my $drive=$_[2]; - my $scsidev=$_[3]; - - if ( $raidtype =~ /areca/i ) - { - $fullcommand = "sudo $command -a -d areca,$drive $scsidev |"; - } - if ( $raidtype =~ /mdadm/i ) - { - $fullcommand = "sudo $command -a -d ata /dev/$drive|"; - } - if ( $raidtype =~ /none/i ) - { - $fullcommand = "sudo $command -a -d sat /dev/$drive|"; - } - - open(SMART,$fullcommand); - while () - { - if ( $_ =~ /FAILING_NOW/ ) - { - my @fail = split; - $message = "Drive $drive is S.M.A.R.T. failing for $fail[1]"; - $crit = 1; - push(@out,$message); - push(@failedout,$drive); - } - if (( $_ =~ /_sector/i ) || ( $_ =~ /d_uncorrect/i )) - { - my @sector = split; - if ( $sector[1] =~ /reallocated/i ) - { - $type = "reallocated"; - } - if ( $sector[1] =~ /pending/i ) - { - $type = "pending"; - } - if ( $sector[1] =~ /d_uncorrect/i ) - { - $type = "uncorrect"; - } - foreach ( $sector[9] ) - { - my $count = $_; - my $l = chr(ord('a') + $drive - 1); - $message = "Drive $drive (sd$l) has $count $type sectors"; - - if ( ( $type =~ /reallocated/i && $count > $realloc ) && ( $type =~ /pending/i && $count > $pend ) && ( $type =~ /uncorrect/i && $count > $uncorrect ) ) - { - $crit = 1; - push(@out,$message); - push(@failedout,$drive); - } - else - { - if ( $type =~ /reallocated/i && $count > $realloc ) - { - $crit = 1; - push(@out,$message); - push(@failedout,$drive); - } - if ( $type =~ /pending/i && $count > $pend ) - { - $crit = 1; - push(@out,$message); - push(@failedout,$drive); - } - if ( $type =~ /uncorrect/i && $count > $uncorrect ) - { - $crit = 1; - push(@out,$message); - push(@failedout,$drive); - } - } - } - } - } -} - -# software raid! -if (-e "/proc/mdstat") -{ - open(R,"/proc/mdstat"); - while () - { - if (/^(md\d+) : (\w+)/) - { - $mdadm = $mdadm + 1; - } - } - if ( $mdadm gt 0 ) - { - open(BLOCK,"cat /proc/partitions | grep -w sd[a-z] |"); - while () - { - my @output = split; - my $blockdevice = $output[3]; - foreach ( $blockdevice ) - { - $drives++; - smartctl("$smartctl","mdadm",$blockdevice,"none"); - } - } - } -} - -#areca hardware raid -if ( $pci =~ /areca/i) -{ - my $firmware = `sudo /usr/sbin/cli64 sys info | grep -i firm | awk '{print \$5}' | cut -d'-' -f1`; - chomp $firmware; - - if ( $firmware < 2011 ) - { - $crit = 1; - $message = "Controller needs newer firmware for S.M.A.R.T. support"; - push(@out,$message); - } - - open (SG, '/proc/scsi/sg/devices'); - my $sgindex = 0; - while () { - my ($host, $chan, $id, $lun, $type, $opens, $depth, $busy, $online) = split(); - if ($type == 3) { - last; - } - $sgindex++; - } - my $scsidev = "/dev/sg$sgindex"; - open(CLI,"sudo /usr/sbin/cli64 disk info | grep -vi Modelname | grep -v ====== | grep -vi GuiErr |"); - while () - { - $drives++; - if ( $_ =~ /^\ \ [0-9]+/ ) - { - my @info = split(/\s+/,$_); - foreach ($info[1]) - { - my $drive = $_; - my $status = $info[$#info]; - if ( $status =~ /Failed/ || $status =~ /N\.A\./ ) { - push(@out, "Drive $drive $status"); - push(@failedout,$drive); - } else { - smartctl("$smartctl","areca",$drive,$scsidev); - } - } - } - } -} - -# assume JBOD/direct access if not areca or hw raid -if ( $mdadm == 0 && $pci !~ /areca/i ) -{ - open(BLOCK,"cat /proc/partitions | grep -w sd[a-z] |"); - while () - { - my @output = split; - my $blockdevice = $output[3]; - foreach ( $blockdevice ) - { - $drives++; - smartctl("$smartctl","none",$blockdevice,"none"); - } - } -} - -# show results -my $result = 0; -$result = 1 if $warn; -$result = 2 if $crit; -# print "warn = $warn crit = $crit\n"; - -my $out = "No real disks found on machine"; -$out = "All $drives drives happy as clams" if $drives; - - -# count unique num failed drives -my %counts = (); -for (@failedout) { - $counts{$_}++; -} - -my $uniquedrives = 0; -foreach my $keys (keys %counts) { - $uniquedrives++; -} - -# prints multiline output unless -s flag used -if ($ARGV[0] =~ /-s/) { - if (@out) - { - # this outputs all messages to one line presumably - # because nagios < v3.0 couldn't handle multiline output - $out = join('; ', @out); - } - - print "$out\n"; -} else { - if (@out) { - print "$uniquedrives of $drives drives failing/missing |\n"; - foreach my $line (@out) { - print $line, "\n"; - } - } else { - print "$out\n"; - } -} -exit $result; diff --git a/roles/testnode/files/libexec/smart.sh b/roles/testnode/files/libexec/smart.sh new file mode 100755 index 0000000..486b5cc --- /dev/null +++ b/roles/testnode/files/libexec/smart.sh @@ -0,0 +1,290 @@ +#!/bin/bash +# Description: Bash script to check drive health using pending, uncorrectable, +# and reallocated sector count +# +# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN +# SMART Attribute Codes: +# 5 = Reallocated +# 187 = Reported Uncorrect +# 197 = Pending +# 198 = Uncorrectable Sector Count +# +# TO-DO: Add support for dynamic SMART attribute lookup. For example, +# 187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs. +# +# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes + +### Define global variables ### +# total number of drives (or RAID slots) discovered +numdrives=0 +# Number of failed, failing, and/or missing drives +failingdrives=0 +# Fallback message for UNKNOWN return code output +unknownmsg="Unknown error" +# Return code for nagios (Default to UNKNOWN) +rc=3 +# Array of messages indicating drive health. Output after nagios status. +declare -a messages + +### Functions ### +main () +{ + preflight + + if [ "$raid" = true ] + then + areca_smart + areca_failed + elif [ "$raid" = false ] + then + normal_smart + else + echo "ERROR - Could not determine if RAID present" + exit 3 + fi + + ## Return UNKNOWN if no drives found + if [ "$numdrives" -eq "0" ] + then + unknownmsg="No drives found!" + rc=3 + fi + + ## Return code and service status for nagios + if [ "$rc" = 0 ] + then + echo "OK - All $numdrives drives healthy" + elif [ "$rc" = 1 ] + then + echo "WARNING - $failingdrives of $numdrives drives sick" + elif [ "$rc" = 2 ] + then + echo "CRITICAL - $failingdrives of $numdrives drives need replacing" + elif [ "$rc" = 3 ] + then + echo "UNKNOWN - $unknownmsg" + else + echo "ERROR - Got no return code" + fi + + ## Iterate through array of messages + # Nagios reads and displays the first line of output on the Services page. + # All individual messages about failed/failing disk statistics can be viewed + # on the individual system's SMART detail page in nagios. + for msg in "${messages[@]}" + do + echo "$msg" + done + + exit $rc +} + +# Pre-flight checks +preflight () +{ + # Set raid var then check for cli64 command and bail if missing + if lspci | grep -qi areca + then + raid=true + else + raid=false + fi + + if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ] + then + echo "ERROR - cli64 command not found or is not executable" + exit 3 + fi + + # Check for smartmontools and bail if missing + if ! [ -x "$(command -v smartctl)" ] + then + echo "ERROR - smartctl is not installed or is not executable" + echo "yum/apt-get install smartmontools" + exit 3 + fi +} + +# Gather smart data for drives behind Areca RAID controller +areca_smart () +{ + # Store output of cli64 to reduce repeated executions + cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") + numdrives=$(echo "$cli64out" | wc -l) + # Loop through all disks not marked as 'N.A.' or 'Failed' + for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \ + | grep -o "Slot#[[:digit:]]" | cut -c6-) + do + failed=false + # Determine if disk is JBOD or part of hardware RAID + if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD' + then + jbod=true + else + jbod=false + fi + output=$(sudo cli64 disk smart drv=$slot \ + | grep -E "^ "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ') + outputcount=$(echo $output | wc -w) + # Only continue if we received 3 SMART data points + if [ "$outputcount" = "3" ] + then + # Only do slot to drive letter matching once per bad JBOD + if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ] + then + dl=$(areca_bay_to_letter $slot) + elif [ "$jbod" = false ] + then + dl="(RAID)" + fi + read reallocated pending uncorrect <<< $output + if [ "$reallocated" != "0" ] + then + messages+=("Drive $slot $dl has $reallocated reallocated sectors") + failed=true + # A small number of reallocated sectors is OK + if [ "$reallocated" -le 5 ] + then + rc=1 # Warn if <= 5 + else + rc=2 # Crit if >5 + fi + fi + if [ "$pending" != "0" ] + then + messages+=("Drive $slot $dl has $pending pending sectors") + failed=true + rc=2 + fi + if [ "$uncorrect" != "0" ] + then + messages+=("Drive $slot $dl has $uncorrect uncorrect sectors") + failed=true + rc=2 + fi + else + messages+=("Drive $slot returned $outputcount of 3 expected attributes") + unknownmsg="SMART data could not be read for one or more drives" + rc=3 + fi + # Make sure drives with multiple types of bad sectors only get counted once + if [ "$failed" = true ] + then + let "failingdrives+=1" + fi + done +} + +# Correlate Areca drive bay to drive letter +areca_bay_to_letter () +{ + # Get S/N according to RAID controller given argument $1 (slot #) + areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \ + | awk '{ print $NF }') + # Loop through and get S/N according to smartctl given drive name + for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ + | awk '{ print $NF }') + do + smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \ + | awk '{ print $NF }') + # If cli64 and smartctl find a S/N match, return drive letter + if [ "$areca_serial" = "$smart_serial" ] + then + echo "($dl)" + fi + done +} + +# Tally missing and failed drives connected to Areca RAID +areca_failed () +{ + # Store output of cli64 to reduce repeated executions + cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") + # Missing (N.A.) drives + for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \ + | grep "N.A." | awk '{ print $1 }') + do + messages+=("Drive $drive is missing") + let "failingdrives+=1" + rc=2 + done + # Hard failed drives + for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \ + | grep 'Failed' | awk '{ print $1 }') + do + messages+=("Drive $drive failed") + let "failingdrives+=1" + rc=2 + done +} + +# Standard SATA/SAS drive smartctl check +normal_smart () +{ + # The grep regex will include drives named sdaa, for example + numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l) + for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ + | awk '{ print $NF }') + do + failed=false + output=$(sudo smartctl -a /dev/$l | grep -E "^ "5"|^"197"|^"198"" \ + | awk '{ print $NF }' | tr '\n' ' ') + outputcount=$(echo $output | wc -w) + # Check if drive is SSD and set var accordingly + if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then + is_ssd=true + else + is_ssd=false + fi + # Only continue if we received 3 SMART data points and drive is not SSD + if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ] + then + read reallocated pending uncorrect <<< $output + if [ "$reallocated" != "0" ] + then + messages+=("Drive $l has $reallocated reallocated sectors") + failed=true + # A small number of reallocated sectors is OK + if [ "$reallocated" -le 5 ] + then + rc=1 # Warn if <= 5 + else + rc=2 # Crit if >5 + fi + fi + if [ "$pending" != "0" ] + then + messages+=("Drive $l has $pending pending sectors") + failed=true + rc=2 + fi + if [ "$uncorrect" != "0" ] + then + messages+=("Drive $l has $uncorrect uncorrect sectors") + failed=true + rc=2 + fi + elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ] + then + messages+=("Drive $l returned $outputcount of 3 expected attributes") + unknownmsg="SMART data could not be read for one or more drives" + rc=3 + # Set no return code and assume any SSD is healthy for now + elif [ "$is_ssd" = true ] + then + messages+=("Drive $l is an SSD. Not yet supported.") + rc=0 + else + messages+=("Error processing data for drive $l") + rc=3 + fi + # Make sure drives with multiple types of bad sectors only get counted once + if [ "$failed" = true ] + then + let "failingdrives+=1" + fi + done +} + +## Call main() function +main diff --git a/roles/testnode/tasks/disk_monitoring.yml b/roles/testnode/tasks/disk_monitoring.yml index d48e229..2d06a17 100644 --- a/roles/testnode/tasks/disk_monitoring.yml +++ b/roles/testnode/tasks/disk_monitoring.yml @@ -28,6 +28,6 @@ group: root mode: 0755 with_items: - - smart.pl + - smart.sh - raid.pl - diskusage.pl diff --git a/roles/testnode/templates/nagios/nrpe.cfg b/roles/testnode/templates/nagios/nrpe.cfg index db2448a..84435c8 100644 --- a/roles/testnode/templates/nagios/nrpe.cfg +++ b/roles/testnode/templates/nagios/nrpe.cfg @@ -22,7 +22,7 @@ command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500 command[check_raid]=/usr/libexec/raid.pl command[check_disks]=/usr/libexec/diskusage.pl 90 95 -command[check_smart]=/usr/libexec/smart.pl +command[check_smart]=/usr/libexec/smart.sh include=/etc/nagios/nrpe_local.cfg