From: David Galloway Date: Tue, 3 Apr 2018 18:42:43 +0000 (-0400) Subject: common: Fixes to smart.sh to support multiple attributes X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=2c7ff1670779697d582d56db0cdf993cef4ccc51;p=ceph-cm-ansible.git common: Fixes to smart.sh to support multiple attributes Signed-off-by: David Galloway --- diff --git a/roles/common/files/libexec/smart.sh b/roles/common/files/libexec/smart.sh index 84c8319..60db5b9 100755 --- a/roles/common/files/libexec/smart.sh +++ b/roles/common/files/libexec/smart.sh @@ -3,14 +3,6 @@ # and reallocated sector count # # Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN -# SMART Attribute Codes: -# 5 = Reallocated -# 187 = Reported Uncorrect -# 197 = Pending -# 198 = Uncorrectable Sector Count -# -# TO-DO: Add support for dynamic SMART attribute lookup. For example, -# 187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs. # # See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes @@ -128,11 +120,11 @@ areca_smart () { # Store output of cli64 to reduce repeated executions cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") - numdrives=$(echo "$cli64out" | wc -l) # Loop through all disks not marked as 'N.A.' or 'Failed' for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \ | grep -o "Slot#[[:digit:]]" | cut -c6-) do + let "numdrives+=1" failed=false # Determine if disk is JBOD or part of hardware RAID if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD' @@ -241,62 +233,107 @@ areca_failed () normal_smart () { # The grep regex will include drives named sdaa, for example - numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l) for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ | awk '{ print $NF }') do + let "numdrives+=1" failed=false - output=$(sudo smartctl -a /dev/$l | grep -E "^ "5"|^"197"|^"198"" \ - | awk '{ print $NF }' | tr '\n' ' ') - outputcount=$(echo $output | wc -w) - # Check if drive is SSD and set var accordingly + # The general consensus online is that some SMART attributes are less + # worrisome when it comes to SSDs (e.g., Reallocated_Sector_Ct) if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then is_ssd=true else is_ssd=false fi - # Only continue if we received 3 SMART data points and drive is not SSD - if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ] - then - read reallocated pending uncorrect <<< $output - if [ "$reallocated" != "0" ] - then + output=$(sudo smartctl -f hex -A /dev/$l | grep '^0') + # This block is mainly for the SAS drives in the reesi since they + # don't report regular SMART attributes + if [ $? != 0 ]; then + if output=$(sudo smartctl -l error /dev/$l | grep '^read:\|^write:'); then + uncorrect_read=$(echo "$output" | grep '^read:' | awk '{print $NF}') + uncorrect_write=$(echo "$output" | grep '^write:' | awk '{print $NF}') + if [ "$uncorrect_read" != "0" ]; then + messages+=("Drive $l reports $uncorrect_read uncorrected read errors") + failed=true + rc=2 + fi + if [ "$uncorrect_write" != "0" ]; then + messages+=("Drive $l reports $uncorrect_write uncorrected write errors") + failed=true + rc=2 + fi + else + messages+=("No SMART data found for drive $l") + failed=true + rc=3 + fi + fi + # 0x05 (5) Reallocated_Sector_Ct + if echo "$output" | grep -q '^0x05'; then + reallocated=$(echo "$output" | grep '^0x05' | awk '{print $NF}') + if [ "$reallocated" != "0" ] && [ $is_ssd = false ]; then messages+=("Drive $l has $reallocated reallocated sectors") failed=true # A small number of reallocated sectors is OK # Don't set rc to WARN if we were already CRIT from previous drive - if [ "$reallocated" -le 5 ] && [ "$rc" != 2 ] + if [ $reallocated -le 5 ] && [ "$rc" -lt 2 ] then rc=1 # Warn if <= 5 else rc=2 # Crit if >5 fi fi - if [ "$pending" != "0" ] - then + fi + # 0xbb (187) Reported_Uncorrect + if echo "$output" | grep -q '^0xbb'; then + uncorrect=$(echo "$output" | grep '^0xbb' | awk '{print $NF}') + if [ "$uncorrect" != "0" ]; then + messages+=("Drive $l has $uncorrect reported uncorrect sectors") + failed=true + rc=2 + fi + fi + # 0xc4 (196) Reallocated_Event_Count + if echo "$output" | grep -q '^0xc4'; then + reallocatedevents=$(echo "$output" | grep '^0xc4' | awk '{print $NF}') + if [ "$reallocatedevents" != "0" ]; then + messages+=("Drive $l has $reallocatedevents reallocated events") + failed=true + rc=2 + fi + fi + # 0xc5 (197) Current_Pending_Sector + if echo "$output" | grep -q '^0xc5'; then + pending=$(echo "$output" | grep '^0xc5' | awk '{print $NF}') + if [ "$pending" != "0" ]; then messages+=("Drive $l has $pending pending sectors") failed=true rc=2 fi - if [ "$uncorrect" != "0" ] - then + fi + # 0xc6 (198) Offline_Uncorrectable + if echo "$output" | grep -q '^0xc6'; then + uncorrect=$(echo "$output" | grep '^0xc6' | awk '{print $NF}') + if [ "$uncorrect" != "0" ]; then messages+=("Drive $l has $uncorrect uncorrect sectors") failed=true rc=2 fi - elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ] - then - messages+=("Drive $l returned $outputcount of 3 expected attributes") - unknownmsg="SMART data could not be read for one or more drives" - rc=3 - # Set no return code and assume any SSD is healthy for now - elif [ "$is_ssd" = true ] - then - messages+=("Drive $l is an SSD. Not yet supported.") - rc=0 - else - messages+=("Error processing data for drive $l") - rc=3 + fi + # 0xe9 (233) Media_Wearout_Indicator + if echo -e "$output" | grep -q '^0xe9'; then + wearout=$(echo "$output" | grep '^0xe9' | awk '{print $NF}') + if [ "$wearout" == "1" ]; then + messages+=("Drive $l has exhausted its Media_Wearout_Indicator") + failed=true + # Don't set rc to WARN if we were already CRIT from previous drive + if [ "$rc" != 2 ] + then + rc=1 + else + rc=2 + fi + fi fi # Make sure drives with multiple types of bad sectors only get counted once if [ "$failed" = true ] @@ -313,6 +350,7 @@ nvme_smart () do # Include NVMe devices in overall drive count let "numdrives+=1" + failed=false # Clear output variable from any previous disk checks output="" output=$(sudo $nvmecli smart-log $nvmedisk | \