From: David Galloway Date: Wed, 25 Nov 2020 15:23:00 +0000 (-0500) Subject: common: smart.sh - print serial numbers of bad drives X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=72e424645df4453a2dcb7f8f34174c67a86ba56b;p=ceph-cm-ansible.git common: smart.sh - print serial numbers of bad drives Signed-off-by: David Galloway --- diff --git a/roles/common/files/libexec/smart.sh b/roles/common/files/libexec/smart.sh index 2287a1d..09a3275 100755 --- a/roles/common/files/libexec/smart.sh +++ b/roles/common/files/libexec/smart.sh @@ -264,7 +264,7 @@ normal_smart () fi # The SSDs in the bruuni just straight up say failed with no additional detail elif sudo smartctl -a /dev/$l | grep -q "FAILED!"; then - messages+=("Drive $l has completely failed") + messages+=("Drive $l ($(get_serial $l)) has completely failed") failed=true rc=2 else @@ -277,7 +277,7 @@ normal_smart () if echo "$output" | grep -q '^0x05'; then reallocated=$(echo "$output" | grep '^0x05' | awk '{print $NF}') if [ "$reallocated" != "0" ] && [ $is_ssd = false ]; then - messages+=("Drive $l has $reallocated reallocated sectors") + messages+=("Drive $l ($(get_serial $l)) has $reallocated reallocated sectors") failed=true # A small number of reallocated sectors is OK # Don't set rc to WARN if we were already CRIT from previous drive @@ -293,7 +293,7 @@ normal_smart () if echo "$output" | grep -q '^0xbb'; then uncorrect=$(echo "$output" | grep '^0xbb' | awk '{print $NF}') if [ "$uncorrect" != "0" ]; then - messages+=("Drive $l has $uncorrect reported uncorrect sectors") + messages+=("Drive $l ($(get_serial $l)) had $uncorrect reported uncorrect sectors") failed=true rc=2 fi @@ -302,7 +302,7 @@ normal_smart () if echo "$output" | grep -q '^0xc4'; then reallocatedevents=$(echo "$output" | grep '^0xc4' | awk '{print $NF}') if [ "$reallocatedevents" != "0" ]; then - messages+=("Drive $l has $reallocatedevents reallocated events") + messages+=("Drive $l ($(get_serial $l)) has $reallocatedevents reallocated events") failed=true rc=2 fi @@ -311,7 +311,7 @@ normal_smart () if echo "$output" | grep -q '^0xc5'; then pending=$(echo "$output" | grep '^0xc5' | awk '{print $NF}') if [ "$pending" != "0" ]; then - messages+=("Drive $l has $pending pending sectors") + messages+=("Drive $l ($(get_serial $l)) has $pending pending sectors") failed=true rc=2 fi @@ -320,7 +320,7 @@ normal_smart () if echo "$output" | grep -q '^0xc6'; then uncorrect=$(echo "$output" | grep '^0xc6' | awk '{print $NF}') if [ "$uncorrect" != "0" ]; then - messages+=("Drive $l has $uncorrect uncorrect sectors") + messages+=("Drive $l ($(get_serial $l)) has $uncorrect uncorrect sectors") failed=true rc=2 fi @@ -329,7 +329,7 @@ normal_smart () if echo -e "$output" | grep -q '^0xe9'; then wearout=$(echo "$output" | grep '^0xe9' | awk '{print $NF}') if [ "$wearout" == "1" ]; then - messages+=("Drive $l has exhausted its Media_Wearout_Indicator") + messages+=("Drive $l ($(get_serial $l)) has exhausted its Media_Wearout_Indicator") failed=true # Don't set rc to WARN if we were already CRIT from previous drive if [ "$rc" != 2 ] @@ -420,5 +420,14 @@ nvme_smart () done } +get_serial() { + serial=$(sudo smartctl -i /dev/$1 | grep "Serial Number:" | awk '{ print $3 }') + if [ "$serial" == "" ]; then + echo "S/N unknown" + else + echo $serial + fi +} + ## Call main() function main