--- /dev/null
+#!/usr/bin/perl
+
+# {{ ansible_managed }}
+
+#******************************************************************************************
+#
+# NRPE DISK USAGE PLUGIN
+#
+# Program: Disk Usage plugin written to be used with Netsaint and NRPE
+# License: GPL
+# Copyright (c) 2000 Jeremy Hanmer (jeremy@newdream.net)
+#
+# Last Modified: 10/23/00
+#
+# Information: Basically, I wrote this because I had to deal with large numbers of
+# machines with a wide range of disk configurations, and with dynamically mounted
+# partitions. The basic check_disk plugin relied on a static configuration file which
+# doesn't lend itself to being used in a heterogeneous environnment (especially when
+# you can't guarantee that the devices listed in the configuration file will be mounted).
+#
+# Bugs: Currently, this plugin only works on EXT2 partitions (although it's easy to change).
+#
+# Command Line: diskusage.pl <warning percentage> <critical percentage>
+#
+# Tested Systems: Mandrake 7.1/Intel, Debian 2.2/Intel, Debian 2.1/Intel
+#
+# License Information:
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#*******************************************************************************************
+
+
+use strict;
+
+my $wrn = shift @ARGV;
+my $crt = shift @ARGV;
+my $output;
+my $count;
+my %type;
+my $result = 0;
+my $warn = 0;
+my $crit = 0;
+my @parts;
+my $hostname = `hostname`;
+chomp $hostname;
+@parts = `mount | grep -vi fuse`;
+
+#if ( $hostname eq 'zartan' ) {
+# @parts = `mount`;
+#}
+#else {
+# @parts = `mount -t ext2,reiserfs`;
+#}
+for (@parts) {
+ my ($dev,$on,$mount,$tp,$type,$options) = split(/\s+/,$_);
+ next if ($type eq 'nfs' && !($hostname eq 'zartan'));
+ next if ($type eq 'proc' || $type eq 'devpts');
+ my @df= `df -k $mount`;
+ my @df_inode = `df -i $mount`;
+# print "$dev $mount $type\n";
+ shift @df;
+ shift @df_inode;
+ for(@df) {
+ my ($dev1,$blocks,$used,$free,$pc,$mount) = split(/\s+/,$_);
+ my ($percent,$blah) = split(/\%/,$pc);
+ if ( ($percent >= $wrn ) && (!($percent >= $crt) || ($mount =~ m/\/mnt\//)) ) {
+ $output .= "$mount is at $pc ";
+ $warn = 1;
+ }
+ if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
+ $output = "" unless $crit eq '1';
+ $output .= "$mount is at $pc ";
+ $crit = 1;
+ }
+ }
+ for(@df_inode) {
+ my ($dev1,$inodes,$used,$free,$pc,$mount) = split(/\s+/,$_);
+ my ($percent,$blah) = split(/\%/,$pc);
+ if ( ($percent >= $wrn ) && (!($percent >= $crt) ) ) {
+ $output .= "$mount is at $pc inode usage ";
+ $warn = 1;
+ }
+ if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
+ $output = "" unless $crit eq '1';
+ $output .= "$mount is at $pc inode usage ";
+ $crit = 1;
+ }
+ }
+ }
+
+
+#if ( ($warn eq '1') && !($crit eq '1') ) {
+# print "$output\n";
+# $result = 1;
+# }
+if ( $crit eq '1' ) {
+ print "$output\n";
+ $result = 2;
+}
+
+else {
+ print "Disks are OK now\n";
+}
+
+
+#if ( !( $crit eq '1' ) && !( $warn eq '1' ) ) {
+# print "Disks are ok now\n";
+#}
+#print "$result\n";
+exit $result;
--- /dev/null
+#!/usr/bin/perl
+
+# {{ ansible_managed }}
+
+use strict;
+
+my $warn;
+my $crit;
+my $out;
+
+my @out;
+my $devices;
+my $pci;
+my $scsi;
+my $derp;
+
+$pci = `/usr/bin/lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -2`;
+$scsi = `/usr/bin/lspci | /bin/grep -i scsi | /bin/grep -v PATA | /usr/bin/head -1`;
+
+# software raid!
+if (-e "/proc/mdstat") {
+ # check software raid!
+# open(R,"/tmp/mdstat");
+ open(R,"/proc/mdstat");
+ while (<R>) {
+ if (/^(md\d+) : (\w+)/) {
+ my $dev = $1;
+ my $status = $2;
+ my $rest = <R>;
+ $devices++;
+
+ my ($disks,$states) = $rest =~ /(\[.*\]) (\[.*\])/;
+ my $mout .= "$dev is $status $disks $states" if $states =~ /_/;
+
+ # recovery?
+ my $next = <R>; # possibly recovery?
+ if ($next =~ / recovery = /) {
+ my ($progress,$per) = $next =~ /(\[.*\])\s+recovery =\s+(\S+%)/;
+ $mout .= " recovery $per";
+ my $next = <R>;
+ if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
+ $mout .= " finish $finish min";
+ }
+ $warn = 1;
+ } elsif ($next =~ / resync = /) {
+ my ($progress,$per) = $next =~ /(\[.*\])\s+resync =\s+(\S+%)/;
+ $mout .= " resync $per";
+ if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
+ $mout .= " finish $finish min";
+ }
+ $warn = 1;
+ } elsif ($states =~ /_/) { # not all U
+ $crit = 1;
+ }
+
+ push( @out, $mout ) if $mout;
+ }
+ }
+}
+
+
+# mylex raid!
+if ($pci =~ /Mylex/i) {
+#if (1) {
+ my $s = `cat /proc/rd/status`;
+ chomp($s);
+ unless ($s =~ /OK/) {
+ my @myinfo;
+ for my $ctl (`ls -d /proc/rd/c*`) {
+# for my $ctl ('/proc/rd/c0') {
+ chomp $ctl;
+ my %bad;
+ my ($c) = $ctl =~ /\/(c\d)$/;
+ open(S,"$ctl/current_status") || print "can't open $ctl/current_status\n";;
+# open(S,"/tmp/mylex.bad");
+ my $lastdevice;
+ while (<S>) {
+ # disk status
+ if (/^ (\d:\d) Vendor/) {
+ $lastdevice = $1;
+ }
+ if (/ Disk Status: (\S+),/) {
+ if ($1 ne 'Online') {
+ push( @myinfo, "$c disk $lastdevice $1");
+ }
+ }
+
+ # logical drives
+ if (/ (\/dev\/rd\/\S+): (\S+), (\w+),/) {
+ my $dev = $1;
+ my $type = $2;
+ my $status = $3;
+ $devices++;
+ $bad{$dev} = 1;
+ if ($status ne 'Online') {
+ push( @myinfo, "$dev ($type) $status");
+ }
+ }
+
+ # rebuild?
+ if (/ Rebuild in Progress: .* \((\S+)\) (\d+%) completed/) {
+ push( @myinfo, "$1 rebuild $2 complete" );
+ delete $bad{$1};
+ }
+ }
+ if (keys %bad) {
+ $crit = 1; # at least 1 is failed and !recovering
+ } else {
+ $warn = 1; # all are recovering
+ }
+ }
+
+ push( @out, "Mylex $s: " . join(', ',@myinfo)) if @myinfo;
+ }
+}
+
+
+# icp vortex raid!
+if ( $pci =~ /intel/i) {
+ opendir(D,"/proc/scsi/gdth");
+ my @dev = readdir(D);
+ closedir D;
+ my @vortex;
+ for my $dev (@dev) {
+ next if $dev =~ /^\./;
+ my $read = `cat /proc/scsi/gdth/$dev`;
+ # my $read = `cat /tmp/asdf9.warn`;
+ my $cur; # Logical | Physical | Host | Array
+ my @myinfo;
+# print "dev $dev\n";
+ for $_ (split(/\n/,$read)) {
+ chomp;
+ if (/^\w/) {
+ # new section
+ ($cur) = /^(\w+)/;
+# print "cur = $cur\n";
+ next;
+ }
+ if ($cur eq 'Logical') {
+ my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
+ next unless $status;
+ if ($status ne 'ok') {
+ $warn = 1;
+ #push( @myinfo, "Logical #$num $status" );
+ unshift( @myinfo, "Logical #$num $status" );
+ }
+ }
+ if ($cur eq 'Array') {
+ my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
+ next unless $status;
+ if ($status ne 'ready') {
+ $warn = 1;
+ #push( @myinfo, "Array #$num $status" );
+ unshift( @myinfo, "Array #$num $status" );
+ }
+ }
+ if ($cur eq 'Host') {
+ if (/Number/) {
+ $devices++;
+ }
+ }
+ if ($cur eq 'Controller') {
+ # push( @myinfo, $_ );
+ unshift( @myinfo, $_ );
+ }
+ }
+
+ if (@myinfo) {
+ # push( @vortex, "dev $dev: " . join(', ', @myinfo) );
+ # unshift( @vortex, "dev $dev: " . join(', ', @myinfo) );
+ push( @vortex, "dev $dev: " . join(', ', $myinfo[0], $myinfo[1], $myinfo[2], $myinfo[3], $myinfo[4] ) );
+ # $warn = 1;
+ }
+ }
+
+ if (@vortex) {
+ # push( @out, 'Vortex: ' . join('. ', @vortex) );
+ push( @out, 'Vortex: ' . join('. ', @vortex) );
+ }
+}
+# SAS megaraid
+if ( $pci =~ /LSI\ Logic/i) {
+ my $read = `/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0`;
+ for $_ (split(/\n/,$read)) {
+ chomp;
+ # The line we care about is State: Optimal, if we don't have that, we've problems
+ if ($_ =~/^State\s*\:\s*(.*)/m) {
+ $devices++;
+ #/^State\?:\s?(\w+)/;
+ my $state = $1;
+ next unless $state;
+ if ($state ne 'Optimal') {
+ my $rebuild = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -i firmware`;
+ if ( $rebuild =~ /Rebuild/i) {
+ my $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
+ #my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
+ my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/egrep -i \'\(rebuild\|not found\)\'`;
+ if ($rebuildstatus =~ /not found/m) {
+ # check by device id instead of enclosure id if we get a not found error above
+ $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /bin/grep -v position | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
+ $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
+ }
+ for $_ ($rebuildstatus) {
+ $crit = 1;
+ push(@out,$_);
+ }
+ } else {
+ $crit = 1;
+ my $virtual=`/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0 | grep -i failed -B6 | grep -i virtual | cut -d'(' -f1`;
+ push(@out, $virtual, $_);
+ }
+ }
+ }
+ # Should to catch the syntax or permissions errors this thing spits out
+ if (/ERROR/i) {
+ $crit = 1;
+ push(@out, $_);
+ foreach my $k (@out)
+ {
+ print $_;
+ }
+ }
+ }
+}
+
+# e3ware
+if ( $pci =~ /3ware/i) {
+ open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli show|");
+ #my $read = `/usr/sbin/megacli -LDInfo -l0 -a0`;
+
+ $devices++;
+ my @controllers;
+ while (<CLI>) {
+ if ( $_ =~ /^c[0-9]/ ) {
+ my ($c) = split(/\s+/,$_);
+ push(@controllers,$c);
+ }
+ }
+ close(CLI);
+
+ foreach my $cont (@controllers) {
+ open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli /$cont show|");
+ while (<CLI>) {
+ if ( $_ =~ /^u[0-9]+/ ) {
+ my @info = split(/\s+/,$_);
+ if ( $info[2] ne 'OK' ) {
+ if ( $info[2] =~ /REBUILDING/i) {
+ my $rebuildstatus = `/usr/bin/sudo /usr/sbin/tw_cli /$cont/$info[0] show | /bin/grep REBUILD | /bin/grep -v RAID-10`;
+ for $_ ($rebuildstatus) {
+ $crit = 1;
+ push(@out,$_);
+ }
+ } else {
+ $crit = 1;
+ push(@out,$_);
+ }
+ }
+ }
+ if ( $_ =~ /^p[0-9]+/ ) {
+ my @info = split(/\s+/,$_);
+ if ( $info[1] ne 'OK' ) {
+ $crit = 1;
+ push(@out,$_);
+ }
+ }
+ }
+ }
+}
+
+#Areca
+
+if ( $pci =~ /areca/i) {
+ open(CLI,"sudo /usr/sbin/cli64 vsf info|");
+ while (<CLI>) {
+ if ( $_ =~ /^\ \ [0-9]+/ ) {
+ $devices++;
+ my @info = split(/\s+/,$_);
+ if ( $_ !~ /Normal/i) {
+ $crit = 1;
+ push(@out,$_);
+ }
+ }
+ }
+ }
+
+if ( $scsi =~ /LSI Logic/i) {
+ open(CLI,"sudo /usr/sbin/mpt-status | /usr/bin/head -1 |");
+ $devices++;
+ while (<CLI>) {
+ if ( $_ =~ /^ioc/ ) {
+ my @info = split(/\s+/,$_);
+ if ( $info[10] ne 'OPTIMAL,' ) {
+ $crit = 1;
+ push(@out,$_);
+ }
+ }
+ }
+ }
+
+# show results
+my $result = 0;
+$result = 1 if $warn;
+$result = 2 if $crit;
+# print "warn = $warn crit = $crit\n";
+print $derp;
+my $out = "No raid devices found $pci";
+$out = "All $devices raid devices happy as clams" if $devices;
+if (@out) {
+ $out = join('; ', @out);
+}
+
+print "$out\n";
+exit $result;
--- /dev/null
+#!/bin/bash
+# Description: Bash script to check drive health using pending, uncorrectable,
+# and reallocated sector count
+#
+# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN
+# SMART Attribute Codes:
+# 5 = Reallocated
+# 187 = Reported Uncorrect
+# 197 = Pending
+# 198 = Uncorrectable Sector Count
+#
+# TO-DO: Add support for dynamic SMART attribute lookup. For example,
+# 187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs.
+#
+# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes
+
+### Define global variables ###
+# total number of drives (or RAID slots) discovered
+numdrives=0
+# Number of failed, failing, and/or missing drives
+failingdrives=0
+# Fallback message for UNKNOWN return code output
+unknownmsg="Unknown error"
+# Return code for nagios (Default to SUCCESS)
+rc=0
+# Array of messages indicating drive health. Output after nagios status.
+declare -a messages
+
+### Functions ###
+main ()
+{
+ preflight
+
+ if [ "$raid" = true ]
+ then
+ areca_smart
+ areca_failed
+ elif [ "$raid" = false ]
+ then
+ normal_smart
+ else
+ echo "ERROR - Could not determine if RAID present"
+ exit 3
+ fi
+
+ ## Return UNKNOWN if no drives found
+ if [ "$numdrives" -eq "0" ]
+ then
+ unknownmsg="No drives found!"
+ rc=3
+ fi
+
+ ## Return code and service status for nagios
+ if [ "$rc" = 0 ]
+ then
+ echo "OK - All $numdrives drives healthy"
+ elif [ "$rc" = 1 ]
+ then
+ echo "WARNING - $failingdrives of $numdrives drives sick"
+ elif [ "$rc" = 2 ]
+ then
+ echo "CRITICAL - $failingdrives of $numdrives drives need replacing"
+ elif [ "$rc" = 3 ]
+ then
+ echo "UNKNOWN - $unknownmsg"
+ else
+ echo "ERROR - Got no return code"
+ fi
+
+ ## Iterate through array of messages
+ # Nagios reads and displays the first line of output on the Services page.
+ # All individual messages about failed/failing disk statistics can be viewed
+ # on the individual system's SMART detail page in nagios.
+ for msg in "${messages[@]}"
+ do
+ echo "$msg"
+ done
+
+ exit $rc
+}
+
+# Pre-flight checks
+preflight ()
+{
+ # Set raid var then check for cli64 command and bail if missing
+ if lspci | grep -qi areca
+ then
+ raid=true
+ else
+ raid=false
+ fi
+
+ if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ]
+ then
+ echo "ERROR - cli64 command not found or is not executable"
+ exit 3
+ fi
+
+ # Check for smartmontools and bail if missing
+ if ! [ -x "$(command -v smartctl)" ]
+ then
+ echo "ERROR - smartctl is not installed or is not executable"
+ echo "yum/apt-get install smartmontools"
+ exit 3
+ fi
+}
+
+# Gather smart data for drives behind Areca RAID controller
+areca_smart ()
+{
+ # Store output of cli64 to reduce repeated executions
+ cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
+ numdrives=$(echo "$cli64out" | wc -l)
+ # Loop through all disks not marked as 'N.A.' or 'Failed'
+ for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \
+ | grep -o "Slot#[[:digit:]]" | cut -c6-)
+ do
+ failed=false
+ # Determine if disk is JBOD or part of hardware RAID
+ if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD'
+ then
+ jbod=true
+ else
+ jbod=false
+ fi
+ output=$(sudo cli64 disk smart drv=$slot \
+ | grep -E "^ "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ')
+ outputcount=$(echo $output | wc -w)
+ # Only continue if we received 3 SMART data points
+ if [ "$outputcount" = "3" ]
+ then
+ # Only do slot to drive letter matching once per bad JBOD
+ if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ]
+ then
+ dl=$(areca_bay_to_letter $slot)
+ elif [ "$jbod" = false ]
+ then
+ dl="(RAID)"
+ fi
+ read reallocated pending uncorrect <<< $output
+ if [ "$reallocated" != "0" ]
+ then
+ messages+=("Drive $slot $dl has $reallocated reallocated sectors")
+ failed=true
+ # A small number of reallocated sectors is OK
+ if [ "$reallocated" -le 5 ]
+ then
+ rc=1 # Warn if <= 5
+ else
+ rc=2 # Crit if >5
+ fi
+ fi
+ if [ "$pending" != "0" ]
+ then
+ messages+=("Drive $slot $dl has $pending pending sectors")
+ failed=true
+ rc=2
+ fi
+ if [ "$uncorrect" != "0" ]
+ then
+ messages+=("Drive $slot $dl has $uncorrect uncorrect sectors")
+ failed=true
+ rc=2
+ fi
+ else
+ messages+=("Drive $slot returned $outputcount of 3 expected attributes")
+ unknownmsg="SMART data could not be read for one or more drives"
+ rc=3
+ fi
+ # Make sure drives with multiple types of bad sectors only get counted once
+ if [ "$failed" = true ]
+ then
+ let "failingdrives+=1"
+ fi
+ done
+}
+
+# Correlate Areca drive bay to drive letter
+areca_bay_to_letter ()
+{
+ # Get S/N according to RAID controller given argument $1 (slot #)
+ areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \
+ | awk '{ print $NF }')
+ # Loop through and get S/N according to smartctl given drive name
+ for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
+ | awk '{ print $NF }')
+ do
+ smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \
+ | awk '{ print $NF }')
+ # If cli64 and smartctl find a S/N match, return drive letter
+ if [ "$areca_serial" = "$smart_serial" ]
+ then
+ echo "($dl)"
+ fi
+ done
+}
+
+# Tally missing and failed drives connected to Areca RAID
+areca_failed ()
+{
+ # Store output of cli64 to reduce repeated executions
+ cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
+ # Missing (N.A.) drives
+ for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
+ | grep "N.A." | awk '{ print $1 }')
+ do
+ messages+=("Drive $drive is missing")
+ let "failingdrives+=1"
+ rc=2
+ done
+ # Hard failed drives
+ for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
+ | grep 'Failed' | awk '{ print $1 }')
+ do
+ messages+=("Drive $drive failed")
+ let "failingdrives+=1"
+ rc=2
+ done
+}
+
+# Standard SATA/SAS drive smartctl check
+normal_smart ()
+{
+ # The grep regex will include drives named sdaa, for example
+ numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l)
+ for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
+ | awk '{ print $NF }')
+ do
+ failed=false
+ output=$(sudo smartctl -a /dev/$l | grep -E "^ "5"|^"197"|^"198"" \
+ | awk '{ print $NF }' | tr '\n' ' ')
+ outputcount=$(echo $output | wc -w)
+ # Check if drive is SSD and set var accordingly
+ if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then
+ is_ssd=true
+ else
+ is_ssd=false
+ fi
+ # Only continue if we received 3 SMART data points and drive is not SSD
+ if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ]
+ then
+ read reallocated pending uncorrect <<< $output
+ if [ "$reallocated" != "0" ]
+ then
+ messages+=("Drive $l has $reallocated reallocated sectors")
+ failed=true
+ # A small number of reallocated sectors is OK
+ if [ "$reallocated" -le 5 ]
+ then
+ rc=1 # Warn if <= 5
+ else
+ rc=2 # Crit if >5
+ fi
+ fi
+ if [ "$pending" != "0" ]
+ then
+ messages+=("Drive $l has $pending pending sectors")
+ failed=true
+ rc=2
+ fi
+ if [ "$uncorrect" != "0" ]
+ then
+ messages+=("Drive $l has $uncorrect uncorrect sectors")
+ failed=true
+ rc=2
+ fi
+ elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ]
+ then
+ messages+=("Drive $l returned $outputcount of 3 expected attributes")
+ unknownmsg="SMART data could not be read for one or more drives"
+ rc=3
+ # Set no return code and assume any SSD is healthy for now
+ elif [ "$is_ssd" = true ]
+ then
+ messages+=("Drive $l is an SSD. Not yet supported.")
+ rc=0
+ else
+ messages+=("Error processing data for drive $l")
+ rc=3
+ fi
+ # Make sure drives with multiple types of bad sectors only get counted once
+ if [ "$failed" = true ]
+ then
+ let "failingdrives+=1"
+ fi
+ done
+}
+
+## Call main() function
+main
--- /dev/null
+module nrpe 1.0;
+
+require {
+ type fsadm_exec_t;
+ type nrpe_t;
+ type hwdata_t;
+ class file { read getattr open };
+}
+
+#============= nrpe_t ==============
+allow nrpe_t fsadm_exec_t:file getattr;
+allow nrpe_t hwdata_t:file { read getattr open };
--- /dev/null
+---
+- name: restart nagios-nrpe-server
+ service:
+ name: "{{ nrpe_service_name }}"
+ state: restarted
--- /dev/null
+---
+# We use these scripts to check to see if any of our test nodes have bad disks
+
+- name: Upload megacli and cli64 for raid monitoring and smart.pl to /usr/sbin/.
+ copy:
+ src: "../files/sbin/{{ item }}"
+ dest: "/usr/sbin/{{ item }}"
+ owner: root
+ group: root
+ mode: 0755
+ with_items:
+ - megacli
+ - cli64
+
+- name: Create /usr/libexec.
+ file:
+ path: /usr/libexec
+ owner: root
+ group: root
+ mode: 0755
+ state: directory
+
+- name: Upload custom netsaint scripts for raid/disk/smart/monitoring to /usr/libexec/.
+ copy:
+ src: "../files/libexec/{{ item }}"
+ dest: "/usr/libexec/{{ item }}"
+ owner: root
+ group: root
+ mode: 0755
+ with_items:
+ - smart.sh
+ - raid.pl
+ - diskusage.pl
- include: kerberos.yml
tags:
- kerberos
+
+# upload custom disk monitoring scripts
+- include: disk_monitoring.yml
+ tags:
+ - monitoring-scripts
+
+# configure nagios
+- include: nagios.yml
+ tags:
+ - nagios
+
+- name: Get SELinux status
+ command: getenforce
+ register: selinux_status
+ when: ansible_pkg_mgr == "yum"
+ tags:
+ - nagios
+
+# configure selinux for nagios
+- include: nrpe-selinux.yml
+ when: selinux_status is defined and selinux_status.stdout != "Disabled"
+ tags:
+ - nagios
--- /dev/null
+---
+- name: Upload nagios sudoers.d for raid utilities.
+ template:
+ src: nagios/90-nagios
+ dest: /etc/sudoers.d/90-nagios
+ owner: root
+ group: root
+ mode: 0440
+ validate: visudo -cf %s
+
+- name: Configure nagios nrpe settings (Ubuntu)
+ lineinfile:
+ dest: /etc/default/{{ nrpe_service_name }}
+ regexp: "^DAEMON_OPTS"
+ line: "DAEMON_OPTS=\"--no-ssl\""
+ when: ansible_pkg_mgr == "apt"
+
+- name: Configure nagios nrpe settings (RHEL/CentOS)
+ lineinfile:
+ dest: /etc/sysconfig/{{ nrpe_service_name }}
+ regexp: "^NRPE_SSL_OPT"
+ line: "NRPE_SSL_OPT=\"-n\""
+ when: ansible_pkg_mgr == "yum"
+
+- name: Check firewalld status
+ command: systemctl status firewalld
+ register: firewalld
+ ignore_errors: true
+ no_log: true
+ when: ansible_pkg_mgr == "yum"
+
+- name: Open nrpe port if firewalld enabled
+ firewalld:
+ port: 5666/tcp
+ state: enabled
+ permanent: yes
+ immediate: yes
+ when: ansible_pkg_mgr == "yum" and (firewalld is defined and firewalld.stdout.find('running') != -1)
+
+- name: Upload nagios nrpe config.
+ template:
+ src: nagios/nrpe.cfg
+ dest: /etc/nagios/nrpe.cfg
+ owner: root
+ group: root
+ mode: 0644
+ notify:
+ - restart nagios-nrpe-server
+
+- name: Make sure nagios nrpe service is running.
+ service:
+ name: "{{ nrpe_service_name }}"
+ enabled: yes
+ state: started
--- /dev/null
+---
+- name: nrpe - Install semanage python bindings
+ yum:
+ pkg: libsemanage-python
+ state: installed
+
+- name: nrpe - Install SELinux tools
+ yum:
+ pkg: policycoreutils-python
+ state: installed
+
+- name: nrpe - Ensure SELinux policy is up to date
+ yum:
+ pkg: selinux-policy-targeted
+ state: latest
+
+- name: nrpe - Set SELinux boolean nagios_run_sudo true
+ seboolean:
+ name: nagios_run_sudo
+ state: yes
+ persistent: yes
+
+- name: nrpe - Remove SELinux policy package
+ command: semodule -r nrpe
+ failed_when: false
+
+- name: nrpe - Copy SELinux type enforcement file
+ copy:
+ src: nagios/nrpe.te
+ dest: /tmp/nrpe.te
+
+- name: nrpe - Compile SELinux module file
+ command: checkmodule -M -m -o /tmp/nrpe.mod /tmp/nrpe.te
+
+- name: nrpe - Build SELinux policy package
+ command: semodule_package -o /tmp/nrpe.pp -m /tmp/nrpe.mod
+
+- name: nrpe - Load SELinux policy package
+ command: semodule -i /tmp/nrpe.pp
+
+- name: nrpe - Remove temporary files
+ file:
+ path: /tmp/nrpe.*
+ state: absent
--- /dev/null
+## {{ ansible_managed }}
+{{ nrpe_user }} ALL=NOPASSWD: /usr/sbin/megacli, /usr/sbin/cli64, /usr/sbin/smartctl, /usr/sbin/smartctl
--- /dev/null
+# {{ ansible_managed }}
+log_facility=daemon
+pid_file=/var/run/nagios/nrpe.pid
+server_port=5666
+nrpe_user={{ nrpe_user }}
+nrpe_group={{ nrpe_group }}
+
+# These should eventually be in a secrets group_var
+# 172. address is sepia nagios server
+# 10. address is octo nagios server
+allowed_hosts=127.0.0.1,172.21.0.33,10.8.0.8
+dont_blame_nrpe=0
+debug=0
+command_timeout=60
+connection_timeout=300
+
+command[check_users]={{ nagios_plugins_directory }}/check_users --warning=5 --critical=10
+command[check_load]={{ nagios_plugins_directory }}/check_load --percpu --warning=1.5,1.4,1.3 --critical=2.0,1.9,1.8
+command[check_hda1]={{ nagios_plugins_directory }}/check_disk --warning=20% --critical=10% --partition=/dev/hda1
+command[check_root]={{ nagios_plugins_directory }}/check_disk --warning=10% --critical=5% --units=GB --path=/
+command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning=5 --critical=10 --state=Z
+command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500
+command[check_raid]=/usr/libexec/raid.pl
+command[check_disks]=/usr/libexec/diskusage.pl 90 95
+command[check_smart]=/usr/libexec/smart.sh
+
+include=/etc/nagios/nrpe_local.cfg
+
+include_dir=/etc/nagios/nrpe.d/
--- /dev/null
+---
+nrpe_service_name: nagios-nrpe-server
+nrpe_user: nagios
+nrpe_group: nagios
+nagios_plugins_directory: /usr/lib/nagios/plugins
--- /dev/null
+---
+nrpe_service_name: nrpe
+nrpe_user: nrpe
+nrpe_group: nrpe
+nagios_plugins_directory: /usr/lib64/nagios/plugins
+++ /dev/null
-#!/usr/bin/perl
-
-# {{ ansible_managed }}
-
-#******************************************************************************************
-#
-# NRPE DISK USAGE PLUGIN
-#
-# Program: Disk Usage plugin written to be used with Netsaint and NRPE
-# License: GPL
-# Copyright (c) 2000 Jeremy Hanmer (jeremy@newdream.net)
-#
-# Last Modified: 10/23/00
-#
-# Information: Basically, I wrote this because I had to deal with large numbers of
-# machines with a wide range of disk configurations, and with dynamically mounted
-# partitions. The basic check_disk plugin relied on a static configuration file which
-# doesn't lend itself to being used in a heterogeneous environnment (especially when
-# you can't guarantee that the devices listed in the configuration file will be mounted).
-#
-# Bugs: Currently, this plugin only works on EXT2 partitions (although it's easy to change).
-#
-# Command Line: diskusage.pl <warning percentage> <critical percentage>
-#
-# Tested Systems: Mandrake 7.1/Intel, Debian 2.2/Intel, Debian 2.1/Intel
-#
-# License Information:
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-#*******************************************************************************************
-
-
-use strict;
-
-my $wrn = shift @ARGV;
-my $crt = shift @ARGV;
-my $output;
-my $count;
-my %type;
-my $result = 0;
-my $warn = 0;
-my $crit = 0;
-my @parts;
-my $hostname = `hostname`;
-chomp $hostname;
-@parts = `mount | grep -vi fuse`;
-
-#if ( $hostname eq 'zartan' ) {
-# @parts = `mount`;
-#}
-#else {
-# @parts = `mount -t ext2,reiserfs`;
-#}
-for (@parts) {
- my ($dev,$on,$mount,$tp,$type,$options) = split(/\s+/,$_);
- next if ($type eq 'nfs' && !($hostname eq 'zartan'));
- next if ($type eq 'proc' || $type eq 'devpts');
- my @df= `df -k $mount`;
- my @df_inode = `df -i $mount`;
-# print "$dev $mount $type\n";
- shift @df;
- shift @df_inode;
- for(@df) {
- my ($dev1,$blocks,$used,$free,$pc,$mount) = split(/\s+/,$_);
- my ($percent,$blah) = split(/\%/,$pc);
- if ( ($percent >= $wrn ) && (!($percent >= $crt) || ($mount =~ m/\/mnt\//)) ) {
- $output .= "$mount is at $pc ";
- $warn = 1;
- }
- if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
- $output = "" unless $crit eq '1';
- $output .= "$mount is at $pc ";
- $crit = 1;
- }
- }
- for(@df_inode) {
- my ($dev1,$inodes,$used,$free,$pc,$mount) = split(/\s+/,$_);
- my ($percent,$blah) = split(/\%/,$pc);
- if ( ($percent >= $wrn ) && (!($percent >= $crt) ) ) {
- $output .= "$mount is at $pc inode usage ";
- $warn = 1;
- }
- if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
- $output = "" unless $crit eq '1';
- $output .= "$mount is at $pc inode usage ";
- $crit = 1;
- }
- }
- }
-
-
-#if ( ($warn eq '1') && !($crit eq '1') ) {
-# print "$output\n";
-# $result = 1;
-# }
-if ( $crit eq '1' ) {
- print "$output\n";
- $result = 2;
-}
-
-else {
- print "Disks are OK now\n";
-}
-
-
-#if ( !( $crit eq '1' ) && !( $warn eq '1' ) ) {
-# print "Disks are ok now\n";
-#}
-#print "$result\n";
-exit $result;
+++ /dev/null
-#!/usr/bin/perl
-
-# {{ ansible_managed }}
-
-use strict;
-
-my $warn;
-my $crit;
-my $out;
-
-my @out;
-my $devices;
-my $pci;
-my $scsi;
-my $derp;
-
-$pci = `/usr/bin/lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -2`;
-$scsi = `/usr/bin/lspci | /bin/grep -i scsi | /bin/grep -v PATA | /usr/bin/head -1`;
-
-# software raid!
-if (-e "/proc/mdstat") {
- # check software raid!
-# open(R,"/tmp/mdstat");
- open(R,"/proc/mdstat");
- while (<R>) {
- if (/^(md\d+) : (\w+)/) {
- my $dev = $1;
- my $status = $2;
- my $rest = <R>;
- $devices++;
-
- my ($disks,$states) = $rest =~ /(\[.*\]) (\[.*\])/;
- my $mout .= "$dev is $status $disks $states" if $states =~ /_/;
-
- # recovery?
- my $next = <R>; # possibly recovery?
- if ($next =~ / recovery = /) {
- my ($progress,$per) = $next =~ /(\[.*\])\s+recovery =\s+(\S+%)/;
- $mout .= " recovery $per";
- my $next = <R>;
- if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
- $mout .= " finish $finish min";
- }
- $warn = 1;
- } elsif ($next =~ / resync = /) {
- my ($progress,$per) = $next =~ /(\[.*\])\s+resync =\s+(\S+%)/;
- $mout .= " resync $per";
- if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
- $mout .= " finish $finish min";
- }
- $warn = 1;
- } elsif ($states =~ /_/) { # not all U
- $crit = 1;
- }
-
- push( @out, $mout ) if $mout;
- }
- }
-}
-
-
-# mylex raid!
-if ($pci =~ /Mylex/i) {
-#if (1) {
- my $s = `cat /proc/rd/status`;
- chomp($s);
- unless ($s =~ /OK/) {
- my @myinfo;
- for my $ctl (`ls -d /proc/rd/c*`) {
-# for my $ctl ('/proc/rd/c0') {
- chomp $ctl;
- my %bad;
- my ($c) = $ctl =~ /\/(c\d)$/;
- open(S,"$ctl/current_status") || print "can't open $ctl/current_status\n";;
-# open(S,"/tmp/mylex.bad");
- my $lastdevice;
- while (<S>) {
- # disk status
- if (/^ (\d:\d) Vendor/) {
- $lastdevice = $1;
- }
- if (/ Disk Status: (\S+),/) {
- if ($1 ne 'Online') {
- push( @myinfo, "$c disk $lastdevice $1");
- }
- }
-
- # logical drives
- if (/ (\/dev\/rd\/\S+): (\S+), (\w+),/) {
- my $dev = $1;
- my $type = $2;
- my $status = $3;
- $devices++;
- $bad{$dev} = 1;
- if ($status ne 'Online') {
- push( @myinfo, "$dev ($type) $status");
- }
- }
-
- # rebuild?
- if (/ Rebuild in Progress: .* \((\S+)\) (\d+%) completed/) {
- push( @myinfo, "$1 rebuild $2 complete" );
- delete $bad{$1};
- }
- }
- if (keys %bad) {
- $crit = 1; # at least 1 is failed and !recovering
- } else {
- $warn = 1; # all are recovering
- }
- }
-
- push( @out, "Mylex $s: " . join(', ',@myinfo)) if @myinfo;
- }
-}
-
-
-# icp vortex raid!
-if ( $pci =~ /intel/i) {
- opendir(D,"/proc/scsi/gdth");
- my @dev = readdir(D);
- closedir D;
- my @vortex;
- for my $dev (@dev) {
- next if $dev =~ /^\./;
- my $read = `cat /proc/scsi/gdth/$dev`;
- # my $read = `cat /tmp/asdf9.warn`;
- my $cur; # Logical | Physical | Host | Array
- my @myinfo;
-# print "dev $dev\n";
- for $_ (split(/\n/,$read)) {
- chomp;
- if (/^\w/) {
- # new section
- ($cur) = /^(\w+)/;
-# print "cur = $cur\n";
- next;
- }
- if ($cur eq 'Logical') {
- my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
- next unless $status;
- if ($status ne 'ok') {
- $warn = 1;
- #push( @myinfo, "Logical #$num $status" );
- unshift( @myinfo, "Logical #$num $status" );
- }
- }
- if ($cur eq 'Array') {
- my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
- next unless $status;
- if ($status ne 'ready') {
- $warn = 1;
- #push( @myinfo, "Array #$num $status" );
- unshift( @myinfo, "Array #$num $status" );
- }
- }
- if ($cur eq 'Host') {
- if (/Number/) {
- $devices++;
- }
- }
- if ($cur eq 'Controller') {
- # push( @myinfo, $_ );
- unshift( @myinfo, $_ );
- }
- }
-
- if (@myinfo) {
- # push( @vortex, "dev $dev: " . join(', ', @myinfo) );
- # unshift( @vortex, "dev $dev: " . join(', ', @myinfo) );
- push( @vortex, "dev $dev: " . join(', ', $myinfo[0], $myinfo[1], $myinfo[2], $myinfo[3], $myinfo[4] ) );
- # $warn = 1;
- }
- }
-
- if (@vortex) {
- # push( @out, 'Vortex: ' . join('. ', @vortex) );
- push( @out, 'Vortex: ' . join('. ', @vortex) );
- }
-}
-# SAS megaraid
-if ( $pci =~ /LSI\ Logic/i) {
- my $read = `/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0`;
- for $_ (split(/\n/,$read)) {
- chomp;
- # The line we care about is State: Optimal, if we don't have that, we've problems
- if ($_ =~/^State\s*\:\s*(.*)/m) {
- $devices++;
- #/^State\?:\s?(\w+)/;
- my $state = $1;
- next unless $state;
- if ($state ne 'Optimal') {
- my $rebuild = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -i firmware`;
- if ( $rebuild =~ /Rebuild/i) {
- my $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
- #my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
- my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/egrep -i \'\(rebuild\|not found\)\'`;
- if ($rebuildstatus =~ /not found/m) {
- # check by device id instead of enclosure id if we get a not found error above
- $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /bin/grep -v position | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
- $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
- }
- for $_ ($rebuildstatus) {
- $crit = 1;
- push(@out,$_);
- }
- } else {
- $crit = 1;
- my $virtual=`/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0 | grep -i failed -B6 | grep -i virtual | cut -d'(' -f1`;
- push(@out, $virtual, $_);
- }
- }
- }
- # Should to catch the syntax or permissions errors this thing spits out
- if (/ERROR/i) {
- $crit = 1;
- push(@out, $_);
- foreach my $k (@out)
- {
- print $_;
- }
- }
- }
-}
-
-# e3ware
-if ( $pci =~ /3ware/i) {
- open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli show|");
- #my $read = `/usr/sbin/megacli -LDInfo -l0 -a0`;
-
- $devices++;
- my @controllers;
- while (<CLI>) {
- if ( $_ =~ /^c[0-9]/ ) {
- my ($c) = split(/\s+/,$_);
- push(@controllers,$c);
- }
- }
- close(CLI);
-
- foreach my $cont (@controllers) {
- open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli /$cont show|");
- while (<CLI>) {
- if ( $_ =~ /^u[0-9]+/ ) {
- my @info = split(/\s+/,$_);
- if ( $info[2] ne 'OK' ) {
- if ( $info[2] =~ /REBUILDING/i) {
- my $rebuildstatus = `/usr/bin/sudo /usr/sbin/tw_cli /$cont/$info[0] show | /bin/grep REBUILD | /bin/grep -v RAID-10`;
- for $_ ($rebuildstatus) {
- $crit = 1;
- push(@out,$_);
- }
- } else {
- $crit = 1;
- push(@out,$_);
- }
- }
- }
- if ( $_ =~ /^p[0-9]+/ ) {
- my @info = split(/\s+/,$_);
- if ( $info[1] ne 'OK' ) {
- $crit = 1;
- push(@out,$_);
- }
- }
- }
- }
-}
-
-#Areca
-
-if ( $pci =~ /areca/i) {
- open(CLI,"sudo /usr/sbin/cli64 vsf info|");
- while (<CLI>) {
- if ( $_ =~ /^\ \ [0-9]+/ ) {
- $devices++;
- my @info = split(/\s+/,$_);
- if ( $_ !~ /Normal/i) {
- $crit = 1;
- push(@out,$_);
- }
- }
- }
- }
-
-if ( $scsi =~ /LSI Logic/i) {
- open(CLI,"sudo /usr/sbin/mpt-status | /usr/bin/head -1 |");
- $devices++;
- while (<CLI>) {
- if ( $_ =~ /^ioc/ ) {
- my @info = split(/\s+/,$_);
- if ( $info[10] ne 'OPTIMAL,' ) {
- $crit = 1;
- push(@out,$_);
- }
- }
- }
- }
-
-# show results
-my $result = 0;
-$result = 1 if $warn;
-$result = 2 if $crit;
-# print "warn = $warn crit = $crit\n";
-print $derp;
-my $out = "No raid devices found $pci";
-$out = "All $devices raid devices happy as clams" if $devices;
-if (@out) {
- $out = join('; ', @out);
-}
-
-print "$out\n";
-exit $result;
+++ /dev/null
-#!/bin/bash
-# Description: Bash script to check drive health using pending, uncorrectable,
-# and reallocated sector count
-#
-# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN
-# SMART Attribute Codes:
-# 5 = Reallocated
-# 187 = Reported Uncorrect
-# 197 = Pending
-# 198 = Uncorrectable Sector Count
-#
-# TO-DO: Add support for dynamic SMART attribute lookup. For example,
-# 187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs.
-#
-# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes
-
-### Define global variables ###
-# total number of drives (or RAID slots) discovered
-numdrives=0
-# Number of failed, failing, and/or missing drives
-failingdrives=0
-# Fallback message for UNKNOWN return code output
-unknownmsg="Unknown error"
-# Return code for nagios (Default to SUCCESS)
-rc=0
-# Array of messages indicating drive health. Output after nagios status.
-declare -a messages
-
-### Functions ###
-main ()
-{
- preflight
-
- if [ "$raid" = true ]
- then
- areca_smart
- areca_failed
- elif [ "$raid" = false ]
- then
- normal_smart
- else
- echo "ERROR - Could not determine if RAID present"
- exit 3
- fi
-
- ## Return UNKNOWN if no drives found
- if [ "$numdrives" -eq "0" ]
- then
- unknownmsg="No drives found!"
- rc=3
- fi
-
- ## Return code and service status for nagios
- if [ "$rc" = 0 ]
- then
- echo "OK - All $numdrives drives healthy"
- elif [ "$rc" = 1 ]
- then
- echo "WARNING - $failingdrives of $numdrives drives sick"
- elif [ "$rc" = 2 ]
- then
- echo "CRITICAL - $failingdrives of $numdrives drives need replacing"
- elif [ "$rc" = 3 ]
- then
- echo "UNKNOWN - $unknownmsg"
- else
- echo "ERROR - Got no return code"
- fi
-
- ## Iterate through array of messages
- # Nagios reads and displays the first line of output on the Services page.
- # All individual messages about failed/failing disk statistics can be viewed
- # on the individual system's SMART detail page in nagios.
- for msg in "${messages[@]}"
- do
- echo "$msg"
- done
-
- exit $rc
-}
-
-# Pre-flight checks
-preflight ()
-{
- # Set raid var then check for cli64 command and bail if missing
- if lspci | grep -qi areca
- then
- raid=true
- else
- raid=false
- fi
-
- if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ]
- then
- echo "ERROR - cli64 command not found or is not executable"
- exit 3
- fi
-
- # Check for smartmontools and bail if missing
- if ! [ -x "$(command -v smartctl)" ]
- then
- echo "ERROR - smartctl is not installed or is not executable"
- echo "yum/apt-get install smartmontools"
- exit 3
- fi
-}
-
-# Gather smart data for drives behind Areca RAID controller
-areca_smart ()
-{
- # Store output of cli64 to reduce repeated executions
- cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
- numdrives=$(echo "$cli64out" | wc -l)
- # Loop through all disks not marked as 'N.A.' or 'Failed'
- for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \
- | grep -o "Slot#[[:digit:]]" | cut -c6-)
- do
- failed=false
- # Determine if disk is JBOD or part of hardware RAID
- if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD'
- then
- jbod=true
- else
- jbod=false
- fi
- output=$(sudo cli64 disk smart drv=$slot \
- | grep -E "^ "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ')
- outputcount=$(echo $output | wc -w)
- # Only continue if we received 3 SMART data points
- if [ "$outputcount" = "3" ]
- then
- # Only do slot to drive letter matching once per bad JBOD
- if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ]
- then
- dl=$(areca_bay_to_letter $slot)
- elif [ "$jbod" = false ]
- then
- dl="(RAID)"
- fi
- read reallocated pending uncorrect <<< $output
- if [ "$reallocated" != "0" ]
- then
- messages+=("Drive $slot $dl has $reallocated reallocated sectors")
- failed=true
- # A small number of reallocated sectors is OK
- if [ "$reallocated" -le 5 ]
- then
- rc=1 # Warn if <= 5
- else
- rc=2 # Crit if >5
- fi
- fi
- if [ "$pending" != "0" ]
- then
- messages+=("Drive $slot $dl has $pending pending sectors")
- failed=true
- rc=2
- fi
- if [ "$uncorrect" != "0" ]
- then
- messages+=("Drive $slot $dl has $uncorrect uncorrect sectors")
- failed=true
- rc=2
- fi
- else
- messages+=("Drive $slot returned $outputcount of 3 expected attributes")
- unknownmsg="SMART data could not be read for one or more drives"
- rc=3
- fi
- # Make sure drives with multiple types of bad sectors only get counted once
- if [ "$failed" = true ]
- then
- let "failingdrives+=1"
- fi
- done
-}
-
-# Correlate Areca drive bay to drive letter
-areca_bay_to_letter ()
-{
- # Get S/N according to RAID controller given argument $1 (slot #)
- areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \
- | awk '{ print $NF }')
- # Loop through and get S/N according to smartctl given drive name
- for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
- | awk '{ print $NF }')
- do
- smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \
- | awk '{ print $NF }')
- # If cli64 and smartctl find a S/N match, return drive letter
- if [ "$areca_serial" = "$smart_serial" ]
- then
- echo "($dl)"
- fi
- done
-}
-
-# Tally missing and failed drives connected to Areca RAID
-areca_failed ()
-{
- # Store output of cli64 to reduce repeated executions
- cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
- # Missing (N.A.) drives
- for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
- | grep "N.A." | awk '{ print $1 }')
- do
- messages+=("Drive $drive is missing")
- let "failingdrives+=1"
- rc=2
- done
- # Hard failed drives
- for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
- | grep 'Failed' | awk '{ print $1 }')
- do
- messages+=("Drive $drive failed")
- let "failingdrives+=1"
- rc=2
- done
-}
-
-# Standard SATA/SAS drive smartctl check
-normal_smart ()
-{
- # The grep regex will include drives named sdaa, for example
- numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l)
- for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
- | awk '{ print $NF }')
- do
- failed=false
- output=$(sudo smartctl -a /dev/$l | grep -E "^ "5"|^"197"|^"198"" \
- | awk '{ print $NF }' | tr '\n' ' ')
- outputcount=$(echo $output | wc -w)
- # Check if drive is SSD and set var accordingly
- if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then
- is_ssd=true
- else
- is_ssd=false
- fi
- # Only continue if we received 3 SMART data points and drive is not SSD
- if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ]
- then
- read reallocated pending uncorrect <<< $output
- if [ "$reallocated" != "0" ]
- then
- messages+=("Drive $l has $reallocated reallocated sectors")
- failed=true
- # A small number of reallocated sectors is OK
- if [ "$reallocated" -le 5 ]
- then
- rc=1 # Warn if <= 5
- else
- rc=2 # Crit if >5
- fi
- fi
- if [ "$pending" != "0" ]
- then
- messages+=("Drive $l has $pending pending sectors")
- failed=true
- rc=2
- fi
- if [ "$uncorrect" != "0" ]
- then
- messages+=("Drive $l has $uncorrect uncorrect sectors")
- failed=true
- rc=2
- fi
- elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ]
- then
- messages+=("Drive $l returned $outputcount of 3 expected attributes")
- unknownmsg="SMART data could not be read for one or more drives"
- rc=3
- # Set no return code and assume any SSD is healthy for now
- elif [ "$is_ssd" = true ]
- then
- messages+=("Drive $l is an SSD. Not yet supported.")
- rc=0
- else
- messages+=("Error processing data for drive $l")
- rc=3
- fi
- # Make sure drives with multiple types of bad sectors only get counted once
- if [ "$failed" = true ]
- then
- let "failingdrives+=1"
- fi
- done
-}
-
-## Call main() function
-main
+++ /dev/null
-module nrpe 1.0;
-
-require {
- type fsadm_exec_t;
- type nrpe_t;
- type hwdata_t;
- class file { read getattr open };
-}
-
-#============= nrpe_t ==============
-allow nrpe_t fsadm_exec_t:file getattr;
-allow nrpe_t hwdata_t:file { read getattr open };
service:
name: cron
state: restarted
-
-- name: restart nagios-nrpe-server
- service:
- name: "{{ nrpe_service_name }}"
- state: restarted
+++ /dev/null
----
-# We use these scripts to check to see if any of our test nodes have bad disks
-
-- name: Upload megacli and cli64 for raid monitoring and smart.pl to /usr/sbin/.
- copy:
- src: "../files/sbin/{{ item }}"
- dest: "/usr/sbin/{{ item }}"
- owner: root
- group: root
- mode: 0755
- with_items:
- - megacli
- - cli64
-
-- name: Create /usr/libexec.
- file:
- path: /usr/libexec
- owner: root
- group: root
- mode: 0755
- state: directory
-
-- name: Upload custom netsaint scripts for raid/disk/smart/monitoring to /usr/libexec/.
- copy:
- src: "../files/libexec/{{ item }}"
- dest: "/usr/libexec/{{ item }}"
- owner: root
- group: root
- mode: 0755
- with_items:
- - smart.sh
- - raid.pl
- - diskusage.pl
tags:
- cpan
-# upload custom disk monitoring scripts
-- include: disk_monitoring.yml
- tags:
- - monitoring-scripts
-
-# configure nagios
-- include: nagios.yml
- tags:
- - nagios
-
-- name: Get SELinux status
- command: getenforce
- register: selinux_status
- when: ansible_pkg_mgr == "yum"
- tags:
- - nagios
-
-# configure selinux for nagios
-- include: nrpe-selinux.yml
- when: selinux_status is defined and selinux_status.stdout != "Disabled"
- tags:
- - nagios
-
# configure ntp
- include: ntp.yml
tags:
+++ /dev/null
----
-- name: Upload nagios sudoers.d for raid utilities.
- template:
- src: nagios/90-nagios
- dest: /etc/sudoers.d/90-nagios
- owner: root
- group: root
- mode: 0440
- validate: visudo -cf %s
-
-- name: Configure nagios nrpe settings (Ubuntu)
- lineinfile:
- dest: /etc/default/{{ nrpe_service_name }}
- regexp: "^DAEMON_OPTS"
- line: "DAEMON_OPTS=\"--no-ssl\""
- when: ansible_pkg_mgr == "apt"
-
-- name: Configure nagios nrpe settings (RHEL/CentOS)
- lineinfile:
- dest: /etc/sysconfig/{{ nrpe_service_name }}
- regexp: "^NRPE_SSL_OPT"
- line: "NRPE_SSL_OPT=\"-n\""
- when: ansible_pkg_mgr == "yum"
-
-- name: Check firewalld status
- command: systemctl status firewalld
- register: firewalld
- ignore_errors: true
- no_log: true
- when: ansible_pkg_mgr == "yum"
-
-- name: Open nrpe port if firewalld enabled
- firewalld:
- port: 5666/tcp
- state: enabled
- permanent: yes
- immediate: yes
- when: ansible_pkg_mgr == "yum" and (firewalld is defined and firewalld.stdout.find('running') != -1)
-
-- name: Upload nagios nrpe config.
- template:
- src: nagios/nrpe.cfg
- dest: /etc/nagios/nrpe.cfg
- owner: root
- group: root
- mode: 0644
- notify:
- - restart nagios-nrpe-server
-
-- name: Make sure nagios nrpe service is running.
- service:
- name: "{{ nrpe_service_name }}"
- enabled: yes
- state: started
+++ /dev/null
----
-- name: nrpe - Install semanage python bindings
- yum:
- pkg: libsemanage-python
- state: installed
-
-- name: nrpe - Install SELinux tools
- yum:
- pkg: policycoreutils-python
- state: installed
-
-- name: nrpe - Ensure SELinux policy is up to date
- yum:
- pkg: selinux-policy-targeted
- state: latest
-
-- name: nrpe - Set SELinux boolean nagios_run_sudo true
- seboolean:
- name: nagios_run_sudo
- state: yes
- persistent: yes
-
-- name: nrpe - Remove SELinux policy package
- command: semodule -r nrpe
- failed_when: false
-
-- name: nrpe - Copy SELinux type enforcement file
- copy:
- src: nagios/nrpe.te
- dest: /tmp/nrpe.te
-
-- name: nrpe - Compile SELinux module file
- command: checkmodule -M -m -o /tmp/nrpe.mod /tmp/nrpe.te
-
-- name: nrpe - Build SELinux policy package
- command: semodule_package -o /tmp/nrpe.pp -m /tmp/nrpe.mod
-
-- name: nrpe - Load SELinux policy package
- command: semodule -i /tmp/nrpe.pp
-
-- name: nrpe - Remove temporary files
- file:
- path: /tmp/nrpe.*
- state: absent
+++ /dev/null
-## {{ ansible_managed }}
-{{ nrpe_user }} ALL=NOPASSWD: /usr/sbin/megacli, /usr/sbin/cli64, /usr/sbin/smartctl, /usr/sbin/smartctl
+++ /dev/null
-# {{ ansible_managed }}
-log_facility=daemon
-pid_file=/var/run/nagios/nrpe.pid
-server_port=5666
-nrpe_user={{ nrpe_user }}
-nrpe_group={{ nrpe_group }}
-
-# These should eventually be in a secrets group_var
-# 172. address is sepia nagios server
-# 10. address is octo nagios server
-allowed_hosts=127.0.0.1,172.21.0.33,10.8.0.8
-dont_blame_nrpe=0
-debug=0
-command_timeout=60
-connection_timeout=300
-
-command[check_users]={{ nagios_plugins_directory }}/check_users --warning=5 --critical=10
-command[check_load]={{ nagios_plugins_directory }}/check_load --percpu --warning=1.5,1.4,1.3 --critical=2.0,1.9,1.8
-command[check_hda1]={{ nagios_plugins_directory }}/check_disk --warning=20% --critical=10% --partition=/dev/hda1
-command[check_root]={{ nagios_plugins_directory }}/check_disk --warning=10% --critical=5% --units=GB --path=/
-command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning=5 --critical=10 --state=Z
-command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500
-command[check_raid]=/usr/libexec/raid.pl
-command[check_disks]=/usr/libexec/diskusage.pl 90 95
-command[check_smart]=/usr/libexec/smart.sh
-
-include=/etc/nagios/nrpe_local.cfg
-
-include_dir=/etc/nagios/nrpe.d/
ntp_service_name: ntp
ssh_service_name: ssh
nfs_service: nfs-kernel-server
-nrpe_service_name: nagios-nrpe-server
-nrpe_user: nagios
-nrpe_group: nagios
-nagios_plugins_directory: /usr/lib/nagios/plugins
ceph_packages_to_remove:
- ceph
# for java bindings, hadoop, etc.
- java-1.7.0-openjdk-devel
- junit4
- # for disk/etc monitoring
- - smartmontools
# for nfs
- nfs-utils
- python-virtualenv
# for setting BIOS settings
- smbios-utils
- # for nagios monitoring
- - nrpe
- - nagios-plugins-all
# for java bindings, hadoop, etc.
- java-1.6.0-openjdk-devel
- junit4
- # for disk/etc monitoring
- - smartmontools
# for nfs
- nfs-utils
# for xfstests
- bonnie++
# for json_xs to investigate JSON by hand
- perl-JSON-XS
- # for nagios monitoring
- - nrpe
- - nagios-plugins-all
- default-jdk
- junit4
###
- # for disk/etc monitoring
- - smartmontools
- - nagios-nrpe-server
- ###
# for samba testing
- cifs-utils
###
- default-jdk
- junit4
###
- # for disk/etc monitoring
- - smartmontools
- - nagios-nrpe-server
- ###
# for samba testing
- cifs-utils
###
# for java bindings, hadoop, etc.
- java-1.8.0-openjdk-devel
- junit
- # for disk/etc monitoring
- - nrpe
- - nagios-plugins-all
- - smartmontools
# for nfs
- nfs-utils
# python-pip is installed via roles/testnode/tasks/pip.yml on other rpm-based distros
# for java bindings, hadoop, etc.
- java-1.6.0-openjdk-devel
- junit4
- # for disk/etc monitoring
- - smartmontools
# for nfs
- nfs-utils
- python-virtualenv
# for setting BIOS settings
- smbios-utils
- # for nagios monitoring
- - nrpe
- - nagios-plugins-all
nfs_service: nfs
- perl-XML-Twig
- java-1.6.0-openjdk-devel
- junit4
- - smartmontools
- nfs-utils
# for xfstests
- ncurses-devel
- perl-JSON-XS
- leveldb
- xmlstarlet
- # for nagios monitoring
- - nrpe
- - nagios-plugins-all
nfs_service: nfs-server
- tgt
- open-iscsi
###
- # for disk/etc monitoring
- - smartmontools
- - nagios-nrpe-server
- ###
# for samba testing
- cifs-utils
# for Static IP
---
ntp_service_name: ntpd
ssh_service_name: sshd
-nrpe_service_name: nrpe
-nrpe_user: nrpe
-nrpe_group: nrpe
-nagios_plugins_directory: /usr/lib64/nagios/plugins
# ceph packages that we ensure do not exist
ceph_packages_to_remove: