From: David Galloway Date: Mon, 11 Jul 2016 22:43:38 +0000 (-0400) Subject: Move NRPE setup to common role X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=ae480540cfeb4bac66a9cfe53dec365fd753a8f7;p=ceph-cm-ansible.git Move NRPE setup to common role Signed-off-by: David Galloway --- diff --git a/roles/common/files/libexec/diskusage.pl b/roles/common/files/libexec/diskusage.pl new file mode 100644 index 0000000..49200da --- /dev/null +++ b/roles/common/files/libexec/diskusage.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl + +# {{ ansible_managed }} + +#****************************************************************************************** +# +# NRPE DISK USAGE PLUGIN +# +# Program: Disk Usage plugin written to be used with Netsaint and NRPE +# License: GPL +# Copyright (c) 2000 Jeremy Hanmer (jeremy@newdream.net) +# +# Last Modified: 10/23/00 +# +# Information: Basically, I wrote this because I had to deal with large numbers of +# machines with a wide range of disk configurations, and with dynamically mounted +# partitions. The basic check_disk plugin relied on a static configuration file which +# doesn't lend itself to being used in a heterogeneous environnment (especially when +# you can't guarantee that the devices listed in the configuration file will be mounted). +# +# Bugs: Currently, this plugin only works on EXT2 partitions (although it's easy to change). +# +# Command Line: diskusage.pl +# +# Tested Systems: Mandrake 7.1/Intel, Debian 2.2/Intel, Debian 2.1/Intel +# +# License Information: +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +#******************************************************************************************* + + +use strict; + +my $wrn = shift @ARGV; +my $crt = shift @ARGV; +my $output; +my $count; +my %type; +my $result = 0; +my $warn = 0; +my $crit = 0; +my @parts; +my $hostname = `hostname`; +chomp $hostname; +@parts = `mount | grep -vi fuse`; + +#if ( $hostname eq 'zartan' ) { +# @parts = `mount`; +#} +#else { +# @parts = `mount -t ext2,reiserfs`; +#} +for (@parts) { + my ($dev,$on,$mount,$tp,$type,$options) = split(/\s+/,$_); + next if ($type eq 'nfs' && !($hostname eq 'zartan')); + next if ($type eq 'proc' || $type eq 'devpts'); + my @df= `df -k $mount`; + my @df_inode = `df -i $mount`; +# print "$dev $mount $type\n"; + shift @df; + shift @df_inode; + for(@df) { + my ($dev1,$blocks,$used,$free,$pc,$mount) = split(/\s+/,$_); + my ($percent,$blah) = split(/\%/,$pc); + if ( ($percent >= $wrn ) && (!($percent >= $crt) || ($mount =~ m/\/mnt\//)) ) { + $output .= "$mount is at $pc "; + $warn = 1; + } + if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){ + $output = "" unless $crit eq '1'; + $output .= "$mount is at $pc "; + $crit = 1; + } + } + for(@df_inode) { + my ($dev1,$inodes,$used,$free,$pc,$mount) = split(/\s+/,$_); + my ($percent,$blah) = split(/\%/,$pc); + if ( ($percent >= $wrn ) && (!($percent >= $crt) ) ) { + $output .= "$mount is at $pc inode usage "; + $warn = 1; + } + if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){ + $output = "" unless $crit eq '1'; + $output .= "$mount is at $pc inode usage "; + $crit = 1; + } + } + } + + +#if ( ($warn eq '1') && !($crit eq '1') ) { +# print "$output\n"; +# $result = 1; +# } +if ( $crit eq '1' ) { + print "$output\n"; + $result = 2; +} + +else { + print "Disks are OK now\n"; +} + + +#if ( !( $crit eq '1' ) && !( $warn eq '1' ) ) { +# print "Disks are ok now\n"; +#} +#print "$result\n"; +exit $result; diff --git a/roles/common/files/libexec/raid.pl b/roles/common/files/libexec/raid.pl new file mode 100755 index 0000000..f65eedd --- /dev/null +++ b/roles/common/files/libexec/raid.pl @@ -0,0 +1,313 @@ +#!/usr/bin/perl + +# {{ ansible_managed }} + +use strict; + +my $warn; +my $crit; +my $out; + +my @out; +my $devices; +my $pci; +my $scsi; +my $derp; + +$pci = `/usr/bin/lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -2`; +$scsi = `/usr/bin/lspci | /bin/grep -i scsi | /bin/grep -v PATA | /usr/bin/head -1`; + +# software raid! +if (-e "/proc/mdstat") { + # check software raid! +# open(R,"/tmp/mdstat"); + open(R,"/proc/mdstat"); + while () { + if (/^(md\d+) : (\w+)/) { + my $dev = $1; + my $status = $2; + my $rest = ; + $devices++; + + my ($disks,$states) = $rest =~ /(\[.*\]) (\[.*\])/; + my $mout .= "$dev is $status $disks $states" if $states =~ /_/; + + # recovery? + my $next = ; # possibly recovery? + if ($next =~ / recovery = /) { + my ($progress,$per) = $next =~ /(\[.*\])\s+recovery =\s+(\S+%)/; + $mout .= " recovery $per"; + my $next = ; + if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) { + $mout .= " finish $finish min"; + } + $warn = 1; + } elsif ($next =~ / resync = /) { + my ($progress,$per) = $next =~ /(\[.*\])\s+resync =\s+(\S+%)/; + $mout .= " resync $per"; + if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) { + $mout .= " finish $finish min"; + } + $warn = 1; + } elsif ($states =~ /_/) { # not all U + $crit = 1; + } + + push( @out, $mout ) if $mout; + } + } +} + + +# mylex raid! +if ($pci =~ /Mylex/i) { +#if (1) { + my $s = `cat /proc/rd/status`; + chomp($s); + unless ($s =~ /OK/) { + my @myinfo; + for my $ctl (`ls -d /proc/rd/c*`) { +# for my $ctl ('/proc/rd/c0') { + chomp $ctl; + my %bad; + my ($c) = $ctl =~ /\/(c\d)$/; + open(S,"$ctl/current_status") || print "can't open $ctl/current_status\n";; +# open(S,"/tmp/mylex.bad"); + my $lastdevice; + while () { + # disk status + if (/^ (\d:\d) Vendor/) { + $lastdevice = $1; + } + if (/ Disk Status: (\S+),/) { + if ($1 ne 'Online') { + push( @myinfo, "$c disk $lastdevice $1"); + } + } + + # logical drives + if (/ (\/dev\/rd\/\S+): (\S+), (\w+),/) { + my $dev = $1; + my $type = $2; + my $status = $3; + $devices++; + $bad{$dev} = 1; + if ($status ne 'Online') { + push( @myinfo, "$dev ($type) $status"); + } + } + + # rebuild? + if (/ Rebuild in Progress: .* \((\S+)\) (\d+%) completed/) { + push( @myinfo, "$1 rebuild $2 complete" ); + delete $bad{$1}; + } + } + if (keys %bad) { + $crit = 1; # at least 1 is failed and !recovering + } else { + $warn = 1; # all are recovering + } + } + + push( @out, "Mylex $s: " . join(', ',@myinfo)) if @myinfo; + } +} + + +# icp vortex raid! +if ( $pci =~ /intel/i) { + opendir(D,"/proc/scsi/gdth"); + my @dev = readdir(D); + closedir D; + my @vortex; + for my $dev (@dev) { + next if $dev =~ /^\./; + my $read = `cat /proc/scsi/gdth/$dev`; + # my $read = `cat /tmp/asdf9.warn`; + my $cur; # Logical | Physical | Host | Array + my @myinfo; +# print "dev $dev\n"; + for $_ (split(/\n/,$read)) { + chomp; + if (/^\w/) { + # new section + ($cur) = /^(\w+)/; +# print "cur = $cur\n"; + next; + } + if ($cur eq 'Logical') { + my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/; + next unless $status; + if ($status ne 'ok') { + $warn = 1; + #push( @myinfo, "Logical #$num $status" ); + unshift( @myinfo, "Logical #$num $status" ); + } + } + if ($cur eq 'Array') { + my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/; + next unless $status; + if ($status ne 'ready') { + $warn = 1; + #push( @myinfo, "Array #$num $status" ); + unshift( @myinfo, "Array #$num $status" ); + } + } + if ($cur eq 'Host') { + if (/Number/) { + $devices++; + } + } + if ($cur eq 'Controller') { + # push( @myinfo, $_ ); + unshift( @myinfo, $_ ); + } + } + + if (@myinfo) { + # push( @vortex, "dev $dev: " . join(', ', @myinfo) ); + # unshift( @vortex, "dev $dev: " . join(', ', @myinfo) ); + push( @vortex, "dev $dev: " . join(', ', $myinfo[0], $myinfo[1], $myinfo[2], $myinfo[3], $myinfo[4] ) ); + # $warn = 1; + } + } + + if (@vortex) { + # push( @out, 'Vortex: ' . join('. ', @vortex) ); + push( @out, 'Vortex: ' . join('. ', @vortex) ); + } +} +# SAS megaraid +if ( $pci =~ /LSI\ Logic/i) { + my $read = `/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0`; + for $_ (split(/\n/,$read)) { + chomp; + # The line we care about is State: Optimal, if we don't have that, we've problems + if ($_ =~/^State\s*\:\s*(.*)/m) { + $devices++; + #/^State\?:\s?(\w+)/; + my $state = $1; + next unless $state; + if ($state ne 'Optimal') { + my $rebuild = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -i firmware`; + if ( $rebuild =~ /Rebuild/i) { + my $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`; + #my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`; + my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/egrep -i \'\(rebuild\|not found\)\'`; + if ($rebuildstatus =~ /not found/m) { + # check by device id instead of enclosure id if we get a not found error above + $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /bin/grep -v position | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`; + $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`; + } + for $_ ($rebuildstatus) { + $crit = 1; + push(@out,$_); + } + } else { + $crit = 1; + my $virtual=`/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0 | grep -i failed -B6 | grep -i virtual | cut -d'(' -f1`; + push(@out, $virtual, $_); + } + } + } + # Should to catch the syntax or permissions errors this thing spits out + if (/ERROR/i) { + $crit = 1; + push(@out, $_); + foreach my $k (@out) + { + print $_; + } + } + } +} + +# e3ware +if ( $pci =~ /3ware/i) { + open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli show|"); + #my $read = `/usr/sbin/megacli -LDInfo -l0 -a0`; + + $devices++; + my @controllers; + while () { + if ( $_ =~ /^c[0-9]/ ) { + my ($c) = split(/\s+/,$_); + push(@controllers,$c); + } + } + close(CLI); + + foreach my $cont (@controllers) { + open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli /$cont show|"); + while () { + if ( $_ =~ /^u[0-9]+/ ) { + my @info = split(/\s+/,$_); + if ( $info[2] ne 'OK' ) { + if ( $info[2] =~ /REBUILDING/i) { + my $rebuildstatus = `/usr/bin/sudo /usr/sbin/tw_cli /$cont/$info[0] show | /bin/grep REBUILD | /bin/grep -v RAID-10`; + for $_ ($rebuildstatus) { + $crit = 1; + push(@out,$_); + } + } else { + $crit = 1; + push(@out,$_); + } + } + } + if ( $_ =~ /^p[0-9]+/ ) { + my @info = split(/\s+/,$_); + if ( $info[1] ne 'OK' ) { + $crit = 1; + push(@out,$_); + } + } + } + } +} + +#Areca + +if ( $pci =~ /areca/i) { + open(CLI,"sudo /usr/sbin/cli64 vsf info|"); + while () { + if ( $_ =~ /^\ \ [0-9]+/ ) { + $devices++; + my @info = split(/\s+/,$_); + if ( $_ !~ /Normal/i) { + $crit = 1; + push(@out,$_); + } + } + } + } + +if ( $scsi =~ /LSI Logic/i) { + open(CLI,"sudo /usr/sbin/mpt-status | /usr/bin/head -1 |"); + $devices++; + while () { + if ( $_ =~ /^ioc/ ) { + my @info = split(/\s+/,$_); + if ( $info[10] ne 'OPTIMAL,' ) { + $crit = 1; + push(@out,$_); + } + } + } + } + +# show results +my $result = 0; +$result = 1 if $warn; +$result = 2 if $crit; +# print "warn = $warn crit = $crit\n"; +print $derp; +my $out = "No raid devices found $pci"; +$out = "All $devices raid devices happy as clams" if $devices; +if (@out) { + $out = join('; ', @out); +} + +print "$out\n"; +exit $result; diff --git a/roles/common/files/libexec/smart.sh b/roles/common/files/libexec/smart.sh new file mode 100755 index 0000000..2f71a60 --- /dev/null +++ b/roles/common/files/libexec/smart.sh @@ -0,0 +1,290 @@ +#!/bin/bash +# Description: Bash script to check drive health using pending, uncorrectable, +# and reallocated sector count +# +# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN +# SMART Attribute Codes: +# 5 = Reallocated +# 187 = Reported Uncorrect +# 197 = Pending +# 198 = Uncorrectable Sector Count +# +# TO-DO: Add support for dynamic SMART attribute lookup. For example, +# 187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs. +# +# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes + +### Define global variables ### +# total number of drives (or RAID slots) discovered +numdrives=0 +# Number of failed, failing, and/or missing drives +failingdrives=0 +# Fallback message for UNKNOWN return code output +unknownmsg="Unknown error" +# Return code for nagios (Default to SUCCESS) +rc=0 +# Array of messages indicating drive health. Output after nagios status. +declare -a messages + +### Functions ### +main () +{ + preflight + + if [ "$raid" = true ] + then + areca_smart + areca_failed + elif [ "$raid" = false ] + then + normal_smart + else + echo "ERROR - Could not determine if RAID present" + exit 3 + fi + + ## Return UNKNOWN if no drives found + if [ "$numdrives" -eq "0" ] + then + unknownmsg="No drives found!" + rc=3 + fi + + ## Return code and service status for nagios + if [ "$rc" = 0 ] + then + echo "OK - All $numdrives drives healthy" + elif [ "$rc" = 1 ] + then + echo "WARNING - $failingdrives of $numdrives drives sick" + elif [ "$rc" = 2 ] + then + echo "CRITICAL - $failingdrives of $numdrives drives need replacing" + elif [ "$rc" = 3 ] + then + echo "UNKNOWN - $unknownmsg" + else + echo "ERROR - Got no return code" + fi + + ## Iterate through array of messages + # Nagios reads and displays the first line of output on the Services page. + # All individual messages about failed/failing disk statistics can be viewed + # on the individual system's SMART detail page in nagios. + for msg in "${messages[@]}" + do + echo "$msg" + done + + exit $rc +} + +# Pre-flight checks +preflight () +{ + # Set raid var then check for cli64 command and bail if missing + if lspci | grep -qi areca + then + raid=true + else + raid=false + fi + + if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ] + then + echo "ERROR - cli64 command not found or is not executable" + exit 3 + fi + + # Check for smartmontools and bail if missing + if ! [ -x "$(command -v smartctl)" ] + then + echo "ERROR - smartctl is not installed or is not executable" + echo "yum/apt-get install smartmontools" + exit 3 + fi +} + +# Gather smart data for drives behind Areca RAID controller +areca_smart () +{ + # Store output of cli64 to reduce repeated executions + cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") + numdrives=$(echo "$cli64out" | wc -l) + # Loop through all disks not marked as 'N.A.' or 'Failed' + for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \ + | grep -o "Slot#[[:digit:]]" | cut -c6-) + do + failed=false + # Determine if disk is JBOD or part of hardware RAID + if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD' + then + jbod=true + else + jbod=false + fi + output=$(sudo cli64 disk smart drv=$slot \ + | grep -E "^ "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ') + outputcount=$(echo $output | wc -w) + # Only continue if we received 3 SMART data points + if [ "$outputcount" = "3" ] + then + # Only do slot to drive letter matching once per bad JBOD + if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ] + then + dl=$(areca_bay_to_letter $slot) + elif [ "$jbod" = false ] + then + dl="(RAID)" + fi + read reallocated pending uncorrect <<< $output + if [ "$reallocated" != "0" ] + then + messages+=("Drive $slot $dl has $reallocated reallocated sectors") + failed=true + # A small number of reallocated sectors is OK + if [ "$reallocated" -le 5 ] + then + rc=1 # Warn if <= 5 + else + rc=2 # Crit if >5 + fi + fi + if [ "$pending" != "0" ] + then + messages+=("Drive $slot $dl has $pending pending sectors") + failed=true + rc=2 + fi + if [ "$uncorrect" != "0" ] + then + messages+=("Drive $slot $dl has $uncorrect uncorrect sectors") + failed=true + rc=2 + fi + else + messages+=("Drive $slot returned $outputcount of 3 expected attributes") + unknownmsg="SMART data could not be read for one or more drives" + rc=3 + fi + # Make sure drives with multiple types of bad sectors only get counted once + if [ "$failed" = true ] + then + let "failingdrives+=1" + fi + done +} + +# Correlate Areca drive bay to drive letter +areca_bay_to_letter () +{ + # Get S/N according to RAID controller given argument $1 (slot #) + areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \ + | awk '{ print $NF }') + # Loop through and get S/N according to smartctl given drive name + for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ + | awk '{ print $NF }') + do + smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \ + | awk '{ print $NF }') + # If cli64 and smartctl find a S/N match, return drive letter + if [ "$areca_serial" = "$smart_serial" ] + then + echo "($dl)" + fi + done +} + +# Tally missing and failed drives connected to Areca RAID +areca_failed () +{ + # Store output of cli64 to reduce repeated executions + cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") + # Missing (N.A.) drives + for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \ + | grep "N.A." | awk '{ print $1 }') + do + messages+=("Drive $drive is missing") + let "failingdrives+=1" + rc=2 + done + # Hard failed drives + for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \ + | grep 'Failed' | awk '{ print $1 }') + do + messages+=("Drive $drive failed") + let "failingdrives+=1" + rc=2 + done +} + +# Standard SATA/SAS drive smartctl check +normal_smart () +{ + # The grep regex will include drives named sdaa, for example + numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l) + for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ + | awk '{ print $NF }') + do + failed=false + output=$(sudo smartctl -a /dev/$l | grep -E "^ "5"|^"197"|^"198"" \ + | awk '{ print $NF }' | tr '\n' ' ') + outputcount=$(echo $output | wc -w) + # Check if drive is SSD and set var accordingly + if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then + is_ssd=true + else + is_ssd=false + fi + # Only continue if we received 3 SMART data points and drive is not SSD + if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ] + then + read reallocated pending uncorrect <<< $output + if [ "$reallocated" != "0" ] + then + messages+=("Drive $l has $reallocated reallocated sectors") + failed=true + # A small number of reallocated sectors is OK + if [ "$reallocated" -le 5 ] + then + rc=1 # Warn if <= 5 + else + rc=2 # Crit if >5 + fi + fi + if [ "$pending" != "0" ] + then + messages+=("Drive $l has $pending pending sectors") + failed=true + rc=2 + fi + if [ "$uncorrect" != "0" ] + then + messages+=("Drive $l has $uncorrect uncorrect sectors") + failed=true + rc=2 + fi + elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ] + then + messages+=("Drive $l returned $outputcount of 3 expected attributes") + unknownmsg="SMART data could not be read for one or more drives" + rc=3 + # Set no return code and assume any SSD is healthy for now + elif [ "$is_ssd" = true ] + then + messages+=("Drive $l is an SSD. Not yet supported.") + rc=0 + else + messages+=("Error processing data for drive $l") + rc=3 + fi + # Make sure drives with multiple types of bad sectors only get counted once + if [ "$failed" = true ] + then + let "failingdrives+=1" + fi + done +} + +## Call main() function +main diff --git a/roles/common/files/nagios/nrpe.te b/roles/common/files/nagios/nrpe.te new file mode 100644 index 0000000..c7bc886 --- /dev/null +++ b/roles/common/files/nagios/nrpe.te @@ -0,0 +1,12 @@ +module nrpe 1.0; + +require { + type fsadm_exec_t; + type nrpe_t; + type hwdata_t; + class file { read getattr open }; +} + +#============= nrpe_t ============== +allow nrpe_t fsadm_exec_t:file getattr; +allow nrpe_t hwdata_t:file { read getattr open }; diff --git a/roles/common/files/sbin/cli64 b/roles/common/files/sbin/cli64 new file mode 100644 index 0000000..7ef82de Binary files /dev/null and b/roles/common/files/sbin/cli64 differ diff --git a/roles/common/files/sbin/megacli b/roles/common/files/sbin/megacli new file mode 100755 index 0000000..50bf00b Binary files /dev/null and b/roles/common/files/sbin/megacli differ diff --git a/roles/common/handlers/main.yml b/roles/common/handlers/main.yml new file mode 100644 index 0000000..e2563ef --- /dev/null +++ b/roles/common/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: restart nagios-nrpe-server + service: + name: "{{ nrpe_service_name }}" + state: restarted diff --git a/roles/common/tasks/disk_monitoring.yml b/roles/common/tasks/disk_monitoring.yml new file mode 100644 index 0000000..2d06a17 --- /dev/null +++ b/roles/common/tasks/disk_monitoring.yml @@ -0,0 +1,33 @@ +--- +# We use these scripts to check to see if any of our test nodes have bad disks + +- name: Upload megacli and cli64 for raid monitoring and smart.pl to /usr/sbin/. + copy: + src: "../files/sbin/{{ item }}" + dest: "/usr/sbin/{{ item }}" + owner: root + group: root + mode: 0755 + with_items: + - megacli + - cli64 + +- name: Create /usr/libexec. + file: + path: /usr/libexec + owner: root + group: root + mode: 0755 + state: directory + +- name: Upload custom netsaint scripts for raid/disk/smart/monitoring to /usr/libexec/. + copy: + src: "../files/libexec/{{ item }}" + dest: "/usr/libexec/{{ item }}" + owner: root + group: root + mode: 0755 + with_items: + - smart.sh + - raid.pl + - diskusage.pl diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml index bf065e3..204cc68 100644 --- a/roles/common/tasks/main.yml +++ b/roles/common/tasks/main.yml @@ -26,3 +26,26 @@ - include: kerberos.yml tags: - kerberos + +# upload custom disk monitoring scripts +- include: disk_monitoring.yml + tags: + - monitoring-scripts + +# configure nagios +- include: nagios.yml + tags: + - nagios + +- name: Get SELinux status + command: getenforce + register: selinux_status + when: ansible_pkg_mgr == "yum" + tags: + - nagios + +# configure selinux for nagios +- include: nrpe-selinux.yml + when: selinux_status is defined and selinux_status.stdout != "Disabled" + tags: + - nagios diff --git a/roles/common/tasks/nagios.yml b/roles/common/tasks/nagios.yml new file mode 100644 index 0000000..259a229 --- /dev/null +++ b/roles/common/tasks/nagios.yml @@ -0,0 +1,54 @@ +--- +- name: Upload nagios sudoers.d for raid utilities. + template: + src: nagios/90-nagios + dest: /etc/sudoers.d/90-nagios + owner: root + group: root + mode: 0440 + validate: visudo -cf %s + +- name: Configure nagios nrpe settings (Ubuntu) + lineinfile: + dest: /etc/default/{{ nrpe_service_name }} + regexp: "^DAEMON_OPTS" + line: "DAEMON_OPTS=\"--no-ssl\"" + when: ansible_pkg_mgr == "apt" + +- name: Configure nagios nrpe settings (RHEL/CentOS) + lineinfile: + dest: /etc/sysconfig/{{ nrpe_service_name }} + regexp: "^NRPE_SSL_OPT" + line: "NRPE_SSL_OPT=\"-n\"" + when: ansible_pkg_mgr == "yum" + +- name: Check firewalld status + command: systemctl status firewalld + register: firewalld + ignore_errors: true + no_log: true + when: ansible_pkg_mgr == "yum" + +- name: Open nrpe port if firewalld enabled + firewalld: + port: 5666/tcp + state: enabled + permanent: yes + immediate: yes + when: ansible_pkg_mgr == "yum" and (firewalld is defined and firewalld.stdout.find('running') != -1) + +- name: Upload nagios nrpe config. + template: + src: nagios/nrpe.cfg + dest: /etc/nagios/nrpe.cfg + owner: root + group: root + mode: 0644 + notify: + - restart nagios-nrpe-server + +- name: Make sure nagios nrpe service is running. + service: + name: "{{ nrpe_service_name }}" + enabled: yes + state: started diff --git a/roles/common/tasks/nrpe-selinux.yml b/roles/common/tasks/nrpe-selinux.yml new file mode 100644 index 0000000..877aa2e --- /dev/null +++ b/roles/common/tasks/nrpe-selinux.yml @@ -0,0 +1,44 @@ +--- +- name: nrpe - Install semanage python bindings + yum: + pkg: libsemanage-python + state: installed + +- name: nrpe - Install SELinux tools + yum: + pkg: policycoreutils-python + state: installed + +- name: nrpe - Ensure SELinux policy is up to date + yum: + pkg: selinux-policy-targeted + state: latest + +- name: nrpe - Set SELinux boolean nagios_run_sudo true + seboolean: + name: nagios_run_sudo + state: yes + persistent: yes + +- name: nrpe - Remove SELinux policy package + command: semodule -r nrpe + failed_when: false + +- name: nrpe - Copy SELinux type enforcement file + copy: + src: nagios/nrpe.te + dest: /tmp/nrpe.te + +- name: nrpe - Compile SELinux module file + command: checkmodule -M -m -o /tmp/nrpe.mod /tmp/nrpe.te + +- name: nrpe - Build SELinux policy package + command: semodule_package -o /tmp/nrpe.pp -m /tmp/nrpe.mod + +- name: nrpe - Load SELinux policy package + command: semodule -i /tmp/nrpe.pp + +- name: nrpe - Remove temporary files + file: + path: /tmp/nrpe.* + state: absent diff --git a/roles/common/templates/nagios/90-nagios b/roles/common/templates/nagios/90-nagios new file mode 100644 index 0000000..34326fb --- /dev/null +++ b/roles/common/templates/nagios/90-nagios @@ -0,0 +1,2 @@ +## {{ ansible_managed }} +{{ nrpe_user }} ALL=NOPASSWD: /usr/sbin/megacli, /usr/sbin/cli64, /usr/sbin/smartctl, /usr/sbin/smartctl diff --git a/roles/common/templates/nagios/nrpe.cfg b/roles/common/templates/nagios/nrpe.cfg new file mode 100644 index 0000000..84435c8 --- /dev/null +++ b/roles/common/templates/nagios/nrpe.cfg @@ -0,0 +1,29 @@ +# {{ ansible_managed }} +log_facility=daemon +pid_file=/var/run/nagios/nrpe.pid +server_port=5666 +nrpe_user={{ nrpe_user }} +nrpe_group={{ nrpe_group }} + +# These should eventually be in a secrets group_var +# 172. address is sepia nagios server +# 10. address is octo nagios server +allowed_hosts=127.0.0.1,172.21.0.33,10.8.0.8 +dont_blame_nrpe=0 +debug=0 +command_timeout=60 +connection_timeout=300 + +command[check_users]={{ nagios_plugins_directory }}/check_users --warning=5 --critical=10 +command[check_load]={{ nagios_plugins_directory }}/check_load --percpu --warning=1.5,1.4,1.3 --critical=2.0,1.9,1.8 +command[check_hda1]={{ nagios_plugins_directory }}/check_disk --warning=20% --critical=10% --partition=/dev/hda1 +command[check_root]={{ nagios_plugins_directory }}/check_disk --warning=10% --critical=5% --units=GB --path=/ +command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning=5 --critical=10 --state=Z +command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500 +command[check_raid]=/usr/libexec/raid.pl +command[check_disks]=/usr/libexec/diskusage.pl 90 95 +command[check_smart]=/usr/libexec/smart.sh + +include=/etc/nagios/nrpe_local.cfg + +include_dir=/etc/nagios/nrpe.d/ diff --git a/roles/common/vars/apt_systems.yml b/roles/common/vars/apt_systems.yml new file mode 100644 index 0000000..066314d --- /dev/null +++ b/roles/common/vars/apt_systems.yml @@ -0,0 +1,5 @@ +--- +nrpe_service_name: nagios-nrpe-server +nrpe_user: nagios +nrpe_group: nagios +nagios_plugins_directory: /usr/lib/nagios/plugins diff --git a/roles/common/vars/yum_systems.yml b/roles/common/vars/yum_systems.yml new file mode 100644 index 0000000..d7b4ed2 --- /dev/null +++ b/roles/common/vars/yum_systems.yml @@ -0,0 +1,5 @@ +--- +nrpe_service_name: nrpe +nrpe_user: nrpe +nrpe_group: nrpe +nagios_plugins_directory: /usr/lib64/nagios/plugins diff --git a/roles/testnode/files/libexec/diskusage.pl b/roles/testnode/files/libexec/diskusage.pl deleted file mode 100644 index 49200da..0000000 --- a/roles/testnode/files/libexec/diskusage.pl +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/perl - -# {{ ansible_managed }} - -#****************************************************************************************** -# -# NRPE DISK USAGE PLUGIN -# -# Program: Disk Usage plugin written to be used with Netsaint and NRPE -# License: GPL -# Copyright (c) 2000 Jeremy Hanmer (jeremy@newdream.net) -# -# Last Modified: 10/23/00 -# -# Information: Basically, I wrote this because I had to deal with large numbers of -# machines with a wide range of disk configurations, and with dynamically mounted -# partitions. The basic check_disk plugin relied on a static configuration file which -# doesn't lend itself to being used in a heterogeneous environnment (especially when -# you can't guarantee that the devices listed in the configuration file will be mounted). -# -# Bugs: Currently, this plugin only works on EXT2 partitions (although it's easy to change). -# -# Command Line: diskusage.pl -# -# Tested Systems: Mandrake 7.1/Intel, Debian 2.2/Intel, Debian 2.1/Intel -# -# License Information: -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -#******************************************************************************************* - - -use strict; - -my $wrn = shift @ARGV; -my $crt = shift @ARGV; -my $output; -my $count; -my %type; -my $result = 0; -my $warn = 0; -my $crit = 0; -my @parts; -my $hostname = `hostname`; -chomp $hostname; -@parts = `mount | grep -vi fuse`; - -#if ( $hostname eq 'zartan' ) { -# @parts = `mount`; -#} -#else { -# @parts = `mount -t ext2,reiserfs`; -#} -for (@parts) { - my ($dev,$on,$mount,$tp,$type,$options) = split(/\s+/,$_); - next if ($type eq 'nfs' && !($hostname eq 'zartan')); - next if ($type eq 'proc' || $type eq 'devpts'); - my @df= `df -k $mount`; - my @df_inode = `df -i $mount`; -# print "$dev $mount $type\n"; - shift @df; - shift @df_inode; - for(@df) { - my ($dev1,$blocks,$used,$free,$pc,$mount) = split(/\s+/,$_); - my ($percent,$blah) = split(/\%/,$pc); - if ( ($percent >= $wrn ) && (!($percent >= $crt) || ($mount =~ m/\/mnt\//)) ) { - $output .= "$mount is at $pc "; - $warn = 1; - } - if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){ - $output = "" unless $crit eq '1'; - $output .= "$mount is at $pc "; - $crit = 1; - } - } - for(@df_inode) { - my ($dev1,$inodes,$used,$free,$pc,$mount) = split(/\s+/,$_); - my ($percent,$blah) = split(/\%/,$pc); - if ( ($percent >= $wrn ) && (!($percent >= $crt) ) ) { - $output .= "$mount is at $pc inode usage "; - $warn = 1; - } - if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){ - $output = "" unless $crit eq '1'; - $output .= "$mount is at $pc inode usage "; - $crit = 1; - } - } - } - - -#if ( ($warn eq '1') && !($crit eq '1') ) { -# print "$output\n"; -# $result = 1; -# } -if ( $crit eq '1' ) { - print "$output\n"; - $result = 2; -} - -else { - print "Disks are OK now\n"; -} - - -#if ( !( $crit eq '1' ) && !( $warn eq '1' ) ) { -# print "Disks are ok now\n"; -#} -#print "$result\n"; -exit $result; diff --git a/roles/testnode/files/libexec/raid.pl b/roles/testnode/files/libexec/raid.pl deleted file mode 100755 index f65eedd..0000000 --- a/roles/testnode/files/libexec/raid.pl +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/perl - -# {{ ansible_managed }} - -use strict; - -my $warn; -my $crit; -my $out; - -my @out; -my $devices; -my $pci; -my $scsi; -my $derp; - -$pci = `/usr/bin/lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -2`; -$scsi = `/usr/bin/lspci | /bin/grep -i scsi | /bin/grep -v PATA | /usr/bin/head -1`; - -# software raid! -if (-e "/proc/mdstat") { - # check software raid! -# open(R,"/tmp/mdstat"); - open(R,"/proc/mdstat"); - while () { - if (/^(md\d+) : (\w+)/) { - my $dev = $1; - my $status = $2; - my $rest = ; - $devices++; - - my ($disks,$states) = $rest =~ /(\[.*\]) (\[.*\])/; - my $mout .= "$dev is $status $disks $states" if $states =~ /_/; - - # recovery? - my $next = ; # possibly recovery? - if ($next =~ / recovery = /) { - my ($progress,$per) = $next =~ /(\[.*\])\s+recovery =\s+(\S+%)/; - $mout .= " recovery $per"; - my $next = ; - if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) { - $mout .= " finish $finish min"; - } - $warn = 1; - } elsif ($next =~ / resync = /) { - my ($progress,$per) = $next =~ /(\[.*\])\s+resync =\s+(\S+%)/; - $mout .= " resync $per"; - if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) { - $mout .= " finish $finish min"; - } - $warn = 1; - } elsif ($states =~ /_/) { # not all U - $crit = 1; - } - - push( @out, $mout ) if $mout; - } - } -} - - -# mylex raid! -if ($pci =~ /Mylex/i) { -#if (1) { - my $s = `cat /proc/rd/status`; - chomp($s); - unless ($s =~ /OK/) { - my @myinfo; - for my $ctl (`ls -d /proc/rd/c*`) { -# for my $ctl ('/proc/rd/c0') { - chomp $ctl; - my %bad; - my ($c) = $ctl =~ /\/(c\d)$/; - open(S,"$ctl/current_status") || print "can't open $ctl/current_status\n";; -# open(S,"/tmp/mylex.bad"); - my $lastdevice; - while () { - # disk status - if (/^ (\d:\d) Vendor/) { - $lastdevice = $1; - } - if (/ Disk Status: (\S+),/) { - if ($1 ne 'Online') { - push( @myinfo, "$c disk $lastdevice $1"); - } - } - - # logical drives - if (/ (\/dev\/rd\/\S+): (\S+), (\w+),/) { - my $dev = $1; - my $type = $2; - my $status = $3; - $devices++; - $bad{$dev} = 1; - if ($status ne 'Online') { - push( @myinfo, "$dev ($type) $status"); - } - } - - # rebuild? - if (/ Rebuild in Progress: .* \((\S+)\) (\d+%) completed/) { - push( @myinfo, "$1 rebuild $2 complete" ); - delete $bad{$1}; - } - } - if (keys %bad) { - $crit = 1; # at least 1 is failed and !recovering - } else { - $warn = 1; # all are recovering - } - } - - push( @out, "Mylex $s: " . join(', ',@myinfo)) if @myinfo; - } -} - - -# icp vortex raid! -if ( $pci =~ /intel/i) { - opendir(D,"/proc/scsi/gdth"); - my @dev = readdir(D); - closedir D; - my @vortex; - for my $dev (@dev) { - next if $dev =~ /^\./; - my $read = `cat /proc/scsi/gdth/$dev`; - # my $read = `cat /tmp/asdf9.warn`; - my $cur; # Logical | Physical | Host | Array - my @myinfo; -# print "dev $dev\n"; - for $_ (split(/\n/,$read)) { - chomp; - if (/^\w/) { - # new section - ($cur) = /^(\w+)/; -# print "cur = $cur\n"; - next; - } - if ($cur eq 'Logical') { - my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/; - next unless $status; - if ($status ne 'ok') { - $warn = 1; - #push( @myinfo, "Logical #$num $status" ); - unshift( @myinfo, "Logical #$num $status" ); - } - } - if ($cur eq 'Array') { - my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/; - next unless $status; - if ($status ne 'ready') { - $warn = 1; - #push( @myinfo, "Array #$num $status" ); - unshift( @myinfo, "Array #$num $status" ); - } - } - if ($cur eq 'Host') { - if (/Number/) { - $devices++; - } - } - if ($cur eq 'Controller') { - # push( @myinfo, $_ ); - unshift( @myinfo, $_ ); - } - } - - if (@myinfo) { - # push( @vortex, "dev $dev: " . join(', ', @myinfo) ); - # unshift( @vortex, "dev $dev: " . join(', ', @myinfo) ); - push( @vortex, "dev $dev: " . join(', ', $myinfo[0], $myinfo[1], $myinfo[2], $myinfo[3], $myinfo[4] ) ); - # $warn = 1; - } - } - - if (@vortex) { - # push( @out, 'Vortex: ' . join('. ', @vortex) ); - push( @out, 'Vortex: ' . join('. ', @vortex) ); - } -} -# SAS megaraid -if ( $pci =~ /LSI\ Logic/i) { - my $read = `/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0`; - for $_ (split(/\n/,$read)) { - chomp; - # The line we care about is State: Optimal, if we don't have that, we've problems - if ($_ =~/^State\s*\:\s*(.*)/m) { - $devices++; - #/^State\?:\s?(\w+)/; - my $state = $1; - next unless $state; - if ($state ne 'Optimal') { - my $rebuild = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -i firmware`; - if ( $rebuild =~ /Rebuild/i) { - my $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`; - #my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`; - my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/egrep -i \'\(rebuild\|not found\)\'`; - if ($rebuildstatus =~ /not found/m) { - # check by device id instead of enclosure id if we get a not found error above - $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /bin/grep -v position | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`; - $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`; - } - for $_ ($rebuildstatus) { - $crit = 1; - push(@out,$_); - } - } else { - $crit = 1; - my $virtual=`/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0 | grep -i failed -B6 | grep -i virtual | cut -d'(' -f1`; - push(@out, $virtual, $_); - } - } - } - # Should to catch the syntax or permissions errors this thing spits out - if (/ERROR/i) { - $crit = 1; - push(@out, $_); - foreach my $k (@out) - { - print $_; - } - } - } -} - -# e3ware -if ( $pci =~ /3ware/i) { - open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli show|"); - #my $read = `/usr/sbin/megacli -LDInfo -l0 -a0`; - - $devices++; - my @controllers; - while () { - if ( $_ =~ /^c[0-9]/ ) { - my ($c) = split(/\s+/,$_); - push(@controllers,$c); - } - } - close(CLI); - - foreach my $cont (@controllers) { - open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli /$cont show|"); - while () { - if ( $_ =~ /^u[0-9]+/ ) { - my @info = split(/\s+/,$_); - if ( $info[2] ne 'OK' ) { - if ( $info[2] =~ /REBUILDING/i) { - my $rebuildstatus = `/usr/bin/sudo /usr/sbin/tw_cli /$cont/$info[0] show | /bin/grep REBUILD | /bin/grep -v RAID-10`; - for $_ ($rebuildstatus) { - $crit = 1; - push(@out,$_); - } - } else { - $crit = 1; - push(@out,$_); - } - } - } - if ( $_ =~ /^p[0-9]+/ ) { - my @info = split(/\s+/,$_); - if ( $info[1] ne 'OK' ) { - $crit = 1; - push(@out,$_); - } - } - } - } -} - -#Areca - -if ( $pci =~ /areca/i) { - open(CLI,"sudo /usr/sbin/cli64 vsf info|"); - while () { - if ( $_ =~ /^\ \ [0-9]+/ ) { - $devices++; - my @info = split(/\s+/,$_); - if ( $_ !~ /Normal/i) { - $crit = 1; - push(@out,$_); - } - } - } - } - -if ( $scsi =~ /LSI Logic/i) { - open(CLI,"sudo /usr/sbin/mpt-status | /usr/bin/head -1 |"); - $devices++; - while () { - if ( $_ =~ /^ioc/ ) { - my @info = split(/\s+/,$_); - if ( $info[10] ne 'OPTIMAL,' ) { - $crit = 1; - push(@out,$_); - } - } - } - } - -# show results -my $result = 0; -$result = 1 if $warn; -$result = 2 if $crit; -# print "warn = $warn crit = $crit\n"; -print $derp; -my $out = "No raid devices found $pci"; -$out = "All $devices raid devices happy as clams" if $devices; -if (@out) { - $out = join('; ', @out); -} - -print "$out\n"; -exit $result; diff --git a/roles/testnode/files/libexec/smart.sh b/roles/testnode/files/libexec/smart.sh deleted file mode 100755 index 2f71a60..0000000 --- a/roles/testnode/files/libexec/smart.sh +++ /dev/null @@ -1,290 +0,0 @@ -#!/bin/bash -# Description: Bash script to check drive health using pending, uncorrectable, -# and reallocated sector count -# -# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN -# SMART Attribute Codes: -# 5 = Reallocated -# 187 = Reported Uncorrect -# 197 = Pending -# 198 = Uncorrectable Sector Count -# -# TO-DO: Add support for dynamic SMART attribute lookup. For example, -# 187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs. -# -# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes - -### Define global variables ### -# total number of drives (or RAID slots) discovered -numdrives=0 -# Number of failed, failing, and/or missing drives -failingdrives=0 -# Fallback message for UNKNOWN return code output -unknownmsg="Unknown error" -# Return code for nagios (Default to SUCCESS) -rc=0 -# Array of messages indicating drive health. Output after nagios status. -declare -a messages - -### Functions ### -main () -{ - preflight - - if [ "$raid" = true ] - then - areca_smart - areca_failed - elif [ "$raid" = false ] - then - normal_smart - else - echo "ERROR - Could not determine if RAID present" - exit 3 - fi - - ## Return UNKNOWN if no drives found - if [ "$numdrives" -eq "0" ] - then - unknownmsg="No drives found!" - rc=3 - fi - - ## Return code and service status for nagios - if [ "$rc" = 0 ] - then - echo "OK - All $numdrives drives healthy" - elif [ "$rc" = 1 ] - then - echo "WARNING - $failingdrives of $numdrives drives sick" - elif [ "$rc" = 2 ] - then - echo "CRITICAL - $failingdrives of $numdrives drives need replacing" - elif [ "$rc" = 3 ] - then - echo "UNKNOWN - $unknownmsg" - else - echo "ERROR - Got no return code" - fi - - ## Iterate through array of messages - # Nagios reads and displays the first line of output on the Services page. - # All individual messages about failed/failing disk statistics can be viewed - # on the individual system's SMART detail page in nagios. - for msg in "${messages[@]}" - do - echo "$msg" - done - - exit $rc -} - -# Pre-flight checks -preflight () -{ - # Set raid var then check for cli64 command and bail if missing - if lspci | grep -qi areca - then - raid=true - else - raid=false - fi - - if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ] - then - echo "ERROR - cli64 command not found or is not executable" - exit 3 - fi - - # Check for smartmontools and bail if missing - if ! [ -x "$(command -v smartctl)" ] - then - echo "ERROR - smartctl is not installed or is not executable" - echo "yum/apt-get install smartmontools" - exit 3 - fi -} - -# Gather smart data for drives behind Areca RAID controller -areca_smart () -{ - # Store output of cli64 to reduce repeated executions - cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") - numdrives=$(echo "$cli64out" | wc -l) - # Loop through all disks not marked as 'N.A.' or 'Failed' - for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \ - | grep -o "Slot#[[:digit:]]" | cut -c6-) - do - failed=false - # Determine if disk is JBOD or part of hardware RAID - if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD' - then - jbod=true - else - jbod=false - fi - output=$(sudo cli64 disk smart drv=$slot \ - | grep -E "^ "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ') - outputcount=$(echo $output | wc -w) - # Only continue if we received 3 SMART data points - if [ "$outputcount" = "3" ] - then - # Only do slot to drive letter matching once per bad JBOD - if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ] - then - dl=$(areca_bay_to_letter $slot) - elif [ "$jbod" = false ] - then - dl="(RAID)" - fi - read reallocated pending uncorrect <<< $output - if [ "$reallocated" != "0" ] - then - messages+=("Drive $slot $dl has $reallocated reallocated sectors") - failed=true - # A small number of reallocated sectors is OK - if [ "$reallocated" -le 5 ] - then - rc=1 # Warn if <= 5 - else - rc=2 # Crit if >5 - fi - fi - if [ "$pending" != "0" ] - then - messages+=("Drive $slot $dl has $pending pending sectors") - failed=true - rc=2 - fi - if [ "$uncorrect" != "0" ] - then - messages+=("Drive $slot $dl has $uncorrect uncorrect sectors") - failed=true - rc=2 - fi - else - messages+=("Drive $slot returned $outputcount of 3 expected attributes") - unknownmsg="SMART data could not be read for one or more drives" - rc=3 - fi - # Make sure drives with multiple types of bad sectors only get counted once - if [ "$failed" = true ] - then - let "failingdrives+=1" - fi - done -} - -# Correlate Areca drive bay to drive letter -areca_bay_to_letter () -{ - # Get S/N according to RAID controller given argument $1 (slot #) - areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \ - | awk '{ print $NF }') - # Loop through and get S/N according to smartctl given drive name - for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ - | awk '{ print $NF }') - do - smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \ - | awk '{ print $NF }') - # If cli64 and smartctl find a S/N match, return drive letter - if [ "$areca_serial" = "$smart_serial" ] - then - echo "($dl)" - fi - done -} - -# Tally missing and failed drives connected to Areca RAID -areca_failed () -{ - # Store output of cli64 to reduce repeated executions - cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]") - # Missing (N.A.) drives - for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \ - | grep "N.A." | awk '{ print $1 }') - do - messages+=("Drive $drive is missing") - let "failingdrives+=1" - rc=2 - done - # Hard failed drives - for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \ - | grep 'Failed' | awk '{ print $1 }') - do - messages+=("Drive $drive failed") - let "failingdrives+=1" - rc=2 - done -} - -# Standard SATA/SAS drive smartctl check -normal_smart () -{ - # The grep regex will include drives named sdaa, for example - numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l) - for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \ - | awk '{ print $NF }') - do - failed=false - output=$(sudo smartctl -a /dev/$l | grep -E "^ "5"|^"197"|^"198"" \ - | awk '{ print $NF }' | tr '\n' ' ') - outputcount=$(echo $output | wc -w) - # Check if drive is SSD and set var accordingly - if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then - is_ssd=true - else - is_ssd=false - fi - # Only continue if we received 3 SMART data points and drive is not SSD - if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ] - then - read reallocated pending uncorrect <<< $output - if [ "$reallocated" != "0" ] - then - messages+=("Drive $l has $reallocated reallocated sectors") - failed=true - # A small number of reallocated sectors is OK - if [ "$reallocated" -le 5 ] - then - rc=1 # Warn if <= 5 - else - rc=2 # Crit if >5 - fi - fi - if [ "$pending" != "0" ] - then - messages+=("Drive $l has $pending pending sectors") - failed=true - rc=2 - fi - if [ "$uncorrect" != "0" ] - then - messages+=("Drive $l has $uncorrect uncorrect sectors") - failed=true - rc=2 - fi - elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ] - then - messages+=("Drive $l returned $outputcount of 3 expected attributes") - unknownmsg="SMART data could not be read for one or more drives" - rc=3 - # Set no return code and assume any SSD is healthy for now - elif [ "$is_ssd" = true ] - then - messages+=("Drive $l is an SSD. Not yet supported.") - rc=0 - else - messages+=("Error processing data for drive $l") - rc=3 - fi - # Make sure drives with multiple types of bad sectors only get counted once - if [ "$failed" = true ] - then - let "failingdrives+=1" - fi - done -} - -## Call main() function -main diff --git a/roles/testnode/files/nagios/nrpe.te b/roles/testnode/files/nagios/nrpe.te deleted file mode 100644 index c7bc886..0000000 --- a/roles/testnode/files/nagios/nrpe.te +++ /dev/null @@ -1,12 +0,0 @@ -module nrpe 1.0; - -require { - type fsadm_exec_t; - type nrpe_t; - type hwdata_t; - class file { read getattr open }; -} - -#============= nrpe_t ============== -allow nrpe_t fsadm_exec_t:file getattr; -allow nrpe_t hwdata_t:file { read getattr open }; diff --git a/roles/testnode/files/sbin/cli64 b/roles/testnode/files/sbin/cli64 deleted file mode 100644 index 7ef82de..0000000 Binary files a/roles/testnode/files/sbin/cli64 and /dev/null differ diff --git a/roles/testnode/files/sbin/megacli b/roles/testnode/files/sbin/megacli deleted file mode 100755 index 50bf00b..0000000 Binary files a/roles/testnode/files/sbin/megacli and /dev/null differ diff --git a/roles/testnode/handlers/main.yml b/roles/testnode/handlers/main.yml index a87e910..e3a24c6 100644 --- a/roles/testnode/handlers/main.yml +++ b/roles/testnode/handlers/main.yml @@ -25,8 +25,3 @@ service: name: cron state: restarted - -- name: restart nagios-nrpe-server - service: - name: "{{ nrpe_service_name }}" - state: restarted diff --git a/roles/testnode/tasks/disk_monitoring.yml b/roles/testnode/tasks/disk_monitoring.yml deleted file mode 100644 index 2d06a17..0000000 --- a/roles/testnode/tasks/disk_monitoring.yml +++ /dev/null @@ -1,33 +0,0 @@ ---- -# We use these scripts to check to see if any of our test nodes have bad disks - -- name: Upload megacli and cli64 for raid monitoring and smart.pl to /usr/sbin/. - copy: - src: "../files/sbin/{{ item }}" - dest: "/usr/sbin/{{ item }}" - owner: root - group: root - mode: 0755 - with_items: - - megacli - - cli64 - -- name: Create /usr/libexec. - file: - path: /usr/libexec - owner: root - group: root - mode: 0755 - state: directory - -- name: Upload custom netsaint scripts for raid/disk/smart/monitoring to /usr/libexec/. - copy: - src: "../files/libexec/{{ item }}" - dest: "/usr/libexec/{{ item }}" - owner: root - group: root - mode: 0755 - with_items: - - smart.sh - - raid.pl - - diskusage.pl diff --git a/roles/testnode/tasks/main.yml b/roles/testnode/tasks/main.yml index e04368e..033d386 100644 --- a/roles/testnode/tasks/main.yml +++ b/roles/testnode/tasks/main.yml @@ -72,29 +72,6 @@ tags: - cpan -# upload custom disk monitoring scripts -- include: disk_monitoring.yml - tags: - - monitoring-scripts - -# configure nagios -- include: nagios.yml - tags: - - nagios - -- name: Get SELinux status - command: getenforce - register: selinux_status - when: ansible_pkg_mgr == "yum" - tags: - - nagios - -# configure selinux for nagios -- include: nrpe-selinux.yml - when: selinux_status is defined and selinux_status.stdout != "Disabled" - tags: - - nagios - # configure ntp - include: ntp.yml tags: diff --git a/roles/testnode/tasks/nagios.yml b/roles/testnode/tasks/nagios.yml deleted file mode 100644 index 259a229..0000000 --- a/roles/testnode/tasks/nagios.yml +++ /dev/null @@ -1,54 +0,0 @@ ---- -- name: Upload nagios sudoers.d for raid utilities. - template: - src: nagios/90-nagios - dest: /etc/sudoers.d/90-nagios - owner: root - group: root - mode: 0440 - validate: visudo -cf %s - -- name: Configure nagios nrpe settings (Ubuntu) - lineinfile: - dest: /etc/default/{{ nrpe_service_name }} - regexp: "^DAEMON_OPTS" - line: "DAEMON_OPTS=\"--no-ssl\"" - when: ansible_pkg_mgr == "apt" - -- name: Configure nagios nrpe settings (RHEL/CentOS) - lineinfile: - dest: /etc/sysconfig/{{ nrpe_service_name }} - regexp: "^NRPE_SSL_OPT" - line: "NRPE_SSL_OPT=\"-n\"" - when: ansible_pkg_mgr == "yum" - -- name: Check firewalld status - command: systemctl status firewalld - register: firewalld - ignore_errors: true - no_log: true - when: ansible_pkg_mgr == "yum" - -- name: Open nrpe port if firewalld enabled - firewalld: - port: 5666/tcp - state: enabled - permanent: yes - immediate: yes - when: ansible_pkg_mgr == "yum" and (firewalld is defined and firewalld.stdout.find('running') != -1) - -- name: Upload nagios nrpe config. - template: - src: nagios/nrpe.cfg - dest: /etc/nagios/nrpe.cfg - owner: root - group: root - mode: 0644 - notify: - - restart nagios-nrpe-server - -- name: Make sure nagios nrpe service is running. - service: - name: "{{ nrpe_service_name }}" - enabled: yes - state: started diff --git a/roles/testnode/tasks/nrpe-selinux.yml b/roles/testnode/tasks/nrpe-selinux.yml deleted file mode 100644 index 877aa2e..0000000 --- a/roles/testnode/tasks/nrpe-selinux.yml +++ /dev/null @@ -1,44 +0,0 @@ ---- -- name: nrpe - Install semanage python bindings - yum: - pkg: libsemanage-python - state: installed - -- name: nrpe - Install SELinux tools - yum: - pkg: policycoreutils-python - state: installed - -- name: nrpe - Ensure SELinux policy is up to date - yum: - pkg: selinux-policy-targeted - state: latest - -- name: nrpe - Set SELinux boolean nagios_run_sudo true - seboolean: - name: nagios_run_sudo - state: yes - persistent: yes - -- name: nrpe - Remove SELinux policy package - command: semodule -r nrpe - failed_when: false - -- name: nrpe - Copy SELinux type enforcement file - copy: - src: nagios/nrpe.te - dest: /tmp/nrpe.te - -- name: nrpe - Compile SELinux module file - command: checkmodule -M -m -o /tmp/nrpe.mod /tmp/nrpe.te - -- name: nrpe - Build SELinux policy package - command: semodule_package -o /tmp/nrpe.pp -m /tmp/nrpe.mod - -- name: nrpe - Load SELinux policy package - command: semodule -i /tmp/nrpe.pp - -- name: nrpe - Remove temporary files - file: - path: /tmp/nrpe.* - state: absent diff --git a/roles/testnode/templates/nagios/90-nagios b/roles/testnode/templates/nagios/90-nagios deleted file mode 100644 index 34326fb..0000000 --- a/roles/testnode/templates/nagios/90-nagios +++ /dev/null @@ -1,2 +0,0 @@ -## {{ ansible_managed }} -{{ nrpe_user }} ALL=NOPASSWD: /usr/sbin/megacli, /usr/sbin/cli64, /usr/sbin/smartctl, /usr/sbin/smartctl diff --git a/roles/testnode/templates/nagios/nrpe.cfg b/roles/testnode/templates/nagios/nrpe.cfg deleted file mode 100644 index 84435c8..0000000 --- a/roles/testnode/templates/nagios/nrpe.cfg +++ /dev/null @@ -1,29 +0,0 @@ -# {{ ansible_managed }} -log_facility=daemon -pid_file=/var/run/nagios/nrpe.pid -server_port=5666 -nrpe_user={{ nrpe_user }} -nrpe_group={{ nrpe_group }} - -# These should eventually be in a secrets group_var -# 172. address is sepia nagios server -# 10. address is octo nagios server -allowed_hosts=127.0.0.1,172.21.0.33,10.8.0.8 -dont_blame_nrpe=0 -debug=0 -command_timeout=60 -connection_timeout=300 - -command[check_users]={{ nagios_plugins_directory }}/check_users --warning=5 --critical=10 -command[check_load]={{ nagios_plugins_directory }}/check_load --percpu --warning=1.5,1.4,1.3 --critical=2.0,1.9,1.8 -command[check_hda1]={{ nagios_plugins_directory }}/check_disk --warning=20% --critical=10% --partition=/dev/hda1 -command[check_root]={{ nagios_plugins_directory }}/check_disk --warning=10% --critical=5% --units=GB --path=/ -command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning=5 --critical=10 --state=Z -command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500 -command[check_raid]=/usr/libexec/raid.pl -command[check_disks]=/usr/libexec/diskusage.pl 90 95 -command[check_smart]=/usr/libexec/smart.sh - -include=/etc/nagios/nrpe_local.cfg - -include_dir=/etc/nagios/nrpe.d/ diff --git a/roles/testnode/vars/apt_systems.yml b/roles/testnode/vars/apt_systems.yml index 15a1225..d03a888 100644 --- a/roles/testnode/vars/apt_systems.yml +++ b/roles/testnode/vars/apt_systems.yml @@ -2,10 +2,6 @@ ntp_service_name: ntp ssh_service_name: ssh nfs_service: nfs-kernel-server -nrpe_service_name: nagios-nrpe-server -nrpe_user: nagios -nrpe_group: nagios -nagios_plugins_directory: /usr/lib/nagios/plugins ceph_packages_to_remove: - ceph diff --git a/roles/testnode/vars/centos_6.yml b/roles/testnode/vars/centos_6.yml index 596eaac..a94e7af 100644 --- a/roles/testnode/vars/centos_6.yml +++ b/roles/testnode/vars/centos_6.yml @@ -99,8 +99,6 @@ packages: # for java bindings, hadoop, etc. - java-1.7.0-openjdk-devel - junit4 - # for disk/etc monitoring - - smartmontools # for nfs - nfs-utils @@ -121,6 +119,3 @@ epel_packages: - python-virtualenv # for setting BIOS settings - smbios-utils - # for nagios monitoring - - nrpe - - nagios-plugins-all diff --git a/roles/testnode/vars/centos_7.yml b/roles/testnode/vars/centos_7.yml index 9f169a9..8e33ab9 100644 --- a/roles/testnode/vars/centos_7.yml +++ b/roles/testnode/vars/centos_7.yml @@ -83,8 +83,6 @@ packages: # for java bindings, hadoop, etc. - java-1.6.0-openjdk-devel - junit4 - # for disk/etc monitoring - - smartmontools # for nfs - nfs-utils # for xfstests @@ -107,6 +105,3 @@ epel_packages: - bonnie++ # for json_xs to investigate JSON by hand - perl-JSON-XS - # for nagios monitoring - - nrpe - - nagios-plugins-all diff --git a/roles/testnode/vars/debian_7.yml b/roles/testnode/vars/debian_7.yml index 568dd42..a881517 100644 --- a/roles/testnode/vars/debian_7.yml +++ b/roles/testnode/vars/debian_7.yml @@ -84,10 +84,6 @@ packages: - default-jdk - junit4 ### - # for disk/etc monitoring - - smartmontools - - nagios-nrpe-server - ### # for samba testing - cifs-utils ### diff --git a/roles/testnode/vars/debian_8.yml b/roles/testnode/vars/debian_8.yml index 16edf7f..9038c28 100644 --- a/roles/testnode/vars/debian_8.yml +++ b/roles/testnode/vars/debian_8.yml @@ -77,10 +77,6 @@ packages: - default-jdk - junit4 ### - # for disk/etc monitoring - - smartmontools - - nagios-nrpe-server - ### # for samba testing - cifs-utils ### diff --git a/roles/testnode/vars/fedora_22.yml b/roles/testnode/vars/fedora_22.yml index 2c911b6..31170f4 100644 --- a/roles/testnode/vars/fedora_22.yml +++ b/roles/testnode/vars/fedora_22.yml @@ -68,10 +68,6 @@ packages: # for java bindings, hadoop, etc. - java-1.8.0-openjdk-devel - junit - # for disk/etc monitoring - - nrpe - - nagios-plugins-all - - smartmontools # for nfs - nfs-utils # python-pip is installed via roles/testnode/tasks/pip.yml on other rpm-based distros diff --git a/roles/testnode/vars/redhat_6.yml b/roles/testnode/vars/redhat_6.yml index e274d36..422b8eb 100644 --- a/roles/testnode/vars/redhat_6.yml +++ b/roles/testnode/vars/redhat_6.yml @@ -86,8 +86,6 @@ packages: # for java bindings, hadoop, etc. - java-1.6.0-openjdk-devel - junit4 - # for disk/etc monitoring - - smartmontools # for nfs - nfs-utils @@ -108,8 +106,5 @@ epel_packages: - python-virtualenv # for setting BIOS settings - smbios-utils - # for nagios monitoring - - nrpe - - nagios-plugins-all nfs_service: nfs diff --git a/roles/testnode/vars/redhat_7.yml b/roles/testnode/vars/redhat_7.yml index a7bd3c7..1cb04f6 100644 --- a/roles/testnode/vars/redhat_7.yml +++ b/roles/testnode/vars/redhat_7.yml @@ -67,7 +67,6 @@ packages: - perl-XML-Twig - java-1.6.0-openjdk-devel - junit4 - - smartmontools - nfs-utils # for xfstests - ncurses-devel @@ -86,8 +85,5 @@ epel_packages: - perl-JSON-XS - leveldb - xmlstarlet - # for nagios monitoring - - nrpe - - nagios-plugins-all nfs_service: nfs-server diff --git a/roles/testnode/vars/ubuntu.yml b/roles/testnode/vars/ubuntu.yml index 52d3478..31cb4b8 100644 --- a/roles/testnode/vars/ubuntu.yml +++ b/roles/testnode/vars/ubuntu.yml @@ -83,10 +83,6 @@ common_packages: - tgt - open-iscsi ### - # for disk/etc monitoring - - smartmontools - - nagios-nrpe-server - ### # for samba testing - cifs-utils # for Static IP diff --git a/roles/testnode/vars/yum_systems.yml b/roles/testnode/vars/yum_systems.yml index e6652a6..433bdf5 100644 --- a/roles/testnode/vars/yum_systems.yml +++ b/roles/testnode/vars/yum_systems.yml @@ -1,10 +1,6 @@ --- ntp_service_name: ntpd ssh_service_name: sshd -nrpe_service_name: nrpe -nrpe_user: nrpe -nrpe_group: nrpe -nagios_plugins_directory: /usr/lib64/nagios/plugins # ceph packages that we ensure do not exist ceph_packages_to_remove: