]> git.apps.os.sepia.ceph.com Git - ceph-cm-ansible.git/commitdiff
Move NRPE setup to common role
authorDavid Galloway <dgallowa@redhat.com>
Mon, 11 Jul 2016 22:43:38 +0000 (18:43 -0400)
committerDavid Galloway <dgallowa@redhat.com>
Wed, 13 Jul 2016 03:39:23 +0000 (23:39 -0400)
Signed-off-by: David Galloway <dgallowa@redhat.com>
38 files changed:
roles/common/files/libexec/diskusage.pl [new file with mode: 0644]
roles/common/files/libexec/raid.pl [new file with mode: 0755]
roles/common/files/libexec/smart.sh [new file with mode: 0755]
roles/common/files/nagios/nrpe.te [new file with mode: 0644]
roles/common/files/sbin/cli64 [new file with mode: 0644]
roles/common/files/sbin/megacli [new file with mode: 0755]
roles/common/handlers/main.yml [new file with mode: 0644]
roles/common/tasks/disk_monitoring.yml [new file with mode: 0644]
roles/common/tasks/main.yml
roles/common/tasks/nagios.yml [new file with mode: 0644]
roles/common/tasks/nrpe-selinux.yml [new file with mode: 0644]
roles/common/templates/nagios/90-nagios [new file with mode: 0644]
roles/common/templates/nagios/nrpe.cfg [new file with mode: 0644]
roles/common/vars/apt_systems.yml [new file with mode: 0644]
roles/common/vars/yum_systems.yml [new file with mode: 0644]
roles/testnode/files/libexec/diskusage.pl [deleted file]
roles/testnode/files/libexec/raid.pl [deleted file]
roles/testnode/files/libexec/smart.sh [deleted file]
roles/testnode/files/nagios/nrpe.te [deleted file]
roles/testnode/files/sbin/cli64 [deleted file]
roles/testnode/files/sbin/megacli [deleted file]
roles/testnode/handlers/main.yml
roles/testnode/tasks/disk_monitoring.yml [deleted file]
roles/testnode/tasks/main.yml
roles/testnode/tasks/nagios.yml [deleted file]
roles/testnode/tasks/nrpe-selinux.yml [deleted file]
roles/testnode/templates/nagios/90-nagios [deleted file]
roles/testnode/templates/nagios/nrpe.cfg [deleted file]
roles/testnode/vars/apt_systems.yml
roles/testnode/vars/centos_6.yml
roles/testnode/vars/centos_7.yml
roles/testnode/vars/debian_7.yml
roles/testnode/vars/debian_8.yml
roles/testnode/vars/fedora_22.yml
roles/testnode/vars/redhat_6.yml
roles/testnode/vars/redhat_7.yml
roles/testnode/vars/ubuntu.yml
roles/testnode/vars/yum_systems.yml

diff --git a/roles/common/files/libexec/diskusage.pl b/roles/common/files/libexec/diskusage.pl
new file mode 100644 (file)
index 0000000..49200da
--- /dev/null
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+
+# {{ ansible_managed }}
+
+#******************************************************************************************
+#
+# NRPE DISK USAGE PLUGIN
+#
+# Program: Disk Usage plugin written to be used with Netsaint and NRPE
+# License: GPL
+# Copyright (c) 2000 Jeremy Hanmer (jeremy@newdream.net)
+#
+# Last Modified: 10/23/00
+# 
+# Information:  Basically, I wrote this because I had to deal with large numbers of 
+# machines with a wide range of disk configurations, and with dynamically mounted 
+# partitions.  The basic check_disk plugin relied on a static configuration file which
+# doesn't lend itself to being used in a heterogeneous environnment (especially when
+# you can't guarantee that the devices listed in the configuration file will be mounted).
+#
+# Bugs:  Currently, this plugin only works on EXT2 partitions (although it's easy to change).
+#
+# Command Line: diskusage.pl <warning percentage> <critical percentage>
+#
+# Tested Systems:  Mandrake 7.1/Intel, Debian 2.2/Intel, Debian 2.1/Intel
+#
+# License Information:
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#*******************************************************************************************
+
+
+use strict;
+
+my $wrn = shift @ARGV;
+my $crt = shift @ARGV;
+my $output;
+my $count;
+my %type;
+my $result = 0;
+my $warn = 0;
+my $crit = 0;
+my @parts;
+my $hostname = `hostname`;
+chomp $hostname;
+@parts = `mount | grep -vi fuse`;
+
+#if ( $hostname eq 'zartan' ) {
+#      @parts = `mount`;
+#}
+#else {
+#      @parts = `mount -t ext2,reiserfs`;
+#}
+for (@parts) {
+       my ($dev,$on,$mount,$tp,$type,$options) = split(/\s+/,$_);
+               next if ($type eq 'nfs' && !($hostname eq 'zartan'));
+               next if ($type eq 'proc' || $type eq 'devpts');
+               my @df= `df -k $mount`;
+               my @df_inode = `df -i $mount`;
+#              print "$dev $mount $type\n";
+               shift @df;
+               shift @df_inode;
+               for(@df) {
+                       my ($dev1,$blocks,$used,$free,$pc,$mount) = split(/\s+/,$_);
+                       my ($percent,$blah) = split(/\%/,$pc);
+                       if ( ($percent >= $wrn ) && (!($percent >= $crt) || ($mount =~ m/\/mnt\//)) ) {
+                               $output .= "$mount is at $pc    ";
+                               $warn = 1;
+                       }
+                       if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
+                               $output = "" unless $crit eq '1';
+                               $output .= "$mount is at $pc    ";
+                               $crit = 1;
+                       }
+               }
+               for(@df_inode) {
+                       my ($dev1,$inodes,$used,$free,$pc,$mount) = split(/\s+/,$_);
+                       my ($percent,$blah) = split(/\%/,$pc);
+                       if ( ($percent >= $wrn ) && (!($percent >= $crt) ) ) {
+                               $output .= "$mount is at $pc inode usage    ";
+                               $warn = 1;
+                       }
+                       if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
+                               $output = "" unless $crit eq '1';
+                               $output .= "$mount is at $pc inode usage    ";
+                               $crit = 1;
+                       }
+               }
+       }
+
+
+#if ( ($warn eq '1') && !($crit eq '1') )  {
+#      print "$output\n";
+#      $result = 1;
+#      }
+if ( $crit eq '1' ) {
+       print "$output\n";
+       $result = 2;
+}
+
+else {
+       print "Disks are OK now\n";
+}
+
+
+#if ( !( $crit eq '1' ) && !( $warn eq '1' ) ) {
+#      print "Disks are ok now\n";
+#}
+#print "$result\n";
+exit $result; 
diff --git a/roles/common/files/libexec/raid.pl b/roles/common/files/libexec/raid.pl
new file mode 100755 (executable)
index 0000000..f65eedd
--- /dev/null
@@ -0,0 +1,313 @@
+#!/usr/bin/perl
+
+# {{ ansible_managed }}
+
+use strict;
+
+my $warn;
+my $crit;
+my $out;
+
+my @out;
+my $devices;
+my $pci;
+my $scsi;
+my $derp;
+
+$pci = `/usr/bin/lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -2`;
+$scsi = `/usr/bin/lspci | /bin/grep -i scsi | /bin/grep -v PATA | /usr/bin/head -1`;
+
+# software raid!
+if (-e "/proc/mdstat") {
+    # check software raid!
+#    open(R,"/tmp/mdstat");
+    open(R,"/proc/mdstat");
+    while (<R>) {
+               if (/^(md\d+) : (\w+)/) {
+                       my $dev = $1;
+                       my $status = $2;
+                       my $rest = <R>;
+                       $devices++;
+                       
+                       my ($disks,$states) = $rest =~ /(\[.*\]) (\[.*\])/;
+                       my $mout .= "$dev is $status $disks $states" if $states =~ /_/;
+                       
+                       # recovery?
+                       my $next = <R>;  # possibly recovery?
+                       if ($next =~ / recovery = /) {
+                               my ($progress,$per) = $next =~ /(\[.*\])\s+recovery =\s+(\S+%)/;
+                               $mout .= " recovery $per";
+                               my $next = <R>;
+                               if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
+                                       $mout .= " finish $finish min";
+                               }
+                               $warn = 1;
+            } elsif ($next =~ / resync = /) {
+                my ($progress,$per) = $next =~ /(\[.*\])\s+resync =\s+(\S+%)/;
+                $mout .= " resync $per";
+                if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
+                    $mout .= " finish $finish min";
+                }
+                $warn = 1;
+                       } elsif ($states =~ /_/) {  # not all U
+                               $crit = 1;
+                       }
+                       
+                       push( @out, $mout ) if $mout;
+               }
+    }
+}
+
+
+# mylex raid!
+if ($pci =~ /Mylex/i) {
+#if (1) {
+    my $s = `cat /proc/rd/status`;
+    chomp($s);
+    unless ($s =~ /OK/) {
+       my @myinfo;
+       for my $ctl (`ls -d /proc/rd/c*`) {
+#      for my $ctl ('/proc/rd/c0') {
+           chomp $ctl;
+           my %bad;
+           my ($c) = $ctl =~ /\/(c\d)$/;
+           open(S,"$ctl/current_status") || print "can't open $ctl/current_status\n";;
+#          open(S,"/tmp/mylex.bad");
+           my $lastdevice;
+           while (<S>) {
+               # disk status
+               if (/^    (\d:\d)  Vendor/) {
+                   $lastdevice = $1;
+               }
+               if (/ Disk Status: (\S+),/) {
+                   if ($1 ne 'Online') {
+                       push( @myinfo, "$c disk $lastdevice $1");
+                   }
+               }
+
+               # logical drives
+               if (/    (\/dev\/rd\/\S+): (\S+), (\w+),/) {
+                   my $dev = $1;
+                   my $type = $2;
+                   my $status = $3;
+                   $devices++;
+                   $bad{$dev} = 1;
+                   if ($status ne 'Online') {
+                       push( @myinfo, "$dev ($type) $status");
+                   }
+               }
+
+               # rebuild?
+               if (/  Rebuild in Progress: .* \((\S+)\) (\d+%) completed/) {
+                   push( @myinfo, "$1 rebuild $2 complete" );
+                   delete $bad{$1};
+               }
+           }
+           if (keys %bad) {
+               $crit = 1;  # at least 1 is failed and !recovering
+           } else {
+               $warn = 1;   # all are recovering
+           }
+       }
+
+       push( @out, "Mylex $s: " . join(', ',@myinfo)) if @myinfo;
+    }
+}
+
+
+# icp vortex raid!
+if ( $pci =~ /intel/i) {
+    opendir(D,"/proc/scsi/gdth");
+    my @dev = readdir(D);
+    closedir D;
+    my @vortex;
+    for my $dev (@dev) {
+       next if $dev =~ /^\./;
+       my $read = `cat /proc/scsi/gdth/$dev`;
+       # my $read = `cat /tmp/asdf9.warn`;
+       my $cur;   # Logical | Physical | Host | Array
+       my @myinfo;
+#      print "dev $dev\n";
+       for $_ (split(/\n/,$read)) {
+           chomp;
+           if (/^\w/) {
+               # new section
+               ($cur) = /^(\w+)/;
+#              print "cur = $cur\n";
+               next;
+           }
+           if ($cur eq 'Logical') {
+               my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
+               next unless $status;
+               if ($status ne 'ok') {
+                   $warn = 1;
+                   #push( @myinfo, "Logical #$num $status" );
+                   unshift( @myinfo, "Logical #$num $status" );
+               }
+           }
+           if ($cur eq 'Array') {
+               my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
+               next unless $status;
+               if ($status ne 'ready') {
+                   $warn = 1;
+                   #push( @myinfo, "Array #$num $status" );
+                   unshift( @myinfo, "Array #$num $status" );
+               }
+           }
+           if ($cur eq 'Host') {
+               if (/Number/) {
+                   $devices++;
+               }
+           }
+           if ($cur eq 'Controller') {
+               # push( @myinfo, $_ );
+               unshift( @myinfo, $_ );
+           }
+       }
+       
+       if (@myinfo) {
+           # push( @vortex, "dev $dev: " . join(', ', @myinfo) );
+           # unshift( @vortex, "dev $dev: " . join(', ', @myinfo) );
+           push( @vortex, "dev $dev: " . join(', ', $myinfo[0], $myinfo[1], $myinfo[2], $myinfo[3], $myinfo[4] ) );
+           # $warn = 1;
+       }
+    }
+
+    if (@vortex) {
+       # push( @out, 'Vortex: ' . join('.   ', @vortex) );
+       push( @out, 'Vortex: ' . join('.   ', @vortex) );
+    }
+}
+# SAS megaraid
+if ( $pci =~ /LSI\ Logic/i) {
+    my $read = `/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0`;
+    for $_ (split(/\n/,$read)) {
+       chomp;
+       # The line we care about is State: Optimal, if we don't have that, we've problems
+       if ($_ =~/^State\s*\:\s*(.*)/m) {
+            $devices++;
+           #/^State\?:\s?(\w+)/;
+           my $state = $1;
+           next unless $state;
+           if ($state ne 'Optimal') {
+               my $rebuild = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -i firmware`;
+                       if ( $rebuild =~ /Rebuild/i) {
+                               my $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
+                               #my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
+                               my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/egrep -i \'\(rebuild\|not found\)\'`;
+                               if ($rebuildstatus =~ /not found/m) {
+                                  # check by device id instead of enclosure id if we get a not found error above
+                                  $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /bin/grep -v position | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
+                                  $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
+                               }
+                                       for $_ ($rebuildstatus) {
+                                       $crit = 1;
+                                       push(@out,$_);
+                                       }
+                       } else {
+               $crit = 1;
+                my $virtual=`/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0 | grep -i failed -B6 | grep -i virtual | cut -d'(' -f1`;
+               push(@out, $virtual, $_);
+               }
+           }
+       }       
+        # Should to catch the syntax or permissions errors this thing spits out
+       if (/ERROR/i) {
+           $crit = 1;
+           push(@out, $_);
+       foreach my $k (@out)
+       {
+               print $_;
+       }
+       }
+    }
+}
+
+# e3ware
+if ( $pci =~ /3ware/i) {
+       open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli show|");
+       #my $read = `/usr/sbin/megacli -LDInfo -l0 -a0`;
+
+       $devices++;
+       my @controllers;
+       while (<CLI>) {
+               if ( $_ =~ /^c[0-9]/ ) {
+                       my ($c) = split(/\s+/,$_);
+                       push(@controllers,$c);
+               }
+       }
+       close(CLI);
+
+       foreach my $cont (@controllers) {
+               open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli /$cont show|");
+               while (<CLI>) {
+                       if ( $_ =~ /^u[0-9]+/ ) {
+                               my @info = split(/\s+/,$_);
+                               if ( $info[2] ne 'OK' ) {
+                                       if ( $info[2] =~ /REBUILDING/i) {
+                                               my $rebuildstatus = `/usr/bin/sudo /usr/sbin/tw_cli /$cont/$info[0] show | /bin/grep REBUILD | /bin/grep -v RAID-10`;
+                                                       for $_ ($rebuildstatus) {
+                                                       $crit = 1;
+                                                       push(@out,$_);
+                                                       }
+                                       } else {
+                                       $crit = 1;
+                                       push(@out,$_);
+                                       }
+                               }
+                       }
+                       if ( $_ =~ /^p[0-9]+/ ) {
+                               my @info = split(/\s+/,$_);
+                               if ( $info[1] ne 'OK' ) {
+                                       $crit = 1;
+                                       push(@out,$_);
+                               }
+                       }
+               }
+       }       
+}
+
+#Areca
+
+if ( $pci =~ /areca/i) {
+                open(CLI,"sudo /usr/sbin/cli64 vsf info|");
+                while (<CLI>) {
+                        if ( $_ =~ /^\ \ [0-9]+/ ) {
+                               $devices++;
+                                my @info = split(/\s+/,$_);
+                               if ( $_ !~ /Normal/i) {
+                                        $crit = 1;
+                                        push(@out,$_);
+                                }
+                        }
+                }
+        }
+
+if ( $scsi =~ /LSI Logic/i) {
+                open(CLI,"sudo /usr/sbin/mpt-status | /usr/bin/head -1 |");
+                $devices++;
+                while (<CLI>) {
+                        if ( $_ =~ /^ioc/ ) {
+                                my @info = split(/\s+/,$_);
+                                if ( $info[10] ne 'OPTIMAL,' ) {
+                                        $crit = 1;
+                                        push(@out,$_);
+                                }
+                        }
+                }
+        }
+
+# show results
+my $result = 0;
+$result = 1 if $warn;
+$result = 2 if $crit;
+# print "warn = $warn crit = $crit\n";
+print $derp;
+my $out = "No raid devices found $pci";
+$out = "All $devices raid devices happy as clams" if $devices;
+if (@out) {
+    $out = join(';     ', @out);  
+}
+
+print "$out\n";
+exit $result;
diff --git a/roles/common/files/libexec/smart.sh b/roles/common/files/libexec/smart.sh
new file mode 100755 (executable)
index 0000000..2f71a60
--- /dev/null
@@ -0,0 +1,290 @@
+#!/bin/bash
+# Description: Bash script to check drive health using pending, uncorrectable,
+# and reallocated sector count
+#
+# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN
+# SMART Attribute Codes:
+#   5 = Reallocated
+#   187 = Reported Uncorrect
+#   197 = Pending
+#   198 = Uncorrectable Sector Count
+#
+# TO-DO: Add support for dynamic SMART attribute lookup.  For example,
+#        187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs.
+#
+# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes
+
+### Define global variables ###
+# total number of drives (or RAID slots) discovered
+numdrives=0
+# Number of failed, failing, and/or missing drives
+failingdrives=0
+# Fallback message for UNKNOWN return code output
+unknownmsg="Unknown error"
+# Return code for nagios (Default to SUCCESS)
+rc=0
+# Array of messages indicating drive health.  Output after nagios status.
+declare -a messages
+
+### Functions ###
+main ()
+{
+  preflight
+
+  if [ "$raid" = true ]
+  then
+    areca_smart
+    areca_failed
+  elif [ "$raid" = false ]
+  then
+    normal_smart
+  else
+    echo "ERROR - Could not determine if RAID present"
+    exit 3
+  fi
+
+  ## Return UNKNOWN if no drives found
+  if [ "$numdrives" -eq "0" ]
+  then
+    unknownmsg="No drives found!"
+    rc=3
+  fi
+  
+  ## Return code and service status for nagios
+  if [ "$rc" = 0 ]
+  then
+    echo "OK - All $numdrives drives healthy"
+  elif [ "$rc" = 1 ]
+  then
+    echo "WARNING - $failingdrives of $numdrives drives sick"
+  elif [ "$rc" = 2 ]
+  then
+    echo "CRITICAL - $failingdrives of $numdrives drives need replacing"
+  elif [ "$rc" = 3 ]
+  then
+    echo "UNKNOWN - $unknownmsg"
+  else
+    echo "ERROR - Got no return code"
+  fi
+  
+  ## Iterate through array of messages
+  # Nagios reads and displays the first line of output on the Services page.
+  # All individual messages about failed/failing disk statistics can be viewed
+  # on the individual system's SMART detail page in nagios.
+  for msg in "${messages[@]}"
+  do
+    echo "$msg"
+  done
+  
+  exit $rc
+}
+
+# Pre-flight checks
+preflight ()
+{
+  # Set raid var then check for cli64 command and bail if missing
+  if lspci | grep -qi areca
+  then
+    raid=true
+  else
+    raid=false
+  fi
+  
+  if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ]
+  then
+    echo "ERROR - cli64 command not found or is not executable"
+    exit 3
+  fi
+  
+  # Check for smartmontools and bail if missing
+  if ! [ -x "$(command -v smartctl)" ]
+  then
+    echo "ERROR - smartctl is not installed or is not executable"
+    echo "yum/apt-get install smartmontools"
+    exit 3
+  fi
+}
+
+# Gather smart data for drives behind Areca RAID controller
+areca_smart ()
+{
+  # Store output of cli64 to reduce repeated executions
+  cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
+  numdrives=$(echo "$cli64out" | wc -l)
+  # Loop through all disks not marked as 'N.A.' or 'Failed'
+  for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \
+  | grep -o "Slot#[[:digit:]]" | cut -c6-)
+  do
+    failed=false
+    # Determine if disk is JBOD or part of hardware RAID
+    if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD'
+    then
+      jbod=true
+    else
+      jbod=false
+    fi
+    output=$(sudo cli64 disk smart drv=$slot \
+    | grep -E "^  "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ')
+    outputcount=$(echo $output | wc -w)
+    # Only continue if we received 3 SMART data points
+    if [ "$outputcount" = "3" ]
+    then
+      # Only do slot to drive letter matching once per bad JBOD
+      if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ]
+      then
+        dl=$(areca_bay_to_letter $slot)
+      elif [ "$jbod" = false ]
+      then
+        dl="(RAID)"
+      fi
+      read reallocated pending uncorrect <<< $output
+      if [ "$reallocated" != "0" ]
+      then
+        messages+=("Drive $slot $dl has $reallocated reallocated sectors")
+        failed=true
+        # A small number of reallocated sectors is OK
+        if [ "$reallocated" -le 5 ]
+        then
+          rc=1 # Warn if <= 5
+        else
+          rc=2 # Crit if >5
+        fi
+      fi
+      if [ "$pending" != "0" ]
+      then
+        messages+=("Drive $slot $dl has $pending pending sectors")
+        failed=true
+        rc=2
+      fi
+      if [ "$uncorrect" != "0" ]
+      then
+        messages+=("Drive $slot $dl has $uncorrect uncorrect sectors")
+        failed=true
+        rc=2
+      fi
+    else
+      messages+=("Drive $slot returned $outputcount of 3 expected attributes")
+      unknownmsg="SMART data could not be read for one or more drives"
+      rc=3
+    fi
+    # Make sure drives with multiple types of bad sectors only get counted once
+    if [ "$failed" = true ]
+    then
+      let "failingdrives+=1"
+    fi
+  done
+}
+
+# Correlate Areca drive bay to drive letter
+areca_bay_to_letter ()
+{
+  # Get S/N according to RAID controller given argument $1 (slot #)
+  areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \
+  | awk '{ print $NF }')
+  # Loop through and get S/N according to smartctl given drive name
+  for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
+  | awk '{ print $NF }')
+  do
+    smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \
+    | awk '{ print $NF }')
+    # If cli64 and smartctl find a S/N match, return drive letter
+    if [ "$areca_serial" = "$smart_serial" ]
+    then
+      echo "($dl)"
+    fi
+  done
+}
+
+# Tally missing and failed drives connected to Areca RAID
+areca_failed ()
+{
+  # Store output of cli64 to reduce repeated executions
+  cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
+  # Missing (N.A.) drives
+  for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
+  | grep "N.A." | awk '{ print $1 }')
+  do
+    messages+=("Drive $drive is missing")
+    let "failingdrives+=1"
+    rc=2
+  done
+  # Hard failed drives
+  for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
+  | grep 'Failed' | awk '{ print $1 }')
+  do
+    messages+=("Drive $drive failed")
+    let "failingdrives+=1"
+    rc=2
+  done
+}
+
+# Standard SATA/SAS drive smartctl check
+normal_smart ()
+{
+  # The grep regex will include drives named sdaa, for example
+  numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l)
+  for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
+  | awk '{ print $NF }')
+  do
+    failed=false
+    output=$(sudo smartctl -a /dev/$l | grep -E "^  "5"|^"197"|^"198"" \
+    | awk '{ print $NF }' | tr '\n' ' ')
+    outputcount=$(echo $output | wc -w)
+    # Check if drive is SSD and set var accordingly
+    if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then
+      is_ssd=true
+    else
+      is_ssd=false
+    fi
+    # Only continue if we received 3 SMART data points and drive is not SSD
+    if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ]
+    then
+      read reallocated pending uncorrect <<< $output
+      if [ "$reallocated" != "0" ]
+      then
+        messages+=("Drive $l has $reallocated reallocated sectors")
+        failed=true
+        # A small number of reallocated sectors is OK
+        if [ "$reallocated" -le 5 ]
+        then
+          rc=1 # Warn if <= 5
+        else
+          rc=2 # Crit if >5
+        fi
+      fi
+      if [ "$pending" != "0" ]
+      then
+        messages+=("Drive $l has $pending pending sectors")
+        failed=true
+        rc=2
+      fi
+      if [ "$uncorrect" != "0" ]
+      then
+        messages+=("Drive $l has $uncorrect uncorrect sectors")
+        failed=true
+        rc=2
+      fi
+    elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ]
+    then
+      messages+=("Drive $l returned $outputcount of 3 expected attributes")
+      unknownmsg="SMART data could not be read for one or more drives"
+      rc=3
+    # Set no return code and assume any SSD is healthy for now
+    elif [ "$is_ssd" = true ]
+    then
+      messages+=("Drive $l is an SSD.  Not yet supported.")
+      rc=0
+    else
+      messages+=("Error processing data for drive $l")
+      rc=3
+    fi
+    # Make sure drives with multiple types of bad sectors only get counted once
+    if [ "$failed" = true ]
+    then
+      let "failingdrives+=1"
+    fi
+  done
+}
+
+## Call main() function
+main
diff --git a/roles/common/files/nagios/nrpe.te b/roles/common/files/nagios/nrpe.te
new file mode 100644 (file)
index 0000000..c7bc886
--- /dev/null
@@ -0,0 +1,12 @@
+module nrpe 1.0;
+
+require {
+       type fsadm_exec_t;
+       type nrpe_t;
+       type hwdata_t;
+       class file { read getattr open };
+}
+
+#============= nrpe_t ==============
+allow nrpe_t fsadm_exec_t:file getattr;
+allow nrpe_t hwdata_t:file { read getattr open };
diff --git a/roles/common/files/sbin/cli64 b/roles/common/files/sbin/cli64
new file mode 100644 (file)
index 0000000..7ef82de
Binary files /dev/null and b/roles/common/files/sbin/cli64 differ
diff --git a/roles/common/files/sbin/megacli b/roles/common/files/sbin/megacli
new file mode 100755 (executable)
index 0000000..50bf00b
Binary files /dev/null and b/roles/common/files/sbin/megacli differ
diff --git a/roles/common/handlers/main.yml b/roles/common/handlers/main.yml
new file mode 100644 (file)
index 0000000..e2563ef
--- /dev/null
@@ -0,0 +1,5 @@
+---
+- name: restart nagios-nrpe-server
+  service:
+    name: "{{ nrpe_service_name }}"
+    state: restarted
diff --git a/roles/common/tasks/disk_monitoring.yml b/roles/common/tasks/disk_monitoring.yml
new file mode 100644 (file)
index 0000000..2d06a17
--- /dev/null
@@ -0,0 +1,33 @@
+---
+# We use these scripts to check to see if any of our test nodes have bad disks
+
+- name: Upload megacli and cli64 for raid monitoring and smart.pl to /usr/sbin/.
+  copy:
+    src: "../files/sbin/{{ item }}"
+    dest: "/usr/sbin/{{ item }}"
+    owner: root
+    group: root
+    mode: 0755
+  with_items:
+    - megacli
+    - cli64
+
+- name: Create /usr/libexec.
+  file:
+    path: /usr/libexec
+    owner: root
+    group: root
+    mode: 0755
+    state: directory
+
+- name: Upload custom netsaint scripts for raid/disk/smart/monitoring to /usr/libexec/.
+  copy:
+    src: "../files/libexec/{{ item }}"
+    dest: "/usr/libexec/{{ item }}"
+    owner: root
+    group: root
+    mode: 0755
+  with_items:
+    - smart.sh
+    - raid.pl
+    - diskusage.pl
index bf065e3a706d05bfa2bafa99ddea1ccd6b450497..204cc68b9b5a633b5c7b56e3e87ec56447a3cee5 100644 (file)
 - include: kerberos.yml
   tags:
     - kerberos
+
+# upload custom disk monitoring scripts
+- include: disk_monitoring.yml
+  tags:
+    - monitoring-scripts
+
+# configure nagios
+- include: nagios.yml
+  tags:
+    - nagios
+
+- name: Get SELinux status
+  command: getenforce
+  register: selinux_status
+  when: ansible_pkg_mgr == "yum"
+  tags:
+    - nagios
+
+# configure selinux for nagios
+- include: nrpe-selinux.yml
+  when: selinux_status is defined and selinux_status.stdout != "Disabled"
+  tags:
+    - nagios
diff --git a/roles/common/tasks/nagios.yml b/roles/common/tasks/nagios.yml
new file mode 100644 (file)
index 0000000..259a229
--- /dev/null
@@ -0,0 +1,54 @@
+---
+- name: Upload nagios sudoers.d for raid utilities.
+  template:
+    src: nagios/90-nagios
+    dest: /etc/sudoers.d/90-nagios
+    owner: root
+    group: root
+    mode: 0440
+    validate: visudo -cf %s
+
+- name: Configure nagios nrpe settings (Ubuntu)
+  lineinfile:
+    dest: /etc/default/{{ nrpe_service_name }}
+    regexp: "^DAEMON_OPTS"
+    line: "DAEMON_OPTS=\"--no-ssl\""
+  when: ansible_pkg_mgr == "apt"
+
+- name: Configure nagios nrpe settings (RHEL/CentOS)
+  lineinfile:
+    dest: /etc/sysconfig/{{ nrpe_service_name }}
+    regexp: "^NRPE_SSL_OPT"
+    line: "NRPE_SSL_OPT=\"-n\""
+  when: ansible_pkg_mgr == "yum"
+
+- name: Check firewalld status
+  command: systemctl status firewalld
+  register: firewalld
+  ignore_errors: true
+  no_log: true
+  when: ansible_pkg_mgr == "yum"
+
+- name: Open nrpe port if firewalld enabled
+  firewalld:
+    port: 5666/tcp
+    state: enabled
+    permanent: yes
+    immediate: yes
+  when: ansible_pkg_mgr == "yum" and (firewalld is defined and firewalld.stdout.find('running') != -1)
+
+- name: Upload nagios nrpe config.
+  template:
+    src: nagios/nrpe.cfg 
+    dest: /etc/nagios/nrpe.cfg
+    owner: root
+    group: root
+    mode: 0644
+  notify:
+    - restart nagios-nrpe-server
+
+- name: Make sure nagios nrpe service is running.
+  service:
+    name: "{{ nrpe_service_name }}"
+    enabled: yes
+    state: started
diff --git a/roles/common/tasks/nrpe-selinux.yml b/roles/common/tasks/nrpe-selinux.yml
new file mode 100644 (file)
index 0000000..877aa2e
--- /dev/null
@@ -0,0 +1,44 @@
+---
+- name: nrpe - Install semanage python bindings
+  yum:
+    pkg: libsemanage-python
+    state: installed
+
+- name: nrpe - Install SELinux tools
+  yum:
+    pkg: policycoreutils-python
+    state: installed
+
+- name: nrpe - Ensure SELinux policy is up to date
+  yum:
+    pkg: selinux-policy-targeted
+    state: latest
+
+- name: nrpe - Set SELinux boolean nagios_run_sudo true
+  seboolean:
+    name: nagios_run_sudo
+    state: yes
+    persistent: yes
+
+- name: nrpe - Remove SELinux policy package
+  command: semodule -r nrpe
+  failed_when: false
+
+- name: nrpe - Copy SELinux type enforcement file
+  copy:
+    src: nagios/nrpe.te
+    dest: /tmp/nrpe.te
+
+- name: nrpe - Compile SELinux module file
+  command: checkmodule -M -m -o /tmp/nrpe.mod /tmp/nrpe.te
+
+- name: nrpe - Build SELinux policy package
+  command: semodule_package -o /tmp/nrpe.pp -m /tmp/nrpe.mod
+
+- name: nrpe - Load SELinux policy package
+  command: semodule -i /tmp/nrpe.pp
+
+- name: nrpe - Remove temporary files
+  file:
+    path: /tmp/nrpe.*
+    state: absent
diff --git a/roles/common/templates/nagios/90-nagios b/roles/common/templates/nagios/90-nagios
new file mode 100644 (file)
index 0000000..34326fb
--- /dev/null
@@ -0,0 +1,2 @@
+## {{ ansible_managed }}
+{{ nrpe_user }} ALL=NOPASSWD: /usr/sbin/megacli, /usr/sbin/cli64, /usr/sbin/smartctl, /usr/sbin/smartctl
diff --git a/roles/common/templates/nagios/nrpe.cfg b/roles/common/templates/nagios/nrpe.cfg
new file mode 100644 (file)
index 0000000..84435c8
--- /dev/null
@@ -0,0 +1,29 @@
+# {{ ansible_managed }}
+log_facility=daemon
+pid_file=/var/run/nagios/nrpe.pid
+server_port=5666
+nrpe_user={{ nrpe_user }}
+nrpe_group={{ nrpe_group }}
+
+# These should eventually be in a secrets group_var
+# 172. address is sepia nagios server
+# 10. address is octo nagios server
+allowed_hosts=127.0.0.1,172.21.0.33,10.8.0.8
+dont_blame_nrpe=0
+debug=0
+command_timeout=60
+connection_timeout=300
+
+command[check_users]={{ nagios_plugins_directory }}/check_users --warning=5 --critical=10
+command[check_load]={{ nagios_plugins_directory }}/check_load --percpu --warning=1.5,1.4,1.3 --critical=2.0,1.9,1.8
+command[check_hda1]={{ nagios_plugins_directory }}/check_disk --warning=20% --critical=10% --partition=/dev/hda1
+command[check_root]={{ nagios_plugins_directory }}/check_disk --warning=10% --critical=5% --units=GB --path=/
+command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning=5 --critical=10 --state=Z
+command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500
+command[check_raid]=/usr/libexec/raid.pl
+command[check_disks]=/usr/libexec/diskusage.pl 90 95
+command[check_smart]=/usr/libexec/smart.sh
+
+include=/etc/nagios/nrpe_local.cfg
+
+include_dir=/etc/nagios/nrpe.d/
diff --git a/roles/common/vars/apt_systems.yml b/roles/common/vars/apt_systems.yml
new file mode 100644 (file)
index 0000000..066314d
--- /dev/null
@@ -0,0 +1,5 @@
+---
+nrpe_service_name: nagios-nrpe-server
+nrpe_user: nagios
+nrpe_group: nagios
+nagios_plugins_directory: /usr/lib/nagios/plugins
diff --git a/roles/common/vars/yum_systems.yml b/roles/common/vars/yum_systems.yml
new file mode 100644 (file)
index 0000000..d7b4ed2
--- /dev/null
@@ -0,0 +1,5 @@
+---
+nrpe_service_name: nrpe
+nrpe_user: nrpe
+nrpe_group: nrpe
+nagios_plugins_directory: /usr/lib64/nagios/plugins
diff --git a/roles/testnode/files/libexec/diskusage.pl b/roles/testnode/files/libexec/diskusage.pl
deleted file mode 100644 (file)
index 49200da..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/perl
-
-# {{ ansible_managed }}
-
-#******************************************************************************************
-#
-# NRPE DISK USAGE PLUGIN
-#
-# Program: Disk Usage plugin written to be used with Netsaint and NRPE
-# License: GPL
-# Copyright (c) 2000 Jeremy Hanmer (jeremy@newdream.net)
-#
-# Last Modified: 10/23/00
-# 
-# Information:  Basically, I wrote this because I had to deal with large numbers of 
-# machines with a wide range of disk configurations, and with dynamically mounted 
-# partitions.  The basic check_disk plugin relied on a static configuration file which
-# doesn't lend itself to being used in a heterogeneous environnment (especially when
-# you can't guarantee that the devices listed in the configuration file will be mounted).
-#
-# Bugs:  Currently, this plugin only works on EXT2 partitions (although it's easy to change).
-#
-# Command Line: diskusage.pl <warning percentage> <critical percentage>
-#
-# Tested Systems:  Mandrake 7.1/Intel, Debian 2.2/Intel, Debian 2.1/Intel
-#
-# License Information:
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-#*******************************************************************************************
-
-
-use strict;
-
-my $wrn = shift @ARGV;
-my $crt = shift @ARGV;
-my $output;
-my $count;
-my %type;
-my $result = 0;
-my $warn = 0;
-my $crit = 0;
-my @parts;
-my $hostname = `hostname`;
-chomp $hostname;
-@parts = `mount | grep -vi fuse`;
-
-#if ( $hostname eq 'zartan' ) {
-#      @parts = `mount`;
-#}
-#else {
-#      @parts = `mount -t ext2,reiserfs`;
-#}
-for (@parts) {
-       my ($dev,$on,$mount,$tp,$type,$options) = split(/\s+/,$_);
-               next if ($type eq 'nfs' && !($hostname eq 'zartan'));
-               next if ($type eq 'proc' || $type eq 'devpts');
-               my @df= `df -k $mount`;
-               my @df_inode = `df -i $mount`;
-#              print "$dev $mount $type\n";
-               shift @df;
-               shift @df_inode;
-               for(@df) {
-                       my ($dev1,$blocks,$used,$free,$pc,$mount) = split(/\s+/,$_);
-                       my ($percent,$blah) = split(/\%/,$pc);
-                       if ( ($percent >= $wrn ) && (!($percent >= $crt) || ($mount =~ m/\/mnt\//)) ) {
-                               $output .= "$mount is at $pc    ";
-                               $warn = 1;
-                       }
-                       if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
-                               $output = "" unless $crit eq '1';
-                               $output .= "$mount is at $pc    ";
-                               $crit = 1;
-                       }
-               }
-               for(@df_inode) {
-                       my ($dev1,$inodes,$used,$free,$pc,$mount) = split(/\s+/,$_);
-                       my ($percent,$blah) = split(/\%/,$pc);
-                       if ( ($percent >= $wrn ) && (!($percent >= $crt) ) ) {
-                               $output .= "$mount is at $pc inode usage    ";
-                               $warn = 1;
-                       }
-                       if ( ($percent >= $crt ) && !($mount =~ m/\/mnt\//) ){
-                               $output = "" unless $crit eq '1';
-                               $output .= "$mount is at $pc inode usage    ";
-                               $crit = 1;
-                       }
-               }
-       }
-
-
-#if ( ($warn eq '1') && !($crit eq '1') )  {
-#      print "$output\n";
-#      $result = 1;
-#      }
-if ( $crit eq '1' ) {
-       print "$output\n";
-       $result = 2;
-}
-
-else {
-       print "Disks are OK now\n";
-}
-
-
-#if ( !( $crit eq '1' ) && !( $warn eq '1' ) ) {
-#      print "Disks are ok now\n";
-#}
-#print "$result\n";
-exit $result; 
diff --git a/roles/testnode/files/libexec/raid.pl b/roles/testnode/files/libexec/raid.pl
deleted file mode 100755 (executable)
index f65eedd..0000000
+++ /dev/null
@@ -1,313 +0,0 @@
-#!/usr/bin/perl
-
-# {{ ansible_managed }}
-
-use strict;
-
-my $warn;
-my $crit;
-my $out;
-
-my @out;
-my $devices;
-my $pci;
-my $scsi;
-my $derp;
-
-$pci = `/usr/bin/lspci | /bin/grep -i raid | /bin/grep -v PATA | /usr/bin/head -2`;
-$scsi = `/usr/bin/lspci | /bin/grep -i scsi | /bin/grep -v PATA | /usr/bin/head -1`;
-
-# software raid!
-if (-e "/proc/mdstat") {
-    # check software raid!
-#    open(R,"/tmp/mdstat");
-    open(R,"/proc/mdstat");
-    while (<R>) {
-               if (/^(md\d+) : (\w+)/) {
-                       my $dev = $1;
-                       my $status = $2;
-                       my $rest = <R>;
-                       $devices++;
-                       
-                       my ($disks,$states) = $rest =~ /(\[.*\]) (\[.*\])/;
-                       my $mout .= "$dev is $status $disks $states" if $states =~ /_/;
-                       
-                       # recovery?
-                       my $next = <R>;  # possibly recovery?
-                       if ($next =~ / recovery = /) {
-                               my ($progress,$per) = $next =~ /(\[.*\])\s+recovery =\s+(\S+%)/;
-                               $mout .= " recovery $per";
-                               my $next = <R>;
-                               if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
-                                       $mout .= " finish $finish min";
-                               }
-                               $warn = 1;
-            } elsif ($next =~ / resync = /) {
-                my ($progress,$per) = $next =~ /(\[.*\])\s+resync =\s+(\S+%)/;
-                $mout .= " resync $per";
-                if (my ($finish,$speed) = $next =~ /finish=(.*)min speed=(.*)\/sec/) {
-                    $mout .= " finish $finish min";
-                }
-                $warn = 1;
-                       } elsif ($states =~ /_/) {  # not all U
-                               $crit = 1;
-                       }
-                       
-                       push( @out, $mout ) if $mout;
-               }
-    }
-}
-
-
-# mylex raid!
-if ($pci =~ /Mylex/i) {
-#if (1) {
-    my $s = `cat /proc/rd/status`;
-    chomp($s);
-    unless ($s =~ /OK/) {
-       my @myinfo;
-       for my $ctl (`ls -d /proc/rd/c*`) {
-#      for my $ctl ('/proc/rd/c0') {
-           chomp $ctl;
-           my %bad;
-           my ($c) = $ctl =~ /\/(c\d)$/;
-           open(S,"$ctl/current_status") || print "can't open $ctl/current_status\n";;
-#          open(S,"/tmp/mylex.bad");
-           my $lastdevice;
-           while (<S>) {
-               # disk status
-               if (/^    (\d:\d)  Vendor/) {
-                   $lastdevice = $1;
-               }
-               if (/ Disk Status: (\S+),/) {
-                   if ($1 ne 'Online') {
-                       push( @myinfo, "$c disk $lastdevice $1");
-                   }
-               }
-
-               # logical drives
-               if (/    (\/dev\/rd\/\S+): (\S+), (\w+),/) {
-                   my $dev = $1;
-                   my $type = $2;
-                   my $status = $3;
-                   $devices++;
-                   $bad{$dev} = 1;
-                   if ($status ne 'Online') {
-                       push( @myinfo, "$dev ($type) $status");
-                   }
-               }
-
-               # rebuild?
-               if (/  Rebuild in Progress: .* \((\S+)\) (\d+%) completed/) {
-                   push( @myinfo, "$1 rebuild $2 complete" );
-                   delete $bad{$1};
-               }
-           }
-           if (keys %bad) {
-               $crit = 1;  # at least 1 is failed and !recovering
-           } else {
-               $warn = 1;   # all are recovering
-           }
-       }
-
-       push( @out, "Mylex $s: " . join(', ',@myinfo)) if @myinfo;
-    }
-}
-
-
-# icp vortex raid!
-if ( $pci =~ /intel/i) {
-    opendir(D,"/proc/scsi/gdth");
-    my @dev = readdir(D);
-    closedir D;
-    my @vortex;
-    for my $dev (@dev) {
-       next if $dev =~ /^\./;
-       my $read = `cat /proc/scsi/gdth/$dev`;
-       # my $read = `cat /tmp/asdf9.warn`;
-       my $cur;   # Logical | Physical | Host | Array
-       my @myinfo;
-#      print "dev $dev\n";
-       for $_ (split(/\n/,$read)) {
-           chomp;
-           if (/^\w/) {
-               # new section
-               ($cur) = /^(\w+)/;
-#              print "cur = $cur\n";
-               next;
-           }
-           if ($cur eq 'Logical') {
-               my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
-               next unless $status;
-               if ($status ne 'ok') {
-                   $warn = 1;
-                   #push( @myinfo, "Logical #$num $status" );
-                   unshift( @myinfo, "Logical #$num $status" );
-               }
-           }
-           if ($cur eq 'Array') {
-               my ($num,$status) = /Number:\s+(\d+)\s+Status:\s+(\w+)/;
-               next unless $status;
-               if ($status ne 'ready') {
-                   $warn = 1;
-                   #push( @myinfo, "Array #$num $status" );
-                   unshift( @myinfo, "Array #$num $status" );
-               }
-           }
-           if ($cur eq 'Host') {
-               if (/Number/) {
-                   $devices++;
-               }
-           }
-           if ($cur eq 'Controller') {
-               # push( @myinfo, $_ );
-               unshift( @myinfo, $_ );
-           }
-       }
-       
-       if (@myinfo) {
-           # push( @vortex, "dev $dev: " . join(', ', @myinfo) );
-           # unshift( @vortex, "dev $dev: " . join(', ', @myinfo) );
-           push( @vortex, "dev $dev: " . join(', ', $myinfo[0], $myinfo[1], $myinfo[2], $myinfo[3], $myinfo[4] ) );
-           # $warn = 1;
-       }
-    }
-
-    if (@vortex) {
-       # push( @out, 'Vortex: ' . join('.   ', @vortex) );
-       push( @out, 'Vortex: ' . join('.   ', @vortex) );
-    }
-}
-# SAS megaraid
-if ( $pci =~ /LSI\ Logic/i) {
-    my $read = `/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0`;
-    for $_ (split(/\n/,$read)) {
-       chomp;
-       # The line we care about is State: Optimal, if we don't have that, we've problems
-       if ($_ =~/^State\s*\:\s*(.*)/m) {
-            $devices++;
-           #/^State\?:\s?(\w+)/;
-           my $state = $1;
-           next unless $state;
-           if ($state ne 'Optimal') {
-               my $rebuild = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -i firmware`;
-                       if ( $rebuild =~ /Rebuild/i) {
-                               my $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
-                               #my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
-                               my $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/egrep -i \'\(rebuild\|not found\)\'`;
-                               if ($rebuildstatus =~ /not found/m) {
-                                  # check by device id instead of enclosure id if we get a not found error above
-                                  $enclosure = `/usr/bin/sudo /usr/sbin/megacli -PDList -a0 | /bin/grep -B15 Rebuild | /bin/grep -e Enclosure -e Slot | /bin/grep -v position | /usr/bin/cut -d':' -f2 | /usr/bin/awk '{printf \$1\":\"}' | /usr/bin/awk -F ":" '{printf \$1":"\$2}'`;
-                                  $rebuildstatus = `/usr/bin/sudo /usr/sbin/megacli -PDRbld -ShowProg -PhysDrv\[$enclosure\] -a0 | /bin/grep -i rebuild`;
-                               }
-                                       for $_ ($rebuildstatus) {
-                                       $crit = 1;
-                                       push(@out,$_);
-                                       }
-                       } else {
-               $crit = 1;
-                my $virtual=`/usr/bin/sudo /usr/sbin/megacli -LDInfo -lall -a0 | grep -i failed -B6 | grep -i virtual | cut -d'(' -f1`;
-               push(@out, $virtual, $_);
-               }
-           }
-       }       
-        # Should to catch the syntax or permissions errors this thing spits out
-       if (/ERROR/i) {
-           $crit = 1;
-           push(@out, $_);
-       foreach my $k (@out)
-       {
-               print $_;
-       }
-       }
-    }
-}
-
-# e3ware
-if ( $pci =~ /3ware/i) {
-       open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli show|");
-       #my $read = `/usr/sbin/megacli -LDInfo -l0 -a0`;
-
-       $devices++;
-       my @controllers;
-       while (<CLI>) {
-               if ( $_ =~ /^c[0-9]/ ) {
-                       my ($c) = split(/\s+/,$_);
-                       push(@controllers,$c);
-               }
-       }
-       close(CLI);
-
-       foreach my $cont (@controllers) {
-               open(CLI,"/usr/bin/sudo /usr/sbin/tw_cli /$cont show|");
-               while (<CLI>) {
-                       if ( $_ =~ /^u[0-9]+/ ) {
-                               my @info = split(/\s+/,$_);
-                               if ( $info[2] ne 'OK' ) {
-                                       if ( $info[2] =~ /REBUILDING/i) {
-                                               my $rebuildstatus = `/usr/bin/sudo /usr/sbin/tw_cli /$cont/$info[0] show | /bin/grep REBUILD | /bin/grep -v RAID-10`;
-                                                       for $_ ($rebuildstatus) {
-                                                       $crit = 1;
-                                                       push(@out,$_);
-                                                       }
-                                       } else {
-                                       $crit = 1;
-                                       push(@out,$_);
-                                       }
-                               }
-                       }
-                       if ( $_ =~ /^p[0-9]+/ ) {
-                               my @info = split(/\s+/,$_);
-                               if ( $info[1] ne 'OK' ) {
-                                       $crit = 1;
-                                       push(@out,$_);
-                               }
-                       }
-               }
-       }       
-}
-
-#Areca
-
-if ( $pci =~ /areca/i) {
-                open(CLI,"sudo /usr/sbin/cli64 vsf info|");
-                while (<CLI>) {
-                        if ( $_ =~ /^\ \ [0-9]+/ ) {
-                               $devices++;
-                                my @info = split(/\s+/,$_);
-                               if ( $_ !~ /Normal/i) {
-                                        $crit = 1;
-                                        push(@out,$_);
-                                }
-                        }
-                }
-        }
-
-if ( $scsi =~ /LSI Logic/i) {
-                open(CLI,"sudo /usr/sbin/mpt-status | /usr/bin/head -1 |");
-                $devices++;
-                while (<CLI>) {
-                        if ( $_ =~ /^ioc/ ) {
-                                my @info = split(/\s+/,$_);
-                                if ( $info[10] ne 'OPTIMAL,' ) {
-                                        $crit = 1;
-                                        push(@out,$_);
-                                }
-                        }
-                }
-        }
-
-# show results
-my $result = 0;
-$result = 1 if $warn;
-$result = 2 if $crit;
-# print "warn = $warn crit = $crit\n";
-print $derp;
-my $out = "No raid devices found $pci";
-$out = "All $devices raid devices happy as clams" if $devices;
-if (@out) {
-    $out = join(';     ', @out);  
-}
-
-print "$out\n";
-exit $result;
diff --git a/roles/testnode/files/libexec/smart.sh b/roles/testnode/files/libexec/smart.sh
deleted file mode 100755 (executable)
index 2f71a60..0000000
+++ /dev/null
@@ -1,290 +0,0 @@
-#!/bin/bash
-# Description: Bash script to check drive health using pending, uncorrectable,
-# and reallocated sector count
-#
-# Nagios return codes: 0 = OK; 1 = WARNING; 2 = CRITICAL; 3 = UNKNOWN
-# SMART Attribute Codes:
-#   5 = Reallocated
-#   187 = Reported Uncorrect
-#   197 = Pending
-#   198 = Uncorrectable Sector Count
-#
-# TO-DO: Add support for dynamic SMART attribute lookup.  For example,
-#        187 is reported for Seagate HDD and all SSDs but not Hitachi HDDs.
-#
-# See https://en.wikipedia.org/wiki/S.M.A.R.T.#ATA_S.M.A.R.T._attributes
-
-### Define global variables ###
-# total number of drives (or RAID slots) discovered
-numdrives=0
-# Number of failed, failing, and/or missing drives
-failingdrives=0
-# Fallback message for UNKNOWN return code output
-unknownmsg="Unknown error"
-# Return code for nagios (Default to SUCCESS)
-rc=0
-# Array of messages indicating drive health.  Output after nagios status.
-declare -a messages
-
-### Functions ###
-main ()
-{
-  preflight
-
-  if [ "$raid" = true ]
-  then
-    areca_smart
-    areca_failed
-  elif [ "$raid" = false ]
-  then
-    normal_smart
-  else
-    echo "ERROR - Could not determine if RAID present"
-    exit 3
-  fi
-
-  ## Return UNKNOWN if no drives found
-  if [ "$numdrives" -eq "0" ]
-  then
-    unknownmsg="No drives found!"
-    rc=3
-  fi
-  
-  ## Return code and service status for nagios
-  if [ "$rc" = 0 ]
-  then
-    echo "OK - All $numdrives drives healthy"
-  elif [ "$rc" = 1 ]
-  then
-    echo "WARNING - $failingdrives of $numdrives drives sick"
-  elif [ "$rc" = 2 ]
-  then
-    echo "CRITICAL - $failingdrives of $numdrives drives need replacing"
-  elif [ "$rc" = 3 ]
-  then
-    echo "UNKNOWN - $unknownmsg"
-  else
-    echo "ERROR - Got no return code"
-  fi
-  
-  ## Iterate through array of messages
-  # Nagios reads and displays the first line of output on the Services page.
-  # All individual messages about failed/failing disk statistics can be viewed
-  # on the individual system's SMART detail page in nagios.
-  for msg in "${messages[@]}"
-  do
-    echo "$msg"
-  done
-  
-  exit $rc
-}
-
-# Pre-flight checks
-preflight ()
-{
-  # Set raid var then check for cli64 command and bail if missing
-  if lspci | grep -qi areca
-  then
-    raid=true
-  else
-    raid=false
-  fi
-  
-  if [ "$raid" = true ] && ! [ -x "$(command -v cli64)" ]
-  then
-    echo "ERROR - cli64 command not found or is not executable"
-    exit 3
-  fi
-  
-  # Check for smartmontools and bail if missing
-  if ! [ -x "$(command -v smartctl)" ]
-  then
-    echo "ERROR - smartctl is not installed or is not executable"
-    echo "yum/apt-get install smartmontools"
-    exit 3
-  fi
-}
-
-# Gather smart data for drives behind Areca RAID controller
-areca_smart ()
-{
-  # Store output of cli64 to reduce repeated executions
-  cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
-  numdrives=$(echo "$cli64out" | wc -l)
-  # Loop through all disks not marked as 'N.A.' or 'Failed'
-  for slot in $(echo "$cli64out" | grep -v 'N.A.\|Failed' \
-  | grep -o "Slot#[[:digit:]]" | cut -c6-)
-  do
-    failed=false
-    # Determine if disk is JBOD or part of hardware RAID
-    if echo "$cli64out" | grep -E "Slot#$slot" | grep -q 'JBOD'
-    then
-      jbod=true
-    else
-      jbod=false
-    fi
-    output=$(sudo cli64 disk smart drv=$slot \
-    | grep -E "^  "5"|^"197"|^"198"" | awk '{ print $(NF-1) }' | tr '\n' ' ')
-    outputcount=$(echo $output | wc -w)
-    # Only continue if we received 3 SMART data points
-    if [ "$outputcount" = "3" ]
-    then
-      # Only do slot to drive letter matching once per bad JBOD
-      if [[ $output != "0 0 0 " ]] && [ "$jbod" = true ]
-      then
-        dl=$(areca_bay_to_letter $slot)
-      elif [ "$jbod" = false ]
-      then
-        dl="(RAID)"
-      fi
-      read reallocated pending uncorrect <<< $output
-      if [ "$reallocated" != "0" ]
-      then
-        messages+=("Drive $slot $dl has $reallocated reallocated sectors")
-        failed=true
-        # A small number of reallocated sectors is OK
-        if [ "$reallocated" -le 5 ]
-        then
-          rc=1 # Warn if <= 5
-        else
-          rc=2 # Crit if >5
-        fi
-      fi
-      if [ "$pending" != "0" ]
-      then
-        messages+=("Drive $slot $dl has $pending pending sectors")
-        failed=true
-        rc=2
-      fi
-      if [ "$uncorrect" != "0" ]
-      then
-        messages+=("Drive $slot $dl has $uncorrect uncorrect sectors")
-        failed=true
-        rc=2
-      fi
-    else
-      messages+=("Drive $slot returned $outputcount of 3 expected attributes")
-      unknownmsg="SMART data could not be read for one or more drives"
-      rc=3
-    fi
-    # Make sure drives with multiple types of bad sectors only get counted once
-    if [ "$failed" = true ]
-    then
-      let "failingdrives+=1"
-    fi
-  done
-}
-
-# Correlate Areca drive bay to drive letter
-areca_bay_to_letter ()
-{
-  # Get S/N according to RAID controller given argument $1 (slot #)
-  areca_serial=$(sudo cli64 disk info drv=$1 | grep 'Serial Number' \
-  | awk '{ print $NF }')
-  # Loop through and get S/N according to smartctl given drive name
-  for dl in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
-  | awk '{ print $NF }')
-  do
-    smart_serial=$(sudo smartctl -a /dev/$dl | grep "Serial number" \
-    | awk '{ print $NF }')
-    # If cli64 and smartctl find a S/N match, return drive letter
-    if [ "$areca_serial" = "$smart_serial" ]
-    then
-      echo "($dl)"
-    fi
-  done
-}
-
-# Tally missing and failed drives connected to Areca RAID
-areca_failed ()
-{
-  # Store output of cli64 to reduce repeated executions
-  cli64out=$(sudo cli64 disk info | grep -E "Slot#[[:digit:]]")
-  # Missing (N.A.) drives
-  for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
-  | grep "N.A." | awk '{ print $1 }')
-  do
-    messages+=("Drive $drive is missing")
-    let "failingdrives+=1"
-    rc=2
-  done
-  # Hard failed drives
-  for drive in $(echo "$cli64out" | grep -E "Slot#[[:digit:]]" \
-  | grep 'Failed' | awk '{ print $1 }')
-  do
-    messages+=("Drive $drive failed")
-    let "failingdrives+=1"
-    rc=2
-  done
-}
-
-# Standard SATA/SAS drive smartctl check
-normal_smart ()
-{
-  # The grep regex will include drives named sdaa, for example
-  numdrives=$(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' | wc -l)
-  for l in $(cat /proc/partitions | grep -w 'sd[a-z]\|sd[a-z]\{2\}' \
-  | awk '{ print $NF }')
-  do
-    failed=false
-    output=$(sudo smartctl -a /dev/$l | grep -E "^  "5"|^"197"|^"198"" \
-    | awk '{ print $NF }' | tr '\n' ' ')
-    outputcount=$(echo $output | wc -w)
-    # Check if drive is SSD and set var accordingly
-    if sudo smartctl -i /dev/$l | grep -q 'Solid State Device'; then
-      is_ssd=true
-    else
-      is_ssd=false
-    fi
-    # Only continue if we received 3 SMART data points and drive is not SSD
-    if [ "$outputcount" = "3" ] && [ "$is_ssd" = false ]
-    then
-      read reallocated pending uncorrect <<< $output
-      if [ "$reallocated" != "0" ]
-      then
-        messages+=("Drive $l has $reallocated reallocated sectors")
-        failed=true
-        # A small number of reallocated sectors is OK
-        if [ "$reallocated" -le 5 ]
-        then
-          rc=1 # Warn if <= 5
-        else
-          rc=2 # Crit if >5
-        fi
-      fi
-      if [ "$pending" != "0" ]
-      then
-        messages+=("Drive $l has $pending pending sectors")
-        failed=true
-        rc=2
-      fi
-      if [ "$uncorrect" != "0" ]
-      then
-        messages+=("Drive $l has $uncorrect uncorrect sectors")
-        failed=true
-        rc=2
-      fi
-    elif [ "$outputcount" != "3" ] && [ "$is_ssd" = false ]
-    then
-      messages+=("Drive $l returned $outputcount of 3 expected attributes")
-      unknownmsg="SMART data could not be read for one or more drives"
-      rc=3
-    # Set no return code and assume any SSD is healthy for now
-    elif [ "$is_ssd" = true ]
-    then
-      messages+=("Drive $l is an SSD.  Not yet supported.")
-      rc=0
-    else
-      messages+=("Error processing data for drive $l")
-      rc=3
-    fi
-    # Make sure drives with multiple types of bad sectors only get counted once
-    if [ "$failed" = true ]
-    then
-      let "failingdrives+=1"
-    fi
-  done
-}
-
-## Call main() function
-main
diff --git a/roles/testnode/files/nagios/nrpe.te b/roles/testnode/files/nagios/nrpe.te
deleted file mode 100644 (file)
index c7bc886..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-module nrpe 1.0;
-
-require {
-       type fsadm_exec_t;
-       type nrpe_t;
-       type hwdata_t;
-       class file { read getattr open };
-}
-
-#============= nrpe_t ==============
-allow nrpe_t fsadm_exec_t:file getattr;
-allow nrpe_t hwdata_t:file { read getattr open };
diff --git a/roles/testnode/files/sbin/cli64 b/roles/testnode/files/sbin/cli64
deleted file mode 100644 (file)
index 7ef82de..0000000
Binary files a/roles/testnode/files/sbin/cli64 and /dev/null differ
diff --git a/roles/testnode/files/sbin/megacli b/roles/testnode/files/sbin/megacli
deleted file mode 100755 (executable)
index 50bf00b..0000000
Binary files a/roles/testnode/files/sbin/megacli and /dev/null differ
index a87e9106f39ae69a118ab795aa9b587d6dbe6628..e3a24c6d301b1a2c2dec2936fe81f8e17e2dba46 100644 (file)
@@ -25,8 +25,3 @@
   service:
     name: cron
     state: restarted
-
-- name: restart nagios-nrpe-server
-  service:
-    name: "{{ nrpe_service_name }}"
-    state: restarted
diff --git a/roles/testnode/tasks/disk_monitoring.yml b/roles/testnode/tasks/disk_monitoring.yml
deleted file mode 100644 (file)
index 2d06a17..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
----
-# We use these scripts to check to see if any of our test nodes have bad disks
-
-- name: Upload megacli and cli64 for raid monitoring and smart.pl to /usr/sbin/.
-  copy:
-    src: "../files/sbin/{{ item }}"
-    dest: "/usr/sbin/{{ item }}"
-    owner: root
-    group: root
-    mode: 0755
-  with_items:
-    - megacli
-    - cli64
-
-- name: Create /usr/libexec.
-  file:
-    path: /usr/libexec
-    owner: root
-    group: root
-    mode: 0755
-    state: directory
-
-- name: Upload custom netsaint scripts for raid/disk/smart/monitoring to /usr/libexec/.
-  copy:
-    src: "../files/libexec/{{ item }}"
-    dest: "/usr/libexec/{{ item }}"
-    owner: root
-    group: root
-    mode: 0755
-  with_items:
-    - smart.sh
-    - raid.pl
-    - diskusage.pl
index e04368eb0080aa492aa66c00387706243300a75e..033d386763d4c76a7bb2e593480e33da31c8b5d5 100644 (file)
   tags:
     - cpan
 
-# upload custom disk monitoring scripts
-- include: disk_monitoring.yml
-  tags:
-    - monitoring-scripts
-
-# configure nagios
-- include: nagios.yml
-  tags:
-    - nagios
-
-- name: Get SELinux status
-  command: getenforce
-  register: selinux_status
-  when: ansible_pkg_mgr == "yum"
-  tags:
-    - nagios
-
-# configure selinux for nagios
-- include: nrpe-selinux.yml
-  when: selinux_status is defined and selinux_status.stdout != "Disabled"
-  tags:
-    - nagios
-
 # configure ntp
 - include: ntp.yml
   tags:
diff --git a/roles/testnode/tasks/nagios.yml b/roles/testnode/tasks/nagios.yml
deleted file mode 100644 (file)
index 259a229..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
----
-- name: Upload nagios sudoers.d for raid utilities.
-  template:
-    src: nagios/90-nagios
-    dest: /etc/sudoers.d/90-nagios
-    owner: root
-    group: root
-    mode: 0440
-    validate: visudo -cf %s
-
-- name: Configure nagios nrpe settings (Ubuntu)
-  lineinfile:
-    dest: /etc/default/{{ nrpe_service_name }}
-    regexp: "^DAEMON_OPTS"
-    line: "DAEMON_OPTS=\"--no-ssl\""
-  when: ansible_pkg_mgr == "apt"
-
-- name: Configure nagios nrpe settings (RHEL/CentOS)
-  lineinfile:
-    dest: /etc/sysconfig/{{ nrpe_service_name }}
-    regexp: "^NRPE_SSL_OPT"
-    line: "NRPE_SSL_OPT=\"-n\""
-  when: ansible_pkg_mgr == "yum"
-
-- name: Check firewalld status
-  command: systemctl status firewalld
-  register: firewalld
-  ignore_errors: true
-  no_log: true
-  when: ansible_pkg_mgr == "yum"
-
-- name: Open nrpe port if firewalld enabled
-  firewalld:
-    port: 5666/tcp
-    state: enabled
-    permanent: yes
-    immediate: yes
-  when: ansible_pkg_mgr == "yum" and (firewalld is defined and firewalld.stdout.find('running') != -1)
-
-- name: Upload nagios nrpe config.
-  template:
-    src: nagios/nrpe.cfg 
-    dest: /etc/nagios/nrpe.cfg
-    owner: root
-    group: root
-    mode: 0644
-  notify:
-    - restart nagios-nrpe-server
-
-- name: Make sure nagios nrpe service is running.
-  service:
-    name: "{{ nrpe_service_name }}"
-    enabled: yes
-    state: started
diff --git a/roles/testnode/tasks/nrpe-selinux.yml b/roles/testnode/tasks/nrpe-selinux.yml
deleted file mode 100644 (file)
index 877aa2e..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
----
-- name: nrpe - Install semanage python bindings
-  yum:
-    pkg: libsemanage-python
-    state: installed
-
-- name: nrpe - Install SELinux tools
-  yum:
-    pkg: policycoreutils-python
-    state: installed
-
-- name: nrpe - Ensure SELinux policy is up to date
-  yum:
-    pkg: selinux-policy-targeted
-    state: latest
-
-- name: nrpe - Set SELinux boolean nagios_run_sudo true
-  seboolean:
-    name: nagios_run_sudo
-    state: yes
-    persistent: yes
-
-- name: nrpe - Remove SELinux policy package
-  command: semodule -r nrpe
-  failed_when: false
-
-- name: nrpe - Copy SELinux type enforcement file
-  copy:
-    src: nagios/nrpe.te
-    dest: /tmp/nrpe.te
-
-- name: nrpe - Compile SELinux module file
-  command: checkmodule -M -m -o /tmp/nrpe.mod /tmp/nrpe.te
-
-- name: nrpe - Build SELinux policy package
-  command: semodule_package -o /tmp/nrpe.pp -m /tmp/nrpe.mod
-
-- name: nrpe - Load SELinux policy package
-  command: semodule -i /tmp/nrpe.pp
-
-- name: nrpe - Remove temporary files
-  file:
-    path: /tmp/nrpe.*
-    state: absent
diff --git a/roles/testnode/templates/nagios/90-nagios b/roles/testnode/templates/nagios/90-nagios
deleted file mode 100644 (file)
index 34326fb..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-## {{ ansible_managed }}
-{{ nrpe_user }} ALL=NOPASSWD: /usr/sbin/megacli, /usr/sbin/cli64, /usr/sbin/smartctl, /usr/sbin/smartctl
diff --git a/roles/testnode/templates/nagios/nrpe.cfg b/roles/testnode/templates/nagios/nrpe.cfg
deleted file mode 100644 (file)
index 84435c8..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-# {{ ansible_managed }}
-log_facility=daemon
-pid_file=/var/run/nagios/nrpe.pid
-server_port=5666
-nrpe_user={{ nrpe_user }}
-nrpe_group={{ nrpe_group }}
-
-# These should eventually be in a secrets group_var
-# 172. address is sepia nagios server
-# 10. address is octo nagios server
-allowed_hosts=127.0.0.1,172.21.0.33,10.8.0.8
-dont_blame_nrpe=0
-debug=0
-command_timeout=60
-connection_timeout=300
-
-command[check_users]={{ nagios_plugins_directory }}/check_users --warning=5 --critical=10
-command[check_load]={{ nagios_plugins_directory }}/check_load --percpu --warning=1.5,1.4,1.3 --critical=2.0,1.9,1.8
-command[check_hda1]={{ nagios_plugins_directory }}/check_disk --warning=20% --critical=10% --partition=/dev/hda1
-command[check_root]={{ nagios_plugins_directory }}/check_disk --warning=10% --critical=5% --units=GB --path=/
-command[check_zombie_procs]={{ nagios_plugins_directory }}/check_procs --warning=5 --critical=10 --state=Z
-command[check_total_procs]={{ nagios_plugins_directory }}/check_procs --warning=300 --critical=500
-command[check_raid]=/usr/libexec/raid.pl
-command[check_disks]=/usr/libexec/diskusage.pl 90 95
-command[check_smart]=/usr/libexec/smart.sh
-
-include=/etc/nagios/nrpe_local.cfg
-
-include_dir=/etc/nagios/nrpe.d/
index 15a12252629ddfd84b692208b6a4922e0d7dcdbf..d03a88869f2077e05c87f858bf5a9dcc56a76eb9 100644 (file)
@@ -2,10 +2,6 @@
 ntp_service_name: ntp
 ssh_service_name: ssh
 nfs_service: nfs-kernel-server
-nrpe_service_name: nagios-nrpe-server
-nrpe_user: nagios
-nrpe_group: nagios
-nagios_plugins_directory: /usr/lib/nagios/plugins
 
 ceph_packages_to_remove:
   - ceph
index 596eaac9640dcd0a42c4d6e26e26c57b9fd3fd29..a94e7af13c5c2888ce5169c2f12d08554b5368a3 100644 (file)
@@ -99,8 +99,6 @@ packages:
   # for java bindings, hadoop, etc.
   - java-1.7.0-openjdk-devel
   - junit4
-  # for disk/etc monitoring
-  - smartmontools
   # for nfs
   - nfs-utils
 
@@ -121,6 +119,3 @@ epel_packages:
   - python-virtualenv
   # for setting BIOS settings
   - smbios-utils
-  # for nagios monitoring
-  - nrpe
-  - nagios-plugins-all
index 9f169a96e3c56b70abd20b4b2ef6e90dfc63e1a1..8e33ab99afda69d90b75b22facfb3c2786bfc5cf 100644 (file)
@@ -83,8 +83,6 @@ packages:
   # for java bindings, hadoop, etc.
   - java-1.6.0-openjdk-devel
   - junit4
-  # for disk/etc monitoring
-  - smartmontools
   # for nfs
   - nfs-utils
   # for xfstests
@@ -107,6 +105,3 @@ epel_packages:
   - bonnie++
   # for json_xs to investigate JSON by hand
   - perl-JSON-XS
-  # for nagios monitoring
-  - nrpe
-  - nagios-plugins-all
index 568dd42326f4e40af4a183f9498708e59656353d..a881517432b1e3e5928194d5a346e64a926eecf4 100644 (file)
@@ -84,10 +84,6 @@ packages:
   - default-jdk
   - junit4
   ###
-  # for disk/etc monitoring
-  - smartmontools
-  - nagios-nrpe-server
-  ###
   # for samba testing
   - cifs-utils
   ###
index 16edf7f450e7a9ce3bd16dee5a782485ec4710e4..9038c2823785807399672f7d105fcea71bf62239 100644 (file)
@@ -77,10 +77,6 @@ packages:
   - default-jdk
   - junit4
   ###
-  # for disk/etc monitoring
-  - smartmontools
-  - nagios-nrpe-server
-  ###
   # for samba testing
   - cifs-utils
   ###
index 2c911b6fc7e303d9419b88ea46d176aab975669c..31170f422eef0c7f6c3f23220e200e11455b1ede 100644 (file)
@@ -68,10 +68,6 @@ packages:
   # for java bindings, hadoop, etc.
   - java-1.8.0-openjdk-devel
   - junit
-  # for disk/etc monitoring
-  - nrpe
-  - nagios-plugins-all
-  - smartmontools
   # for nfs
   - nfs-utils
   # python-pip is installed via roles/testnode/tasks/pip.yml on other rpm-based distros
index e274d36cf6135849be748f2a06b58da286d8303b..422b8eb388c0e61962fecccff7ead7f491745342 100644 (file)
@@ -86,8 +86,6 @@ packages:
   # for java bindings, hadoop, etc.
   - java-1.6.0-openjdk-devel
   - junit4
-  # for disk/etc monitoring
-  - smartmontools
   # for nfs
   - nfs-utils
 
@@ -108,8 +106,5 @@ epel_packages:
   - python-virtualenv
   # for setting BIOS settings
   - smbios-utils
-  # for nagios monitoring
-  - nrpe
-  - nagios-plugins-all
 
 nfs_service: nfs
index a7bd3c7dd08d1247f74427e867aea9f42eefbba5..1cb04f6197d2f3fd79589f4fa039fb1cb26b8e4f 100644 (file)
@@ -67,7 +67,6 @@ packages:
   - perl-XML-Twig
   - java-1.6.0-openjdk-devel
   - junit4
-  - smartmontools
   - nfs-utils
   # for xfstests
   - ncurses-devel
@@ -86,8 +85,5 @@ epel_packages:
   - perl-JSON-XS
   - leveldb
   - xmlstarlet
-  # for nagios monitoring
-  - nrpe
-  - nagios-plugins-all
 
 nfs_service: nfs-server
index 52d34784e52c9ce3bc63d965bec3200984d5000d..31cb4b824732b968d6eaa904937dd8d221e26538 100644 (file)
@@ -83,10 +83,6 @@ common_packages:
   - tgt
   - open-iscsi
   ###
-  # for disk/etc monitoring
-  - smartmontools
-  - nagios-nrpe-server
-  ###
   # for samba testing
   - cifs-utils
   # for Static IP
index e6652a60251c8849814e1e27248df0103b00dd44..433bdf53fff63705ac438d75bd66126c7e21b96a 100644 (file)
@@ -1,10 +1,6 @@
 ---
 ntp_service_name: ntpd
 ssh_service_name: sshd
-nrpe_service_name: nrpe
-nrpe_user: nrpe
-nrpe_group: nrpe
-nagios_plugins_directory: /usr/lib64/nagios/plugins
 
 # ceph packages that we ensure do not exist
 ceph_packages_to_remove: