IT & + si affinités

Encore un blog de sysadmin … mais pas uniquement ;-)

Script Nagios de monitoring SNMP de cartes MegaRaid

Pour continuer dans le monitoring du raid des cartes LSI MegaRaid, voici un autre script plus ancien qui permet de vérifier l’état du raid par requête SNMP et renvoi un code et un message utilisable par Nagios.

 

#!/bin/bash

##################################################################
# Creation: @Markhor75
# Last Modification: 2011/11/28
# This script check snmp status of LSI RAID cards
##################################################################

export name=`basename "$0"`

case "$1" in
   "-h"|"")
   echo "Usage:"
   echo "    $name -h                      Show this help."
   echo "    $name ip address or name"
   echo "    exemple : $name server.lan"
   exit
  ;;
esac

ip=$1
media_error=0
degraded_drives=0
general_statut=0
detail=""

nb_total_expected_disk=0        #total expected disk number
nb_disk=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::pdDiskPresentCount.0 | cut --delimiter=" " -f2) #physicaly presents disks
nb_array=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::vdPresentCount.0 | cut --delimiter=" " -f2) #array number
nb_spare=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::spareDevNumber.0  | cut --delimiter=" " -f9) #number of spare disks

function check_array {
        local statut
        state=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::state.$1 | cut --delimiter=" " -f2)

        case "$state" in
           "3")
                statut="0"                      #OK
                ;;
           "0"|"1")
                statut="2"                      #WARNING
                if [ $general_statut -lt 2 ]
                        then general_statut=2
                fi
                ;;
           "2")
                statut="2"                      #CRITICAL
                if [ $general_statut -lt 3 ]
                        then general_statut=3
                fi
                ;;
           *) statut="1"                        #UNKNOWN
                if [ $general_statut -lt 1 ]
                        then general_statut=1
                fi
                ;;

        esac

        nb_disk_array=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::arrayNumDrives.$1 | cut --delimiter=" " -f2)

        nb_total_expected_disk=$(( nb_total_expected_disk + nb_disk_array ))

        raid_type=$raid_type"/"$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::prl.$1 | cut --delimiter=" " -f2)

        size=$size"/"$(( $(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::size.$1 | cut --delimiter=" " -f2) / 1024 ))

        }

j=0;
while [ $j -lt $nb_array ]
do
        state=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::state.$j | cut --delimiter=" " -f2)
        check_array $j
        let j=j+1
done

for i in `seq 0 $(($nb_disk - 1 ))`; #for each present disk
        do
        if [ $general_statut -ne 0 ] #if general status is not OK
        then
                 # get disk state and (if needed) running operations
                 disk_state[$i]=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::pdState.$i | cut --delimiter=" " -f2)

                 case "${disk_state[$i]}" in
                        "0"|"2"|"24")   #OK
                        ;;

                        "1"|"20")
                        detail=$detail$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::operationProgress.$i  | cut --delimiter=" " -f2-)
                        ;;

                        "16"|"17")
                        detail=$detail$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::operationProgress.$i  | cut --delimiter=" " -f2-)
                        ;;
                 esac

        else
                #get minor errors
                (( media_error+=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::mediaErrCount.$i | cut --delimiter=" " -f2) ))
                (( media_error+=$(snmpget -v2c -O v -c public $ip LSI-MegaRAID-SAS-MIB::otherErrCount.$i | cut --delimiter=" " -f2) ))

                #if have this kind of error, we change global state
                if (( media_error != 0 ))
                then
                        general_statut="2"
                        detail='(Errors: '$media_error')'
                fi

         fi
done

#Removing suplus slash in the begining
raid_type=${raid_type:1}
size=${size:1}

#optimal_drives=$(( nb_total_expected_disk - nb_disk ))

detail_config="R:$raid_type S:$size Go Optimal Drives $nb_disk/$nb_total_expected_disk"
if [ $nb_spare -gt 0 ]
        then detail_config=$detail_config"(+"$nb_spare")"
fi
detail_config=$detail_config" "$detail

case "$general_statut" in
        "0")
        echo "OK: "$detail_config
        exit 0
        ;;

        "2")
        echo "WARNING: "$detail_config
        exit 1
        ;;

        "3")
        echo "CRITICAL: "$detail_config
        exit 2
        ;;

        "1"|*)
        echo "WARNING: Unknown state"
        exit 3
        ;;
esac

Script de monitoring de cartes MegaRaid

Ci dessous un petit script personnel qui surveille l’état de votre raid (une seule grappe gérée ici). Il utilise la commande megacli qui permet de gérer en ligne de commande les cartes MegaRaid de LSI.

Ce script interroge la carte et en cas de statut dégradé, recherche quel est le disque fautif et envoi un email.

Il est possible de demander une reconstruction du raid avec l’option « run ».

#!/bin/bash

# use "./thisscript" to only check raid state (report send by email)
# use "./thisscript run" to start raid rebuild if needed
action="off"
megacli="/opt/MegaRAID/MegaCli/MegaCli"
mail_dest="monitoring@mycompany.tld"

if [ "$#" = "1" ] && [ "$1" = "run" ]
then
action="on"
fi

#Get general RAID status
raid_status=`./MegaCli -CfgDsply -aALL |grep ^State`
if [ "$raid_status" != "Online" ]; then

#get infos from raid config
enclosure=`$megacli -CfgDsply -aALL |grep ^Enclosure |head -n 1|cut -d" " -f 4`
nb=`$megacli -CfgDsply -aALL |grep "Number of PDs:" |cut -d" " -f 4`

counter=$(($nb-1))
#search the disk problem
for (( c=0; c<=$counter; c++ ))
do
state=`$megacli -pdInfo -PhysDrv[$enclosure:$c] -aALL | grep "Firmware state:" |cut -d" " -f 3`
case $state in
"Online")

;;

"Rebuild")
$megacli -PDRbld -ShowProg -PhysDrv [$enclosure:$c] -aALL |grep Rebuild > /tmp/rebuild_statut.txt
avancee=`cat /tmp/rebuild_statut.txt | cut -d" " -f 11`
mail -s "Rebuild status disk slot $c on $HOSTNAME "$avancee $mail_dest < /tmp/rebuild_statut.txt
;;

"Offline")
$megacli -pdInfo -PhysDrv[$enclosure:$c] -aALL | mail -s "Error : Offline disk slot $c on $HOSTNAME" $mail_dest
#Rebuilding raid on drive or not
if [ "$action" = "on" ]; then
$megacli -PDRbld -Start -PhysDrv [$enclosure:$c] -aALL |mail -s "Starting rebuild disk slot $c on $HOSTNAME" $mail_dest
fi
;;

*)
echo "Reported state : $state" | mail -s "Unknown state disk slot $c on $HOSTNAME" $mail_dest
;;
esac
done
fi