Implemented check script
This commit is contained in:
		
							parent
							
								
									ba70f3b737
								
							
						
					
					
						commit
						39214f2907
					
				
					 1 changed files with 193 additions and 36 deletions
				
			
		
							
								
								
									
										229
									
								
								check_mdstat.sh
									
									
									
									
									
								
							
							
						
						
									
										229
									
								
								check_mdstat.sh
									
									
									
									
									
								
							| 
						 | 
					@ -1,47 +1,204 @@
 | 
				
			||||||
#!/bin/bash
 | 
					#!/bin/bash
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Created by Sebastian Grewe, Jammicron Technology
 | 
					# Created by Sebastian Grewe, Jammicron Technology
 | 
				
			||||||
# Changes By Jasem Elayeb on 02.03.2016
 | 
					# Changes by Jasem Elayeb on 02.03.2016
 | 
				
			||||||
 | 
					# Chagnes by Jonny007-MKD on 06.02.2020
 | 
				
			||||||
# JE: add Physical Disks Name RAID_DISKS
 | 
					# JE: add Physical Disks Name RAID_DISKS
 | 
				
			||||||
# JE: add Physical Disks Status DISKS_STATUS
 | 
					# JE: add Physical Disks Status DISKS_STATUS
 | 
				
			||||||
# JE: add Array Names RAID_ARRAY
 | 
					# JE: add Array Names RAID_ARRAY
 | 
				
			||||||
 | 
					# J007: Only check a single array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Get count of raid arrays
 | 
					 | 
				
			||||||
RAID_DEVICES=`grep ^md -c /proc/mdstat`
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Get count of degraded arrays
 | 
					if [ $# -lt 1 -o -z "$1" ]; then
 | 
				
			||||||
RAID_STATUS=`grep "\[.*_.*\]" /proc/mdstat -c`
 | 
						echo "ERROR: pass raid name as argument"
 | 
				
			||||||
 | 
						exit 1;
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					while [[ $# -gt 0 ]]; do
 | 
				
			||||||
 | 
						case $1 in
 | 
				
			||||||
 | 
							-r|--raid)	TARGET_RAID_NAME="$2";;
 | 
				
			||||||
 | 
							-?|--help)
 | 
				
			||||||
 | 
								echo "Check /proc/mdstat. Arguments:"
 | 
				
			||||||
 | 
								echo "--raid NAME: Raid name, e.g. md0"
 | 
				
			||||||
 | 
								;;
 | 
				
			||||||
 | 
						esac
 | 
				
			||||||
 | 
						shift
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Is an array currently recovering, get percentage of recovery
 | 
					if [ -z "$TARGET_RAID_NAME" ]; then
 | 
				
			||||||
RAID_RECOVER=`grep recovery /proc/mdstat | awk '{print $4}'`
 | 
						echo "ERROR: pass --raid as argument"
 | 
				
			||||||
 | 
						exit 255
 | 
				
			||||||
# Is an array currently resyncing, get percentage of resync
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
RAID_RESYNC=`grep resync /proc/mdstat | awk '{print $4}'`
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
RAID_ARRAY=`awk '/md[1-9]/{for (i=1;i<=NF;++i) if ($i~/md[1-2]/)print $i}' /proc/mdstat |xargs`
 | 
					 | 
				
			||||||
RAID_DISKS=`awk '/sd[a-z]/{for (i=1;i<=NF;++i) if ($i~/sd[a-z]/)print $i}' /proc/mdstat |xargs`
 | 
					 | 
				
			||||||
DISKS_STATUS=`grep algorithm  /proc/mdstat|awk '{print $12}'`
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Check raid status
 | 
					 | 
				
			||||||
# RAID recovers --> Warning
 | 
					 | 
				
			||||||
if [[ $RAID_RECOVER ]]; then
 | 
					 | 
				
			||||||
        STATUS="WARNING - Checked $RAID_DEVICES arrays $RAID_ARRAY, recovering : $RAID_RECOVER"
 | 
					 | 
				
			||||||
        EXIT=1
 | 
					 | 
				
			||||||
# RAID resync --> Warning
 | 
					 | 
				
			||||||
elif [[ $RAID_RESYNC ]]; then
 | 
					 | 
				
			||||||
        STATUS="WARNING - Checked $RAID_DEVICES arrays $RAID_ARRAY., resyncing : $RAID_RESYNC"
 | 
					 | 
				
			||||||
        EXIT=1
 | 
					 | 
				
			||||||
# RAID ok
 | 
					 | 
				
			||||||
elif [[ $RAID_STATUS  == "0" ]]; then
 | 
					 | 
				
			||||||
        STATUS="OK - Checked $RAID_DEVICES arrays $RAID_ARRAY."
 | 
					 | 
				
			||||||
        EXIT=0
 | 
					 | 
				
			||||||
# All else critical, better save than sorry
 | 
					 | 
				
			||||||
else
 | 
					 | 
				
			||||||
        STATUS="CRITICAL - Checked $RAID_DEVICES arrays $RAID_ARRAY, $RAID_STATUS have FAILED"
 | 
					 | 
				
			||||||
        EXIT=2
 | 
					 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Status and quit
 | 
					function log {
 | 
				
			||||||
echo -e "$STATUS \n Physical Disks: $RAID_DISKS Disks Status: $DISKS_STATUS "
 | 
						local x=
 | 
				
			||||||
exit $EXIT
 | 
						#echo "  > $@"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CURRENT_RAID=	# string
 | 
				
			||||||
 | 
					function setCurrentRaid {
 | 
				
			||||||
 | 
						# This function is called for each line in $1
 | 
				
			||||||
 | 
						# If the line does not start with a whitespace, it denotes a new RAID device and CURRENT_RAID is set
 | 
				
			||||||
 | 
						# Also, the state and the devices are parsed 
 | 
				
			||||||
 | 
						local line="$1"
 | 
				
			||||||
 | 
						if [[ $line =~ ^[^\s].+:.+ ]]; then
 | 
				
			||||||
 | 
							local x1="${line%:*}" # Remove : suffix
 | 
				
			||||||
 | 
							local x2="${x1% *}"   # Trim right
 | 
				
			||||||
 | 
							CURRENT_RAID="$x2"
 | 
				
			||||||
 | 
							log "SET CURRENT RAID"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
						log "CURRENT RAID: $CURRENT_RAID"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					STATE=		# "active", "started", "inactive"
 | 
				
			||||||
 | 
					LEVEL=		# int
 | 
				
			||||||
 | 
					DEVICES=	# strings separated by whitespace
 | 
				
			||||||
 | 
					function setStateAndDevicesAndLevel {
 | 
				
			||||||
 | 
						# This functions is called for the line that starts a new RAID device by setCurrentRaid, with the complete line in $1
 | 
				
			||||||
 | 
						# It parses the STATE, the LEVEL and the DEVICES
 | 
				
			||||||
 | 
						local line="$1"
 | 
				
			||||||
 | 
						local x1="${line#*: }" 				# Remove : prefix
 | 
				
			||||||
 | 
						STATE="${x1%% *}" 					# Only the first word
 | 
				
			||||||
 | 
						local x2="${x1:$((${#STATE}+1))}"	# Remove the first word
 | 
				
			||||||
 | 
						LEVEL="${x2%% *}" 					# Only the first word
 | 
				
			||||||
 | 
						local x3="${x2:$((${#LEVEL}+1))}"	# Remove the first word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						LEVEL="${LEVEL:4}"					# Remove the raid prefix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for dev1 in $x3; do
 | 
				
			||||||
 | 
							local dev2="${dev1%[*}"
 | 
				
			||||||
 | 
							if [ -n "$DEVICES" ]; then DEVICES="$DEVICES "; fi
 | 
				
			||||||
 | 
							DEVICES="${DEVICES}$dev2"
 | 
				
			||||||
 | 
						done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						log "STATE = $STATE"
 | 
				
			||||||
 | 
						log "LEVEL = $LEVEL"
 | 
				
			||||||
 | 
						log "DEVICES = $DEVICES"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SIZE_IN_BLOCKS="!"	# int
 | 
				
			||||||
 | 
					NUM_DEVICES=0		# int with number of devices in raid
 | 
				
			||||||
 | 
					BAD_DEVICES=0		# int with number of "_" in "[UUU_UU]"
 | 
				
			||||||
 | 
					function parseConfigStatusLine {
 | 
				
			||||||
 | 
						# This function is called for 1st line after the raid definition line
 | 
				
			||||||
 | 
						# It parses the SIZE_IN_BLOCKS
 | 
				
			||||||
 | 
						local line="$1"
 | 
				
			||||||
 | 
						line="${line:6}"			# trim left
 | 
				
			||||||
 | 
						SIZE_IN_BLOCKS=${line%% *}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						local lastWord=${line##* }	# Get the last word
 | 
				
			||||||
 | 
						lastWord=${lastWord:1:-1}	# Remove first and last char
 | 
				
			||||||
 | 
						NUM_DEVICES=${#lastWord}	# str length = num devices
 | 
				
			||||||
 | 
						lastWord=${lastWord//U/}	# Remove all U, so only _ remain
 | 
				
			||||||
 | 
						BAD_DEVICES=${#lastWord}	# str length = num bad devices
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						log "SIZE_IN_BLOCKS = $SIZE_IN_BLOCKS"
 | 
				
			||||||
 | 
						log "NUM_DEVICES = $NUM_DEVICES"
 | 
				
			||||||
 | 
						log "BAD_DEVICES = $BAD_DEVICES"
 | 
				
			||||||
 | 
						# TODO
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					function parseBitmapOrRecoveryLine {
 | 
				
			||||||
 | 
						# This function is called for the 2nd line after the raid definition line
 | 
				
			||||||
 | 
						# It checks whether it contains a bitmap or a recovery line
 | 
				
			||||||
 | 
						if [[ "$1" == *bitmap* ]]; then
 | 
				
			||||||
 | 
							parseBitmapLine "$1"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
						if [[ "$1" == *recovery* ]]; then
 | 
				
			||||||
 | 
							parseRecoveryLine "$1"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					function parseBitmapLine {
 | 
				
			||||||
 | 
						local x
 | 
				
			||||||
 | 
						# TODO
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					RECOVERY_PROGRESS=	# float in percent
 | 
				
			||||||
 | 
					RECOVERY_INFO=		# finish and speed
 | 
				
			||||||
 | 
					function parseRecoveryLine {
 | 
				
			||||||
 | 
						# This function is called for the 2nd or 3rd lineafter the raid definition line
 | 
				
			||||||
 | 
						# It sets the RECOVERY_PROGRESS
 | 
				
			||||||
 | 
						local line="$1"
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						local x1="${line##*recovery = }"
 | 
				
			||||||
 | 
						local x2="${x1%%%*}"
 | 
				
			||||||
 | 
						RECOVERY_PROGRESS="$x2"
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						local x3="${line##*finish=}"
 | 
				
			||||||
 | 
						RECOVERY_INFO="finish=$x3"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						log "RECOVERY_PROGESS = $RECOVERY_PROGRESS"
 | 
				
			||||||
 | 
						log "RECOVERY_INFO = $RECOVERY_INFO"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					function printOutputAndExit {
 | 
				
			||||||
 | 
						# STATE, LEVEL, DEVICES, SIZE_IN_BLOCKS, NUM_DEVICES, BAD_DEVICES, RECOVERY_PROGRESS, RECOVERY_INFO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						local result="OK"
 | 
				
			||||||
 | 
						local info=""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if [ -n "$RECOVERY_PROGRESS" ]; then
 | 
				
			||||||
 | 
							result="WARNING"
 | 
				
			||||||
 | 
							if [ -n "$info" ]; then info="$info. "; fi
 | 
				
			||||||
 | 
							info="${info}Recovering: $RECOVERY_PROGRESS $RECOVERY_INFO"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if [ "$STATE" != "active" -a "$STATE" != "started" ]; then
 | 
				
			||||||
 | 
							result="CRITICAL"
 | 
				
			||||||
 | 
							if [ -n "$info" ]; then info="$info. "; fi
 | 
				
			||||||
 | 
							info="${info}State is $STATE"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if [ $BAD_DEVICES -gt 0 ]; then
 | 
				
			||||||
 | 
							result="CRITICAL"
 | 
				
			||||||
 | 
							if [ -n "$info" ]; then info="$info. "; fi
 | 
				
			||||||
 | 
							info="${info}Missing $NUM_DEVICES devices"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						echo -n "$result"
 | 
				
			||||||
 | 
						if [ -n "$info" ]; then
 | 
				
			||||||
 | 
							echo -n ": $info"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						echo -n " |"
 | 
				
			||||||
 | 
						echo -n " 'raid level'=$LEVEL"
 | 
				
			||||||
 | 
						echo -n " 'size in blocks'=$SIZE_IN_BLOCKS"
 | 
				
			||||||
 | 
						echo -n " 'num devices'=$NUM_DEVICES"
 | 
				
			||||||
 | 
						echo -n " 'num bad devices'=$BAD_DEVICES;;1;0;$NUM_DEVICES"
 | 
				
			||||||
 | 
						if [ -n "$RECOVERY_PROGRESS" ]; then
 | 
				
			||||||
 | 
							echo -n " 'recovery progress'=$RECOVERY_PROGRESS%;0.1;;0;100"
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
						echo
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						case $result in
 | 
				
			||||||
 | 
							OK)       exit 0;;
 | 
				
			||||||
 | 
							WARNING)  exit 1;;
 | 
				
			||||||
 | 
							CRITICAL) exit 2;;
 | 
				
			||||||
 | 
						esac
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					HAD_TARGET_RAID=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					while IFS= read -r line; do
 | 
				
			||||||
 | 
					    #echo "  : $line"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						setCurrentRaid "$line"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if [ "$CURRENT_RAID" == "$TARGET_RAID_NAME" ]; then
 | 
				
			||||||
 | 
							case $HAD_TARGET_RAID in
 | 
				
			||||||
 | 
								0) setStateAndDevicesAndLevel "$line";;
 | 
				
			||||||
 | 
								1) parseConfigStatusLine "$line";;
 | 
				
			||||||
 | 
								2) parseBitmapOrRecoveryLine "$line";;
 | 
				
			||||||
 | 
								3) parseRecoveryLine "$line";;
 | 
				
			||||||
 | 
							esac
 | 
				
			||||||
 | 
							HAD_TARGET_RAID=$(($HAD_TARGET_RAID+1))
 | 
				
			||||||
 | 
						elif [ $HAD_TARGET_RAID -gt 0 ]; then
 | 
				
			||||||
 | 
							printOutputAndExit
 | 
				
			||||||
 | 
						fi
 | 
				
			||||||
 | 
					done < /proc/mdstat
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "CRITICAL: RAID $TARGET_RAID_NAME not found in mdstat"
 | 
				
			||||||
 | 
					exit 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue