diff --git a/check_mdstat.sh b/check_mdstat.sh index 597661a..467d661 100755 --- a/check_mdstat.sh +++ b/check_mdstat.sh @@ -46,6 +46,7 @@ function logLine { echo " : $@" fi } +NL=$'\n' CURRENT_RAID= # string function setCurrentRaid { @@ -64,7 +65,8 @@ function setCurrentRaid { STATE= # "active", "started", "inactive" LEVEL= # int -DEVICES= # strings separated by whitespace +declare -A DEVICES # dict of strings. key = index, value = dev name +declare -A FAILED_DEVICES function setStateAndDevicesAndLevel { # This functions is called for the line that starts a new RAID device by setCurrentRaid, with the complete line in $1 # It parses the STATE, the LEVEL and the DEVICES @@ -78,19 +80,26 @@ function setStateAndDevicesAndLevel { LEVEL="${LEVEL:4}" # Remove the raid prefix for dev1 in $x3; do - local dev2="${dev1%[*}" - if [ -n "$DEVICES" ]; then DEVICES="$DEVICES "; fi - DEVICES="${DEVICES}$dev2" + local devName="${dev1%[*}" # device name + local dev2="${dev1#*[}" # all after device name + local devIndex="${dev2%]*}" # device index + local dev3="${dev2#*]}" # all after device index + local devFailed=false + DEVICES[$devIndex]="$devName" + FAILED_DEVICES[$devIndex]=false + if [ -n "$dev3" ]; then + FAILED_DEVICES[$devIndex]=true + fi done log "STATE = $STATE" log "LEVEL = $LEVEL" - log "DEVICES = $DEVICES" + log "DEVICES = ${DEVICES[@]}" } -SIZE_IN_BLOCKS="!" # int -NUM_DEVICES=0 # int with number of devices in raid -BAD_DEVICES=0 # int with number of "_" in "[UUU_UU]" +SIZE_IN_BLOCKS="!" # int +NUM_DEVICES=0 # int with total number of devices in raid +NUM_ACTIVE_DEVICES=0 # int with number of active devices in raid function parseConfigStatusLine { # This function is called for 1st line after the raid definition line # It parses the SIZE_IN_BLOCKS @@ -98,16 +107,17 @@ function parseConfigStatusLine { line=$(echo $line) # trim left SIZE_IN_BLOCKS=${line%% *} - local lastWord=${line##* } # Get the last word - lastWord=${lastWord:1:-1} # Remove first and last char - NUM_DEVICES=${#lastWord} # str length = num devices - lastWord=${lastWord//U/} # Remove all U, so only _ remain - BAD_DEVICES=${#lastWord} # str length = num bad devices + local x1=${line##* } # Get the last word [UU_] + local x2=${line% *} # Remove the last word + local x3=${x2##* } # Get the last word [3/2] + x3=${x3:1:-1} + NUM_DEVICES=${x3%/*} + NUM_ACTIVE_DEVICES=${x3#*/} log "SIZE_IN_BLOCKS = $SIZE_IN_BLOCKS" log "NUM_DEVICES = $NUM_DEVICES" - log "BAD_DEVICES = $BAD_DEVICES" + log "NUM_ACTIVE_DEVICES = $NUM_ACTIVE_DEVICES" # TODO } @@ -150,6 +160,14 @@ function printOutputAndExit { local result="OK" local info="" + local numFailedDevices=0 + local numSpareDevices=0 + + if [ $NUM_ACTIVE_DEVICES -lt $NUM_DEVICES ]; then + result="CRITICAL" + if [ -n "$info" ]; then info="$info. "; fi + info="${info}Missing $(($NUM_DEVICES - $NUM_ACTIVE_DEVICES)) of $NUM_DEVICES devices" + fi if [ -n "$RECOVERY_PROGRESS" ]; then result="WARNING" @@ -157,18 +175,26 @@ function printOutputAndExit { info="${info}Recovering: progress=$RECOVERY_PROGRESS% $RECOVERY_INFO" fi + local devIndices="$(printf "%s\n" "${!DEVICES[@]}" | sort -n)" + for devIndex in $devIndices; do + if [ -z "${DEVICES[$devIndex]}" ]; then + ((numSpareDevies++)) + continue + fi + if ${FAILED_DEVICES[$devIndex]}; then + result="CRITICAL" + ((numFailedDevices++)) + if [ -n "$info" ]; then info="$info. "; fi + info="${info}Device ${DEVICES[$devIndex]} failed" + fi + done + if [ "$STATE" != "active" -a "$STATE" != "started" ]; then result="CRITICAL" if [ -n "$info" ]; then info="$info. "; fi info="${info}State is $STATE" fi - if [ $BAD_DEVICES -gt 0 ]; then - result="CRITICAL" - if [ -n "$info" ]; then info="$info. "; fi - info="${info}Missing $BAD_DEVICES of $NUM_DEVICES devices" - fi - echo -n "$result" if [ -n "$info" ]; then echo -n ": $info" @@ -178,10 +204,23 @@ function printOutputAndExit { echo -n " 'raid level'=$LEVEL" echo -n " 'size in blocks'=$SIZE_IN_BLOCKS" echo -n " 'num devices'=$NUM_DEVICES" - echo -n " 'num bad devices'=$BAD_DEVICES;;1;0;$NUM_DEVICES" + echo -n " 'num failed devices'=$numFailedDevices;;1;0;$NUM_DEVICES" + echo -n " 'num spare devices'=$numSpareDevices;;;0;$NUM_DEVICES" if [ -n "$RECOVERY_PROGRESS" ]; then echo -n " 'recovery progress'=$RECOVERY_PROGRESS%;0;;0;100" fi + for devIndex in $devIndices; do + if [ -z "${DEVICES[$devIndex]}" ]; then + echo -n " 'device $devIndex: spare'=$devIndex" + continue + fi + if ${FAILED_DEVICES[$devIndex]}; then + echo -n " 'device $devIndex: ${DEVICES[$devIndex]} failed'=$devIndex;;$devIndex'" + continue + fi + echo -n " 'device $devIndex: ${DEVICES[$devIndex]}'=$devIndex" + done + echo case $result in @@ -205,7 +244,7 @@ while IFS= read -r line; do 1) parseConfigStatusLine "$line";; *) parseBitmapOrRecoveryLine "$line";; esac - HAD_TARGET_RAID=$(($HAD_TARGET_RAID+1)) + ((HAD_TARGET_RAID++)) elif [ $HAD_TARGET_RAID -gt 0 ]; then printOutputAndExit fi diff --git a/test/1missing.output b/test/1missing.output index 808f869..282cb3a 100644 --- a/test/1missing.output +++ b/test/1missing.output @@ -1 +1 @@ -CRITICAL: Missing 1 of 4 devices | 'raid level'=5 'size in blocks'=1465151808 'num devices'=4 'num bad devices'=1;;1;0;4 +CRITICAL: Missing 1 of 4 devices | 'raid level'=5 'size in blocks'=1465151808 'num devices'=4 'num failed devices'=0;;1;0;4 'num spare devices'=0;;;0;4 'device 0: sda1'=0 'device 1: sdb1'=1 'device 2: sdd1'=2 diff --git a/test/OK1.output b/test/OK1.output index 4b618cd..8c89504 100644 --- a/test/OK1.output +++ b/test/OK1.output @@ -1 +1 @@ -OK | 'raid level'=1 'size in blocks'=976430080 'num devices'=2 'num bad devices'=0;;1;0;2 +OK | 'raid level'=1 'size in blocks'=976430080 'num devices'=2 'num failed devices'=0;;1;0;2 'num spare devices'=0;;;0;2 'device 0: sda1'=0 'device 1: sdb1'=1 diff --git a/test/OK2.output b/test/OK2.output index ebe7842..f033258 100644 --- a/test/OK2.output +++ b/test/OK2.output @@ -1 +1 @@ -OK | 'raid level'=5 'size in blocks'=1250241792 'num devices'=5 'num bad devices'=0;;1;0;5 +OK | 'raid level'=5 'size in blocks'=1250241792 'num devices'=5 'num failed devices'=0;;1;0;5 'num spare devices'=0;;;0;5 'device 0: sde1'=0 'device 1: sdc1'=1 'device 2: sdd1'=2 'device 4: sdf1'=4 'device 5: sdb1'=5 diff --git a/test/OK3.output b/test/OK3.output index f4b32ff..8607af1 100644 --- a/test/OK3.output +++ b/test/OK3.output @@ -1 +1 @@ -OK | 'raid level'=5 'size in blocks'=1318680576 'num devices'=10 'num bad devices'=0;;1;0;10 +OK | 'raid level'=5 'size in blocks'=1318680576 'num devices'=10 'num failed devices'=0;;1;0;10 'num spare devices'=0;;;0;10 'device 0: sdc1'=0 'device 1: sdd1'=1 'device 2: sde1'=2 'device 3: sdf1'=3 'device 4: sdg1'=4 'device 5: sdh1'=5 'device 6: sdi1'=6 'device 7: sdj1'=7 'device 8: sdk1'=8 'device 9: sdl1'=9 diff --git a/test/bitmap.output b/test/bitmap.output index 3c4e3a2..83c86f1 100644 --- a/test/bitmap.output +++ b/test/bitmap.output @@ -1 +1 @@ -OK | 'raid level'=6 'size in blocks'=1225557760 'num devices'=7 'num bad devices'=0;;1;0;7 +OK | 'raid level'=6 'size in blocks'=1225557760 'num devices'=7 'num failed devices'=0;;1;0;7 'num spare devices'=0;;;0;7 'device 0: sdf1'=0 'device 1: sde1'=1 'device 2: sdd1'=2 'device 3: sdc1'=3 'device 4: sdb1'=4 'device 5: sda1'=5 'device 6: hdb1'=6 diff --git a/test/recovery.output b/test/recovery.output index ac8b429..44c3596 100644 --- a/test/recovery.output +++ b/test/recovery.output @@ -1 +1 @@ -WARNING: Recovering: progress=12.6% finish=127.5min speed=33440K/sec | 'raid level'=5 'size in blocks'=1464725760 'num devices'=6 'num bad devices'=0;;1;0;6 'recovery progress'=12.6%;0;;0;100 +WARNING: Missing 1 of 6 devices. Recovering: progress=12.6% finish=127.5min speed=33440K/sec | 'raid level'=5 'size in blocks'=1464725760 'num devices'=6 'num failed devices'=0;;1;0;6 'num spare devices'=0;;;0;6 'recovery progress'=12.6%;0;;0;100 'device 0: sdc1'=0 'device 1: sdd1'=1 'device 2: sde1'=2 'device 3: sdf1'=3 'device 4: sdg1'=4 'device 6: sdh1'=6