Rewritten to use /sys/block/$md/md/

This commit is contained in:
Gitea 2020-11-29 11:31:21 +01:00
parent 83f1367297
commit 78d9357c02
40 changed files with 134 additions and 227 deletions

View File

@ -1,18 +1,17 @@
# check_mdstat.sh
This script reads from `/proc/mdstat` to get the status of a RAID devices.
This script reads from `/sys/block/$md/md/` to get the status of a RAID devices.
Other than [check_md_raid](https://exchange.icinga.com/exchange/check_md_raid) and [check_md_raid updated](https://exchange.icinga.com/jasem/check_md_raid updated/) not all RAIDs are checked simulataneously, but only one specific. This gives the possibility to query some "performance" data and later add more granular checks.
Other than [check_md_raid](https://exchange.icinga.com/exchange/check_md_raid) and [check_md_raid updated](https://exchange.icinga.com/jasem/check_md_raid updated/) the data is not taken from /proc/mdstat which is less machine readable. Also not all RAIDs are checked simulataneously but only one specific. This gives the possibility to query some "performance" data and later add more granular checks.
## Arguments
* --raid | -r: Name of the RAID, e.g. `md0`
* --input | -i: Input file. Defaults to /proc/mdstat
* --input | -i: Input base directory. Defaults to /sys/block/
* --debug | -d: More debugging output, cannot be used with icinga2
## Results
* CRITICAL: if any device failed
* CRITICAL: if the state of the RAID is not active/started
* CRITICAL: if the number of active devices is smaller than the number of configured devices.
* WARNING: when recoverying/resyncing
* CRITICAL: if the number of active devices is smaller than the number of active devices.
## Known issues
Probably I did not get things right with spare devices.

View File

@ -14,7 +14,7 @@ if [ $# -lt 1 -o -z "$1" ]; then
exit 1;
fi
INPUT="/proc/mdstat"
INPUT="/sys/block"
DEBUG=false
while [[ $# -gt 0 ]]; do
case $1 in
@ -35,133 +35,92 @@ if [ -z "$TARGET_RAID_NAME" ]; then
echo "ERROR: pass --raid as argument"
exit 255
fi
INPUT="$INPUT/$TARGET_RAID_NAME/md"
if [ ! -d $INPUT ]; then
echo "CRITICAL: RAID $TARGET_RAID_NAME not found in $INPUT"
exit 2
fi
function log {
if $DEBUG; then
echo " > $@"
fi
}
function logLine {
if $DEBUG; then
echo " : $@"
fi
}
NL=$'\n'
log "Reading from $INPUT"
CURRENT_RAID= # string
function setCurrentRaid {
# This function is called for each line in $1
# If the line does not start with a whitespace, it denotes a new RAID device and CURRENT_RAID is set
# Also, the state and the devices are parsed
local line="$1"
if [[ $line =~ ^[^\s].+:.+ ]]; then
local x1="${line%:*}" # Remove : suffix
local x2="${x1% *}" # Trim right
CURRENT_RAID="$x2"
log "SET CURRENT RAID"
fi
log "CURRENT RAID: $CURRENT_RAID"
}
STATE= # string
LEVEL= # int
SIZE_IN_BLOCKS="!" # int
CHUNK_SIZE_IN_BYTES= # size in bytes for chunks
NUM_DEVICES=0 # int with total number of devices in raid
NUM_ACTIVE_DEVICES=0 # int with number of active devices in raid
NUM_SPARE_DEVICES=0 # int with number of spare devices in raid
RECOVERY_PROGRESS= # float in percent
RECOVERY_SPEED= # sync speed in K/s
declare -A DEVICE_STATE # dict of strings. key = dev, value = state
declare -A FAULTY_DEVICES # dict with only faulty devices. key = dev, value = true
declare -A TO_REPLACE_DEVICES # dict of devices in raid that shall be replaced. key = dev, value = true
STATE= # "active", "started", "inactive"
LEVEL= # int
declare -A DEVICES # dict of strings. key = index, value = dev name
declare -A FAILED_DEVICES
function setStateAndDevicesAndLevel {
# This functions is called for the line that starts a new RAID device by setCurrentRaid, with the complete line in $1
# It parses the STATE, the LEVEL and the DEVICES
local line="$1"
local x1="${line#*: }" # Remove : prefix
STATE="${x1%% *}" # Only the first word
local x2="${x1:$((${#STATE}+1))}" # Remove the first word
LEVEL="${x2%% *}" # Only the first word
local x3="${x2:$((${#LEVEL}+1))}" # Remove the first word
function getDataFromSys {
# https://mjmwired.net/kernel/Documentation/md.txt
LEVEL="${LEVEL:4}" # Remove the raid prefix
LEVEL=$(<$INPUT/level)
NUM_DEVICES=$(<$INPUT/raid_disks)
CHUNK_SIZE_IN_BYTES=$(<$INPUT/chunk_size)
SIZE_IN_BLOCKS=$(<$INPUT/component_size)
STATE=$(<$INPUT/array_state)
RECOVERY_PROGRESS=$(<$INPUT/sync_completed)
RECOVERY_SPEED=$(<$INPUT/sync_speed)
for dev1 in $x3; do
local devName="${dev1%[*}" # device name
local dev2="${dev1#*[}" # all after device name
local devIndex="${dev2%]*}" # device index
local dev3="${dev2#*]}" # all after device index
local devFailed=false
DEVICES[$devIndex]="$devName"
FAILED_DEVICES[$devIndex]=false
if [ -n "$dev3" ]; then
FAILED_DEVICES[$devIndex]=true
fi
log "LEVEL = $LEVEL"
log "STATE = $STATE"
log "CHUNK_SIZE_IN_BYTES = $CHUNK_SIZE_IN_BYTES"
log "SIZE_IN_BLOCKS = $SIZE_IN_BLOCKS"
log "RECOVERY_PROGRESS = $RECOVERY_PROGRESS"
log "RECOVERY_SPEED = $RECOVERY_SPEED"
for dev in `find $INPUT -type d -name 'dev-*' -printf "%f\n"`; do
local deviceName=${dev:4}
local slot=$(<$INPUT/$dev/slot)
local state=$(<$INPUT/$dev/state)
log "$dev [$slot] is '$state'"
DEVICE_STATE[$deviceName]=$state
case "$state" in
*in_sync*) ((NUM_ACTIVE_DEVICES++));;
*writemostly*) ((NUM_ACTIVE_DEVICES++));;
*faulty*) FAULTY_DEVICES[$deviceName]=true;;
*blocked*) FAULTY_DEVICES[$deviceName]=true;;
*write-error*) TO_REPLACE_DEVICES[$deviceName]=true;;
*want_replacement*) TO_REPLACE_DEVICES[$deviceName]=true;;
*replacement*) ((NUM_SPARE_DEVICES++));;
*spare*) ((NUM_SPARE_DEVICES++));;
*) FAULTY_DEVICES[$deviceName]=true;;
esac
done
log "STATE = $STATE"
log "LEVEL = $LEVEL"
log "DEVICES = ${DEVICES[@]}"
}
log "NUM_DEVICES = $NUM_DEVICES"
log "NUM_ACTIVE_DEVICES = $NUM_ACTIVE_DEVICES"
log "NUM_SPARE_DEVICES = $NUM_SPARE_DEVICES"
log "FAULTY_DEVICES = ${FAULTY_DEVICES[*]}"
log "TO_REPLACE_DEVICES = ${TO_REPLACE_DEVICES[*]}"
SIZE_IN_BLOCKS="!" # int
NUM_DEVICES=0 # int with total number of devices in raid
NUM_ACTIVE_DEVICES=0 # int with number of active devices in raid
function parseConfigStatusLine {
# This function is called for 1st line after the raid definition line
# It parses the SIZE_IN_BLOCKS
local line="$1"
line=$(echo $line) # trim left
SIZE_IN_BLOCKS=${line%% *}
local x1=${line##* } # Get the last word [UU_]
local x2=${line% *} # Remove the last word
local x3=${x2##* } # Get the last word [3/2]
x3=${x3:1:-1}
NUM_DEVICES=${x3%/*}
NUM_ACTIVE_DEVICES=${x3#*/}
log "SIZE_IN_BLOCKS = $SIZE_IN_BLOCKS"
log "NUM_DEVICES = $NUM_DEVICES"
log "NUM_ACTIVE_DEVICES = $NUM_ACTIVE_DEVICES"
# TODO
}
function parseBitmapOrRecoveryLine {
# This function is called for the 2nd line after the raid definition line
# It checks whether it contains a bitmap or a recovery line
if [[ "$1" == *bitmap* ]]; then
parseBitmapLine "$1"
if [ "$RECOVERY_PROGRESS" != "none" ]; then
RECOVERY_NUM=${RECOVERY_PROGRESS% /*}
log "'$RECOVERY_NUM'"
RECOVERY_DEN=${RECOVERY_PROGRESS#*/ }
log "'$RECOVERY_DEN'"
RECOVERY_PERCENT=$(($RECOVERY_NUM*100/$RECOVERY_DEN))
BLOCK_SIZE=$(<$(dirname $INPUT)/queue/hw_sector_size)
RECOVERY_REMAINING_S=$(( ($RECOVERY_DEN-$RECOVERY_NUM)*$BLOCK_SIZE/1024/$RECOVERY_SPEED))
RECOVERY_REMAINING_MIN=$(($RECOVERY_REMAINING_S/60))
fi
if [[ "$1" == *recovery* ]]; then
parseRecoveryLine "$1"
fi
}
function parseBitmapLine {
local x
# TODO
}
RECOVERY_PROGRESS= # float in percent
RECOVERY_INFO= # finish and speed
function parseRecoveryLine {
# This function is called for the 2nd or 3rd lineafter the raid definition line
# It sets the RECOVERY_PROGRESS
local line="$1"
local x1="${line##*recovery = }"
local x2="${x1%%%*}"
RECOVERY_PROGRESS="$x2"
local x3="${line##*finish=}"
RECOVERY_INFO="finish=$x3"
log "RECOVERY_PROGESS = $RECOVERY_PROGRESS"
log "RECOVERY_INFO = $RECOVERY_INFO"
}
function printOutputAndExit {
# STATE, LEVEL, DEVICES, SIZE_IN_BLOCKS, NUM_DEVICES, BAD_DEVICES, RECOVERY_PROGRESS, RECOVERY_INFO
local result="OK"
local result=""
local info=""
local numFailedDevices=0
local numSpareDevices=0
if [ $NUM_ACTIVE_DEVICES -lt $NUM_DEVICES ]; then
result="CRITICAL"
@ -169,56 +128,54 @@ function printOutputAndExit {
info="${info}Missing $(($NUM_DEVICES - $NUM_ACTIVE_DEVICES)) of $NUM_DEVICES devices"
fi
if [ -n "$RECOVERY_PROGRESS" ]; then
result="WARNING"
if [ -n "$info" ]; then info="$info. "; fi
info="${info}Recovering: progress=$RECOVERY_PROGRESS% $RECOVERY_INFO"
fi
local devIndices="$(printf "%s\n" "${!DEVICES[@]}" | sort -n)"
for devIndex in $devIndices; do
if [ -z "${DEVICES[$devIndex]}" ]; then
((numSpareDevies++))
continue
fi
if ${FAILED_DEVICES[$devIndex]}; then
result="CRITICAL"
((numFailedDevices++))
if [ -n "$info" ]; then info="$info. "; fi
info="${info}Device ${DEVICES[$devIndex]} failed"
fi
done
if [ "$STATE" != "active" -a "$STATE" != "started" ]; then
if [ ${#FAULTY_DEVICES[@]} -gt 0 ]; then
result="CRITICAL"
if [ -n "$info" ]; then info="$info. "; fi
info="${info}State is $STATE"
info="${info}These devices failed: ${!FAULTY_DEVICES[@]}"
fi
if [ ${#TO_REPLACE_DEVICES[@]} -gt 0 ]; then
if [ -z "$result" ]; then result="WARNING"; fi
if [ -n "$info" ]; then info="$info. "; fi
info="${info}These devices should be replaced: ${!TO_REPLACE_DEVICES[@]}"
fi
if [ "$RECOVERY_PROGRESS" != "none" ]; then
if [ -z "$result" ]; then result="WARNING"; fi
if [ -n "$info" ]; then info="$info. "; fi
info="${info}Recovering: $RECOVERY_PERCENT%, remaining ${RECOVERY_REMAINING_MIN}min"
fi
if [ -z "$result" ]; then result="OK"; fi
echo -n "$result"
if [ -n "$info" ]; then
echo -n ": $info"
fi
echo -n " |"
echo -n " 'raid level'=$LEVEL"
echo -n " 'raid level: $LEVEL'=0"
echo -n " 'size in blocks'=$SIZE_IN_BLOCKS"
echo -n " 'num devices'=$NUM_DEVICES"
echo -n " 'num failed devices'=$numFailedDevices;;1;0;$NUM_DEVICES"
echo -n " 'num spare devices'=$numSpareDevices;;;0;$NUM_DEVICES"
if [ -n "$RECOVERY_PROGRESS" ]; then
echo -n " 'recovery progress'=$RECOVERY_PROGRESS%;0;;0;100"
echo -n " 'num active devices'=$NUM_ACTIVE_DEVICES;;;0;$NUM_DEVICES"
echo -n " 'num failed devices'=${#FAILED_DEVICES[@]};;1;0;$NUM_DEVICES"
echo -n " 'num spare devices'=$NUM_SPARE_DEVICES;;;0"
if [ "$RECOVERY_PROGRESS" != "none" ]; then
if [ -z "$result" ]; then result="WARNING"; fi
if [ -n "$info" ]; then info="$info. "; fi
echo -n " 'recovery progress'=$RECOVERY_NUM;0;;0;$RECOVERY_DEN"
echo -n " 'recovery speed [1/s]'=${RECOVERY_SPEED}K"
echo -n " 'recovery remaining'=${RECOVERY_REMAINING_S}s"
fi
for devIndex in $devIndices; do
if [ -z "${DEVICES[$devIndex]}" ]; then
echo -n " 'device $devIndex: spare'=$devIndex"
continue
for dev in ${!DEVICE_STATE[@]}; do
echo -n " 'dev $dev: ${DEVICE_STATE[$dev]}"
if [ -n "${FAULTY_DEVICE[$dev]}" ]; then
echo -n " faulty"
fi
if ${FAILED_DEVICES[$devIndex]}; then
echo -n " 'device $devIndex: ${DEVICES[$devIndex]} failed'=$devIndex;;$devIndex'"
continue
if [ -n "${TO_REPLACE_DEVICE[$dev]}" ]; then
echo -n " wants replacement"
fi
echo -n " 'device $devIndex: ${DEVICES[$devIndex]}'=$devIndex"
echo -n "'=0"
done
echo
@ -231,25 +188,6 @@ function printOutputAndExit {
}
HAD_TARGET_RAID=0
while IFS= read -r line; do
logLine "$line"
setCurrentRaid "$line"
if [ "$CURRENT_RAID" == "$TARGET_RAID_NAME" ]; then
case $HAD_TARGET_RAID in
0) setStateAndDevicesAndLevel "$line";;
1) parseConfigStatusLine "$line";;
*) parseBitmapOrRecoveryLine "$line";;
esac
((HAD_TARGET_RAID++))
elif [ $HAD_TARGET_RAID -gt 0 ]; then
printOutputAndExit
fi
done < $INPUT
echo "CRITICAL: RAID $TARGET_RAID_NAME not found in $(basename $INPUT)"
exit 2
getDataFromSys
printOutputAndExit

View File

@ -1,4 +0,0 @@
Personalities : [raid6] [raid5] [raid4]
md0 : active raid5 sda1[0] sdd1[2] sdb1[1]
1465151808 blocks level 5, 64k chunk, algorithm 2 [4/3] [UUU_]
unused devices: <none>

View File

@ -1 +0,0 @@
CRITICAL: Missing 1 of 4 devices | 'raid level'=5 'size in blocks'=1465151808 'num devices'=4 'num failed devices'=0;;1;0;4 'num spare devices'=0;;;0;4 'device 0: sda1'=0 'device 1: sdb1'=1 'device 2: sdd1'=2

View File

@ -1,6 +0,0 @@
Personalities : [raid1]
md0 : active raid1 sdb1[1] sda1[0]
976430080 blocks super 1.2 [2/2] [UU]
bitmap: 0/8 pages [0KB], 65536KB chunk
unused devices: <none>

View File

@ -1 +0,0 @@
OK | 'raid level'=1 'size in blocks'=976430080 'num devices'=2 'num failed devices'=0;;1;0;2 'num spare devices'=0;;;0;2 'device 0: sda1'=0 'device 1: sdb1'=1

View File

@ -1,6 +0,0 @@
Personalities : [raid1] [raid6] [raid5] [raid4]
md0 : active raid5 sde1[0] sdf1[4] sdb1[5] sdd1[2] sdc1[1]
1250241792 blocks super 1.2 level 5, 64k chunk, algorithm 2 [5/5] [UUUUU]
bitmap: 0/10 pages [0KB], 16384KB chunk
unused devices: <none>

View File

@ -1 +0,0 @@
OK | 'raid level'=5 'size in blocks'=1250241792 'num devices'=5 'num failed devices'=0;;1;0;5 'num spare devices'=0;;;0;5 'device 0: sde1'=0 'device 1: sdc1'=1 'device 2: sdd1'=2 'device 4: sdf1'=4 'device 5: sdb1'=5

View File

@ -1,14 +0,0 @@
Personalities : [raid1] [raid6] [raid5] [raid4]
md1 : active raid1 sdb2[1] sda2[0]
136448 blocks [2/2] [UU]
md2 : active raid1 sdb3[1] sda3[0]
129596288 blocks [2/2] [UU]
md0 : active raid5 sdl1[9] sdk1[8] sdj1[7] sdi1[6] sdh1[5] sdg1[4] sdf1[3] sde1[2] sdd1[1] sdc1[0]
1318680576 blocks level 5, 1024k chunk, algorithm 2 [10/10] [UUUUUUUUUU]
md3 : active raid1 sdb1[1] sda1[0]
16787776 blocks [2/2] [UU]
unused devices: <none>

View File

@ -1 +0,0 @@
OK | 'raid level'=5 'size in blocks'=1318680576 'num devices'=10 'num failed devices'=0;;1;0;10 'num spare devices'=0;;;0;10 'device 0: sdc1'=0 'device 1: sdd1'=1 'device 2: sde1'=2 'device 3: sdf1'=3 'device 4: sdg1'=4 'device 5: sdh1'=5 'device 6: sdi1'=6 'device 7: sdj1'=7 'device 8: sdk1'=8 'device 9: sdl1'=9

View File

@ -1,7 +0,0 @@
Personalities : [linear] [raid0] [raid1] [raid5] [raid4] [raid6]
md0 : active raid6 sdf1[0] sde1[1] sdd1[2] sdc1[3] sdb1[4] sda1[5] hdb1[6]
1225557760 blocks level 6, 256k chunk, algorithm 2 [7/7] [UUUUUUU]
bitmap: 0/234 pages [0KB], 512KB chunk
unused devices: <none>

View File

@ -1 +0,0 @@
OK | 'raid level'=6 'size in blocks'=1225557760 'num devices'=7 'num failed devices'=0;;1;0;7 'num spare devices'=0;;;0;7 'device 0: sdf1'=0 'device 1: sde1'=1 'device 2: sdd1'=2 'device 3: sdc1'=3 'device 4: sdb1'=4 'device 5: sda1'=5 'device 6: hdb1'=6

View File

@ -1,5 +0,0 @@
Personalities : [raid1] [raid6] [raid5] [raid4]
md1 : active raid1 sdb2[1] sda2[0]
136448 blocks [2/2] [UU]
unused devices: <none>

View File

@ -1 +1 @@
CRITICAL: RAID md0 not found in notExisting.input
CRITICAL: RAID md0 not found in ./notExisting.input/md0/md

View File

@ -0,0 +1 @@
default

View File

@ -0,0 +1 @@
clean

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
976431104

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
2

View File

@ -0,0 +1 @@
in_sync

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
in_sync

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
spare

View File

@ -0,0 +1 @@
recovery

View File

@ -0,0 +1 @@
raid1

View File

@ -0,0 +1 @@
20

View File

@ -0,0 +1 @@
3

View File

@ -0,0 +1 @@
dev-sdd1

View File

@ -0,0 +1 @@
dev-sdc1

View File

@ -0,0 +1 @@
dev-sda1

View File

@ -0,0 +1 @@
recover

View File

@ -0,0 +1 @@
1728697856 / 1952862208

View File

@ -0,0 +1 @@
17170

View File

@ -0,0 +1 @@
512

1
test/recovering.output Normal file
View File

@ -0,0 +1 @@
CRITICAL: Missing 1 of 3 devices. Recovering: 88%, remaining 108min | 'raid level: raid1'=0 'size in blocks'=976431104 'num devices'=3 'num active devices'=2;;;0;3 'num failed devices'=0;;1;0;3 'num spare devices'=1;;;0 'recovery progress'=1728697856;0;;0;1952862208 'recovery speed [1/s]'=17170K 'recovery remaining'=6527s 'dev sdd1: spare'=0 'dev sdc1: in_sync'=0 'dev sda1: in_sync'=0

View File

@ -1,6 +0,0 @@
Personalities : [raid1] [raid6] [raid5] [raid4]
md0 : active raid5 sdh1[6] sdg1[4] sdf1[3] sde1[2] sdd1[1] sdc1[0]
1464725760 blocks level 5, 64k chunk, algorithm 2 [6/5] [UUUUUU]
[==>..................] recovery = 12.6% (37043392/292945152) finish=127.5min speed=33440K/sec
unused devices: <none>

View File

@ -1 +0,0 @@
WARNING: Missing 1 of 6 devices. Recovering: progress=12.6% finish=127.5min speed=33440K/sec | 'raid level'=5 'size in blocks'=1464725760 'num devices'=6 'num failed devices'=0;;1;0;6 'num spare devices'=0;;;0;6 'recovery progress'=12.6%;0;;0;100 'device 0: sdc1'=0 'device 1: sdd1'=1 'device 2: sde1'=2 'device 3: sdf1'=3 'device 4: sdg1'=4 'device 6: sdh1'=6

View File

@ -1,9 +1,10 @@
dir="`dirname $0`"
ERROR=false
for f in `ls $dir/*.input`; do
IFS=$'\n'
for f in `find $dir -type d -name '*.input'`; do
RESULT="`$dir/../check_mdstat.sh --raid md0 --input $f`"
EXPECTED="`cat $dir/$(basename $f .input).output`"
EXPECTED="$(<$dir/$(basename $f .input).output)"
if [ "$RESULT" != "$EXPECTED" ]; then
echo "Error for test $(basename $f .input):"
echo " Expected"