2020-02-06 21:50:46 +01:00
|
|
|
#!/bin/bash
|
|
|
|
#
|
|
|
|
# Created by Sebastian Grewe, Jammicron Technology
|
2020-02-08 16:26:05 +01:00
|
|
|
# Changes by Jasem Elayeb on 02.03.2016
|
|
|
|
# Chagnes by Jonny007-MKD on 06.02.2020
|
2020-02-06 21:50:46 +01:00
|
|
|
# JE: add Physical Disks Name RAID_DISKS
|
|
|
|
# JE: add Physical Disks Status DISKS_STATUS
|
|
|
|
# JE: add Array Names RAID_ARRAY
|
2020-02-08 16:26:05 +01:00
|
|
|
# J007: Only check a single array
|
2020-02-06 21:50:46 +01:00
|
|
|
|
2020-02-08 16:26:05 +01:00
|
|
|
|
|
|
|
if [ $# -lt 1 -o -z "$1" ]; then
|
|
|
|
echo "ERROR: pass raid name as argument"
|
|
|
|
exit 1;
|
|
|
|
fi
|
2020-02-08 18:36:27 +01:00
|
|
|
|
2020-11-29 11:31:21 +01:00
|
|
|
INPUT="/sys/block"
|
2020-02-08 18:36:27 +01:00
|
|
|
DEBUG=false
|
2020-02-08 16:26:05 +01:00
|
|
|
while [[ $# -gt 0 ]]; do
|
|
|
|
case $1 in
|
2020-02-27 21:16:31 +01:00
|
|
|
-r|--raid) TARGET_RAID_NAME="$2"; shift;;
|
|
|
|
-i|--input) INPUT="$2"; shift;;
|
2020-02-08 18:36:27 +01:00
|
|
|
-d|--debug) DEBUG=true;;
|
2020-02-08 16:26:05 +01:00
|
|
|
-?|--help)
|
|
|
|
echo "Check /proc/mdstat. Arguments:"
|
|
|
|
echo "--raid NAME: Raid name, e.g. md0"
|
2020-02-08 18:36:27 +01:00
|
|
|
echo "--input FILE: Read from this file. Default: /proc/mdstat"
|
|
|
|
exit 0
|
2020-02-08 16:26:05 +01:00
|
|
|
;;
|
|
|
|
esac
|
|
|
|
shift
|
|
|
|
done
|
|
|
|
|
|
|
|
if [ -z "$TARGET_RAID_NAME" ]; then
|
|
|
|
echo "ERROR: pass --raid as argument"
|
|
|
|
exit 255
|
2020-02-06 21:50:46 +01:00
|
|
|
fi
|
2020-11-29 11:31:21 +01:00
|
|
|
INPUT="$INPUT/$TARGET_RAID_NAME/md"
|
|
|
|
|
|
|
|
if [ ! -d $INPUT ]; then
|
|
|
|
echo "CRITICAL: RAID $TARGET_RAID_NAME not found in $INPUT"
|
|
|
|
exit 2
|
|
|
|
fi
|
2020-02-06 21:50:46 +01:00
|
|
|
|
2020-02-08 16:26:05 +01:00
|
|
|
function log {
|
2020-02-08 18:36:27 +01:00
|
|
|
if $DEBUG; then
|
|
|
|
echo " > $@"
|
|
|
|
fi
|
|
|
|
}
|
2020-02-09 14:40:31 +01:00
|
|
|
NL=$'\n'
|
2020-11-29 11:31:21 +01:00
|
|
|
log "Reading from $INPUT"
|
|
|
|
|
|
|
|
STATE= # string
|
|
|
|
LEVEL= # int
|
|
|
|
SIZE_IN_BLOCKS="!" # int
|
|
|
|
CHUNK_SIZE_IN_BYTES= # size in bytes for chunks
|
|
|
|
NUM_DEVICES=0 # int with total number of devices in raid
|
|
|
|
NUM_ACTIVE_DEVICES=0 # int with number of active devices in raid
|
|
|
|
NUM_SPARE_DEVICES=0 # int with number of spare devices in raid
|
2020-12-06 21:14:06 +01:00
|
|
|
rYNC_ACTION= # "idle", "check" or "recover"
|
|
|
|
SYNC_PROGRESS= # float in percent
|
|
|
|
SYNC_SPEED= # sync speed in K/s
|
2020-11-29 11:31:21 +01:00
|
|
|
declare -A DEVICE_STATE # dict of strings. key = dev, value = state
|
|
|
|
declare -A FAULTY_DEVICES # dict with only faulty devices. key = dev, value = true
|
|
|
|
declare -A TO_REPLACE_DEVICES # dict of devices in raid that shall be replaced. key = dev, value = true
|
|
|
|
|
|
|
|
function getDataFromSys {
|
|
|
|
# https://mjmwired.net/kernel/Documentation/md.txt
|
|
|
|
|
|
|
|
LEVEL=$(<$INPUT/level)
|
|
|
|
NUM_DEVICES=$(<$INPUT/raid_disks)
|
|
|
|
CHUNK_SIZE_IN_BYTES=$(<$INPUT/chunk_size)
|
|
|
|
SIZE_IN_BLOCKS=$(<$INPUT/component_size)
|
|
|
|
STATE=$(<$INPUT/array_state)
|
2020-12-06 21:14:06 +01:00
|
|
|
SYNC_ACTION=$(<$INPUT/sync_action)
|
|
|
|
if [ "$SYNC_ACTION" != "idle" ]; then
|
|
|
|
SYNC_PROGRESS=$(<$INPUT/sync_completed)
|
|
|
|
SYNC_SPEED=$(<$INPUT/sync_speed)
|
|
|
|
fi
|
2020-11-29 11:31:21 +01:00
|
|
|
|
|
|
|
log "LEVEL = $LEVEL"
|
|
|
|
log "STATE = $STATE"
|
|
|
|
log "CHUNK_SIZE_IN_BYTES = $CHUNK_SIZE_IN_BYTES"
|
|
|
|
log "SIZE_IN_BLOCKS = $SIZE_IN_BLOCKS"
|
2020-12-06 21:14:06 +01:00
|
|
|
log "SYNC_ACTION = $SYNC_ACTION"
|
|
|
|
log "SYNC_PROGRESS = $SYNC_PROGRESS"
|
|
|
|
log "SYNC_SPEED = $SYNC_SPEED"
|
2020-11-29 11:31:21 +01:00
|
|
|
|
|
|
|
for dev in `find $INPUT -type d -name 'dev-*' -printf "%f\n"`; do
|
|
|
|
local deviceName=${dev:4}
|
|
|
|
local slot=$(<$INPUT/$dev/slot)
|
|
|
|
local state=$(<$INPUT/$dev/state)
|
|
|
|
log "$dev [$slot] is '$state'"
|
|
|
|
DEVICE_STATE[$deviceName]=$state
|
|
|
|
case "$state" in
|
|
|
|
*in_sync*) ((NUM_ACTIVE_DEVICES++));;
|
|
|
|
*writemostly*) ((NUM_ACTIVE_DEVICES++));;
|
|
|
|
*faulty*) FAULTY_DEVICES[$deviceName]=true;;
|
|
|
|
*blocked*) FAULTY_DEVICES[$deviceName]=true;;
|
|
|
|
*write-error*) TO_REPLACE_DEVICES[$deviceName]=true;;
|
|
|
|
*want_replacement*) TO_REPLACE_DEVICES[$deviceName]=true;;
|
|
|
|
*replacement*) ((NUM_SPARE_DEVICES++));;
|
|
|
|
*spare*) ((NUM_SPARE_DEVICES++));;
|
|
|
|
*) FAULTY_DEVICES[$deviceName]=true;;
|
|
|
|
esac
|
2020-02-08 16:26:05 +01:00
|
|
|
done
|
|
|
|
|
2020-11-29 11:31:21 +01:00
|
|
|
log "NUM_DEVICES = $NUM_DEVICES"
|
|
|
|
log "NUM_ACTIVE_DEVICES = $NUM_ACTIVE_DEVICES"
|
|
|
|
log "NUM_SPARE_DEVICES = $NUM_SPARE_DEVICES"
|
|
|
|
log "FAULTY_DEVICES = ${FAULTY_DEVICES[*]}"
|
|
|
|
log "TO_REPLACE_DEVICES = ${TO_REPLACE_DEVICES[*]}"
|
|
|
|
|
2020-12-06 21:14:06 +01:00
|
|
|
if [ "$SYNC_ACTION" != "idle" ]; then
|
|
|
|
SYNC_NUM=${SYNC_PROGRESS% /*}
|
|
|
|
log "'$SYNC_NUM'"
|
|
|
|
SYNC_DEN=${SYNC_PROGRESS#*/ }
|
|
|
|
log "'$SYNC_DEN'"
|
|
|
|
SYNC_PERCENT=$(($SYNC_NUM*100/$SYNC_DEN))
|
2020-11-29 11:31:21 +01:00
|
|
|
BLOCK_SIZE=$(<$(dirname $INPUT)/queue/hw_sector_size)
|
2020-12-06 21:14:06 +01:00
|
|
|
SYNC_REMAINING_S=$(( ($SYNC_DEN-$SYNC_NUM)*$BLOCK_SIZE/1024/$SYNC_SPEED))
|
|
|
|
SYNC_REMAINING_MIN=$(($SYNC_REMAINING_S/60))
|
2020-02-08 16:26:05 +01:00
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
function printOutputAndExit {
|
2020-11-29 11:31:21 +01:00
|
|
|
local result=""
|
2020-02-08 16:26:05 +01:00
|
|
|
local info=""
|
2020-02-09 14:40:31 +01:00
|
|
|
|
|
|
|
if [ $NUM_ACTIVE_DEVICES -lt $NUM_DEVICES ]; then
|
|
|
|
result="CRITICAL"
|
|
|
|
if [ -n "$info" ]; then info="$info. "; fi
|
|
|
|
info="${info}Missing $(($NUM_DEVICES - $NUM_ACTIVE_DEVICES)) of $NUM_DEVICES devices"
|
|
|
|
fi
|
2020-02-08 16:26:05 +01:00
|
|
|
|
2020-11-29 11:31:21 +01:00
|
|
|
if [ ${#FAULTY_DEVICES[@]} -gt 0 ]; then
|
|
|
|
result="CRITICAL"
|
2020-02-08 16:26:05 +01:00
|
|
|
if [ -n "$info" ]; then info="$info. "; fi
|
2020-11-29 11:31:21 +01:00
|
|
|
info="${info}These devices failed: ${!FAULTY_DEVICES[@]}"
|
2020-02-08 16:26:05 +01:00
|
|
|
fi
|
|
|
|
|
2020-11-29 11:31:21 +01:00
|
|
|
if [ ${#TO_REPLACE_DEVICES[@]} -gt 0 ]; then
|
|
|
|
if [ -z "$result" ]; then result="WARNING"; fi
|
|
|
|
if [ -n "$info" ]; then info="$info. "; fi
|
|
|
|
info="${info}These devices should be replaced: ${!TO_REPLACE_DEVICES[@]}"
|
|
|
|
fi
|
2020-02-09 14:40:31 +01:00
|
|
|
|
2020-12-06 21:14:06 +01:00
|
|
|
if [ "$SYNC_ACTION" == "recover" ]; then
|
2020-11-29 11:31:21 +01:00
|
|
|
if [ -z "$result" ]; then result="WARNING"; fi
|
2020-02-08 16:26:05 +01:00
|
|
|
if [ -n "$info" ]; then info="$info. "; fi
|
2020-12-06 21:14:06 +01:00
|
|
|
info="${info}Recovering: $SYNC_PERCENT%, remaining ${SYNC_REMAINING_MIN}min"
|
|
|
|
fi
|
|
|
|
if [ "$SYNC_ACTION" == "check" ]; then
|
|
|
|
if [ -n "$info" ]; then info="$info. "; fi
|
|
|
|
info="${info}Checking: $SYNC_PERCENT%, remaining ${SYNC_REMAINING_MIN}min"
|
2020-02-08 16:26:05 +01:00
|
|
|
fi
|
|
|
|
|
2020-11-29 11:31:21 +01:00
|
|
|
if [ -z "$result" ]; then result="OK"; fi
|
|
|
|
|
2020-02-08 16:26:05 +01:00
|
|
|
echo -n "$result"
|
|
|
|
if [ -n "$info" ]; then
|
|
|
|
echo -n ": $info"
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo -n " |"
|
2020-11-29 11:31:21 +01:00
|
|
|
echo -n " 'raid level: $LEVEL'=0"
|
2020-02-08 16:26:05 +01:00
|
|
|
echo -n " 'size in blocks'=$SIZE_IN_BLOCKS"
|
|
|
|
echo -n " 'num devices'=$NUM_DEVICES"
|
2020-11-29 11:31:21 +01:00
|
|
|
echo -n " 'num active devices'=$NUM_ACTIVE_DEVICES;;;0;$NUM_DEVICES"
|
|
|
|
echo -n " 'num failed devices'=${#FAILED_DEVICES[@]};;1;0;$NUM_DEVICES"
|
|
|
|
echo -n " 'num spare devices'=$NUM_SPARE_DEVICES;;;0"
|
2020-12-06 21:14:06 +01:00
|
|
|
if [ "$SYNC_ACTION" != "idle" ]; then
|
|
|
|
echo -n " 'sync progress'=$SYNC_NUM;0;;0;$SYNC_DEN"
|
|
|
|
echo -n " 'sync speed [1/s]'=${SYNC_SPEED}K"
|
|
|
|
echo -n " 'sync remaining'=${SYNC_REMAINING_S}s"
|
2020-02-08 16:26:05 +01:00
|
|
|
fi
|
2020-11-29 11:31:21 +01:00
|
|
|
for dev in ${!DEVICE_STATE[@]}; do
|
|
|
|
echo -n " 'dev $dev: ${DEVICE_STATE[$dev]}"
|
|
|
|
if [ -n "${FAULTY_DEVICE[$dev]}" ]; then
|
|
|
|
echo -n " faulty"
|
2020-02-09 14:40:31 +01:00
|
|
|
fi
|
2020-11-29 11:31:21 +01:00
|
|
|
if [ -n "${TO_REPLACE_DEVICE[$dev]}" ]; then
|
|
|
|
echo -n " wants replacement"
|
2020-02-09 14:40:31 +01:00
|
|
|
fi
|
2020-11-29 11:31:21 +01:00
|
|
|
echo -n "'=0"
|
2020-02-09 14:40:31 +01:00
|
|
|
done
|
|
|
|
|
2020-02-08 16:26:05 +01:00
|
|
|
echo
|
|
|
|
|
|
|
|
case $result in
|
|
|
|
OK) exit 0;;
|
|
|
|
WARNING) exit 1;;
|
|
|
|
CRITICAL) exit 2;;
|
|
|
|
esac
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-11-29 11:31:21 +01:00
|
|
|
getDataFromSys
|
|
|
|
printOutputAndExit
|
2020-02-08 16:26:05 +01:00
|
|
|
|