#!/bin/bash # # Created by Sebastian Grewe, Jammicron Technology # Changes by Jasem Elayeb on 02.03.2016 # Chagnes by Jonny007-MKD on 06.02.2020 # JE: add Physical Disks Name RAID_DISKS # JE: add Physical Disks Status DISKS_STATUS # JE: add Array Names RAID_ARRAY # J007: Only check a single array if [ $# -lt 1 -o -z "$1" ]; then echo "ERROR: pass raid name as argument" exit 1; fi INPUT="/sys/block" DEBUG=false while [[ $# -gt 0 ]]; do case $1 in -r|--raid) TARGET_RAID_NAME="$2"; shift;; -i|--input) INPUT="$2"; shift;; -d|--debug) DEBUG=true;; -?|--help) echo "Check /proc/mdstat. Arguments:" echo "--raid NAME: Raid name, e.g. md0" echo "--input FILE: Read from this file. Default: /proc/mdstat" exit 0 ;; esac shift done if [ -z "$TARGET_RAID_NAME" ]; then echo "ERROR: pass --raid as argument" exit 255 fi INPUT="$INPUT/$TARGET_RAID_NAME/md" if [ ! -d $INPUT ]; then echo "CRITICAL: RAID $TARGET_RAID_NAME not found in $INPUT" exit 2 fi function log { if $DEBUG; then echo " > $@" fi } NL=$'\n' log "Reading from $INPUT" STATE= # string LEVEL= # int SIZE_IN_BLOCKS="!" # int CHUNK_SIZE_IN_BYTES= # size in bytes for chunks NUM_DEVICES=0 # int with total number of devices in raid NUM_ACTIVE_DEVICES=0 # int with number of active devices in raid NUM_SPARE_DEVICES=0 # int with number of spare devices in raid rYNC_ACTION= # "idle", "check" or "recover" SYNC_PROGRESS= # float in percent SYNC_SPEED= # sync speed in K/s declare -A DEVICE_STATE # dict of strings. key = dev, value = state declare -A FAULTY_DEVICES # dict with only faulty devices. key = dev, value = true declare -A TO_REPLACE_DEVICES # dict of devices in raid that shall be replaced. key = dev, value = true function getDataFromSys { # https://mjmwired.net/kernel/Documentation/md.txt LEVEL=$(<$INPUT/level) NUM_DEVICES=$(<$INPUT/raid_disks) CHUNK_SIZE_IN_BYTES=$(<$INPUT/chunk_size) SIZE_IN_BLOCKS=$(<$INPUT/component_size) STATE=$(<$INPUT/array_state) SYNC_ACTION=$(<$INPUT/sync_action) if [ "$SYNC_ACTION" != "idle" ]; then SYNC_PROGRESS=$(<$INPUT/sync_completed) SYNC_SPEED=$(<$INPUT/sync_speed) fi log "LEVEL = $LEVEL" log "STATE = $STATE" log "CHUNK_SIZE_IN_BYTES = $CHUNK_SIZE_IN_BYTES" log "SIZE_IN_BLOCKS = $SIZE_IN_BLOCKS" log "SYNC_ACTION = $SYNC_ACTION" log "SYNC_PROGRESS = $SYNC_PROGRESS" log "SYNC_SPEED = $SYNC_SPEED" for dev in `find $INPUT -type d -name 'dev-*' -printf "%f\n"`; do local deviceName=${dev:4} local slot=$(<$INPUT/$dev/slot) local state=$(<$INPUT/$dev/state) log "$dev [$slot] is '$state'" DEVICE_STATE[$deviceName]=$state case "$state" in *in_sync*) ((NUM_ACTIVE_DEVICES++));; *writemostly*) ((NUM_ACTIVE_DEVICES++));; *faulty*) FAULTY_DEVICES[$deviceName]=true;; *blocked*) FAULTY_DEVICES[$deviceName]=true;; *write-error*) TO_REPLACE_DEVICES[$deviceName]=true;; *want_replacement*) TO_REPLACE_DEVICES[$deviceName]=true;; *replacement*) ((NUM_SPARE_DEVICES++));; *spare*) ((NUM_SPARE_DEVICES++));; *) FAULTY_DEVICES[$deviceName]=true;; esac done log "NUM_DEVICES = $NUM_DEVICES" log "NUM_ACTIVE_DEVICES = $NUM_ACTIVE_DEVICES" log "NUM_SPARE_DEVICES = $NUM_SPARE_DEVICES" log "FAULTY_DEVICES = ${FAULTY_DEVICES[*]}" log "TO_REPLACE_DEVICES = ${TO_REPLACE_DEVICES[*]}" if [ "$SYNC_ACTION" != "idle" ]; then SYNC_NUM=${SYNC_PROGRESS% /*} log "'$SYNC_NUM'" SYNC_DEN=${SYNC_PROGRESS#*/ } log "'$SYNC_DEN'" SYNC_PERCENT=$(($SYNC_NUM*100/$SYNC_DEN)) BLOCK_SIZE=$(<$(dirname $INPUT)/queue/hw_sector_size) SYNC_REMAINING_S=$(( ($SYNC_DEN-$SYNC_NUM)*$BLOCK_SIZE/1024/$SYNC_SPEED)) SYNC_REMAINING_MIN=$(($SYNC_REMAINING_S/60)) fi } function printOutputAndExit { local result="" local info="" if [ $NUM_ACTIVE_DEVICES -lt $NUM_DEVICES ]; then result="CRITICAL" if [ -n "$info" ]; then info="$info. "; fi info="${info}Missing $(($NUM_DEVICES - $NUM_ACTIVE_DEVICES)) of $NUM_DEVICES devices" fi if [ ${#FAULTY_DEVICES[@]} -gt 0 ]; then result="CRITICAL" if [ -n "$info" ]; then info="$info. "; fi info="${info}These devices failed: ${!FAULTY_DEVICES[@]}" fi if [ ${#TO_REPLACE_DEVICES[@]} -gt 0 ]; then if [ -z "$result" ]; then result="WARNING"; fi if [ -n "$info" ]; then info="$info. "; fi info="${info}These devices should be replaced: ${!TO_REPLACE_DEVICES[@]}" fi if [ "$SYNC_ACTION" == "recover" ]; then if [ -z "$result" ]; then result="WARNING"; fi if [ -n "$info" ]; then info="$info. "; fi info="${info}Recovering: $SYNC_PERCENT%, remaining ${SYNC_REMAINING_MIN}min" fi if [ "$SYNC_ACTION" == "check" ]; then if [ -n "$info" ]; then info="$info. "; fi info="${info}Checking: $SYNC_PERCENT%, remaining ${SYNC_REMAINING_MIN}min" fi if [ -z "$result" ]; then result="OK"; fi echo -n "$result" if [ -n "$info" ]; then echo -n ": $info" fi echo -n " |" echo -n " 'raid level: $LEVEL'=0" echo -n " 'size in blocks'=$SIZE_IN_BLOCKS" echo -n " 'num devices'=$NUM_DEVICES" echo -n " 'num active devices'=$NUM_ACTIVE_DEVICES;;;0;$NUM_DEVICES" echo -n " 'num failed devices'=${#FAILED_DEVICES[@]};;1;0;$NUM_DEVICES" echo -n " 'num spare devices'=$NUM_SPARE_DEVICES;;;0" if [ "$SYNC_ACTION" != "idle" ]; then echo -n " 'sync progress'=$SYNC_NUM;0;;0;$SYNC_DEN" echo -n " 'sync speed [1/s]'=${SYNC_SPEED}K" echo -n " 'sync remaining'=${SYNC_REMAINING_S}s" fi for dev in ${!DEVICE_STATE[@]}; do echo -n " 'dev $dev: ${DEVICE_STATE[$dev]}" if [ -n "${FAULTY_DEVICE[$dev]}" ]; then echo -n " faulty" fi if [ -n "${TO_REPLACE_DEVICE[$dev]}" ]; then echo -n " wants replacement" fi echo -n "'=0" done echo case $result in OK) exit 0;; WARNING) exit 1;; CRITICAL) exit 2;; esac } getDataFromSys printOutputAndExit