Monitoring Solaris Volume Manager With a <tt>cron</tt> Job - Solaris Volume Manager Administration Guide

Monitoring Solaris Volume Manager With a `cron` Job

How to Automate Checking for Errors in Volumes

To automatically check your Solaris Volume Manager configuration for errors, create a script that the cron utility can periodically run.
The following example shows a script that you can adapt and modify for your needs.
Note - This script serves as a starting point for automating error checking for Solaris Volume Manager. You probably need to modify this script for your own configuration.
#
#!/bin/ksh
#ident "@(#)metacheck.sh   1.3     96/06/21 SMI"
# ident='%Z%%M%   %I%     %E% SMI'
#
# Copyright (c) 1999 by Sun Microsystems, Inc.
#
# metacheck
#
# Check on the status of the metadevice configuration.  If there is a problem
# return a non zero exit code.  Depending on options, send email notification.
#
# -h
#    help
# -s setname
#    Specify the set to check.  By default, the 'local' set will be checked.
# -m recipient [recipient...]
#    Send email notification to the specified recipients.  This
#    must be the last argument. The notification shows up as a short 
#    email message with a subject of 
#        "Solaris Volume Manager Problem: metacheck.who.nodename.setname"
#    which summarizes the problem(s) and tells how to obtain detailed 
#    information. The "setname" is from the -s option, "who" is from 
#    the -w option, and "nodename" is reported by uname(1).
#    Email notification is further affected by the following options:
#        -f    to suppress additional messages after a problem 
#            has been found. 
#        -d    to control the supression.
#        -w    to identify who generated the email.
#        -t    to force email even when there is no problem.
# -w who
#    indicate who is running the command. By default, this is the
#    user-name as reported by id(1M). This is used when sending
#    email notification (-m).
# -f 
#    Enable filtering.  Filtering applies to email notification (-m).
#    Filtering requires root permission.  When sending email notification
#    the file /etc/lvm/metacheck.setname.pending is used to 
#    controll the filter.  The following matrix specifies the behavior
#    of the filter:
#
#    problem_found    file_exists
#      yes          no        Create file, send notification
#      yes          yes        Resend notification if the current date 
#                    (as specified by -d datefmt) is 
#                    different than the file date.
#      no          yes        Delete file, send notification 
#                    that the problem is resolved.
#      no          no        Send notification if -t specified.
#    
# -d datefmt
#    Specify the format of the date for filtering (-f).  This option 
#    controls the how often re-notification via email occurs. If the 
#    current date according to the specified format (strftime(3C)) is 
#    identical to the date contained in the 
#    /etc/lvm/metacheck.setname.pending file then the message is 
#    suppressed. The default date format is "%D", which will send one 
#    re-notification per day.
# -t
#    Test mode.  Enable email generation even when there is no problem.
#    Used for end-to-end verification of the mechanism and email addresses.
#    
#
# These options are designed to allow integration of metacheck
# into crontab.  For example, a root crontab entry of:
#
# 0,15,30,45 * * * * /usr/sbin/metacheck -f -w SVMcron \
#   -d '\%D \%h' -m [email protected] [email protected]
#
# would check for problems every 15 minutes, and generate an email to
# [email protected] (and send to an email pager service) every hour when 
# there is a problem.  Note the \ prior to the '%' characters for a 
# crontab entry.  Bounced email would come back to root@nodename.
# The subject line for email generated by the above line would be
# Solaris Volume Manager Problem: metacheck.SVMcron.nodename.local
#

# display a debug line to controlling terminal (works in pipes)
decho()
{
    if [ "$debug" = "yes" ] ; then
    echo "DEBUG: $*"    < /dev/null > /dev/tty 2>&1
    fi
}

# if string $1 is in $2-* then return $1, else return ""
strstr()
{
    typeset    look="$1"
    typeset    ret=""

    shift
#   decho "strstr LOOK .$look. FIRST .$1."
    while [ $# -ne 0 ] ; do
    if [ "$look" = "$1" ] ; then
        ret="$look"
    fi
    shift
    done
    echo "$ret"
}

# if string $1 is in $2-* then delete it. return result
strdstr()
{
    typeset    look="$1"
    typeset    ret=""

    shift
#   decho "strdstr LOOK .$look. FIRST .$1."
    while [ $# -ne 0 ] ; do
    if [ "$look" != "$1" ] ; then
        ret="$ret $1"
    fi
    shift
    done
    echo "$ret"
}

merge_continued_lines()
{
    awk -e '\
    BEGIN { line = "";} \
    $NF == "\\" { \
        $NF = ""; \
        line = line $0; \
        next; \
    } \
    $NF != "\\" { \
        if ( line != "" ) { \
        print line $0; \
        line = ""; \
        } else { \
        print $0; \
        } \
    }'
}

# trim out stuff not associated with metadevices
find_meta_devices()
{
    typeset    devices=""

#   decho "find_meta_devices .$*."
    while [ $# -ne 0 ] ; do
    case $1 in
    d+([0-9]) )    # metadevice name
        devices="$devices $1"
        ;;
    esac
    shift
    done
    echo "$devices"
}

# return the list of top level metadevices
toplevel()
{
    typeset    comp_meta_devices=""
    typeset    top_meta_devices=""
    typeset    devices=""
    typeset    device=""
    typeset    comp=""

    metastat$setarg -p | merge_continued_lines | while read line ; do
    echo "$line"
    devices=`find_meta_devices $line`
    set -- $devices
    if [ $# -ne 0 ] ; then
        device=$1
        shift
        # check to see if device already refered to as component
        comp=`strstr $device $comp_meta_devices`
        if [ -z $comp ] ; then 
        top_meta_devices="$top_meta_devices $device"
        fi
        # add components to component list, remove from top list
        while [ $# -ne 0 ] ; do
        comp=$1
        comp_meta_devices="$comp_meta_devices $comp"
        top_meta_devices=`strdstr $comp $top_meta_devices`
        shift
        done
    fi
    done > /dev/null 2>&1
    echo $top_meta_devices
}

#
# - MAIN
#
METAPATH=/usr/sbin
PATH=//usr/bin:$METAPATH
USAGE="usage: metacheck [-s setname] [-h] [[-t] [-f [-d datefmt]] \
    [-w who] -m recipient [recipient...]]"

datefmt="%D"
debug="no"
filter="no"
mflag="no"
set="local"
setarg=""
testarg="no"
who=`id | sed -e 's/^uid=[0-9][0-9]*(//' -e 's/).*//'`

while getopts d:Dfms:tw: flag
do
    case $flag in
    d)    datefmt=$OPTARG;
    ;;
    D)    debug="yes"
    ;;
    f)    filter="yes"
    ;;
    m)    mflag="yes"
    ;;
    s)    set=$OPTARG;
    if [ "$set" != "local" ] ; then
        setarg=" -s $set";
    fi
    ;;
    t)    testarg="yes";
    ;;
    w)    who=$OPTARG;
    ;;
    \?)    echo $USAGE
    exit 1
    ;;
    esac
done

# if mflag specified then everything else part of recipient
shift `expr $OPTIND - 1`
if [ $mflag = "no" ] ; then
    if [ $# -ne 0 ] ; then 
    echo $USAGE
    exit 1
    fi
else
    if [ $# -eq 0 ] ; then 
    echo $USAGE
    exit 1
    fi
fi
recipients="$*"

curdate_filter=`date +$datefmt`
curdate=`date`
node=`uname -n`

# establish files
msg_f=/tmp/metacheck.msg.$$
msgs_f=/tmp/metacheck.msgs.$$
metastat_f=/tmp/metacheck.metastat.$$
metadb_f=/tmp/metacheck.metadb.$$
metahs_f=/tmp/metacheck.metahs.$$
pending_f=/etc/lvm/metacheck.$set.pending 
files="$metastat_f $metadb_f $metahs_f $msg_f $msgs_f"

rm -f $files                            > /dev/null 2>&1
trap "rm -f $files > /dev/null 2>&1; exit 1" 1 2 3 15

# Check to see if metadb is capable of running
have_metadb="yes"
metadb$setarg                             > $metadb_f 2>&1
if [ $? -ne 0 ] ; then
    have_metadb="no"
fi
grep "there are no existing databases"      < $metadb_f    > /dev/null 2>&1
if [ $? -eq 0 ] ; then
    have_metadb="no"
fi
grep "/dev/md/admin"                < $metadb_f    > /dev/null 2>&1
if [ $? -eq 0 ] ; then
    have_metadb="no"
fi

# check for problems accessing metadbs
retval=0
if [ "$have_metadb" = "no" ] ; then
    retval=1
    echo "metacheck: metadb problem, can't run '$METAPATH/metadb$setarg'" \
                                >> $msgs_f
else
    # snapshot the state
    metadb$setarg 2>&1 | sed -e '1d' | merge_continued_lines    > $metadb_f
    metastat$setarg 2>&1 | merge_continued_lines        > $metastat_f
    metahs$setarg -i 2>&1 | merge_continued_lines        > $metahs_f

    #
    # Check replicas for problems, capital letters in the flags
    # indicate an error, fields are seperated by tabs.
    #
    problem=`awk < $metadb_f -F\t '{if ($1 ~ /[A-Z]/) print $1;}'`
    if [ -n "$problem" ] ; then
    retval=`expr $retval + 64`
    echo "\
metacheck: metadb problem, for more detail run:\n\t$METAPATH/metadb$setarg -i" \
                                >> $msgs_f
    fi

    #
    # Check the metadevice state
    #
    problem=`awk < $metastat_f -e \
        '/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
    if [ -n "$problem" ] ; then
    retval=`expr $retval + 128`
    echo "\
metacheck: metadevice problem, for more detail run:" \
                                >> $msgs_f

    # refine the message to toplevel metadevices that have a problem
    top=`toplevel`
    set -- $top
    while [ $# -ne 0 ] ; do
        device=$1
        problem=`metastat $device | awk -e \
        '/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
        if [ -n "$problem" ] ; then
        echo "\t$METAPATH/metastat$setarg $device"    >> $msgs_f
        # find out what is mounted on the device
        mp=`mount|awk -e '/\/dev\/md\/dsk\/'$device'[ \t]/{print $1;}'`
        if [ -n "$mp" ] ; then
            echo "\t\t$mp mounted on $device"        >> $msgs_f
        fi
        fi
        shift
    done
    fi

    #
    # Check the hotspares to see if any have been used.
    #
    problem=""
    grep "no hotspare pools found"    < $metahs_f        > /dev/null 2>&1
    if [ $? -ne 0 ] ; then
    problem=`awk < $metahs_f -e \
        '/blocks/ { if ( $2 != "Available" ) print $0;}'`
    fi
    if [ -n "$problem" ] ; then
    retval=`expr $retval + 256`
    echo "\
metacheck: hot spare in use, for more detail run:\n\t$METAPATH/metahs$setarg -i" \
                                 >> $msgs_f
    fi
fi

# If any errors occurred, then mail the report
if [ $retval -ne 0 ] ; then
    if [ -n "$recipients" ] ; then 
    re=""
    if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
        re="Re: "
        # we have a pending notification, check date to see if we resend
        penddate_filter=`cat $pending_f | head -1`
        if [ "$curdate_filter" != "$penddate_filter" ] ; then
        rm -f $pending_f                > /dev/null 2>&1
        else
         if [ "$debug" = "yes" ] ; then
            echo "metacheck: email problem notification still pending"
            cat $pending_f
        fi
        fi
    fi
    if [ ! -f $pending_f ] ; then
        if [ "$filter" = "yes" ] ; then
        echo "$curdate_filter\n\tDate:$curdate\n\tTo:$recipients" \
                                > $pending_f
        fi
        echo "\
Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate"        >> $msg_f
        echo "\
--------------------------------------------------------------" >> $msg_f
        cat $msg_f $msgs_f | mailx -s \
        "${re}Solaris Volume Manager Problem: metacheck.$who.$set.$node" $recipients
    fi
    else
    cat $msgs_f
    fi
else
    # no problems detected,
    if [ -n "$recipients" ] ; then
    # default is to not send any mail, or print anything.
    echo "\
Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate"        >> $msg_f
    echo "\
--------------------------------------------------------------" >> $msg_f
    if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
        # pending filter exista, remove it and send OK
        rm -f $pending_f                    > /dev/null 2>&1
        echo "Problem resolved"                >> $msg_f
        cat $msg_f | mailx -s \
        "Re: Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
    elif [ "$testarg" = "yes" ] ; then
        # for testing, send mail every time even thought there is no problem
        echo "Messaging test, no problems detected"        >> $msg_f
        cat $msg_f | mailx -s \
        "Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
    fi
    else
    echo "metacheck: Okay"
    fi
fi

rm -f $files                            > /dev/null 2>&1
exit $retval
For information on invoking scripts by using the cron utility, see the cron(1M) man page.
Monitoring Solaris Volume Manager With a cron Job

How to Automate Checking for Errors in Volumes

Monitoring Solaris Volume Manager With a `cron` Job