Script: Bash: AIX: Daily Check Script

Summary: A daily script to check all sort of stuff on AIX.
Date: 27 December 2010
Refactor: 21 February 2025: Checked links and formatting.

#!/bin/bash
########################################################################################################################
# Author : Sjoerd Hooft
# Date Initial Version: 27 Dec 2010
# Comments: sjoerd_@_warmetal_nl
#
# Description:
# This is a sample script to perform the daily checks on AIX servers.
#
# Recommendations:
# The script is designed for a 120 column terminal.
# The running user must be able to do a passwordless sudo to root.
#
# Changes:
# Please comment on your changes to the script (your name and email address, line number, description):
########################################################################################################################
 
# Script Variables
HOSTNAME_SHORT=`hostname -s`
AUTOMATIC=0
BASEDIR=`dirname $0`
LOGFILE="$BASEDIR/dc.log"
WHATAMI=`basename $0`
DATE=`date +%Y%m%d`
TOMAIL=sjoerd_@_warmetal_nl
BOLD=`tput bold`
BOLDOFF=`tput sgr0`
 
# Directories
APPDIR="/var/log/APP"
WASDIR="/opt/WAS_Profiles/AppSrv/logs"
FILE3DIR="/var/data/FILE3"
FILE1DIR="/var/data/FILE1/log"
JMSDIR="/var/data/app/jms_errors"
TOMCATDIR="/var/log/app"
 
# Oracle Variables
ORACLE_HOME="/opt/oracle/product/10.2"
ORACLE_BASE="/opt/oracle"
ORACLE_SID_DB1=db1
ORACLE_SID_DB2=db2
export ORACLE_HOME ORACLE_BASE
 
# Function to pause the script
# The operator can evaluate the outcome of the previous function
scriptContinue () {
   if [ "$AUTOMATIC" == "0" ]; then
      echo "Press ENTER to continue"
      read CONTINUE
      clear
   fi
}
 
# Function that will list the AIX internal errors
checkErrors () {
   echo "$BOLD Listing the Error Logging Facility: $BOLDOFF"
   errpt
   echo
}
 
# Function that will clear all AIX internal errors
clearErrors () {
   echo "Clearing Errors"
   sudo errclear 0
}
 
# Function that will let the operator view the AIX internal errors in detail
viewErrors () {
   echo "Viewing Errors"
   errpt -a | less
}
 
# Function that will remove all files from the protected directory that holds JMS/MQ errors
removeJms () {
   echo "Are you sure you want to remove the JMS error files from $JMSDIR? "
   echo "If you hesitate, press CTRL+C to exit the script. "
   scriptContinue
   echo "Removing these files: "
   echo $JMSDIR/*
   sudo rm $JMSDIR/*
   echo
   echo "Done"
   echo
}
 
# Function that will check the last 4 logfiles from 4 different applications
# This is possible with multiple for loops since the files are named similar
# Known errors are being skipped
# It will show only the last 10 entries per logfile
checkLog-abs () {
   echo "$BOLD Checking abs-logs in $APPDIR $BOLDOFF "
   echo "Note: we check the last 4 logfiles and skip any known error, and limit the amount of lines to 10."
   for application in appserver1 appserver2 appserver3; do
      for logfile in app.log.4 app.log.3 app.log.2 app.log; do
         echo "Checking $BOLD $application-$logfile $BOLDOFF "
         cat $ABSDIR/$application-$logfile | grep ERROR | \
            grep -v 'LDAP: error code 32 - No Such Object' | \
            grep -v 'doRefreshProposalsResponse didn.d send the email caught - ignoring' | \
            grep -v 'Error getting active tan: No TAN available for user' | \
            grep -v 'CORBA OBJECT_NOT_EXIST' | \
            tail -10
         echo
      done
      scriptContinue
      clear
   done
   echo
}
 
# Function to check the SystemOut.log from the websphere applications
# Known errors are being skipped
# It will show only the last 10 entries per logfile
checkLog-was () {
   echo "$BOLD Checking websphere logs in $WASDIR $BOLDOFF "
   for server in server1 server2 server3 server4; do
      echo "Checking $BOLD ${server}_Server/SystemOut.log $BOLDOFF "
      cat $WASDIR/${server}_Server/SystemOut.log | grep -i error | \
         grep -v 'oracle.jdbc.driver.DatabaseError.throwSqlException' | \
         grep -v 'The Network Adapter could not establish the connectionDSRA0010E' | \
         grep -v 'Error creating XA Connection and Resource com.ibm.ws.exception.WsException: DSRA8100E' | \
         grep -v 'Error creating XA Connection and Resource java.security.PrivilegedActionException:' | \
         tail -10
      echo
      scriptContinue
   done
   echo
}
 
# Function to check whether files have been processed.
# They will have a different extention.
checkFiles-host3 () {
   echo "$BOLD Checking the process on $HOSTNAME_SHORT $BOLDOFF "
   echo "There should be no files ending on .txt older than one hour:"
   echo "Last 10 files ending on .txt in $FILE3DIR:"
   ls -ltr $FILE3DIR | grep '\.txt$' | tail -10
   echo
   echo "$BOLD Checking the process on $HOSTNAME_SHORT $BOLDOFF "
   echo "There should be recent (last 24 hours) files:"
   echo "Last 10 files in in $FILE3DIR:"
   ls -ltr $FILE3DIR | grep '\.txt' |  tail -10
   echo
   scriptContinue
}
 
# Function that will check whether error files exist
# It will allow the operator, after examining the size, to delete them
# Continue works only in this menu structure because this is the last check for this host
checkFiles-host1 () {
   echo "$BOLD Checking MQ process error files on $HOSTNAME_SHORT $BOLDOFF "
   echo "Checking for jms (MQ) errors in $JMSDIR, there should be no files in this directory:"
   ls -ltr $JMSDIR
   if [ $AUTOMATIC == 0 ]; then
      JMSACTION=`ls -ltr $JMSDIR | wc -l`
      if [ $JMSACTION -gt 1 ]; then
         echo
         echo "${BOLD}There are files in this directory!$BOLDOFF If all files are really small ( < 100 bytes ) you can delete them. "
         echo "   Would you like to do that right now?"
         echo
         echo "remove            - remove all files in $JMSDIR"
         echo "continue          - continue with dailycheck"
         echo
         menuChoice
      fi
   scriptContinue
   else
      echo "AUTOMATIC mode is on. If there are any files run the script manually on $HOSTNAME_SHORT "
   fi
}
 
# Function to check Oracle logfile bdump for errors
# It will show the line with the error, as well as the 2 lines before and after
# It will show only the last 10 entries per logfile
checkLog-ora () {
   ORALOGDIR="/var/log/oracle/10.2/${ORACLE_SID}/bdump"
   echo "$BOLD Checking the Oracle logfile $ORALOGDIR/alert_$ORACLE_SID.log $BOLDOFF "
   echo "The last 10 ORA- messages are displayed, including the 2 lines before and the two lines after "
   sudo cat $ORALOGDIR/alert_$ORACLE_SID.log | sed -e '
      1{$!N;$d;}
      $!N;/ORA-/!D
      $!N;$d;N;p
      g;$!N;$d;N;D
      '| tail -10
   echo
   scriptContinue
}
 
# Function to check tomcat application servers for errors
# It will evaluate all logfiles created the last four days
# Known errors are being skipped
checkLog-tomcat () {
   echo "$BOLD Checking the tomcat application server logs on $HOSTNAME_SHORT $BOLDOFF "
   echo "$BOLD Checking Tomcat logfiles: $BOLDOFF"
   echo "Checking the last four days of $TOMCATDIR/applicaton.log files"
   find $TOMCATDIR/app/. -type f -name 'application*' -mtime -3 -print -exec cat {} \; | grep ERROR
   echo
   echo "Checking the last four days of $TOMCATDIR/framework.log files"
   find $TOMCATDIR/app/. -type f -name 'framework*' -mtime -3 -print -exec cat {} \; | grep ERROR
   echo
   scriptContinue
}
 
# Function to expand the options handling AIX system errors
actionErrors () {
   menuStart
   checkErrors
   echo "Note: The system clears all hardware errors automatically after 90 days, and all other errors after 30 days."
   echo
   echo "clearerrors       - clear all errors now"
   echo "viewerrors        - review errors in less"
}
 
# Function to specify which host the script runs on
# Declare host specific variables
# Set the actions to be taken
hostSpecific () {
   clear
   if [ "$HOSTNAME_SHORT" == "host1" ]; then
      checkLog-abs
      checkLog-was
      checkFiles-host1
   fi
   if [ "$HOSTNAME_SHORT" == "host2" ]; then
      export ORACLE_SID=$ORACLE_SID_DB2
      checkLog-ora
   fi
   if [ "$HOSTNAME_SHORT" == "host3" ]; then
      export ORACLE_SID=$ORACLE_SID_DB1
      checkLog-ora
      checkLog-tomcat
      checkFiles-host3
   fi
}
 
# Function to clear the screen and give the idea of a pretty script
menuStart () {
   clear
   echo "########################################################################################################################"
   echo "################################################### Daily Check Menu ###################################################"
   echo
}
 
# Function to show the operator the default menu options
menuEnd () {
   echo
   echo "errors            - take further actions regarding errors"
   echo "host              - start host specific checks"
   echo "auto              - restarts the script and runs it automatically, after which the logfile is mailed to $TOMAIL "
   echo "                  - this also works from the commandline: $WHATAMI auto "
   echo
   echo "exit              - exit"
   echo
}
 
# Function to read the menu option from the operator
# This menu is used for all required menus in the script
menuChoice () {
   echo "Enter menu choice: [exit]"
   read MENUCHOICE
 
   if [ -z "$MENUCHOICE" ]; then
   MENUCHOICE="exit"
   fi
 
   case $MENUCHOICE in
 
   errors )
      actionErrors
      menuChoice
   ;;
 
   host )
      hostSpecific
   ;;
 
   clearerrors )
      clearErrors
   ;;
 
   viewerrors )
      viewErrors
   ;;
 
   auto )
      $BASEDIR/$WHATAMI auto
      exit
   ;;
 
   exit )
      exit
   ;;
 
   remove )
      removeJms
   ;;
 
   continue )
      echo
   ;;
 
   * )
      echo "Wrong Input"
      menuChoice
   ;;
 
   esac
}
 
# Function to mail the log when the script has run automatically
mailLog () {
   cat $LOGFILE | mail -s "Report $WHATAMI on $HOSTNAME_SHORT of $DATE" $TOMAIL
}
 
# Function to determine whether the script should run automatically
# Set the automatic variables to send the output to a logfile instead of a screen
# and make the logfile readable by removing bold text markers
# It also makes sure the logfile gets mailed
if [ "$1" == "auto" ]; then
   AUTOMATIC=1
   BOLD=
   BOLDOFF=
   exec > $LOGFILE 2>&1
   checkErrors
   hostSpecific
   mailLog
   exit
fi
 
# Actual script:
# Infinite while loop, as long the script is not exited,
# start the menu, check for errors and ask the operator what to do
while :
do
   menuStart
   checkErrors
   menuEnd
   menuChoice
done