#!/bin/sh
# $Id: check_slony_lag.sh 299 2008-04-08 14:19:47Z wmoran $

# nagios plugin that checks whether the slave nodes in a slony cluster
# are being updated from the master
#
# possible exit statuses:
#  0 = OK
#  1 = WARNING, one or more slaves is falling behind
#  2 = Error, one or more slave nodes are not sync'ing with the master
#
# script requires five parameters:
# CLUSTERNAME - name of slon cluster to be checked
# DBNAME - name of master database
# DBHOST - host name of master database
# LAGLIMIT - number of backlogged events that results in an error
# LAGWARN - number of backlogged events that results in a warning
#
# It also depends on PGPORT being set to the appropriate port
#
# Author: Bill Moran <wmoran@collaborativefusion.com> Collaborative Fusion Inc.
# Based on the script by John Sidney-Woollett

# check parameters are valid
if [ $# -ne 5 ]
then
   echo "Invalid parameters need CLUSTERNAME DBNAME DBHOST LAGLIMIT LAGWARN"
   exit 2
fi

# assign parameters
CLUSTERNAME=$1
DBNAME=$2
DBHOST=$3
LAGLIMIT=$4
LAGWARN=$5

SQL="select case
   when ttlcount = okcount then 'OK - '||okcount||' nodes in sync'
   else 'WARNING - '||ttlcount-okcount||' of '||ttlcount||' nodes lagging'
end as syncstatus
from (
-- determine total active receivers
select (select count(distinct sub_receiver)
     from \"_$CLUSTERNAME\".sl_subscribe
     where sub_active = true) as ttlcount,
(
  select count(*) from (
   select st_received, st_lag_num_events
   from \"_$CLUSTERNAME\".sl_status
   where st_received in (
     select distinct sub_receiver
     from \"_$CLUSTERNAME\".sl_subscribe
     where sub_active = true
   )
) as t1
where st_lag_num_events < $LAGWARN) as okcount
) as t2"

CHECK1=`psql -c "$SQL" --tuples-only -U pgsql -h $DBHOST $DBNAME`

if [ ! -n "$CHECK1" ]
then
   echo "ERROR querying $DBNAME"
   exit 2
fi

STATUS=`echo $CHECK1 | awk '{print $1}'`
if [ $STATUS = "OK" ]
then
   echo $CHECK1
   exit 0
fi

# See if we're in the ERROR range or the WARNING range
SQL="select case
   when ttlcount = okcount then 'OK - '||okcount||' nodes in sync'
   else 'ERROR - '||ttlcount-okcount||' of '||ttlcount||' nodes not in sync'
end as syncstatus
from (
-- determine total active receivers
select (select count(distinct sub_receiver)
     from \"_$CLUSTERNAME\".sl_subscribe
     where sub_active = true) as ttlcount,
(
  select count(*) from (
   select st_received, st_lag_num_events
   from \"_$CLUSTERNAME\".sl_status
   where st_received in (
     select distinct sub_receiver
     from \"_$CLUSTERNAME\".sl_subscribe
     where sub_active = true
   )
) as t1
where st_lag_num_events < $LAGLIMIT) as okcount
) as t2"

CHECK2=`psql -c "$SQL" --tuples-only -U pgsql -h $DBHOST $DBNAME`

if [ ! -n "$CHECK2" ]
then
   echo "ERROR querying $DBNAME"
   exit 2
fi

# and check the return status
STATUS=`echo $CHECK2 | awk '{print $1}'`
if [ $STATUS = "ERROR" ]
then
   echo $CHECK2
   exit 2
else
   echo $CHECK1
   exit 1
fi


