2017-09-20 16:56:49 +08:00
#!/bin/bash
trap "exit" INT TERM
trap "kill 0" EXIT
# Prepare
BACKGROUND_TASKS = ( )
2019-03-28 06:15:04 +08:00
echo "Waiting for containers to settle..."
2019-09-22 04:41:31 +08:00
sleep 30
2017-09-20 16:56:49 +08:00
2017-10-03 18:05:38 +08:00
if [ [ " ${ USE_WATCHDOG } " = ~ ^( [ nN] [ oO] | [ nN] ) +$ ] ] ; then
2017-10-03 18:07:48 +08:00
echo -e " $( date) - USE_WATCHDOG=n, skipping watchdog... "
2017-10-03 18:05:38 +08:00
sleep 365d
exec $( readlink -f " $0 " )
fi
2017-09-20 16:56:49 +08:00
# Checks pipe their corresponding container name in this pipe
if [ [ ! -p /tmp/com_pipe ] ] ; then
mkfifo /tmp/com_pipe
fi
2019-09-12 14:29:15 +08:00
# Wait for containers
while ! mysqladmin status --socket= /var/run/mysqld/mysqld.sock -u${ DBUSER } -p${ DBPASS } --silent; do
echo "Waiting for SQL..."
sleep 2
done
2020-02-05 18:01:09 +08:00
# Do not attempt to write to slave
if [ [ ! -z ${ REDIS_SLAVEOF_IP } ] ] ; then
REDIS_CMDLINE = " redis-cli -h ${ REDIS_SLAVEOF_IP } -p ${ REDIS_SLAVEOF_PORT } "
else
REDIS_CMDLINE = "redis-cli -h redis -p 6379"
fi
2020-03-09 03:23:32 +08:00
until [ [ $( ${ REDIS_CMDLINE } PING) = = "PONG" ] ] ; do
echo "Waiting for Redis..."
sleep 2
done
2020-02-05 18:01:09 +08:00
${ REDIS_CMDLINE } DEL F2B_RES > /dev/null
2019-06-10 16:57:38 +08:00
2017-09-20 16:56:49 +08:00
# Common functions
2019-12-02 21:23:54 +08:00
get_ipv6( ) {
local IPV6 =
local IPV6_SRCS =
local TRY =
IPV6_SRCS[ 0] = "ip6.korves.net"
IPV6_SRCS[ 1] = "ip6.mailcow.email"
until [ [ ! -z ${ IPV6 } ] ] || [ [ ${ TRY } -ge 10 ] ] ; do
IPV6 = $( curl --connect-timeout 3 -m 10 -L6s ${ IPV6_SRCS [ $RANDOM % ${# IPV6_SRCS [@] } ] } | grep " ^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\} $" )
[ [ ! -z ${ TRY } ] ] && sleep 1
TRY = $(( TRY+1))
done
echo ${ IPV6 }
}
2019-06-10 16:57:38 +08:00
array_diff( ) {
# https://stackoverflow.com/questions/2312762, Alex Offshore
eval local ARR1 = \( \" \$ { $2 [ @] } \" \)
eval local ARR2 = \( \" \$ { $3 [ @] } \" \)
local IFS = $'\n'
mapfile -t $1 < <( comm -23 <( echo " ${ ARR1 [*] } " | sort) <( echo " ${ ARR2 [*] } " | sort) )
}
2017-09-20 16:56:49 +08:00
progress( ) {
SERVICE = ${ 1 }
TOTAL = ${ 2 }
CURRENT = ${ 3 }
DIFF = ${ 4 }
[ [ -z ${ DIFF } ] ] && DIFF = 0
[ [ -z ${ TOTAL } || -z ${ CURRENT } ] ] && return
[ [ ${ CURRENT } -gt ${ TOTAL } ] ] && return
[ [ ${ CURRENT } -lt 0 ] ] && CURRENT = 0
2017-09-21 05:24:56 +08:00
PERCENT = $(( 200 * ${ CURRENT } / ${ TOTAL } % 2 + 100 * ${ CURRENT } / ${ TOTAL } ))
2020-02-05 18:01:09 +08:00
${ REDIS_CMDLINE } LPUSH WATCHDOG_LOG " {\"time\":\" $( date +%s) \",\"service\":\" ${ SERVICE } \",\"lvl\":\" ${ PERCENT } \",\"hpnow\":\" ${ CURRENT } \",\"hptotal\":\" ${ TOTAL } \",\"hpdiff\":\" ${ DIFF } \"} " > /dev/null
2017-12-09 20:15:24 +08:00
log_msg " ${ SERVICE } health level: ${ PERCENT } % ( ${ CURRENT } / ${ TOTAL } ), health trend: ${ DIFF } " no_redis
2018-10-28 05:44:06 +08:00
# Return 10 to indicate a dead service
[ ${ CURRENT } -le 0 ] && return 10
2017-09-20 16:56:49 +08:00
}
2017-11-14 17:44:00 +08:00
log_msg( ) {
2017-12-09 20:15:24 +08:00
if [ [ ${ 2 } != "no_redis" ] ] ; then
2020-02-05 18:01:09 +08:00
${ REDIS_CMDLINE } LPUSH WATCHDOG_LOG " {\"time\":\" $( date +%s) \",\"message\":\" $( printf '%s' " ${ 1 } " | \
2019-03-05 00:56:27 +08:00
tr '\r\n%&;$"_[]{}-' ' ' ) \" } " > /dev/null
2017-12-09 20:15:24 +08:00
fi
2017-11-14 17:44:00 +08:00
echo $( date) $( printf '%s\n' " ${ 1 } " )
}
2017-10-06 05:38:33 +08:00
function mail_error( ) {
[ [ -z ${ 1 } ] ] && return 1
2020-05-31 17:39:20 +08:00
# If exists, body will be the content of "/tmp/${1}", even if ${2} is set
2018-10-26 16:07:23 +08:00
[ [ -z ${ 2 } ] ] && BODY = " Service was restarted on $( date) , please check your mailcow installation. " || BODY = " $( date) - ${ 2 } "
WATCHDOG_NOTIFY_EMAIL = $( echo " ${ WATCHDOG_NOTIFY_EMAIL } " | sed 's/"//;s|"$||' )
2019-06-11 02:20:41 +08:00
# Some exceptions for subject and body formats
2019-06-11 04:40:21 +08:00
if [ [ ${ 1 } = = "fail2ban" ] ] ; then
2019-06-11 02:20:41 +08:00
SUBJECT = " ${ BODY } "
BODY = "Please see netfilter-mailcow for more details and triggered rules."
else
2019-06-11 04:40:21 +08:00
SUBJECT = " Watchdog ALERT: ${ 1 } "
2019-06-11 02:20:41 +08:00
fi
2018-10-26 16:07:23 +08:00
IFS = ',' read -r -a MAIL_RCPTS <<< " ${ WATCHDOG_NOTIFY_EMAIL } "
for rcpt in " ${ MAIL_RCPTS [@] } " ; do
RCPT_DOMAIN =
2019-12-02 21:23:54 +08:00
#RCPT_MX=
2018-10-26 16:07:23 +08:00
RCPT_DOMAIN = $( echo ${ rcpt } | awk -F @ { 'print $NF' } )
2019-12-02 21:23:54 +08:00
# Latest smtp-cli looks up mx via dns
#RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}')
#if [[ -z ${RCPT_MX} ]]; then
# log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
# return 1
#fi
2019-06-11 04:40:21 +08:00
[ -f " /tmp/ ${ 1 } " ] && BODY = " /tmp/ ${ 1 } "
2019-08-11 16:18:00 +08:00
timeout 10s ./smtp-cli --missing-modules-ok \
2019-07-26 03:49:30 +08:00
--charset= UTF-8 \
2019-04-19 04:09:26 +08:00
--subject= " ${ SUBJECT } " \
2018-10-26 16:07:23 +08:00
--body-plain= " ${ BODY } " \
2020-05-23 17:16:13 +08:00
--add-header= "X-Priority: 1" \
2018-10-26 16:07:23 +08:00
--to= ${ rcpt } \
--from= " watchdog@ ${ MAILCOW_HOSTNAME } " \
2019-12-02 21:23:54 +08:00
--hello-host= ${ MAILCOW_HOSTNAME } \
--ipv4
#--server="${RCPT_MX}"
2018-10-26 16:07:23 +08:00
log_msg " Sent notification email to ${ rcpt } "
done
2017-10-06 05:38:33 +08:00
}
2017-09-20 18:27:24 +08:00
get_container_ip( ) {
# ${1} is container
2018-05-27 04:19:17 +08:00
CONTAINER_ID = ( )
2018-09-10 03:17:59 +08:00
CONTAINER_IPS = ( )
2017-09-20 18:27:24 +08:00
CONTAINER_IP =
2017-09-21 05:24:56 +08:00
LOOP_C = 1
until [ [ ${ CONTAINER_IP } = ~ ^[ 0-9] { 1,3} \. [ 0-9] { 1,3} \. [ 0-9] { 1,3} \. [ 0-9] { 1,3} $ ] ] || [ [ ${ LOOP_C } -gt 5 ] ] ; do
2018-10-14 06:21:31 +08:00
if [ ${ IP_BY_DOCKER_API } -eq 0 ] ; then
CONTAINER_IP = $( dig a " ${ 1 } " +short)
else
sleep 0.5
# get long container id for exact match
CONTAINER_ID = ( $( curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc " select( .name | tostring == \" ${ 1 } \") | .id " ) )
# returned id can have multiple elements (if scaled), shuffle for random test
CONTAINER_ID = ( $( printf "%s\n" " ${ CONTAINER_ID [@] } " | shuf) )
if [ [ ! -z ${ CONTAINER_ID } ] ] ; then
for matched_container in " ${ CONTAINER_ID [@] } " ; do
CONTAINER_IPS = ( $( curl --silent --insecure https://dockerapi/containers/${ matched_container } /json | jq -r '.NetworkSettings.Networks[].IPAddress' ) )
for ip_match in " ${ CONTAINER_IPS [@] } " ; do
# grep will do nothing if one of these vars is empty
[ [ -z ${ ip_match } ] ] && continue
[ [ -z ${ IPV4_NETWORK } ] ] && continue
# only return ips that are part of our network
if ! grep -q ${ IPV4_NETWORK } <( echo ${ ip_match } ) ; then
continue
else
CONTAINER_IP = ${ ip_match }
break
fi
done
[ [ ! -z ${ CONTAINER_IP } ] ] && break
2018-09-10 03:17:59 +08:00
done
2018-10-14 06:21:31 +08:00
fi
2017-11-14 17:44:00 +08:00
fi
2017-09-21 05:24:56 +08:00
LOOP_C = $(( LOOP_C + 1 ))
2017-09-20 18:27:24 +08:00
done
2017-09-21 05:24:56 +08:00
[ [ ${ LOOP_C } -gt 5 ] ] && echo 240.0.0.0 || echo ${ CONTAINER_IP }
2017-09-20 18:27:24 +08:00
}
2019-12-03 01:04:56 +08:00
# One-time check
if grep -qi " $( echo ${ IPV6_NETWORK } | cut -d: -f1-3) " <<< " $( ip a s) " ; then
if [ [ -z " $( get_ipv6) " ] ] ; then
mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
fi
fi
2020-01-26 01:26:56 +08:00
external_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ EXTERNAL_CHECKS_THRESHOLD }
2020-01-26 01:26:56 +08:00
# Reduce error count by 2 after restarting an unhealthy container
GUID = $( mysql -u${ DBUSER } -p${ DBPASS } ${ DBNAME } -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
err_c_cur = ${ err_count }
CHECK_REPONSE = " $( curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid= ${ GUID } 2> /dev/null) "
if [ [ ! -z " ${ CHECK_REPONSE } " ] ] && [ [ " $( echo ${ CHECK_REPONSE } | jq -r .response) " = = "critical" ] ] ; then
echo ${ CHECK_REPONSE } | jq -r .out > /tmp/external_checks
err_count = $(( ${ err_count } + 1 ))
fi
CHECK_REPONSE6 = " $( curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid= ${ GUID } 2> /dev/null) "
if [ [ ! -z " ${ CHECK_REPONSE6 } " ] ] && [ [ " $( echo ${ CHECK_REPONSE6 } | jq -r .response) " = = "critical" ] ] ; then
echo ${ CHECK_REPONSE } | jq -r .out > /tmp/external_checks
err_count = $(( ${ err_count } + 1 ))
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "External checks" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 60
else
diff_c = 0
sleep $(( ( RANDOM % 20 ) + 120 ))
fi
done
return 1
}
2017-09-20 16:56:49 +08:00
nginx_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ NGINX_THRESHOLD }
2017-09-20 16:56:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/nginx-mailcow; echo " $( tail -50 /tmp/nginx-mailcow) " > /tmp/nginx-mailcow
2017-09-21 05:24:56 +08:00
host_ip = $( get_container_ip nginx-mailcow)
2017-09-20 16:56:49 +08:00
err_c_cur = ${ err_count }
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_http -4 -H ${ host_ip } -u / -p 8081 2>> /tmp/nginx-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2017-09-20 16:56:49 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Nginx" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2017-09-20 16:56:49 +08:00
done
return 1
}
2018-10-14 06:21:31 +08:00
unbound_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ UNBOUND_THRESHOLD }
2018-10-14 06:21:31 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/unbound-mailcow; echo " $( tail -50 /tmp/unbound-mailcow) " > /tmp/unbound-mailcow
2018-10-14 06:21:31 +08:00
host_ip = $( get_container_ip unbound-mailcow)
err_c_cur = ${ err_count }
2019-01-29 07:20:39 +08:00
/usr/lib/nagios/plugins/check_dns -s ${ host_ip } -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2018-10-14 06:21:31 +08:00
DNSSEC = $( dig com +dnssec | egrep 'flags:.+ad' )
if [ [ -z ${ DNSSEC } ] ] ; then
2018-10-18 04:30:13 +08:00
echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>& 2
2018-10-14 06:21:31 +08:00
err_count = $(( ${ err_count } + 1 ))
else
2018-10-18 04:30:13 +08:00
echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>& 2
2018-10-14 06:21:31 +08:00
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Unbound" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2018-10-14 06:21:31 +08:00
done
return 1
}
2019-09-12 14:29:15 +08:00
redis_checks( ) {
2020-02-05 18:01:09 +08:00
# A check for the local redis container
2019-09-12 14:29:15 +08:00
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ REDIS_THRESHOLD }
2019-09-12 14:29:15 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
touch /tmp/redis-mailcow; echo " $( tail -50 /tmp/redis-mailcow) " > /tmp/redis-mailcow
host_ip = $( get_container_ip redis-mailcow)
err_c_cur = ${ err_count }
/usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Redis" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
sleep $(( ( RANDOM % 60 ) + 20 ))
fi
done
return 1
}
2017-09-20 16:56:49 +08:00
mysql_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ MYSQL_THRESHOLD }
2017-09-20 16:56:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/mysql-mailcow; echo " $( tail -50 /tmp/mysql-mailcow) " > /tmp/mysql-mailcow
2017-09-20 16:56:49 +08:00
err_c_cur = ${ err_count }
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${ DBUSER } -p ${ DBPASS } -d ${ DBNAME } 2>> /tmp/mysql-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
/usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${ DBUSER } -p ${ DBPASS } -d ${ DBNAME } -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2017-09-20 16:56:49 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "MySQL/MariaDB" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2017-09-20 16:56:49 +08:00
done
return 1
}
2020-04-06 17:23:20 +08:00
mysql_repl_checks( ) {
err_count = 0
diff_c = 0
THRESHOLD = ${ MYSQL_REPLICATION_THRESHOLD }
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
touch /tmp/mysql_repl_checks; echo " $( tail -50 /tmp/mysql_repl_checks) " > /tmp/mysql_repl_checks
err_c_cur = ${ err_count }
/usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${ DBROOT } 2>> /tmp/mysql_repl_checks 1>& 2; err_count = $(( ${ err_count } + $? ))
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "MySQL/MariaDB replication" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 60
else
diff_c = 0
sleep $(( ( RANDOM % 60 ) + 20 ))
fi
done
return 1
}
2017-09-20 16:56:49 +08:00
sogo_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ SOGO_THRESHOLD }
2017-09-20 16:56:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/sogo-mailcow; echo " $( tail -50 /tmp/sogo-mailcow) " > /tmp/sogo-mailcow
2017-09-21 05:24:56 +08:00
host_ip = $( get_container_ip sogo-mailcow)
2017-09-20 16:56:49 +08:00
err_c_cur = ${ err_count }
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_http -4 -H ${ host_ip } -u /SOGo.index/ -p 20000 -R "SOGo\.MainUI" 2>> /tmp/sogo-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2017-09-20 16:56:49 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "SOGo" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2017-09-20 16:56:49 +08:00
done
return 1
}
postfix_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ POSTFIX_THRESHOLD }
2017-09-20 16:56:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/postfix-mailcow; echo " $( tail -50 /tmp/postfix-mailcow) " > /tmp/postfix-mailcow
2018-10-18 04:30:13 +08:00
host_ip = $( get_container_ip postfix-mailcow)
2017-09-20 16:56:49 +08:00
err_c_cur = ${ err_count }
2019-09-05 05:07:17 +08:00
/usr/lib/nagios/plugins/check_smtp -4 -H ${ host_ip } -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_smtp -4 -H ${ host_ip } -p 589 -S 2>> /tmp/postfix-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2017-09-20 16:56:49 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Postfix" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2017-09-20 16:56:49 +08:00
done
return 1
}
2018-10-14 06:21:31 +08:00
clamd_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ CLAMD_THRESHOLD }
2018-10-14 06:21:31 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/clamd-mailcow; echo " $( tail -50 /tmp/clamd-mailcow) " > /tmp/clamd-mailcow
2018-10-18 04:30:13 +08:00
host_ip = $( get_container_ip clamd-mailcow)
2018-10-14 06:21:31 +08:00
err_c_cur = ${ err_count }
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_clamd -4 -H ${ host_ip } 2>> /tmp/clamd-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2018-10-14 06:21:31 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Clamd" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 120 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2018-10-14 06:21:31 +08:00
done
return 1
}
2017-09-20 16:56:49 +08:00
dovecot_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ DOVECOT_THRESHOLD }
2017-09-20 16:56:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/dovecot-mailcow; echo " $( tail -50 /tmp/dovecot-mailcow) " > /tmp/dovecot-mailcow
2017-09-21 05:24:56 +08:00
host_ip = $( get_container_ip dovecot-mailcow)
2017-09-20 16:56:49 +08:00
err_c_cur = ${ err_count }
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_smtp -4 -H ${ host_ip } -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
/usr/lib/nagios/plugins/check_imap -4 -H ${ host_ip } -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
/usr/lib/nagios/plugins/check_imap -4 -H ${ host_ip } -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
/usr/lib/nagios/plugins/check_tcp -4 -H ${ host_ip } -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
/usr/lib/nagios/plugins/check_tcp -4 -H ${ host_ip } -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2017-09-20 16:56:49 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Dovecot" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2017-09-20 16:56:49 +08:00
done
return 1
}
2020-04-14 18:48:57 +08:00
dovecot_repl_checks( ) {
err_count = 0
diff_c = 0
THRESHOLD = ${ DOVECOT_REPL_THRESHOLD }
D_REPL_STATUS = $( redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
err_c_cur = ${ err_count }
D_REPL_STATUS = $( redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
if [ [ " ${ D_REPL_STATUS } " != "1" ] ] ; then
err_count = $(( ${ err_count } + 1 ))
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Dovecot replication" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
sleep $(( ( RANDOM % 60 ) + 20 ))
fi
done
return 1
}
2017-09-20 16:56:49 +08:00
phpfpm_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ PHPFPM_THRESHOLD }
2017-09-20 16:56:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/php-fpm-mailcow; echo " $( tail -50 /tmp/php-fpm-mailcow) " > /tmp/php-fpm-mailcow
2017-09-21 05:24:56 +08:00
host_ip = $( get_container_ip php-fpm-mailcow)
2017-09-20 16:56:49 +08:00
err_c_cur = ${ err_count }
2018-10-18 04:30:13 +08:00
/usr/lib/nagios/plugins/check_tcp -H ${ host_ip } -p 9001 2>> /tmp/php-fpm-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
/usr/lib/nagios/plugins/check_tcp -H ${ host_ip } -p 9002 2>> /tmp/php-fpm-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2017-09-20 16:56:49 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "PHP-FPM" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2018-10-28 05:44:06 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-10-28 05:44:06 +08:00
fi
2017-09-20 16:56:49 +08:00
done
return 1
}
2018-12-16 04:21:22 +08:00
ratelimit_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ RATELIMIT_THRESHOLD }
2018-12-16 04:21:22 +08:00
RL_LOG_STATUS = $( redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
err_c_cur = ${ err_count }
RL_LOG_STATUS_PREV = ${ RL_LOG_STATUS }
RL_LOG_STATUS = $( redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
if [ [ ${ RL_LOG_STATUS_PREV } != ${ RL_LOG_STATUS } ] ] ; then
err_count = $(( ${ err_count } + 1 ))
2020-03-03 02:56:49 +08:00
echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
echo >> /tmp/ratelimit
redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
2018-12-16 04:21:22 +08:00
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Ratelimit" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2018-12-16 04:21:22 +08:00
fi
done
return 1
}
2020-05-31 17:39:20 +08:00
mailq_checks( ) {
err_count = 0
diff_c = 0
THRESHOLD = ${ MAILQ_THRESHOLD }
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
touch /tmp/mail_queue_status; echo " $( tail -50 /tmp/mail_queue_status) " > /tmp/mail_queue_status
MAILQ_LOG_STATUS = $( find /var/spool/postfix/deferred -type f | wc -l)
echo " Mail queue contains ${ MAILQ_LOG_STATUS } items (critical limit is ${ MAILQ_CRIT } ) at $( date) " >> /tmp/mail_queue_status
err_c_cur = ${ err_count }
if [ ${ MAILQ_LOG_STATUS } -ge ${ MAILQ_CRIT } ] ; then
err_count = $(( ${ err_count } + 1 ))
echo " Mail queue contains ${ MAILQ_LOG_STATUS } items (critical limit is ${ MAILQ_CRIT } ) at $( date) " >> /tmp/mail_queue_status
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Mail queue" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 60
else
diff_c = 0
sleep $(( ( RANDOM % 60 ) + 20 ))
fi
done
return 1
}
2019-06-10 16:57:38 +08:00
fail2ban_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ FAIL2BAN_THRESHOLD }
2020-02-05 18:01:09 +08:00
F2B_LOG_STATUS = ( $( ${ REDIS_CMDLINE } --raw HKEYS F2B_ACTIVE_BANS) )
2019-06-10 16:57:38 +08:00
F2B_RES =
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
err_c_cur = ${ err_count }
F2B_LOG_STATUS_PREV = ( ${ F2B_LOG_STATUS [@] } )
2020-02-05 18:01:09 +08:00
F2B_LOG_STATUS = ( $( ${ REDIS_CMDLINE } --raw HKEYS F2B_ACTIVE_BANS) )
2019-06-10 16:57:38 +08:00
array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
if [ [ ! -z " ${ F2B_RES } " ] ] ; then
err_count = $(( ${ err_count } + 1 ))
2020-02-05 18:01:09 +08:00
echo -n " ${ F2B_RES [@] } " | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${ REDIS_CMDLINE } -x SET F2B_RES > /dev/null
2019-08-09 20:12:56 +08:00
if [ $? -ne 0 ] ; then
2020-02-05 18:01:09 +08:00
${ REDIS_CMDLINE } -x DEL F2B_RES
2019-08-09 20:12:56 +08:00
fi
2019-06-10 16:57:38 +08:00
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Fail2ban" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2019-06-10 16:57:38 +08:00
fi
done
return 1
}
2019-03-28 06:15:04 +08:00
acme_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ ACME_THRESHOLD }
2019-03-28 06:15:04 +08:00
ACME_LOG_STATUS = $( redis-cli -h redis GET ACME_FAIL_TIME)
if [ [ -z " ${ ACME_LOG_STATUS } " ] ] ; then
2020-02-05 18:01:09 +08:00
${ REDIS_CMDLINE } SET ACME_FAIL_TIME 0
2019-03-28 06:15:04 +08:00
ACME_LOG_STATUS = 0
fi
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
err_c_cur = ${ err_count }
ACME_LOG_STATUS_PREV = ${ ACME_LOG_STATUS }
2019-12-28 18:37:04 +08:00
ACME_LC = 0
until [ [ ! -z ${ ACME_LOG_STATUS } ] ] || [ ${ ACME_LC } -ge 3 ] ; do
ACME_LOG_STATUS = $( redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null)
2020-01-02 03:11:28 +08:00
sleep 3
2019-12-28 18:37:04 +08:00
ACME_LC = $(( ACME_LC+1))
done
2019-03-28 06:15:04 +08:00
if [ [ ${ ACME_LOG_STATUS_PREV } != ${ ACME_LOG_STATUS } ] ] ; then
err_count = $(( ${ err_count } + 1 ))
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "ACME" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2019-03-28 06:15:04 +08:00
fi
done
return 1
}
2018-12-27 15:20:49 +08:00
ipv6nat_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ IPV6NAT_THRESHOLD }
2018-12-27 15:20:49 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
err_c_cur = ${ err_count }
2019-03-07 07:08:45 +08:00
CONTAINERS = $( curl --silent --insecure https://dockerapi/containers/json)
IPV6NAT_CONTAINER_ID = $( echo ${ CONTAINERS } | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\")) | .id" )
2018-12-27 15:20:49 +08:00
if [ [ ! -z ${ IPV6NAT_CONTAINER_ID } ] ] ; then
2019-03-07 07:08:45 +08:00
LATEST_STARTED = " $( echo ${ CONTAINERS } | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], StartedAt: .State.StartedAt}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\") | not)" | jq -rc .StartedAt | xargs -n1 date +%s -d | sort | tail -n1) "
LATEST_IPV6NAT = " $( echo ${ CONTAINERS } | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], StartedAt: .State.StartedAt}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\"))" | jq -rc .StartedAt | xargs -n1 date +%s -d | sort | tail -n1) "
2018-12-27 15:20:49 +08:00
DIFFERENCE_START_TIME = $( expr ${ LATEST_IPV6NAT } - ${ LATEST_STARTED } 2>/dev/null)
if [ [ " ${ DIFFERENCE_START_TIME } " -lt 30 ] ] ; then
err_count = $(( ${ err_count } + 1 ))
fi
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "IPv6 NAT" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep 30
2018-12-27 15:20:49 +08:00
else
diff_c = 0
2019-03-07 07:08:45 +08:00
sleep 300
2018-12-27 15:20:49 +08:00
fi
done
return 1
}
2019-03-07 07:08:45 +08:00
2017-09-22 01:30:03 +08:00
rspamd_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ RSPAMD_THRESHOLD }
2017-09-22 01:30:03 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
2019-03-05 00:56:27 +08:00
touch /tmp/rspamd-mailcow; echo " $( tail -50 /tmp/rspamd-mailcow) " > /tmp/rspamd-mailcow
2017-09-22 01:30:03 +08:00
host_ip = $( get_container_ip rspamd-mailcow)
err_c_cur = ${ err_count }
2019-03-07 07:08:45 +08:00
SCORE = $( echo ' To: null@localhost
2017-09-22 01:30:03 +08:00
From: watchdog@localhost
Empty
2019-10-12 19:15:36 +08:00
' | usr/bin/curl -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score)
2017-09-22 01:30:03 +08:00
if [ [ ${ SCORE } != "9999" ] ] ; then
2018-10-18 04:30:13 +08:00
echo "Rspamd settings check failed" 2>> /tmp/rspamd-mailcow 1>& 2
2017-09-22 01:30:03 +08:00
err_count = $(( ${ err_count } + 1 ))
else
2018-10-18 04:30:13 +08:00
echo "Rspamd settings check succeeded" 2>> /tmp/rspamd-mailcow 1>& 2
2017-09-22 01:30:03 +08:00
fi
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Rspamd" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
2019-06-10 16:57:38 +08:00
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2019-06-10 16:57:38 +08:00
fi
2017-09-22 01:30:03 +08:00
done
return 1
}
2019-07-08 03:59:20 +08:00
olefy_checks( ) {
err_count = 0
diff_c = 0
2020-02-21 15:48:40 +08:00
THRESHOLD = ${ OLEFY_THRESHOLD }
2019-07-08 03:59:20 +08:00
# Reduce error count by 2 after restarting an unhealthy container
trap " [ ${ err_count } -gt 1 ] && err_count= $(( ${ err_count } - 2 )) " USR1
while [ ${ err_count } -lt ${ THRESHOLD } ] ; do
touch /tmp/olefy-mailcow; echo " $( tail -50 /tmp/olefy-mailcow) " > /tmp/olefy-mailcow
host_ip = $( get_container_ip olefy-mailcow)
err_c_cur = ${ err_count }
2019-10-19 01:45:16 +08:00
/usr/lib/nagios/plugins/check_tcp -4 -H ${ host_ip } -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>& 2; err_count = $(( ${ err_count } + $? ))
2019-07-08 03:59:20 +08:00
[ ${ err_c_cur } -eq ${ err_count } ] && [ ! $(( ${ err_count } - 1 )) -lt 0 ] && err_count = $(( ${ err_count } - 1 )) diff_c = 1
[ ${ err_c_cur } -ne ${ err_count } ] && diff_c = $(( ${ err_c_cur } - ${ err_count } ))
progress "Olefy" ${ THRESHOLD } $(( ${ THRESHOLD } - ${ err_count } )) ${ diff_c }
if [ [ $? = = 10 ] ] ; then
diff_c = 0
sleep 1
else
diff_c = 0
2019-07-21 03:43:14 +08:00
sleep $(( ( RANDOM % 60 ) + 20 ))
2019-07-08 03:59:20 +08:00
fi
done
return 1
}
2019-04-19 04:09:26 +08:00
# Notify about start
2019-08-14 02:19:01 +08:00
if [ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] ; then
2019-08-03 20:29:02 +08:00
mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
fi
2019-04-19 04:09:26 +08:00
2017-09-20 16:56:49 +08:00
# Create watchdog agents
2019-08-03 20:29:02 +08:00
2017-09-20 16:56:49 +08:00
(
while true; do
if ! nginx_checks; then
2017-11-14 17:44:00 +08:00
log_msg "Nginx hit error limit"
2017-09-20 16:56:49 +08:00
echo nginx-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned nginx_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
2020-01-26 01:26:56 +08:00
if [ [ ${ WATCHDOG_EXTERNAL_CHECKS } = ~ ^( [ yY] [ eE] [ sS] | [ yY] ) +$ ] ] ; then
(
while true; do
if ! external_checks; then
log_msg "External checks hit error limit"
echo external_checks > /tmp/com_pipe
fi
done
) &
PID = $!
echo " Spawned external_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
fi
2020-04-06 17:23:20 +08:00
if [ [ ${ WATCHDOG_MYSQL_REPLICATION_CHECKS } = ~ ^( [ yY] [ eE] [ sS] | [ yY] ) +$ ] ] ; then
(
while true; do
if ! mysql_repl_checks; then
log_msg "MySQL replication check hit error limit"
echo mysql_repl_checks > /tmp/com_pipe
fi
done
) &
PID = $!
echo " Spawned mysql_repl_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
fi
2017-09-20 16:56:49 +08:00
(
while true; do
if ! mysql_checks; then
2017-11-14 17:44:00 +08:00
log_msg "MySQL hit error limit"
2017-09-20 16:56:49 +08:00
echo mysql-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned mysql_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
2019-09-12 14:29:15 +08:00
(
while true; do
if ! redis_checks; then
2020-02-05 18:01:09 +08:00
log_msg "Local Redis hit error limit"
2019-09-12 14:29:15 +08:00
echo redis-mailcow > /tmp/com_pipe
fi
done
) &
PID = $!
echo " Spawned redis_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
(
while true; do
if ! phpfpm_checks; then
2017-11-14 17:44:00 +08:00
log_msg "PHP-FPM hit error limit"
2017-09-20 16:56:49 +08:00
echo php-fpm-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned phpfpm_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
2020-04-27 23:27:47 +08:00
if [ [ " ${ SKIP_SOGO } " = ~ ^( [ nN] [ oO] | [ nN] ) +$ ] ] ; then
2017-09-20 16:56:49 +08:00
(
while true; do
if ! sogo_checks; then
2017-11-14 17:44:00 +08:00
log_msg "SOGo hit error limit"
2017-09-20 16:56:49 +08:00
echo sogo-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned sogo_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2020-04-27 23:27:47 +08:00
fi
2017-09-20 16:56:49 +08:00
2018-10-14 06:21:31 +08:00
if [ ${ CHECK_UNBOUND } -eq 1 ] ; then
(
while true; do
if ! unbound_checks; then
log_msg "Unbound hit error limit"
echo unbound-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned unbound_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2018-10-14 06:21:31 +08:00
fi
if [ [ " ${ SKIP_CLAMD } " = ~ ^( [ nN] [ oO] | [ nN] ) +$ ] ] ; then
(
while true; do
if ! clamd_checks; then
log_msg "Clamd hit error limit"
echo clamd-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned clamd_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2018-10-14 06:21:31 +08:00
fi
2017-09-20 16:56:49 +08:00
(
while true; do
if ! postfix_checks; then
2017-11-14 17:44:00 +08:00
log_msg "Postfix hit error limit"
2017-09-20 16:56:49 +08:00
echo postfix-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned postfix_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
2020-05-31 17:39:20 +08:00
(
while true; do
if ! mailq_checks; then
log_msg "Mail queue hit error limit"
echo mail_queue_status > /tmp/com_pipe
fi
done
) &
PID = $!
echo " Spawned mailq_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
(
while true; do
if ! dovecot_checks; then
2017-11-14 17:44:00 +08:00
log_msg "Dovecot hit error limit"
2017-09-20 16:56:49 +08:00
echo dovecot-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned dovecot_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-20 16:56:49 +08:00
2020-04-14 18:48:57 +08:00
(
while true; do
if ! dovecot_repl_checks; then
log_msg "Dovecot hit error limit"
echo dovecot_repl_checks > /tmp/com_pipe
fi
done
) &
PID = $!
echo " Spawned dovecot_repl_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-22 01:30:03 +08:00
(
while true; do
if ! rspamd_checks; then
2017-11-14 17:44:00 +08:00
log_msg "Rspamd hit error limit"
2017-09-22 01:30:03 +08:00
echo rspamd-mailcow > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned rspamd_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2017-09-22 01:30:03 +08:00
2018-12-16 04:21:22 +08:00
(
while true; do
if ! ratelimit_checks; then
log_msg "Ratelimit hit error limit"
echo ratelimit > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned ratelimit_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2018-12-27 15:20:49 +08:00
2019-06-10 16:57:38 +08:00
(
while true; do
if ! fail2ban_checks; then
log_msg "Fail2ban hit error limit"
echo fail2ban > /tmp/com_pipe
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned fail2ban_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2019-10-19 01:45:16 +08:00
(
while true; do
if ! olefy_checks; then
log_msg "Olefy hit error limit"
echo olefy-mailcow > /tmp/com_pipe
fi
done
) &
PID = $!
echo " Spawned olefy_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2019-07-08 03:59:20 +08:00
2019-03-28 06:15:04 +08:00
(
while true; do
if ! acme_checks; then
log_msg "ACME client hit error limit"
2019-05-17 01:44:39 +08:00
echo acme-mailcow > /tmp/com_pipe
2019-03-28 06:15:04 +08:00
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned acme_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2019-03-28 06:15:04 +08:00
2018-12-27 15:20:49 +08:00
(
while true; do
if ! ipv6nat_checks; then
2019-02-20 05:25:28 +08:00
log_msg "IPv6 NAT warning: ipv6nat-mailcow container was not started at least 30s after siblings (not an error)"
echo ipv6nat-mailcow > /tmp/com_pipe
2018-12-27 15:20:49 +08:00
fi
done
) &
2019-08-03 20:29:02 +08:00
PID = $!
echo " Spawned ipv6nat_checks with PID ${ PID } "
BACKGROUND_TASKS += ( ${ PID } )
2018-12-27 15:20:49 +08:00
2017-09-20 16:56:49 +08:00
# Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
(
while true; do
for bg_task in ${ BACKGROUND_TASKS [*] } ; do
2017-10-15 05:26:08 +08:00
if ! kill -0 ${ bg_task } 1>& 2; then
2017-11-14 17:44:00 +08:00
log_msg " Worker ${ bg_task } died, stopping watchdog and waiting for respawn... "
2017-10-15 05:26:08 +08:00
kill -TERM 1
2017-09-20 16:56:49 +08:00
fi
2017-10-15 05:26:08 +08:00
sleep 10
2017-09-20 16:56:49 +08:00
done
done
) &
2017-10-27 17:22:39 +08:00
# Monitor dockerapi
(
while true; do
[Docker API] Use TLS encryption for communication with "on-the-fly" created key paris (non-exposed)
[Docker API] Create pipe to pass Rspamd UI worker password
[Dovecot] Pull Spamassassin ruleset to be read by Rspamd (MANY THANKS to Peer Heinlein!)
[Dovecot] Garbage collector for deleted maildirs (set keep time via MAILDIR_GC_TIME which defaults to 1440 minutes)
[Web] Flush memcached after mailbox item changes, fixes #1808
[Web] Fix duplicate IDs, fixes #1792
[Compose] Use SQL sockets
[PHP-FPM] Update APCu and Redis libs
[Dovecot] Encrypt maildir with global key pair in crypt-vol-1 (BACKUP!), also fixes #1791
[Web] Fix deletion of spam aliases
[Helper] Add "crypt" to backup script
[Helper] Override file for external SQL socket (not supported!)
[Compose] New images for Rspamd, PHP-FPM, SOGo, Dovecot, Docker API, Watchdog, ACME, Postfix
2018-09-30 04:01:23 +08:00
while nc -z dockerapi 443; do
2017-10-27 17:22:39 +08:00
sleep 3
done
2017-11-14 17:44:00 +08:00
log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
2017-10-27 17:22:39 +08:00
kill -STOP ${ BACKGROUND_TASKS [*] }
[Docker API] Use TLS encryption for communication with "on-the-fly" created key paris (non-exposed)
[Docker API] Create pipe to pass Rspamd UI worker password
[Dovecot] Pull Spamassassin ruleset to be read by Rspamd (MANY THANKS to Peer Heinlein!)
[Dovecot] Garbage collector for deleted maildirs (set keep time via MAILDIR_GC_TIME which defaults to 1440 minutes)
[Web] Flush memcached after mailbox item changes, fixes #1808
[Web] Fix duplicate IDs, fixes #1792
[Compose] Use SQL sockets
[PHP-FPM] Update APCu and Redis libs
[Dovecot] Encrypt maildir with global key pair in crypt-vol-1 (BACKUP!), also fixes #1791
[Web] Fix deletion of spam aliases
[Helper] Add "crypt" to backup script
[Helper] Override file for external SQL socket (not supported!)
[Compose] New images for Rspamd, PHP-FPM, SOGo, Dovecot, Docker API, Watchdog, ACME, Postfix
2018-09-30 04:01:23 +08:00
until nc -z dockerapi 443; do
2017-10-27 17:22:39 +08:00
sleep 3
done
kill -CONT ${ BACKGROUND_TASKS [*] }
kill -USR1 ${ BACKGROUND_TASKS [*] }
done
) &
2019-06-11 04:40:21 +08:00
# Actions when threshold limit is reached
2017-09-20 16:56:49 +08:00
while true; do
CONTAINER_ID =
2018-10-27 19:23:36 +08:00
HAS_INITDB =
2017-09-20 16:56:49 +08:00
read com_pipe_answer </tmp/com_pipe
2019-03-05 00:56:27 +08:00
if [ -s " /tmp/ ${ com_pipe_answer } " ] ; then
cat " /tmp/ ${ com_pipe_answer } "
fi
2018-12-16 04:21:22 +08:00
if [ [ ${ com_pipe_answer } = = "ratelimit" ] ] ; then
log_msg "At least one ratelimit was applied"
2020-03-03 02:56:49 +08:00
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } "
2020-05-31 17:39:20 +08:00
elif [ [ ${ com_pipe_answer } = = "mail_queue_status" ] ] ; then
log_msg "Mail queue status is critical"
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } "
2020-01-26 01:26:56 +08:00
elif [ [ ${ com_pipe_answer } = = "external_checks" ] ] ; then
log_msg "Your mailcow is an open relay!"
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } " "Please stop mailcow now and check your network configuration!"
2020-04-06 17:23:20 +08:00
elif [ [ ${ com_pipe_answer } = = "mysql_repl_checks" ] ] ; then
log_msg "MySQL replication is not working properly"
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } "
2020-04-14 18:48:57 +08:00
elif [ [ ${ com_pipe_answer } = = "dovecot_repl_checks" ] ] ; then
log_msg "Dovecot replication is not working properly" "Please check doveadm replicator status"
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } "
2019-05-17 01:44:39 +08:00
elif [ [ ${ com_pipe_answer } = = "acme-mailcow" ] ] ; then
log_msg "acme-mailcow did not complete successfully"
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } " "Please check acme-mailcow for further information."
2019-06-10 16:57:38 +08:00
elif [ [ ${ com_pipe_answer } = = "fail2ban" ] ] ; then
2020-02-05 18:01:09 +08:00
F2B_RES = ( $( timeout 4s ${ REDIS_CMDLINE } --raw GET F2B_RES 2> /dev/null) )
2019-08-09 20:12:56 +08:00
if [ [ ! -z " ${ F2B_RES } " ] ] ; then
2020-02-05 18:01:09 +08:00
${ REDIS_CMDLINE } DEL F2B_RES > /dev/null
2019-08-09 20:12:56 +08:00
host =
for host in " ${ F2B_RES [@] } " ; do
log_msg " Banned ${ host } "
rm /tmp/fail2ban 2> /dev/null
2019-08-14 02:19:01 +08:00
timeout 2s whois " ${ host } " > /tmp/fail2ban
2019-08-09 20:12:56 +08:00
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && [ [ ${ WATCHDOG_NOTIFY_BAN } = ~ ^( [ yY] [ eE] [ sS] | [ yY] ) +$ ] ] && mail_error " ${ com_pipe_answer } " " IP ban: ${ host } "
done
fi
2019-06-11 04:40:21 +08:00
elif [ [ ${ com_pipe_answer } = ~ .+-mailcow ] ] ; then
2017-09-20 16:56:49 +08:00
kill -STOP ${ BACKGROUND_TASKS [*] }
2019-07-21 03:43:14 +08:00
sleep 10
[Docker API] Use TLS encryption for communication with "on-the-fly" created key paris (non-exposed)
[Docker API] Create pipe to pass Rspamd UI worker password
[Dovecot] Pull Spamassassin ruleset to be read by Rspamd (MANY THANKS to Peer Heinlein!)
[Dovecot] Garbage collector for deleted maildirs (set keep time via MAILDIR_GC_TIME which defaults to 1440 minutes)
[Web] Flush memcached after mailbox item changes, fixes #1808
[Web] Fix duplicate IDs, fixes #1792
[Compose] Use SQL sockets
[PHP-FPM] Update APCu and Redis libs
[Dovecot] Encrypt maildir with global key pair in crypt-vol-1 (BACKUP!), also fixes #1791
[Web] Fix deletion of spam aliases
[Helper] Add "crypt" to backup script
[Helper] Override file for external SQL socket (not supported!)
[Compose] New images for Rspamd, PHP-FPM, SOGo, Dovecot, Docker API, Watchdog, ACME, Postfix
2018-09-30 04:01:23 +08:00
CONTAINER_ID = $( curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc " select( .name | tostring | contains(\" ${ com_pipe_answer } \")) | .id " )
2017-09-20 16:56:49 +08:00
if [ [ ! -z ${ CONTAINER_ID } ] ] ; then
2018-10-27 19:23:36 +08:00
if [ [ " ${ com_pipe_answer } " = = "php-fpm-mailcow" ] ] ; then
HAS_INITDB = $( curl --silent --insecure -XPOST https://dockerapi/containers/${ CONTAINER_ID } /top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true )
fi
2018-10-28 05:44:06 +08:00
S_RUNNING = $(( $( date +%s) - $( curl --silent --insecure https://dockerapi/containers/${ CONTAINER_ID } /json | jq .State.StartedAt | xargs -n1 date +%s -d) ))
2019-07-21 03:43:14 +08:00
if [ ${ S_RUNNING } -lt 360 ] ; then
log_msg "Container is running for less than 360 seconds, skipping action..."
2018-10-28 05:44:06 +08:00
elif [ [ ! -z ${ HAS_INITDB } ] ] ; then
2018-10-27 19:23:36 +08:00
log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
sleep 60
else
log_msg " Sending restart command to ${ CONTAINER_ID } ... "
curl --silent --insecure -XPOST https://dockerapi/containers/${ CONTAINER_ID } /restart
2019-02-20 05:25:28 +08:00
if [ [ ${ com_pipe_answer } != "ipv6nat-mailcow" ] ] ; then
2018-12-27 15:20:49 +08:00
[ [ ! -z ${ WATCHDOG_NOTIFY_EMAIL } ] ] && mail_error " ${ com_pipe_answer } "
fi
2018-10-27 19:23:36 +08:00
log_msg "Wait for restarted container to settle and continue watching..."
2018-12-27 15:20:49 +08:00
sleep 35
2018-10-27 19:23:36 +08:00
fi
2017-09-20 16:56:49 +08:00
fi
kill -CONT ${ BACKGROUND_TASKS [*] }
2019-07-21 03:43:14 +08:00
sleep 1
2017-09-20 16:56:49 +08:00
kill -USR1 ${ BACKGROUND_TASKS [*] }
fi
done