[Watchdog] Remove some check_ping checks

[Watchdog] Add ClamAV check (if SKIP_CLAMD=n)
[Watchdog] Add Unbound check
[Watchdog] Do not use Docker API by default to determine IP of containers (see "IP_BY_DOCKER_API")
[Watchdog] Minor changes
master
André 2018-10-14 00:21:31 +02:00
parent a9eddae686
commit 2da228a21e
1 changed files with 134 additions and 31 deletions

View File

@ -43,6 +43,7 @@ log_msg() {
function mail_error() { function mail_error() {
[[ -z ${1} ]] && return 1 [[ -z ${1} ]] && return 1
[[ -z ${2} ]] && return 2 [[ -z ${2} ]] && return 2
[[ -z ${3} ]] && BODY="Service was restarted, please check your mailcow installation." || BODY="${3}"
RCPT_DOMAIN=$(echo ${1} | awk -F @ {'print $NF'}) RCPT_DOMAIN=$(echo ${1} | awk -F @ {'print $NF'})
RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}') RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}')
if [[ -z ${RCPT_MX} ]]; then if [[ -z ${RCPT_MX} ]]; then
@ -50,8 +51,8 @@ function mail_error() {
return 1 return 1
fi fi
./smtp-cli --missing-modules-ok \ ./smtp-cli --missing-modules-ok \
--subject="Watchdog: ${2} service hit the error rate limit" \ --subject="Watchdog: ${2} hit the error rate limit" \
--body-plain="Service was restarted, please check your mailcow installation." \ --body-plain="${BODY}" \
--to=${1} \ --to=${1} \
--from="watchdog@${MAILCOW_HOSTNAME}" \ --from="watchdog@${MAILCOW_HOSTNAME}" \
--server="${RCPT_MX}" \ --server="${RCPT_MX}" \
@ -66,6 +67,9 @@ get_container_ip() {
CONTAINER_IP= CONTAINER_IP=
LOOP_C=1 LOOP_C=1
until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
CONTAINER_IP=$(dig a "${1}" +short)
else
sleep 0.5 sleep 0.5
# get long container id for exact match # get long container id for exact match
CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | .id")) CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | .id"))
@ -89,6 +93,7 @@ get_container_ip() {
[[ ! -z ${CONTAINER_IP} ]] && break [[ ! -z ${CONTAINER_IP} ]] && break
done done
fi fi
fi
LOOP_C=$((LOOP_C + 1)) LOOP_C=$((LOOP_C + 1))
done done
[[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP} [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
@ -103,7 +108,6 @@ nginx_checks() {
while [ ${err_count} -lt ${THRESHOLD} ]; do while [ ${err_count} -lt ${THRESHOLD} ]; do
host_ip=$(get_container_ip nginx-mailcow) host_ip=$(get_container_ip nginx-mailcow)
err_c_cur=${err_count} err_c_cur=${err_count}
/usr/lib/nagios/plugins/check_ping -4 -H ${host_ip} -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
/usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 1>&2; err_count=$(( ${err_count} + $? )) /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 1>&2; err_count=$(( ${err_count} + $? ))
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
@ -114,6 +118,32 @@ nginx_checks() {
return 1 return 1
} }
unbound_checks() {
err_count=0
diff_c=0
THRESHOLD=8
# Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do
host_ip=$(get_container_ip unbound-mailcow)
err_c_cur=${err_count}
/usr/lib/nagios/plugins/check_dns -s ${host_ip} -H google.com 1>&2; err_count=$(( ${err_count} + $? ))
DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
if [[ -z ${DNSSEC} ]]; then
echo "DNSSEC failure" 1>&2
err_count=$(( ${err_count} + 1))
else
echo "DNSSEC check succeeded" 1>&2
fi
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
diff_c=0
sleep $(( ( RANDOM % 30 ) + 10 ))
done
return 1
}
mysql_checks() { mysql_checks() {
err_count=0 err_count=0
diff_c=0 diff_c=0
@ -137,7 +167,7 @@ mysql_checks() {
sogo_checks() { sogo_checks() {
err_count=0 err_count=0
diff_c=0 diff_c=0
THRESHOLD=20 THRESHOLD=10
# Reduce error count by 2 after restarting an unhealthy container # Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do while [ ${err_count} -lt ${THRESHOLD} ]; do
@ -156,7 +186,7 @@ sogo_checks() {
postfix_checks() { postfix_checks() {
err_count=0 err_count=0
diff_c=0 diff_c=0
THRESHOLD=16 THRESHOLD=8
# Reduce error count by 2 after restarting an unhealthy container # Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do while [ ${err_count} -lt ${THRESHOLD} ]; do
@ -173,10 +203,29 @@ postfix_checks() {
return 1 return 1
} }
clamd_checks() {
err_count=0
diff_c=0
THRESHOLD=6
# Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do
host_ip=$(get_container_ip clamd-mailcow)
err_c_cur=${err_count}
/usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 1>&2; err_count=$(( ${err_count} + $? ))
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
diff_c=0
sleep $(( ( RANDOM % 30 ) + 10 ))
done
return 1
}
dovecot_checks() { dovecot_checks() {
err_count=0 err_count=0
diff_c=0 diff_c=0
THRESHOLD=24 THRESHOLD=20
# Reduce error count by 2 after restarting an unhealthy container # Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do while [ ${err_count} -lt ${THRESHOLD} ]; do
@ -199,7 +248,7 @@ dovecot_checks() {
phpfpm_checks() { phpfpm_checks() {
err_count=0 err_count=0
diff_c=0 diff_c=0
THRESHOLD=10 THRESHOLD=5
# Reduce error count by 2 after restarting an unhealthy container # Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do while [ ${err_count} -lt ${THRESHOLD} ]; do
@ -207,7 +256,6 @@ phpfpm_checks() {
err_c_cur=${err_count} err_c_cur=${err_count}
nc -z ${host_ip} 9001 ; err_count=$(( ${err_count} + ($? * 2))) nc -z ${host_ip} 9001 ; err_count=$(( ${err_count} + ($? * 2)))
nc -z ${host_ip} 9002 ; err_count=$(( ${err_count} + ($? * 2))) nc -z ${host_ip} 9002 ; err_count=$(( ${err_count} + ($? * 2)))
/usr/lib/nagios/plugins/check_ping -4 -H ${host_ip} -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
@ -217,10 +265,30 @@ phpfpm_checks() {
return 1 return 1
} }
cert_checks() {
err_count=0
diff_c=0
THRESHOLD=1
# Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do
host_ip=$(get_container_ip nginx-mailcow)
err_c_cur=${err_count}
/usr/lib/nagios/plugins/check_http -H ${host_ip} -p ${HTTPS_PORT} -C 15 1>&2; err_count=$(( ${err_count} + $? ))
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
progress "TLS certificate" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
diff_c=0
# Sleep 1 day, fixme: 1 day lag
sleep 86400
done
return 1
}
rspamd_checks() { rspamd_checks() {
err_count=0 err_count=0
diff_c=0 diff_c=0
THRESHOLD=10 THRESHOLD=5
# Reduce error count by 2 after restarting an unhealthy container # Reduce error count by 2 after restarting an unhealthy container
trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
while [ ${err_count} -lt ${THRESHOLD} ]; do while [ ${err_count} -lt ${THRESHOLD} ]; do
@ -238,7 +306,6 @@ Empty
else else
echo "Rspamd settings check succeeded" 1>&2 echo "Rspamd settings check succeeded" 1>&2
fi fi
/usr/lib/nagios/plugins/check_ping -4 -H ${host_ip} -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
@ -293,6 +360,42 @@ done
) & ) &
BACKGROUND_TASKS+=($!) BACKGROUND_TASKS+=($!)
if [ ${CHECK_UNBOUND} -eq 1 ]; then
(
while true; do
if ! unbound_checks; then
log_msg "Unbound hit error limit"
[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${WATCHDOG_NOTIFY_EMAIL}" "unbound-mailcow"
echo unbound-mailcow > /tmp/com_pipe
fi
done
) &
BACKGROUND_TASKS+=($!)
fi
if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
(
while true; do
if ! clamd_checks; then
log_msg "Clamd hit error limit"
[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${WATCHDOG_NOTIFY_EMAIL}" "clamd-mailcow"
echo clamd-mailcow > /tmp/com_pipe
fi
done
) &
BACKGROUND_TASKS+=($!)
fi
(
while true; do
if ! cert_checks; then
log_msg "TLS certificate hit error limit"
[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${WATCHDOG_NOTIFY_EMAIL}" "TLS check" "TLS certificate expires soon!"
fi
done
) &
BACKGROUND_TASKS+=($!)
( (
while true; do while true; do
if ! postfix_checks; then if ! postfix_checks; then