From 063337b58dd0428e3b9b687b20fffd2f737d5248 Mon Sep 17 00:00:00 2001 From: andryyy Date: Sun, 31 May 2020 11:39:20 +0200 Subject: [PATCH] [Watchdog] Watch mail queue (added inexpensive check via "find" instead of adding an API endpoint to dockerapi-mailcow) --- data/Dockerfiles/watchdog/watchdog.sh | 45 +++++++++++++++++++++++++++ docker-compose.yml | 5 ++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/data/Dockerfiles/watchdog/watchdog.sh b/data/Dockerfiles/watchdog/watchdog.sh index acad52df..8d33365e 100755 --- a/data/Dockerfiles/watchdog/watchdog.sh +++ b/data/Dockerfiles/watchdog/watchdog.sh @@ -88,6 +88,7 @@ log_msg() { function mail_error() { [[ -z ${1} ]] && return 1 + # If exists, body will be the content of "/tmp/${1}", even if ${2} is set [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}" WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||') # Some exceptions for subject and body formats @@ -524,6 +525,35 @@ ratelimit_checks() { return 1 } +mailq_checks() { + err_count=0 + diff_c=0 + THRESHOLD=${MAILQ_THRESHOLD} + # Reduce error count by 2 after restarting an unhealthy container + trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 + while [ ${err_count} -lt ${THRESHOLD} ]; do + touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status + MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l) + echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status + err_c_cur=${err_count} + if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then + err_count=$(( ${err_count} + 1 )) + echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status + fi + [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 + [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) + progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} + if [[ $? == 10 ]]; then + diff_c=0 + sleep 60 + else + diff_c=0 + sleep $(( ( RANDOM % 60 ) + 20 )) + fi + done + return 1 +} + fail2ban_checks() { err_count=0 diff_c=0 @@ -825,6 +855,18 @@ PID=$! echo "Spawned postfix_checks with PID ${PID}" BACKGROUND_TASKS+=(${PID}) +( +while true; do + if ! mailq_checks; then + log_msg "Mail queue hit error limit" + echo mail_queue_status > /tmp/com_pipe + fi +done +) & +PID=$! +echo "Spawned mailq_checks with PID ${PID}" +BACKGROUND_TASKS+=(${PID}) + ( while true; do if ! dovecot_checks; then @@ -961,6 +1003,9 @@ while true; do if [[ ${com_pipe_answer} == "ratelimit" ]]; then log_msg "At least one ratelimit was applied" [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" + elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then + log_msg "Mail queue status is critical" + [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" elif [[ ${com_pipe_answer} == "external_checks" ]]; then log_msg "Your mailcow is an open relay!" [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!" diff --git a/docker-compose.yml b/docker-compose.yml index f9cdc2c7..efd11d30 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -396,7 +396,7 @@ services: - /lib/modules:/lib/modules:ro watchdog-mailcow: - image: mailcow/watchdog:1.77 + image: mailcow/watchdog:1.78 # Debug #command: /watchdog.sh dns: @@ -404,6 +404,7 @@ services: volumes: - rspamd-vol-1:/var/lib/rspamd - mysql-socket-vol-1:/var/run/mysqld/ + - postfix-vol-1:/var/spool/postfix - ./data/assets/ssl:/etc/ssl/mail/:ro restart: always environment: @@ -447,6 +448,8 @@ services: - IPV6NAT_THRESHOLD=1 - RSPAMD_THRESHOLD=5 - OLEFY_THRESHOLD=5 + - MAILQ_THRESHOLD=3 + - MAILQ_CRIT=30 networks: mailcow-network: aliases: