From 2bd1b1c9609b32e5427338b62eacc2382f893142 Mon Sep 17 00:00:00 2001 From: andryyy Date: Sat, 27 Oct 2018 23:44:06 +0200 Subject: [PATCH] [Watchdog] Skip container restart if running for less than 120 seconds --- data/Dockerfiles/watchdog/Dockerfile | 1 + data/Dockerfiles/watchdog/watchdog.sh | 80 +++++++++++++++++++++------ 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/data/Dockerfiles/watchdog/Dockerfile b/data/Dockerfiles/watchdog/Dockerfile index 810b2af2..c4427c59 100644 --- a/data/Dockerfiles/watchdog/Dockerfile +++ b/data/Dockerfiles/watchdog/Dockerfile @@ -9,6 +9,7 @@ RUN apk add --update \ nagios-plugins-ping \ curl \ bash \ + coreutils \ jq \ fcgi \ nagios-plugins-mysql \ diff --git a/data/Dockerfiles/watchdog/watchdog.sh b/data/Dockerfiles/watchdog/watchdog.sh index 7f8e0ad4..0b903354 100755 --- a/data/Dockerfiles/watchdog/watchdog.sh +++ b/data/Dockerfiles/watchdog/watchdog.sh @@ -30,6 +30,8 @@ progress() { PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} )) redis-cli -h redis LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis + # Return 10 to indicate a dead service + [ ${CURRENT} -le 0 ] && return 10 } log_msg() { @@ -120,8 +122,13 @@ nginx_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -147,8 +154,13 @@ unbound_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -168,8 +180,13 @@ mysql_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -188,8 +205,13 @@ sogo_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -209,8 +231,13 @@ postfix_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -229,9 +256,13 @@ clamd_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - # Don't check Clamd too often - sleep 1800 + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -254,8 +285,13 @@ dovecot_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -275,8 +311,13 @@ phpfpm_checks() { [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} - diff_c=0 - sleep $(( ( RANDOM % 30 ) + 10 )) + if [[ $? == 10 ]]; then + diff_c=0 + sleep 1 + else + diff_c=0 + sleep $(( ( RANDOM % 30 ) + 10 )) + fi done return 1 } @@ -449,7 +490,10 @@ while true; do if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true) fi - if [[ ! -z ${HAS_INITDB} ]]; then + S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d))) + if [ ${S_RUNNING} -lt 120 ]; then + log_msg "Container is running for less than 120 seconds, skipping action..." + elif [[ ! -z ${HAS_INITDB} ]]; then log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..." sleep 60 else