[Watchdog] Skip container restart if running for less than 120 seconds
parent
a332f0dcc6
commit
2bd1b1c960
|
@ -9,6 +9,7 @@ RUN apk add --update \
|
||||||
nagios-plugins-ping \
|
nagios-plugins-ping \
|
||||||
curl \
|
curl \
|
||||||
bash \
|
bash \
|
||||||
|
coreutils \
|
||||||
jq \
|
jq \
|
||||||
fcgi \
|
fcgi \
|
||||||
nagios-plugins-mysql \
|
nagios-plugins-mysql \
|
||||||
|
|
|
@ -30,6 +30,8 @@ progress() {
|
||||||
PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
|
PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
|
||||||
redis-cli -h redis LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
|
redis-cli -h redis LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
|
||||||
log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
|
log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
|
||||||
|
# Return 10 to indicate a dead service
|
||||||
|
[ ${CURRENT} -le 0 ] && return 10
|
||||||
}
|
}
|
||||||
|
|
||||||
log_msg() {
|
log_msg() {
|
||||||
|
@ -120,8 +122,13 @@ nginx_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -147,8 +154,13 @@ unbound_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -168,8 +180,13 @@ mysql_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -188,8 +205,13 @@ sogo_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -209,8 +231,13 @@ postfix_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -229,9 +256,13 @@ clamd_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
# Don't check Clamd too often
|
diff_c=0
|
||||||
sleep 1800
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -254,8 +285,13 @@ dovecot_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -275,8 +311,13 @@ phpfpm_checks() {
|
||||||
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
|
||||||
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
|
||||||
progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
|
||||||
diff_c=0
|
if [[ $? == 10 ]]; then
|
||||||
sleep $(( ( RANDOM % 30 ) + 10 ))
|
diff_c=0
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
diff_c=0
|
||||||
|
sleep $(( ( RANDOM % 30 ) + 10 ))
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
@ -449,7 +490,10 @@ while true; do
|
||||||
if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
|
if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
|
||||||
HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
|
HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
|
||||||
fi
|
fi
|
||||||
if [[ ! -z ${HAS_INITDB} ]]; then
|
S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
|
||||||
|
if [ ${S_RUNNING} -lt 120 ]; then
|
||||||
|
log_msg "Container is running for less than 120 seconds, skipping action..."
|
||||||
|
elif [[ ! -z ${HAS_INITDB} ]]; then
|
||||||
log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
|
log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
|
||||||
sleep 60
|
sleep 60
|
||||||
else
|
else
|
||||||
|
|
Loading…
Reference in New Issue