#!/usr/bin/env bash # ============================================================================= # docker-health-watchdog.sh — restart containers stuck in unhealthy state # # Systemd timer invokes this every 10 minutes. # A container is only restarted after 3 consecutive failing health checks # (i.e. the last 3 entries in .State.Health.Log all have ExitCode != 0). # This gives a 30-minute grace window before action is taken — avoids # restarting containers that are transiently unhealthy during a deploy. # # Log: /var/log/docker-watchdog.log # ============================================================================= set -Eeuo pipefail LOG=/var/log/docker-watchdog.log TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG" 2>/dev/null || true; } notify_telegram() { local msg="$1" local token chat_id token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) chat_id=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) [[ -z "$token" || -z "$chat_id" ]] && return curl -sf -X POST "https://api.telegram.org/bot${token}/sendMessage" \ -d chat_id="$chat_id" \ -d text="$msg" > /dev/null 2>&1 || true } if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then log "Docker not available — skipping watchdog run" exit 0 fi mapfile -t unhealthy < <(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null || true) if (( ${#unhealthy[@]} == 0 )); then exit 0 fi log "Unhealthy containers detected: ${unhealthy[*]}" for container in "${unhealthy[@]}"; do # Count how many of the last 3 health check log entries failed (ExitCode != 0) failures=$(docker inspect "$container" 2>/dev/null | python3 -c " import json, sys data = json.load(sys.stdin) if not data: print(0); exit() log = data[0].get('State', {}).get('Health', {}).get('Log', []) recent = log[-3:] if len(log) >= 3 else log print(sum(1 for e in recent if e.get('ExitCode', 0) != 0)) " 2>/dev/null || echo 0) if [[ "$failures" -eq 3 ]]; then log "Auto-restarting $container (unhealthy 3/3 consecutive checks)" docker restart "$container" 2>&1 | head -1 | tee -a "$LOG" || true notify_telegram "🔄 docker-watchdog restarted $container (3 consecutive unhealthy health checks) — $(hostname)" else log "$container is unhealthy but only $failures/3 consecutive failures — waiting" fi done