bytelyst-devops-tools/scripts/VMs/HostingerVM/docker-health-watchdog.sh

#!/usr/bin/env bash
# =============================================================================
# docker-health-watchdog.sh — restart containers stuck in unhealthy state
#
# Systemd timer invokes this every 10 minutes.
# A container is only restarted after 3 consecutive failing health checks
# (i.e. the last 3 entries in .State.Health.Log all have ExitCode != 0).
# This gives a 30-minute grace window before action is taken — avoids
# restarting containers that are transiently unhealthy during a deploy.
#
# Log: /var/log/docker-watchdog.log
# =============================================================================
set -Eeuo pipefail

LOG=/var/log/docker-watchdog.log
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"

log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG" 2>/dev/null || true; }

notify_telegram() {
  local msg="$1"
  local token chat_id
  token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
  chat_id=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
  [[ -z "$token" || -z "$chat_id" ]] && return
  curl -sf -X POST "https://api.telegram.org/bot${token}/sendMessage" \
    -d chat_id="$chat_id" \
    -d text="$msg" > /dev/null 2>&1 || true
}

if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
  log "Docker not available — skipping watchdog run"
  exit 0
fi

mapfile -t unhealthy < <(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null || true)

if (( ${#unhealthy[@]} == 0 )); then
  exit 0
fi

log "Unhealthy containers detected: ${unhealthy[*]}"

for container in "${unhealthy[@]}"; do
  # Count how many of the last 3 health check log entries failed (ExitCode != 0)
  failures=$(docker inspect "$container" 2>/dev/null | python3 -c "
import json, sys
data = json.load(sys.stdin)
if not data:
    print(0); exit()
log = data[0].get('State', {}).get('Health', {}).get('Log', [])
recent = log[-3:] if len(log) >= 3 else log
print(sum(1 for e in recent if e.get('ExitCode', 0) != 0))
" 2>/dev/null || echo 0)

  if [[ "$failures" -eq 3 ]]; then
    log "Auto-restarting $container (unhealthy 3/3 consecutive checks)"
    docker restart "$container" 2>&1 | head -1 | tee -a "$LOG" || true
    notify_telegram "🔄 docker-watchdog restarted $container (3 consecutive unhealthy health checks) — $(hostname)"
  else
    log "$container is unhealthy but only $failures/3 consecutive failures — waiting"
  fi
done