Phase 1.2 — CPU steal time metric in vm-health-check.sh: - Samples /proc/stat twice 1s apart for accurate current steal % - Thresholds: >5% WARN, >15% CRIT (currently 0.8% on this host) - Inserts before memory check so steal is visible alongside load Phase 1.4 — Swap pressure indicator: - Reads SwapCached from /proc/meminfo as secondary metric - Raises SWAP_USED_WARN_GB 1→1.5 to reduce noise (current usage 0.6G) - New WARN path: SwapCached > 200MB signals recent pressure even when current swap usage looks ok (catches post-spike state) Phase 2.1 — Docker health-check watchdog: - docker-health-watchdog.sh: checks unhealthy containers every 10 min, restarts only after 3 consecutive failing health checks (30min grace) - docker-health-watchdog.service + .timer: enabled, fires every 10 min - Sends Telegram notification on each auto-restart - Rollback: systemctl disable docker-health-watchdog.timer Phase 2.2 already complete: sync_hermes_persistent_backup.py handles diverge gracefully with rebase/reset-hard fallback; running successfully. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
64 lines
2.4 KiB
Bash
Executable File
64 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# docker-health-watchdog.sh — restart containers stuck in unhealthy state
|
|
#
|
|
# Systemd timer invokes this every 10 minutes.
|
|
# A container is only restarted after 3 consecutive failing health checks
|
|
# (i.e. the last 3 entries in .State.Health.Log all have ExitCode != 0).
|
|
# This gives a 30-minute grace window before action is taken — avoids
|
|
# restarting containers that are transiently unhealthy during a deploy.
|
|
#
|
|
# Log: /var/log/docker-watchdog.log
|
|
# =============================================================================
|
|
set -Eeuo pipefail
|
|
|
|
LOG=/var/log/docker-watchdog.log
|
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
|
|
|
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG" 2>/dev/null || true; }
|
|
|
|
notify_telegram() {
|
|
local msg="$1"
|
|
local token chat_id
|
|
token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
chat_id=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
[[ -z "$token" || -z "$chat_id" ]] && return
|
|
curl -sf -X POST "https://api.telegram.org/bot${token}/sendMessage" \
|
|
-d chat_id="$chat_id" \
|
|
-d text="$msg" > /dev/null 2>&1 || true
|
|
}
|
|
|
|
if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
|
|
log "Docker not available — skipping watchdog run"
|
|
exit 0
|
|
fi
|
|
|
|
mapfile -t unhealthy < <(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null || true)
|
|
|
|
if (( ${#unhealthy[@]} == 0 )); then
|
|
exit 0
|
|
fi
|
|
|
|
log "Unhealthy containers detected: ${unhealthy[*]}"
|
|
|
|
for container in "${unhealthy[@]}"; do
|
|
# Count how many of the last 3 health check log entries failed (ExitCode != 0)
|
|
failures=$(docker inspect "$container" 2>/dev/null | python3 -c "
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
if not data:
|
|
print(0); exit()
|
|
log = data[0].get('State', {}).get('Health', {}).get('Log', [])
|
|
recent = log[-3:] if len(log) >= 3 else log
|
|
print(sum(1 for e in recent if e.get('ExitCode', 0) != 0))
|
|
" 2>/dev/null || echo 0)
|
|
|
|
if [[ "$failures" -eq 3 ]]; then
|
|
log "Auto-restarting $container (unhealthy 3/3 consecutive checks)"
|
|
docker restart "$container" 2>&1 | head -1 | tee -a "$LOG" || true
|
|
notify_telegram "🔄 docker-watchdog restarted $container (3 consecutive unhealthy health checks) — $(hostname)"
|
|
else
|
|
log "$container is unhealthy but only $failures/3 consecutive failures — waiting"
|
|
fi
|
|
done
|