From d9618ba7b0111927357f920107e2c6820b18c581 Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Wed, 27 May 2026 21:31:09 +0000 Subject: [PATCH] =?UTF-8?q?feat(vm):=20Phases=201.2,=201.4,=202.1=20?= =?UTF-8?q?=E2=80=94=20steal=20time,=20swap=20pressure,=20health=20watchdo?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1.2 — CPU steal time metric in vm-health-check.sh: - Samples /proc/stat twice 1s apart for accurate current steal % - Thresholds: >5% WARN, >15% CRIT (currently 0.8% on this host) - Inserts before memory check so steal is visible alongside load Phase 1.4 — Swap pressure indicator: - Reads SwapCached from /proc/meminfo as secondary metric - Raises SWAP_USED_WARN_GB 1→1.5 to reduce noise (current usage 0.6G) - New WARN path: SwapCached > 200MB signals recent pressure even when current swap usage looks ok (catches post-spike state) Phase 2.1 — Docker health-check watchdog: - docker-health-watchdog.sh: checks unhealthy containers every 10 min, restarts only after 3 consecutive failing health checks (30min grace) - docker-health-watchdog.service + .timer: enabled, fires every 10 min - Sends Telegram notification on each auto-restart - Rollback: systemctl disable docker-health-watchdog.timer Phase 2.2 already complete: sync_hermes_persistent_backup.py handles diverge gracefully with rebase/reset-hard fallback; running successfully. Co-Authored-By: Claude Sonnet 4.6 --- .../VMs/HostingerVM/docker-health-watchdog.sh | 63 ++++++++++++++++++ scripts/VMs/HostingerVM/vm-health-check.sh | 64 ++++++++++++++++--- systemd/docker-health-watchdog.service | 12 ++++ systemd/docker-health-watchdog.timer | 11 ++++ 4 files changed, 141 insertions(+), 9 deletions(-) create mode 100755 scripts/VMs/HostingerVM/docker-health-watchdog.sh create mode 100644 systemd/docker-health-watchdog.service create mode 100644 systemd/docker-health-watchdog.timer diff --git a/scripts/VMs/HostingerVM/docker-health-watchdog.sh b/scripts/VMs/HostingerVM/docker-health-watchdog.sh new file mode 100755 index 0000000..1de08f2 --- /dev/null +++ b/scripts/VMs/HostingerVM/docker-health-watchdog.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# ============================================================================= +# docker-health-watchdog.sh — restart containers stuck in unhealthy state +# +# Systemd timer invokes this every 10 minutes. +# A container is only restarted after 3 consecutive failing health checks +# (i.e. the last 3 entries in .State.Health.Log all have ExitCode != 0). +# This gives a 30-minute grace window before action is taken — avoids +# restarting containers that are transiently unhealthy during a deploy. +# +# Log: /var/log/docker-watchdog.log +# ============================================================================= +set -Eeuo pipefail + +LOG=/var/log/docker-watchdog.log +TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG" 2>/dev/null || true; } + +notify_telegram() { + local msg="$1" + local token chat_id + token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + chat_id=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + [[ -z "$token" || -z "$chat_id" ]] && return + curl -sf -X POST "https://api.telegram.org/bot${token}/sendMessage" \ + -d chat_id="$chat_id" \ + -d text="$msg" > /dev/null 2>&1 || true +} + +if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then + log "Docker not available — skipping watchdog run" + exit 0 +fi + +mapfile -t unhealthy < <(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null || true) + +if (( ${#unhealthy[@]} == 0 )); then + exit 0 +fi + +log "Unhealthy containers detected: ${unhealthy[*]}" + +for container in "${unhealthy[@]}"; do + # Count how many of the last 3 health check log entries failed (ExitCode != 0) + failures=$(docker inspect "$container" 2>/dev/null | python3 -c " +import json, sys +data = json.load(sys.stdin) +if not data: + print(0); exit() +log = data[0].get('State', {}).get('Health', {}).get('Log', []) +recent = log[-3:] if len(log) >= 3 else log +print(sum(1 for e in recent if e.get('ExitCode', 0) != 0)) +" 2>/dev/null || echo 0) + + if [[ "$failures" -eq 3 ]]; then + log "Auto-restarting $container (unhealthy 3/3 consecutive checks)" + docker restart "$container" 2>&1 | head -1 | tee -a "$LOG" || true + notify_telegram "🔄 docker-watchdog restarted $container (3 consecutive unhealthy health checks) — $(hostname)" + else + log "$container is unhealthy but only $failures/3 consecutive failures — waiting" + fi +done diff --git a/scripts/VMs/HostingerVM/vm-health-check.sh b/scripts/VMs/HostingerVM/vm-health-check.sh index 87a8d34..6acfe10 100755 --- a/scripts/VMs/HostingerVM/vm-health-check.sh +++ b/scripts/VMs/HostingerVM/vm-health-check.sh @@ -30,8 +30,11 @@ LOAD_WARN=4.0 # absolute (not per-CPU) LOAD_CRIT=8.0 RAM_FREE_WARN_GB=3 # GB available RAM_FREE_CRIT_GB=1 -SWAP_USED_WARN_GB=1 +SWAP_USED_WARN_GB=1.5 SWAP_USED_CRIT_GB=3 +SWAP_CACHED_WARN_MB=200 # early-warning: recent swap pressure even if current usage looks ok +STEAL_WARN=5 # % steal time +STEAL_CRIT=15 CONTAINER_RESTART_WARN=10 CONTAINER_RESTART_CRIT=50 BUILD_CACHE_WARN_GB=5 @@ -161,24 +164,66 @@ check_memory() { fi } +check_steal() { + header "CPU STEAL" + # Requires two /proc/stat samples 1s apart — single sample gives lifetime average, not current. + local s1 s2 + s1=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat) + sleep 1 + s2=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat) + local steal_pct + steal_pct=$(awk -v s1="$s1" -v s2="$s2" 'BEGIN{ + split(s1,a," "); split(s2,b," ") + delta_steal=b[1]-a[1]; delta_total=b[2]-a[2] + if (delta_total == 0) { printf "0.0"; exit } + printf "%.1f", (delta_steal/delta_total)*100 + }') + local steal_int + steal_int=$(awk -v v="$steal_pct" 'BEGIN{printf "%d", v}') + + if (( steal_int >= STEAL_CRIT )); then record steal CRIT "${steal_pct}%" "CPU steal ${steal_pct}% — CRITICAL (host is overcommitted)" + elif (( steal_int >= STEAL_WARN )); then record steal WARN "${steal_pct}%" "CPU steal ${steal_pct}% — WARNING (host contention; degrades LLM inference)" + else record steal OK "${steal_pct}%" "CPU steal OK (${steal_pct}%)" + fi +} + check_swap() { header "SWAP" - local swap_total_kb swap_used_kb + local swap_total_kb swap_free_kb swap_cached_kb swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo) - swap_used_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo) - swap_used_kb=$(( swap_total_kb - swap_used_kb )) - local swap_total_gb swap_used_gb + swap_free_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo) + swap_cached_kb=$(awk '/^SwapCached/ {print $2}' /proc/meminfo) + local swap_used_kb + swap_used_kb=$(( swap_total_kb - swap_free_kb )) + local swap_total_gb swap_total_gb=$(( swap_total_kb / 1024 / 1024 )) - swap_used_gb=$(( swap_used_kb / 1024 / 1024 )) + local swap_cached_mb + swap_cached_mb=$(( swap_cached_kb / 1024 )) if (( swap_total_kb == 0 )); then record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)" return fi - if (( swap_used_gb >= SWAP_USED_CRIT_GB )); then record swap CRIT "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — CRITICAL" - elif (( swap_used_gb >= SWAP_USED_WARN_GB )); then record swap WARN "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — WARNING" - else record swap OK "${swap_used_gb}G / ${swap_total_gb}G" "Swap OK (${swap_used_gb}G used)" + # Compare used GB using awk to handle the fractional threshold (1.5) + local used_gb_10x warn_10x crit_10x + used_gb_10x=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%d", (kb/1024/1024)*10}') + warn_10x=$(awk -v t="$SWAP_USED_WARN_GB" 'BEGIN{printf "%d", t*10}') + crit_10x=$(awk -v t="$SWAP_USED_CRIT_GB" 'BEGIN{printf "%d", t*10}') + local swap_used_display + swap_used_display=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%.1fG", kb/1024/1024}') + + if (( used_gb_10x >= crit_10x )); then + record swap CRIT "${swap_used_display} used" "Swap ${swap_used_display} used — CRITICAL" + elif (( used_gb_10x >= warn_10x )); then + record swap WARN "${swap_used_display} used" "Swap ${swap_used_display} used — WARNING (>${SWAP_USED_WARN_GB}G)" + elif (( swap_cached_mb >= SWAP_CACHED_WARN_MB )); then + # SwapCached is pages reclaimed from swap still sitting in cache — indicates + # recent memory pressure even though current usage looks ok. + record swap WARN "${swap_used_display} used, ${swap_cached_mb}MB cached" \ + "Swap pressure indicator: SwapCached ${swap_cached_mb}MB — recent memory pressure (threshold ${SWAP_CACHED_WARN_MB}MB)" + else + record swap OK "${swap_used_display} / ${swap_total_gb}G" "Swap OK (${swap_used_display} used, ${swap_cached_mb}MB cached)" fi } @@ -310,6 +355,7 @@ fi check_disk check_load +check_steal check_memory check_swap check_docker_containers diff --git a/systemd/docker-health-watchdog.service b/systemd/docker-health-watchdog.service new file mode 100644 index 0000000..f46f24e --- /dev/null +++ b/systemd/docker-health-watchdog.service @@ -0,0 +1,12 @@ +[Unit] +Description=Restart Docker containers stuck in unhealthy state +Documentation=file:///usr/local/bin/docker-health-watchdog.sh +After=docker.service +Requires=docker.service + +[Service] +Type=oneshot +User=root +Group=root +Environment="HERMES_HOME=/root/.hermes" +ExecStart=/usr/local/bin/docker-health-watchdog.sh diff --git a/systemd/docker-health-watchdog.timer b/systemd/docker-health-watchdog.timer new file mode 100644 index 0000000..c45aded --- /dev/null +++ b/systemd/docker-health-watchdog.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run Docker health watchdog every 10 minutes +After=docker.service + +[Timer] +OnBootSec=5min +OnUnitActiveSec=10min +AccuracySec=30s + +[Install] +WantedBy=timers.target