From d9618ba7b0111927357f920107e2c6820b18c581 Mon Sep 17 00:00:00 2001
From: Hermes VM <root@srv1491630>
Date: Wed, 27 May 2026 21:31:09 +0000
Subject: [PATCH] =?UTF-8?q?feat(vm):=20Phases=201.2,=201.4,=202.1=20?=
 =?UTF-8?q?=E2=80=94=20steal=20time,=20swap=20pressure,=20health=20watchdo?=
 =?UTF-8?q?g?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1.2 — CPU steal time metric in vm-health-check.sh:
- Samples /proc/stat twice 1s apart for accurate current steal %
- Thresholds: >5% WARN, >15% CRIT (currently 0.8% on this host)
- Inserts before memory check so steal is visible alongside load

Phase 1.4 — Swap pressure indicator:
- Reads SwapCached from /proc/meminfo as secondary metric
- Raises SWAP_USED_WARN_GB 1→1.5 to reduce noise (current usage 0.6G)
- New WARN path: SwapCached > 200MB signals recent pressure even when
  current swap usage looks ok (catches post-spike state)

Phase 2.1 — Docker health-check watchdog:
- docker-health-watchdog.sh: checks unhealthy containers every 10 min,
  restarts only after 3 consecutive failing health checks (30min grace)
- docker-health-watchdog.service + .timer: enabled, fires every 10 min
- Sends Telegram notification on each auto-restart
- Rollback: systemctl disable docker-health-watchdog.timer

Phase 2.2 already complete: sync_hermes_persistent_backup.py handles
diverge gracefully with rebase/reset-hard fallback; running successfully.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../VMs/HostingerVM/docker-health-watchdog.sh | 63 ++++++++++++++++++
 scripts/VMs/HostingerVM/vm-health-check.sh    | 64 ++++++++++++++++---
 systemd/docker-health-watchdog.service        | 12 ++++
 systemd/docker-health-watchdog.timer          | 11 ++++
 4 files changed, 141 insertions(+), 9 deletions(-)
 create mode 100755 scripts/VMs/HostingerVM/docker-health-watchdog.sh
 create mode 100644 systemd/docker-health-watchdog.service
 create mode 100644 systemd/docker-health-watchdog.timer

diff --git a/scripts/VMs/HostingerVM/docker-health-watchdog.sh b/scripts/VMs/HostingerVM/docker-health-watchdog.sh
new file mode 100755
index 0000000..1de08f2
--- /dev/null
+++ b/scripts/VMs/HostingerVM/docker-health-watchdog.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# =============================================================================
+# docker-health-watchdog.sh — restart containers stuck in unhealthy state
+#
+# Systemd timer invokes this every 10 minutes.
+# A container is only restarted after 3 consecutive failing health checks
+# (i.e. the last 3 entries in .State.Health.Log all have ExitCode != 0).
+# This gives a 30-minute grace window before action is taken — avoids
+# restarting containers that are transiently unhealthy during a deploy.
+#
+# Log: /var/log/docker-watchdog.log
+# =============================================================================
+set -Eeuo pipefail
+
+LOG=/var/log/docker-watchdog.log
+TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
+
+log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG" 2>/dev/null || true; }
+
+notify_telegram() {
+  local msg="$1"
+  local token chat_id
+  token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
+  chat_id=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
+  [[ -z "$token" || -z "$chat_id" ]] && return
+  curl -sf -X POST "https://api.telegram.org/bot${token}/sendMessage" \
+    -d chat_id="$chat_id" \
+    -d text="$msg" > /dev/null 2>&1 || true
+}
+
+if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
+  log "Docker not available — skipping watchdog run"
+  exit 0
+fi
+
+mapfile -t unhealthy < <(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null || true)
+
+if (( ${#unhealthy[@]} == 0 )); then
+  exit 0
+fi
+
+log "Unhealthy containers detected: ${unhealthy[*]}"
+
+for container in "${unhealthy[@]}"; do
+  # Count how many of the last 3 health check log entries failed (ExitCode != 0)
+  failures=$(docker inspect "$container" 2>/dev/null | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+if not data:
+    print(0); exit()
+log = data[0].get('State', {}).get('Health', {}).get('Log', [])
+recent = log[-3:] if len(log) >= 3 else log
+print(sum(1 for e in recent if e.get('ExitCode', 0) != 0))
+" 2>/dev/null || echo 0)
+
+  if [[ "$failures" -eq 3 ]]; then
+    log "Auto-restarting $container (unhealthy 3/3 consecutive checks)"
+    docker restart "$container" 2>&1 | head -1 | tee -a "$LOG" || true
+    notify_telegram "🔄 docker-watchdog restarted $container (3 consecutive unhealthy health checks) — $(hostname)"
+  else
+    log "$container is unhealthy but only $failures/3 consecutive failures — waiting"
+  fi
+done
diff --git a/scripts/VMs/HostingerVM/vm-health-check.sh b/scripts/VMs/HostingerVM/vm-health-check.sh
index 87a8d34..6acfe10 100755
--- a/scripts/VMs/HostingerVM/vm-health-check.sh
+++ b/scripts/VMs/HostingerVM/vm-health-check.sh
@@ -30,8 +30,11 @@ LOAD_WARN=4.0          # absolute (not per-CPU)
 LOAD_CRIT=8.0
 RAM_FREE_WARN_GB=3     # GB available
 RAM_FREE_CRIT_GB=1
-SWAP_USED_WARN_GB=1
+SWAP_USED_WARN_GB=1.5
 SWAP_USED_CRIT_GB=3
+SWAP_CACHED_WARN_MB=200   # early-warning: recent swap pressure even if current usage looks ok
+STEAL_WARN=5              # % steal time
+STEAL_CRIT=15
 CONTAINER_RESTART_WARN=10
 CONTAINER_RESTART_CRIT=50
 BUILD_CACHE_WARN_GB=5
@@ -161,24 +164,66 @@ check_memory() {
   fi
 }
 
+check_steal() {
+  header "CPU STEAL"
+  # Requires two /proc/stat samples 1s apart — single sample gives lifetime average, not current.
+  local s1 s2
+  s1=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
+  sleep 1
+  s2=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
+  local steal_pct
+  steal_pct=$(awk -v s1="$s1" -v s2="$s2" 'BEGIN{
+    split(s1,a," "); split(s2,b," ")
+    delta_steal=b[1]-a[1]; delta_total=b[2]-a[2]
+    if (delta_total == 0) { printf "0.0"; exit }
+    printf "%.1f", (delta_steal/delta_total)*100
+  }')
+  local steal_int
+  steal_int=$(awk -v v="$steal_pct" 'BEGIN{printf "%d", v}')
+
+  if   (( steal_int >= STEAL_CRIT )); then record steal CRIT "${steal_pct}%" "CPU steal ${steal_pct}% — CRITICAL (host is overcommitted)"
+  elif (( steal_int >= STEAL_WARN )); then record steal WARN "${steal_pct}%" "CPU steal ${steal_pct}% — WARNING (host contention; degrades LLM inference)"
+  else                                     record steal OK   "${steal_pct}%" "CPU steal OK (${steal_pct}%)"
+  fi
+}
+
 check_swap() {
   header "SWAP"
-  local swap_total_kb swap_used_kb
+  local swap_total_kb swap_free_kb swap_cached_kb
   swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
-  swap_used_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
-  swap_used_kb=$(( swap_total_kb - swap_used_kb ))
-  local swap_total_gb swap_used_gb
+  swap_free_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
+  swap_cached_kb=$(awk '/^SwapCached/ {print $2}' /proc/meminfo)
+  local swap_used_kb
+  swap_used_kb=$(( swap_total_kb - swap_free_kb ))
+  local swap_total_gb
   swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
-  swap_used_gb=$(( swap_used_kb / 1024 / 1024 ))
+  local swap_cached_mb
+  swap_cached_mb=$(( swap_cached_kb / 1024 ))
 
   if (( swap_total_kb == 0 )); then
     record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
     return
   fi
 
-  if   (( swap_used_gb >= SWAP_USED_CRIT_GB )); then record swap CRIT "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — CRITICAL"
-  elif (( swap_used_gb >= SWAP_USED_WARN_GB )); then record swap WARN "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — WARNING"
-  else                                               record swap OK   "${swap_used_gb}G / ${swap_total_gb}G" "Swap OK (${swap_used_gb}G used)"
+  # Compare used GB using awk to handle the fractional threshold (1.5)
+  local used_gb_10x warn_10x crit_10x
+  used_gb_10x=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%d", (kb/1024/1024)*10}')
+  warn_10x=$(awk -v t="$SWAP_USED_WARN_GB" 'BEGIN{printf "%d", t*10}')
+  crit_10x=$(awk -v t="$SWAP_USED_CRIT_GB" 'BEGIN{printf "%d", t*10}')
+  local swap_used_display
+  swap_used_display=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%.1fG", kb/1024/1024}')
+
+  if   (( used_gb_10x >= crit_10x )); then
+    record swap CRIT "${swap_used_display} used" "Swap ${swap_used_display} used — CRITICAL"
+  elif (( used_gb_10x >= warn_10x )); then
+    record swap WARN "${swap_used_display} used" "Swap ${swap_used_display} used — WARNING (>${SWAP_USED_WARN_GB}G)"
+  elif (( swap_cached_mb >= SWAP_CACHED_WARN_MB )); then
+    # SwapCached is pages reclaimed from swap still sitting in cache — indicates
+    # recent memory pressure even though current usage looks ok.
+    record swap WARN "${swap_used_display} used, ${swap_cached_mb}MB cached" \
+      "Swap pressure indicator: SwapCached ${swap_cached_mb}MB — recent memory pressure (threshold ${SWAP_CACHED_WARN_MB}MB)"
+  else
+    record swap OK "${swap_used_display} / ${swap_total_gb}G" "Swap OK (${swap_used_display} used, ${swap_cached_mb}MB cached)"
   fi
 }
 
@@ -310,6 +355,7 @@ fi
 
 check_disk
 check_load
+check_steal
 check_memory
 check_swap
 check_docker_containers
diff --git a/systemd/docker-health-watchdog.service b/systemd/docker-health-watchdog.service
new file mode 100644
index 0000000..f46f24e
--- /dev/null
+++ b/systemd/docker-health-watchdog.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Restart Docker containers stuck in unhealthy state
+Documentation=file:///usr/local/bin/docker-health-watchdog.sh
+After=docker.service
+Requires=docker.service
+
+[Service]
+Type=oneshot
+User=root
+Group=root
+Environment="HERMES_HOME=/root/.hermes"
+ExecStart=/usr/local/bin/docker-health-watchdog.sh
diff --git a/systemd/docker-health-watchdog.timer b/systemd/docker-health-watchdog.timer
new file mode 100644
index 0000000..c45aded
--- /dev/null
+++ b/systemd/docker-health-watchdog.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Run Docker health watchdog every 10 minutes
+After=docker.service
+
+[Timer]
+OnBootSec=5min
+OnUnitActiveSec=10min
+AccuracySec=30s
+
+[Install]
+WantedBy=timers.target