feat(vm): Phases 1.2, 1.4, 2.1 — steal time, swap pressure, health watchdog
Phase 1.2 — CPU steal time metric in vm-health-check.sh: - Samples /proc/stat twice 1s apart for accurate current steal % - Thresholds: >5% WARN, >15% CRIT (currently 0.8% on this host) - Inserts before memory check so steal is visible alongside load Phase 1.4 — Swap pressure indicator: - Reads SwapCached from /proc/meminfo as secondary metric - Raises SWAP_USED_WARN_GB 1→1.5 to reduce noise (current usage 0.6G) - New WARN path: SwapCached > 200MB signals recent pressure even when current swap usage looks ok (catches post-spike state) Phase 2.1 — Docker health-check watchdog: - docker-health-watchdog.sh: checks unhealthy containers every 10 min, restarts only after 3 consecutive failing health checks (30min grace) - docker-health-watchdog.service + .timer: enabled, fires every 10 min - Sends Telegram notification on each auto-restart - Rollback: systemctl disable docker-health-watchdog.timer Phase 2.2 already complete: sync_hermes_persistent_backup.py handles diverge gracefully with rebase/reset-hard fallback; running successfully. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d60c81ebda
commit
d9618ba7b0
63
scripts/VMs/HostingerVM/docker-health-watchdog.sh
Executable file
63
scripts/VMs/HostingerVM/docker-health-watchdog.sh
Executable file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# docker-health-watchdog.sh — restart containers stuck in unhealthy state
|
||||||
|
#
|
||||||
|
# Systemd timer invokes this every 10 minutes.
|
||||||
|
# A container is only restarted after 3 consecutive failing health checks
|
||||||
|
# (i.e. the last 3 entries in .State.Health.Log all have ExitCode != 0).
|
||||||
|
# This gives a 30-minute grace window before action is taken — avoids
|
||||||
|
# restarting containers that are transiently unhealthy during a deploy.
|
||||||
|
#
|
||||||
|
# Log: /var/log/docker-watchdog.log
|
||||||
|
# =============================================================================
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
LOG=/var/log/docker-watchdog.log
|
||||||
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
||||||
|
|
||||||
|
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG" 2>/dev/null || true; }
|
||||||
|
|
||||||
|
notify_telegram() {
|
||||||
|
local msg="$1"
|
||||||
|
local token chat_id
|
||||||
|
token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||||
|
chat_id=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||||
|
[[ -z "$token" || -z "$chat_id" ]] && return
|
||||||
|
curl -sf -X POST "https://api.telegram.org/bot${token}/sendMessage" \
|
||||||
|
-d chat_id="$chat_id" \
|
||||||
|
-d text="$msg" > /dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
|
||||||
|
log "Docker not available — skipping watchdog run"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
mapfile -t unhealthy < <(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null || true)
|
||||||
|
|
||||||
|
if (( ${#unhealthy[@]} == 0 )); then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Unhealthy containers detected: ${unhealthy[*]}"
|
||||||
|
|
||||||
|
for container in "${unhealthy[@]}"; do
|
||||||
|
# Count how many of the last 3 health check log entries failed (ExitCode != 0)
|
||||||
|
failures=$(docker inspect "$container" 2>/dev/null | python3 -c "
|
||||||
|
import json, sys
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
if not data:
|
||||||
|
print(0); exit()
|
||||||
|
log = data[0].get('State', {}).get('Health', {}).get('Log', [])
|
||||||
|
recent = log[-3:] if len(log) >= 3 else log
|
||||||
|
print(sum(1 for e in recent if e.get('ExitCode', 0) != 0))
|
||||||
|
" 2>/dev/null || echo 0)
|
||||||
|
|
||||||
|
if [[ "$failures" -eq 3 ]]; then
|
||||||
|
log "Auto-restarting $container (unhealthy 3/3 consecutive checks)"
|
||||||
|
docker restart "$container" 2>&1 | head -1 | tee -a "$LOG" || true
|
||||||
|
notify_telegram "🔄 docker-watchdog restarted $container (3 consecutive unhealthy health checks) — $(hostname)"
|
||||||
|
else
|
||||||
|
log "$container is unhealthy but only $failures/3 consecutive failures — waiting"
|
||||||
|
fi
|
||||||
|
done
|
||||||
@ -30,8 +30,11 @@ LOAD_WARN=4.0 # absolute (not per-CPU)
|
|||||||
LOAD_CRIT=8.0
|
LOAD_CRIT=8.0
|
||||||
RAM_FREE_WARN_GB=3 # GB available
|
RAM_FREE_WARN_GB=3 # GB available
|
||||||
RAM_FREE_CRIT_GB=1
|
RAM_FREE_CRIT_GB=1
|
||||||
SWAP_USED_WARN_GB=1
|
SWAP_USED_WARN_GB=1.5
|
||||||
SWAP_USED_CRIT_GB=3
|
SWAP_USED_CRIT_GB=3
|
||||||
|
SWAP_CACHED_WARN_MB=200 # early-warning: recent swap pressure even if current usage looks ok
|
||||||
|
STEAL_WARN=5 # % steal time
|
||||||
|
STEAL_CRIT=15
|
||||||
CONTAINER_RESTART_WARN=10
|
CONTAINER_RESTART_WARN=10
|
||||||
CONTAINER_RESTART_CRIT=50
|
CONTAINER_RESTART_CRIT=50
|
||||||
BUILD_CACHE_WARN_GB=5
|
BUILD_CACHE_WARN_GB=5
|
||||||
@ -161,24 +164,66 @@ check_memory() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
check_steal() {
|
||||||
|
header "CPU STEAL"
|
||||||
|
# Requires two /proc/stat samples 1s apart — single sample gives lifetime average, not current.
|
||||||
|
local s1 s2
|
||||||
|
s1=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
|
||||||
|
sleep 1
|
||||||
|
s2=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
|
||||||
|
local steal_pct
|
||||||
|
steal_pct=$(awk -v s1="$s1" -v s2="$s2" 'BEGIN{
|
||||||
|
split(s1,a," "); split(s2,b," ")
|
||||||
|
delta_steal=b[1]-a[1]; delta_total=b[2]-a[2]
|
||||||
|
if (delta_total == 0) { printf "0.0"; exit }
|
||||||
|
printf "%.1f", (delta_steal/delta_total)*100
|
||||||
|
}')
|
||||||
|
local steal_int
|
||||||
|
steal_int=$(awk -v v="$steal_pct" 'BEGIN{printf "%d", v}')
|
||||||
|
|
||||||
|
if (( steal_int >= STEAL_CRIT )); then record steal CRIT "${steal_pct}%" "CPU steal ${steal_pct}% — CRITICAL (host is overcommitted)"
|
||||||
|
elif (( steal_int >= STEAL_WARN )); then record steal WARN "${steal_pct}%" "CPU steal ${steal_pct}% — WARNING (host contention; degrades LLM inference)"
|
||||||
|
else record steal OK "${steal_pct}%" "CPU steal OK (${steal_pct}%)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
check_swap() {
|
check_swap() {
|
||||||
header "SWAP"
|
header "SWAP"
|
||||||
local swap_total_kb swap_used_kb
|
local swap_total_kb swap_free_kb swap_cached_kb
|
||||||
swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
|
swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
|
||||||
swap_used_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
|
swap_free_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
|
||||||
swap_used_kb=$(( swap_total_kb - swap_used_kb ))
|
swap_cached_kb=$(awk '/^SwapCached/ {print $2}' /proc/meminfo)
|
||||||
local swap_total_gb swap_used_gb
|
local swap_used_kb
|
||||||
|
swap_used_kb=$(( swap_total_kb - swap_free_kb ))
|
||||||
|
local swap_total_gb
|
||||||
swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
|
swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
|
||||||
swap_used_gb=$(( swap_used_kb / 1024 / 1024 ))
|
local swap_cached_mb
|
||||||
|
swap_cached_mb=$(( swap_cached_kb / 1024 ))
|
||||||
|
|
||||||
if (( swap_total_kb == 0 )); then
|
if (( swap_total_kb == 0 )); then
|
||||||
record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
|
record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if (( swap_used_gb >= SWAP_USED_CRIT_GB )); then record swap CRIT "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — CRITICAL"
|
# Compare used GB using awk to handle the fractional threshold (1.5)
|
||||||
elif (( swap_used_gb >= SWAP_USED_WARN_GB )); then record swap WARN "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — WARNING"
|
local used_gb_10x warn_10x crit_10x
|
||||||
else record swap OK "${swap_used_gb}G / ${swap_total_gb}G" "Swap OK (${swap_used_gb}G used)"
|
used_gb_10x=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%d", (kb/1024/1024)*10}')
|
||||||
|
warn_10x=$(awk -v t="$SWAP_USED_WARN_GB" 'BEGIN{printf "%d", t*10}')
|
||||||
|
crit_10x=$(awk -v t="$SWAP_USED_CRIT_GB" 'BEGIN{printf "%d", t*10}')
|
||||||
|
local swap_used_display
|
||||||
|
swap_used_display=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%.1fG", kb/1024/1024}')
|
||||||
|
|
||||||
|
if (( used_gb_10x >= crit_10x )); then
|
||||||
|
record swap CRIT "${swap_used_display} used" "Swap ${swap_used_display} used — CRITICAL"
|
||||||
|
elif (( used_gb_10x >= warn_10x )); then
|
||||||
|
record swap WARN "${swap_used_display} used" "Swap ${swap_used_display} used — WARNING (>${SWAP_USED_WARN_GB}G)"
|
||||||
|
elif (( swap_cached_mb >= SWAP_CACHED_WARN_MB )); then
|
||||||
|
# SwapCached is pages reclaimed from swap still sitting in cache — indicates
|
||||||
|
# recent memory pressure even though current usage looks ok.
|
||||||
|
record swap WARN "${swap_used_display} used, ${swap_cached_mb}MB cached" \
|
||||||
|
"Swap pressure indicator: SwapCached ${swap_cached_mb}MB — recent memory pressure (threshold ${SWAP_CACHED_WARN_MB}MB)"
|
||||||
|
else
|
||||||
|
record swap OK "${swap_used_display} / ${swap_total_gb}G" "Swap OK (${swap_used_display} used, ${swap_cached_mb}MB cached)"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -310,6 +355,7 @@ fi
|
|||||||
|
|
||||||
check_disk
|
check_disk
|
||||||
check_load
|
check_load
|
||||||
|
check_steal
|
||||||
check_memory
|
check_memory
|
||||||
check_swap
|
check_swap
|
||||||
check_docker_containers
|
check_docker_containers
|
||||||
|
|||||||
12
systemd/docker-health-watchdog.service
Normal file
12
systemd/docker-health-watchdog.service
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Restart Docker containers stuck in unhealthy state
|
||||||
|
Documentation=file:///usr/local/bin/docker-health-watchdog.sh
|
||||||
|
After=docker.service
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=root
|
||||||
|
Group=root
|
||||||
|
Environment="HERMES_HOME=/root/.hermes"
|
||||||
|
ExecStart=/usr/local/bin/docker-health-watchdog.sh
|
||||||
11
systemd/docker-health-watchdog.timer
Normal file
11
systemd/docker-health-watchdog.timer
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Run Docker health watchdog every 10 minutes
|
||||||
|
After=docker.service
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=5min
|
||||||
|
OnUnitActiveSec=10min
|
||||||
|
AccuracySec=30s
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
Loading…
Reference in New Issue
Block a user