bytelyst-devops-tools/scripts/VMs/HostingerVM/vm-health-check.sh
Hermes VM d9618ba7b0 feat(vm): Phases 1.2, 1.4, 2.1 — steal time, swap pressure, health watchdog
Phase 1.2 — CPU steal time metric in vm-health-check.sh:
- Samples /proc/stat twice 1s apart for accurate current steal %
- Thresholds: >5% WARN, >15% CRIT (currently 0.8% on this host)
- Inserts before memory check so steal is visible alongside load

Phase 1.4 — Swap pressure indicator:
- Reads SwapCached from /proc/meminfo as secondary metric
- Raises SWAP_USED_WARN_GB 1→1.5 to reduce noise (current usage 0.6G)
- New WARN path: SwapCached > 200MB signals recent pressure even when
  current swap usage looks ok (catches post-spike state)

Phase 2.1 — Docker health-check watchdog:
- docker-health-watchdog.sh: checks unhealthy containers every 10 min,
  restarts only after 3 consecutive failing health checks (30min grace)
- docker-health-watchdog.service + .timer: enabled, fires every 10 min
- Sends Telegram notification on each auto-restart
- Rollback: systemctl disable docker-health-watchdog.timer

Phase 2.2 already complete: sync_hermes_persistent_backup.py handles
diverge gracefully with rebase/reset-hard fallback; running successfully.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:31:09 +00:00

431 lines
17 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# vm-health-check.sh — Hostinger VM Health Check (READ-ONLY)
#
# Checks disk, memory, load, swap, and Docker container health.
# Prints a colour-coded report. Exits non-zero if any threshold is exceeded
# so it can drive cron alerts or CI gates.
#
# Usage:
# bash vm-health-check.sh # interactive report
# bash vm-health-check.sh --quiet # only print problems (exit 1 if any)
# bash vm-health-check.sh --json # machine-readable JSON output
# bash vm-health-check.sh --notify # send Telegram alert on WARNING/CRITICAL
#
# Exit codes:
# 0 — all green
# 1 — at least one WARNING
# 2 — at least one CRITICAL
# =============================================================================
set -Eeuo pipefail
# shellcheck disable=SC2034
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_FILE="/var/log/vm-health-check.log"
# ── Thresholds ──────────────────────────────────────────────────────────────
DISK_WARN=55 # % used
DISK_CRIT=70
LOAD_WARN=4.0 # absolute (not per-CPU)
LOAD_CRIT=8.0
RAM_FREE_WARN_GB=3 # GB available
RAM_FREE_CRIT_GB=1
SWAP_USED_WARN_GB=1.5
SWAP_USED_CRIT_GB=3
SWAP_CACHED_WARN_MB=200 # early-warning: recent swap pressure even if current usage looks ok
STEAL_WARN=5 # % steal time
STEAL_CRIT=15
CONTAINER_RESTART_WARN=10
CONTAINER_RESTART_CRIT=50
BUILD_CACHE_WARN_GB=5
BUILD_CACHE_CRIT_GB=20
# shellcheck disable=SC2034
DOCKER_IMAGES_WARN_GB=15
# shellcheck disable=SC2034
DOCKER_IMAGES_CRIT_GB=25
# ── Colour codes ────────────────────────────────────────────────────────────
RED=$'\033[0;31m'
YELLOW=$'\033[1;33m'
GREEN=$'\033[0;32m'
CYAN=$'\033[0;36m'
BOLD=$'\033[1m'
NC=$'\033[0m'
# ── Flags ───────────────────────────────────────────────────────────────────
QUIET=false
JSON_MODE=false
NOTIFY=false
for arg in "$@"; do
case "$arg" in
--quiet) QUIET=true ;;
--json) JSON_MODE=true ;;
--notify) NOTIFY=true ;;
esac
done
# ── State tracking ──────────────────────────────────────────────────────────
WORST_LEVEL=0 # 0=OK, 1=WARN, 2=CRIT
ISSUES=()
declare -A JSON_DATA
# ── Helpers ─────────────────────────────────────────────────────────────────
log_to_file() {
local log_dir
log_dir="$(dirname "$LOG_FILE")"
if [[ ( -e "$LOG_FILE" && ! -w "$LOG_FILE" ) || ( ! -e "$LOG_FILE" && ! -w "$log_dir" ) ]]; then
return
fi
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true
}
status_icon() {
case "$1" in
OK) echo -e "${GREEN}${NC}" ;;
WARN) echo -e "${YELLOW}${NC}" ;;
CRIT) echo -e "${RED}${NC}" ;;
esac
}
level_int() {
case "$1" in OK) echo 0 ;; WARN) echo 1 ;; CRIT) echo 2 ;; *) echo 0 ;; esac
}
record() {
# record NAME LEVEL VALUE MESSAGE
local name="$1" level="$2" value="$3" message="$4"
local lvl_int
lvl_int=$(level_int "$level")
if (( lvl_int > WORST_LEVEL )); then WORST_LEVEL=$lvl_int; fi
if [[ "$level" != "OK" ]]; then ISSUES+=("[$level] $message"); fi
JSON_DATA["$name"]=$(printf '{"level":"%s","value":"%s","message":"%s"}' \
"$level" "$value" "$message")
if $QUIET && [[ "$level" == "OK" ]]; then return; fi
if ! $JSON_MODE; then
printf " %s %-30s %s\n" "$(status_icon "$level")" "$message" "${CYAN}($value)${NC}"
fi
}
header() {
$JSON_MODE && return
$QUIET && return
echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}"
}
# ── Checks ──────────────────────────────────────────────────────────────────
check_disk() {
header "DISK"
local used_pct
used_pct=$(df / --output=pcent | tail -1 | tr -d ' %')
local avail_gb
avail_gb=$(df / --output=avail -BG | tail -1 | tr -d ' G')
if (( used_pct >= DISK_CRIT )); then record disk CRIT "${used_pct}%" "Disk ${used_pct}% used — CRITICAL (>${DISK_CRIT}%)"
elif (( used_pct >= DISK_WARN )); then record disk WARN "${used_pct}%" "Disk ${used_pct}% used — WARNING (>${DISK_WARN}%)"
else record disk OK "${used_pct}% used, ${avail_gb}G free" "Disk OK (${used_pct}%)"
fi
}
check_load() {
header "LOAD"
local load1 load5 _load15
read -r load1 load5 _load15 _ < /proc/loadavg
local ncpu
ncpu=$(nproc)
# compare as integers (multiply by 10 to avoid bc dependency)
local load1_int
load1_int=$(echo "$load1" | awk '{printf "%d", $1 * 10}')
local warn_int crit_int
warn_int=$(echo "$LOAD_WARN" | awk '{printf "%d", $1 * 10}')
crit_int=$(echo "$LOAD_CRIT" | awk '{printf "%d", $1 * 10}')
if (( load1_int >= crit_int )); then record load CRIT "$load1" "Load avg $load1 — CRITICAL (>${LOAD_CRIT}, ${ncpu} CPUs)"
elif (( load1_int >= warn_int )); then record load WARN "$load1" "Load avg $load1 — WARNING (>${LOAD_WARN})"
else record load OK "$load1 (1m) / $load5 (5m)" "Load OK ($load1)"
fi
}
check_memory() {
header "MEMORY"
local available_kb total_kb
available_kb=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
total_kb=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
local available_gb total_gb
available_gb=$(( available_kb / 1024 / 1024 ))
total_gb=$(( total_kb / 1024 / 1024 ))
if (( available_gb < RAM_FREE_CRIT_GB )); then record ram CRIT "${available_gb}G avail" "RAM available ${available_gb}G — CRITICAL (<${RAM_FREE_CRIT_GB}G)"
elif (( available_gb < RAM_FREE_WARN_GB )); then record ram WARN "${available_gb}G avail" "RAM available ${available_gb}G — WARNING (<${RAM_FREE_WARN_GB}G)"
else record ram OK "${available_gb}G / ${total_gb}G" "RAM OK (${available_gb}G available)"
fi
}
check_steal() {
header "CPU STEAL"
# Requires two /proc/stat samples 1s apart — single sample gives lifetime average, not current.
local s1 s2
s1=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
sleep 1
s2=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
local steal_pct
steal_pct=$(awk -v s1="$s1" -v s2="$s2" 'BEGIN{
split(s1,a," "); split(s2,b," ")
delta_steal=b[1]-a[1]; delta_total=b[2]-a[2]
if (delta_total == 0) { printf "0.0"; exit }
printf "%.1f", (delta_steal/delta_total)*100
}')
local steal_int
steal_int=$(awk -v v="$steal_pct" 'BEGIN{printf "%d", v}')
if (( steal_int >= STEAL_CRIT )); then record steal CRIT "${steal_pct}%" "CPU steal ${steal_pct}% — CRITICAL (host is overcommitted)"
elif (( steal_int >= STEAL_WARN )); then record steal WARN "${steal_pct}%" "CPU steal ${steal_pct}% — WARNING (host contention; degrades LLM inference)"
else record steal OK "${steal_pct}%" "CPU steal OK (${steal_pct}%)"
fi
}
check_swap() {
header "SWAP"
local swap_total_kb swap_free_kb swap_cached_kb
swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
swap_free_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
swap_cached_kb=$(awk '/^SwapCached/ {print $2}' /proc/meminfo)
local swap_used_kb
swap_used_kb=$(( swap_total_kb - swap_free_kb ))
local swap_total_gb
swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
local swap_cached_mb
swap_cached_mb=$(( swap_cached_kb / 1024 ))
if (( swap_total_kb == 0 )); then
record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
return
fi
# Compare used GB using awk to handle the fractional threshold (1.5)
local used_gb_10x warn_10x crit_10x
used_gb_10x=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%d", (kb/1024/1024)*10}')
warn_10x=$(awk -v t="$SWAP_USED_WARN_GB" 'BEGIN{printf "%d", t*10}')
crit_10x=$(awk -v t="$SWAP_USED_CRIT_GB" 'BEGIN{printf "%d", t*10}')
local swap_used_display
swap_used_display=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%.1fG", kb/1024/1024}')
if (( used_gb_10x >= crit_10x )); then
record swap CRIT "${swap_used_display} used" "Swap ${swap_used_display} used — CRITICAL"
elif (( used_gb_10x >= warn_10x )); then
record swap WARN "${swap_used_display} used" "Swap ${swap_used_display} used — WARNING (>${SWAP_USED_WARN_GB}G)"
elif (( swap_cached_mb >= SWAP_CACHED_WARN_MB )); then
# SwapCached is pages reclaimed from swap still sitting in cache — indicates
# recent memory pressure even though current usage looks ok.
record swap WARN "${swap_used_display} used, ${swap_cached_mb}MB cached" \
"Swap pressure indicator: SwapCached ${swap_cached_mb}MB — recent memory pressure (threshold ${SWAP_CACHED_WARN_MB}MB)"
else
record swap OK "${swap_used_display} / ${swap_total_gb}G" "Swap OK (${swap_used_display} used, ${swap_cached_mb}MB cached)"
fi
}
check_docker_containers() {
header "DOCKER CONTAINERS"
if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
record docker_daemon WARN "not running" "Docker daemon is not running"
return
fi
# Crash-looping containers
local looping_warn=() looping_crit=()
while IFS=$'\t' read -r name restarts; do
[[ -z "$name" || "$name" == "NAMES" ]] && continue
restarts="${restarts:-0}"
if (( restarts >= CONTAINER_RESTART_CRIT )); then looping_crit+=("$name(${restarts}x)")
elif (( restarts >= CONTAINER_RESTART_WARN )); then looping_warn+=("$name(${restarts}x)")
fi
done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true)
if (( ${#looping_crit[@]} > 0 )); then
record container_loops CRIT "${looping_crit[*]}" "Crash-looping containers: ${looping_crit[*]}"
elif (( ${#looping_warn[@]} > 0 )); then
record container_loops WARN "${looping_warn[*]}" "Containers restarting: ${looping_warn[*]}"
else
record container_loops OK "0 looping" "No containers crash-looping"
fi
# Unhealthy containers (running but health check failing)
local unhealthy
unhealthy=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | paste -sd, || true)
if [[ -n "$unhealthy" ]]; then
record container_health WARN "$unhealthy" "Unhealthy containers: $unhealthy"
else
record container_health OK "all healthy" "All containers passing healthchecks"
fi
}
check_docker_disk() {
header "DOCKER DISK"
if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
return
fi
# Build cache
local cache_size_gb
cache_size_gb=$(docker system df --format '{{.BuildCache}}' 2>/dev/null \
| grep -oP '[0-9.]+(?=GB)' | head -1 || echo "0")
cache_size_gb="${cache_size_gb:-0}"
local cache_int
cache_int=$(echo "$cache_size_gb" | awk '{printf "%d", $1}')
if (( cache_int >= BUILD_CACHE_CRIT_GB )); then record build_cache CRIT "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — CRITICAL (>${BUILD_CACHE_CRIT_GB}GB, run: docker builder prune -f)"
elif (( cache_int >= BUILD_CACHE_WARN_GB )); then record build_cache WARN "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — WARNING (run: docker builder prune -f)"
else record build_cache OK "${cache_size_gb}GB" "Build cache OK (${cache_size_gb}GB)"
fi
# Images total size
local images_size
images_size=$(docker system df 2>/dev/null | awk '/^Images/ {print $3}' || echo "?")
record docker_images OK "$images_size" "Docker images: $images_size"
}
check_logs() {
header "LOGS"
local journal_mb
journal_mb=$(journalctl --disk-usage 2>/dev/null \
| grep -oP '[0-9.]+(?= M)' | head -1 || echo "0")
journal_mb="${journal_mb:-0}"
local syslog_mb=0
[[ -f /var/log/syslog ]] && syslog_mb=$(du -sm /var/log/syslog 2>/dev/null | cut -f1 || echo "0")
if (( journal_mb > 300 )); then record journal WARN "${journal_mb}MB" "Journal ${journal_mb}MB — WARNING (run: journalctl --vacuum-size=200M)"
else record journal OK "${journal_mb}MB" "Journal OK (${journal_mb}MB)"
fi
if (( syslog_mb > 100 )); then record syslog WARN "${syslog_mb}MB" "syslog ${syslog_mb}MB — WARNING"
else record syslog OK "${syslog_mb}MB" "syslog OK (${syslog_mb}MB)"
fi
}
check_automation_drift() {
header "AUTOMATION DRIFT"
local failed_units
failed_units=$(systemctl --failed --no-legend --plain 2>/dev/null | awk '{print $1}' | paste -sd, - || true)
if [[ -n "$failed_units" ]]; then
record failed_units WARN "$failed_units" "Failed systemd units: $failed_units"
else
record failed_units OK "0 failed" "No failed systemd units"
fi
local missing_paths=()
local cron_line path clean_path
while IFS= read -r cron_line; do
[[ -z "$cron_line" || "$cron_line" =~ ^[[:space:]]*# ]] && continue
[[ "$cron_line" =~ ^[A-Za-z_][A-Za-z0-9_]*= ]] && continue
while IFS= read -r path; do
clean_path="${path%\"}"
clean_path="${clean_path%\'}"
clean_path="${clean_path%,}"
clean_path="${clean_path%;}"
clean_path="${clean_path%)}"
case "$clean_path" in
/var/log/*|/run/*|/proc/*|/sys/*|/dev/*) continue ;;
esac
if [[ "$clean_path" == *.sh || "$clean_path" == *.py || "$clean_path" == */scripts/* ]]; then
[[ -e "$clean_path" ]] || missing_paths+=("$clean_path")
fi
done < <(grep -oE '/(opt|root|home|usr/local|etc)/[^[:space:]|;&<>]+' <<< "$cron_line" || true)
done < <(crontab -l 2>/dev/null || true)
if (( ${#missing_paths[@]} > 0 )); then
record cron_missing_paths WARN "${missing_paths[*]}" "Cron references missing path(s): ${missing_paths[*]}"
else
record cron_missing_paths OK "0 missing" "No missing script paths in root crontab"
fi
}
# ── Run all checks ───────────────────────────────────────────────────────────
if ! $JSON_MODE && ! $QUIET; then
echo -e "\n${BOLD}VM Health Check — $(hostname)$(date -u '+%Y-%m-%d %H:%M UTC')${NC}"
fi
check_disk
check_load
check_steal
check_memory
check_swap
check_docker_containers
check_docker_disk
check_logs
check_automation_drift
# ── Summary ──────────────────────────────────────────────────────────────────
if $JSON_MODE; then
echo '{'
echo " \"timestamp\": \"$(date -u '+%Y-%m-%dT%H:%M:%SZ')\","
echo " \"hostname\": \"$(hostname)\","
if (( WORST_LEVEL >= 2 )); then _overall='"CRIT"'
elif (( WORST_LEVEL == 1 )); then _overall='"WARN"'
else _overall='"OK"'
fi
echo " \"overall\": ${_overall},"
echo " \"checks\": {"
local_keys=("${!JSON_DATA[@]}")
for i in "${!local_keys[@]}"; do
k="${local_keys[$i]}"
if [[ $i -lt $(( ${#local_keys[@]} - 1 )) ]]; then comma=","; else comma=""; fi
echo " \"$k\": ${JSON_DATA[$k]}$comma"
done
echo " }"
echo '}'
else
echo ""
if (( WORST_LEVEL == 0 )); then
echo -e " ${GREEN}${BOLD}✓ All checks passed${NC}"
elif (( WORST_LEVEL == 1 )); then
echo -e " ${YELLOW}${BOLD}${#ISSUES[@]} warning(s):${NC}"
for issue in "${ISSUES[@]}"; do echo -e " ${YELLOW}${NC} $issue"; done
else
echo -e " ${RED}${BOLD}${#ISSUES[@]} issue(s) — action required:${NC}"
for issue in "${ISSUES[@]}"; do echo -e " ${RED}${NC} $issue"; done
fi
echo ""
fi
# ── Telegram notification ─────────────────────────────────────────────────
if $NOTIFY && (( WORST_LEVEL > 0 )); then
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
TELEGRAM_TOKEN=""
TELEGRAM_CHAT_ID=""
if [[ -f "$TOKEN_FILE" ]]; then
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" || true)
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" || true)
fi
if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
SEVERITY=$([[ $WORST_LEVEL -ge 2 ]] && echo "🚨 CRITICAL" || echo "⚠️ WARNING")
MSG="$SEVERITY$(hostname) VM health check
$(date -u '+%Y-%m-%d %H:%M UTC')
$(printf '%s\n' "${ISSUES[@]}")"
curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
-d chat_id="$TELEGRAM_CHAT_ID" \
-d text="$MSG" > /dev/null || true
fi
fi
# ── Log result ───────────────────────────────────────────────────────────────
if (( WORST_LEVEL >= 2 )); then RESULT_STR="CRIT"
elif (( WORST_LEVEL == 1 )); then RESULT_STR="WARN"
else RESULT_STR="OK"
fi
log_to_file "health-check result=$RESULT_STR issues=${#ISSUES[@]}"
exit "$WORST_LEVEL"