bytelyst-devops-tools/scripts/VMs/HostingerVM/vm-health-check.sh

#!/usr/bin/env bash
# =============================================================================
# vm-health-check.sh — Hostinger VM Health Check (READ-ONLY)
#
# Checks disk, memory, load, swap, and Docker container health.
# Prints a colour-coded report. Exits non-zero if any threshold is exceeded
# so it can drive cron alerts or CI gates.
#
# Usage:
#   bash vm-health-check.sh              # interactive report
#   bash vm-health-check.sh --quiet      # only print problems (exit 1 if any)
#   bash vm-health-check.sh --json       # machine-readable JSON output
#   bash vm-health-check.sh --notify     # send Telegram alert on WARNING/CRITICAL
#
# Exit codes:
#   0 — all green
#   1 — at least one WARNING
#   2 — at least one CRITICAL
# =============================================================================
set -Eeuo pipefail

# shellcheck disable=SC2034
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_FILE="/var/log/vm-health-check.log"

# ── Thresholds ──────────────────────────────────────────────────────────────
DISK_WARN=55           # % used
DISK_CRIT=70
LOAD_WARN=4.0          # absolute (not per-CPU)
LOAD_CRIT=8.0
RAM_FREE_WARN_GB=3     # GB available
RAM_FREE_CRIT_GB=1
SWAP_USED_WARN_GB=1.5
SWAP_USED_CRIT_GB=3
SWAP_CACHED_WARN_MB=200   # early-warning: recent swap pressure even if current usage looks ok
STEAL_WARN=5              # % steal time
STEAL_CRIT=15
CONTAINER_RESTART_WARN=10
CONTAINER_RESTART_CRIT=50
BUILD_CACHE_WARN_GB=5
BUILD_CACHE_CRIT_GB=20
# shellcheck disable=SC2034
DOCKER_IMAGES_WARN_GB=15
# shellcheck disable=SC2034
DOCKER_IMAGES_CRIT_GB=25

# ── Colour codes ────────────────────────────────────────────────────────────
RED=$'\033[0;31m'
YELLOW=$'\033[1;33m'
GREEN=$'\033[0;32m'
CYAN=$'\033[0;36m'
BOLD=$'\033[1m'
NC=$'\033[0m'

# ── Flags ───────────────────────────────────────────────────────────────────
QUIET=false
JSON_MODE=false
NOTIFY=false

for arg in "$@"; do
  case "$arg" in
    --quiet)  QUIET=true ;;
    --json)   JSON_MODE=true ;;
    --notify) NOTIFY=true ;;
  esac
done

# ── State tracking ──────────────────────────────────────────────────────────
WORST_LEVEL=0   # 0=OK, 1=WARN, 2=CRIT
ISSUES=()
declare -A JSON_DATA

# ── Helpers ─────────────────────────────────────────────────────────────────
log_to_file() {
  local log_dir
  log_dir="$(dirname "$LOG_FILE")"
  if [[ ( -e "$LOG_FILE" && ! -w "$LOG_FILE" ) || ( ! -e "$LOG_FILE" && ! -w "$log_dir" ) ]]; then
    return
  fi
  echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true
}

status_icon() {
  case "$1" in
    OK)   echo -e "${GREEN}✓${NC}" ;;
    WARN) echo -e "${YELLOW}⚠${NC}" ;;
    CRIT) echo -e "${RED}✗${NC}" ;;
  esac
}

level_int() {
  case "$1" in OK) echo 0 ;; WARN) echo 1 ;; CRIT) echo 2 ;; *) echo 0 ;; esac
}

record() {
  # record NAME LEVEL VALUE MESSAGE
  local name="$1" level="$2" value="$3" message="$4"
  local lvl_int
  lvl_int=$(level_int "$level")
  if (( lvl_int > WORST_LEVEL )); then WORST_LEVEL=$lvl_int; fi
  if [[ "$level" != "OK" ]]; then ISSUES+=("[$level] $message"); fi
  JSON_DATA["$name"]=$(printf '{"level":"%s","value":"%s","message":"%s"}' \
    "$level" "$value" "$message")

  if $QUIET && [[ "$level" == "OK" ]]; then return; fi
  if ! $JSON_MODE; then
    printf "  %s  %-30s %s\n" "$(status_icon "$level")" "$message" "${CYAN}($value)${NC}"
  fi
}

header() {
  $JSON_MODE && return
  $QUIET && return
  echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}"
}

# ── Checks ──────────────────────────────────────────────────────────────────

check_disk() {
  header "DISK"
  local used_pct
  used_pct=$(df / --output=pcent | tail -1 | tr -d ' %')
  local avail_gb
  avail_gb=$(df / --output=avail -BG | tail -1 | tr -d ' G')

  if   (( used_pct >= DISK_CRIT )); then record disk CRIT "${used_pct}%" "Disk ${used_pct}% used — CRITICAL (>${DISK_CRIT}%)"
  elif (( used_pct >= DISK_WARN )); then record disk WARN "${used_pct}%" "Disk ${used_pct}% used — WARNING (>${DISK_WARN}%)"
  else                                   record disk OK   "${used_pct}% used, ${avail_gb}G free" "Disk OK (${used_pct}%)"
  fi
}

check_load() {
  header "LOAD"
  local load1 load5 _load15
  read -r load1 load5 _load15 _ < /proc/loadavg
  local ncpu
  ncpu=$(nproc)

  # compare as integers (multiply by 10 to avoid bc dependency)
  local load1_int
  load1_int=$(echo "$load1" | awk '{printf "%d", $1 * 10}')
  local warn_int crit_int
  warn_int=$(echo "$LOAD_WARN" | awk '{printf "%d", $1 * 10}')
  crit_int=$(echo "$LOAD_CRIT" | awk '{printf "%d", $1 * 10}')

  if   (( load1_int >= crit_int )); then record load CRIT "$load1" "Load avg $load1 — CRITICAL (>${LOAD_CRIT}, ${ncpu} CPUs)"
  elif (( load1_int >= warn_int )); then record load WARN "$load1" "Load avg $load1 — WARNING (>${LOAD_WARN})"
  else                                   record load OK   "$load1 (1m) / $load5 (5m)" "Load OK ($load1)"
  fi
}

check_memory() {
  header "MEMORY"
  local available_kb total_kb
  available_kb=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
  total_kb=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
  local available_gb total_gb
  available_gb=$(( available_kb / 1024 / 1024 ))
  total_gb=$(( total_kb / 1024 / 1024 ))

  if   (( available_gb < RAM_FREE_CRIT_GB )); then record ram CRIT "${available_gb}G avail" "RAM available ${available_gb}G — CRITICAL (<${RAM_FREE_CRIT_GB}G)"
  elif (( available_gb < RAM_FREE_WARN_GB )); then record ram WARN "${available_gb}G avail" "RAM available ${available_gb}G — WARNING (<${RAM_FREE_WARN_GB}G)"
  else                                             record ram OK   "${available_gb}G / ${total_gb}G" "RAM OK (${available_gb}G available)"
  fi
}

check_steal() {
  header "CPU STEAL"
  # Requires two /proc/stat samples 1s apart — single sample gives lifetime average, not current.
  local s1 s2
  s1=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
  sleep 1
  s2=$(awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat)
  local steal_pct
  steal_pct=$(awk -v s1="$s1" -v s2="$s2" 'BEGIN{
    split(s1,a," "); split(s2,b," ")
    delta_steal=b[1]-a[1]; delta_total=b[2]-a[2]
    if (delta_total == 0) { printf "0.0"; exit }
    printf "%.1f", (delta_steal/delta_total)*100
  }')
  local steal_int
  steal_int=$(awk -v v="$steal_pct" 'BEGIN{printf "%d", v}')

  if   (( steal_int >= STEAL_CRIT )); then record steal CRIT "${steal_pct}%" "CPU steal ${steal_pct}% — CRITICAL (host is overcommitted)"
  elif (( steal_int >= STEAL_WARN )); then record steal WARN "${steal_pct}%" "CPU steal ${steal_pct}% — WARNING (host contention; degrades LLM inference)"
  else                                     record steal OK   "${steal_pct}%" "CPU steal OK (${steal_pct}%)"
  fi
}

check_swap() {
  header "SWAP"
  local swap_total_kb swap_free_kb swap_cached_kb
  swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
  swap_free_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
  swap_cached_kb=$(awk '/^SwapCached/ {print $2}' /proc/meminfo)
  local swap_used_kb
  swap_used_kb=$(( swap_total_kb - swap_free_kb ))
  local swap_total_gb
  swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
  local swap_cached_mb
  swap_cached_mb=$(( swap_cached_kb / 1024 ))

  if (( swap_total_kb == 0 )); then
    record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
    return
  fi

  # Compare used GB using awk to handle the fractional threshold (1.5)
  local used_gb_10x warn_10x crit_10x
  used_gb_10x=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%d", (kb/1024/1024)*10}')
  warn_10x=$(awk -v t="$SWAP_USED_WARN_GB" 'BEGIN{printf "%d", t*10}')
  crit_10x=$(awk -v t="$SWAP_USED_CRIT_GB" 'BEGIN{printf "%d", t*10}')
  local swap_used_display
  swap_used_display=$(awk -v kb="$swap_used_kb" 'BEGIN{printf "%.1fG", kb/1024/1024}')

  if   (( used_gb_10x >= crit_10x )); then
    record swap CRIT "${swap_used_display} used" "Swap ${swap_used_display} used — CRITICAL"
  elif (( used_gb_10x >= warn_10x )); then
    record swap WARN "${swap_used_display} used" "Swap ${swap_used_display} used — WARNING (>${SWAP_USED_WARN_GB}G)"
  elif (( swap_cached_mb >= SWAP_CACHED_WARN_MB )); then
    # SwapCached is pages reclaimed from swap still sitting in cache — indicates
    # recent memory pressure even though current usage looks ok.
    record swap WARN "${swap_used_display} used, ${swap_cached_mb}MB cached" \
      "Swap pressure indicator: SwapCached ${swap_cached_mb}MB — recent memory pressure (threshold ${SWAP_CACHED_WARN_MB}MB)"
  else
    record swap OK "${swap_used_display} / ${swap_total_gb}G" "Swap OK (${swap_used_display} used, ${swap_cached_mb}MB cached)"
  fi
}

check_docker_containers() {
  header "DOCKER CONTAINERS"

  if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
    record docker_daemon WARN "not running" "Docker daemon is not running"
    return
  fi

  # Crash-looping containers
  local looping_warn=() looping_crit=()
  while IFS=$'\t' read -r name restarts; do
    [[ -z "$name" || "$name" == "NAMES" ]] && continue
    restarts="${restarts:-0}"
    if   (( restarts >= CONTAINER_RESTART_CRIT )); then looping_crit+=("$name(${restarts}x)")
    elif (( restarts >= CONTAINER_RESTART_WARN )); then looping_warn+=("$name(${restarts}x)")
    fi
  done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true)

  if   (( ${#looping_crit[@]} > 0 )); then
    record container_loops CRIT "${looping_crit[*]}" "Crash-looping containers: ${looping_crit[*]}"
  elif (( ${#looping_warn[@]} > 0 )); then
    record container_loops WARN "${looping_warn[*]}" "Containers restarting: ${looping_warn[*]}"
  else
    record container_loops OK "0 looping" "No containers crash-looping"
  fi

  # Unhealthy containers (running but health check failing)
  local unhealthy
  unhealthy=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | paste -sd, || true)
  if [[ -n "$unhealthy" ]]; then
    record container_health WARN "$unhealthy" "Unhealthy containers: $unhealthy"
  else
    record container_health OK "all healthy" "All containers passing healthchecks"
  fi
}

check_docker_disk() {
  header "DOCKER DISK"

  if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
    return
  fi

  # Build cache
  local cache_size_gb
  cache_size_gb=$(docker system df --format '{{.BuildCache}}' 2>/dev/null \
    | grep -oP '[0-9.]+(?=GB)' | head -1 || echo "0")
  cache_size_gb="${cache_size_gb:-0}"
  local cache_int
  cache_int=$(echo "$cache_size_gb" | awk '{printf "%d", $1}')
  if   (( cache_int >= BUILD_CACHE_CRIT_GB )); then record build_cache CRIT "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — CRITICAL (>${BUILD_CACHE_CRIT_GB}GB, run: docker builder prune -f)"
  elif (( cache_int >= BUILD_CACHE_WARN_GB )); then record build_cache WARN "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — WARNING (run: docker builder prune -f)"
  else                                              record build_cache OK   "${cache_size_gb}GB" "Build cache OK (${cache_size_gb}GB)"
  fi

  # Images total size
  local images_size
  images_size=$(docker system df 2>/dev/null | awk '/^Images/ {print $3}' || echo "?")
  record docker_images OK "$images_size" "Docker images: $images_size"
}

check_logs() {
  header "LOGS"
  local journal_mb
  journal_mb=$(journalctl --disk-usage 2>/dev/null \
    | grep -oP '[0-9.]+(?= M)' | head -1 || echo "0")
  journal_mb="${journal_mb:-0}"
  local syslog_mb=0
  [[ -f /var/log/syslog ]] && syslog_mb=$(du -sm /var/log/syslog 2>/dev/null | cut -f1 || echo "0")

  if   (( journal_mb > 300 )); then record journal WARN "${journal_mb}MB" "Journal ${journal_mb}MB — WARNING (run: journalctl --vacuum-size=200M)"
  else                               record journal OK   "${journal_mb}MB" "Journal OK (${journal_mb}MB)"
  fi

  if   (( syslog_mb > 100 )); then record syslog WARN "${syslog_mb}MB" "syslog ${syslog_mb}MB — WARNING"
  else                              record syslog OK   "${syslog_mb}MB" "syslog OK (${syslog_mb}MB)"
  fi
}

check_automation_drift() {
  header "AUTOMATION DRIFT"

  local failed_units
  failed_units=$(systemctl --failed --no-legend --plain 2>/dev/null | awk '{print $1}' | paste -sd, - || true)
  if [[ -n "$failed_units" ]]; then
    record failed_units WARN "$failed_units" "Failed systemd units: $failed_units"
  else
    record failed_units OK "0 failed" "No failed systemd units"
  fi

  local missing_paths=()
  local cron_line path clean_path
  while IFS= read -r cron_line; do
    [[ -z "$cron_line" || "$cron_line" =~ ^[[:space:]]*# ]] && continue
    [[ "$cron_line" =~ ^[A-Za-z_][A-Za-z0-9_]*= ]] && continue

    while IFS= read -r path; do
      clean_path="${path%\"}"
      clean_path="${clean_path%\'}"
      clean_path="${clean_path%,}"
      clean_path="${clean_path%;}"
      clean_path="${clean_path%)}"

      case "$clean_path" in
        /var/log/*|/run/*|/proc/*|/sys/*|/dev/*) continue ;;
      esac

      if [[ "$clean_path" == *.sh || "$clean_path" == *.py || "$clean_path" == */scripts/* ]]; then
        [[ -e "$clean_path" ]] || missing_paths+=("$clean_path")
      fi
    done < <(grep -oE '/(opt|root|home|usr/local|etc)/[^[:space:]|;&<>]+' <<< "$cron_line" || true)
  done < <(crontab -l 2>/dev/null || true)

  if (( ${#missing_paths[@]} > 0 )); then
    record cron_missing_paths WARN "${missing_paths[*]}" "Cron references missing path(s): ${missing_paths[*]}"
  else
    record cron_missing_paths OK "0 missing" "No missing script paths in root crontab"
  fi
}

# ── Run all checks ───────────────────────────────────────────────────────────

if ! $JSON_MODE && ! $QUIET; then
  echo -e "\n${BOLD}VM Health Check — $(hostname) — $(date -u '+%Y-%m-%d %H:%M UTC')${NC}"
fi

check_disk
check_load
check_steal
check_memory
check_swap
check_docker_containers
check_docker_disk
check_logs
check_automation_drift

# ── Summary ──────────────────────────────────────────────────────────────────

if $JSON_MODE; then
  echo '{'
  echo "  \"timestamp\": \"$(date -u '+%Y-%m-%dT%H:%M:%SZ')\","
  echo "  \"hostname\": \"$(hostname)\","
  if   (( WORST_LEVEL >= 2 )); then _overall='"CRIT"'
  elif (( WORST_LEVEL == 1 )); then _overall='"WARN"'
  else                               _overall='"OK"'
  fi
  echo "  \"overall\": ${_overall},"
  echo "  \"checks\": {"
  local_keys=("${!JSON_DATA[@]}")
  for i in "${!local_keys[@]}"; do
    k="${local_keys[$i]}"
    if [[ $i -lt $(( ${#local_keys[@]} - 1 )) ]]; then comma=","; else comma=""; fi
    echo "    \"$k\": ${JSON_DATA[$k]}$comma"
  done
  echo "  }"
  echo '}'
else
  echo ""
  if (( WORST_LEVEL == 0 )); then
    echo -e "  ${GREEN}${BOLD}✓ All checks passed${NC}"
  elif (( WORST_LEVEL == 1 )); then
    echo -e "  ${YELLOW}${BOLD}⚠ ${#ISSUES[@]} warning(s):${NC}"
    for issue in "${ISSUES[@]}"; do echo -e "    ${YELLOW}→${NC} $issue"; done
  else
    echo -e "  ${RED}${BOLD}✗ ${#ISSUES[@]} issue(s) — action required:${NC}"
    for issue in "${ISSUES[@]}"; do echo -e "    ${RED}→${NC} $issue"; done
  fi
  echo ""
fi

# ── Telegram notification ─────────────────────────────────────────────────
if $NOTIFY && (( WORST_LEVEL > 0 )); then
  TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
  TELEGRAM_TOKEN=""
  TELEGRAM_CHAT_ID=""

  if [[ -f "$TOKEN_FILE" ]]; then
    TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" || true)
    TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" || true)
  fi

  if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
    SEVERITY=$([[ $WORST_LEVEL -ge 2 ]] && echo "🚨 CRITICAL" || echo "⚠️ WARNING")
    MSG="$SEVERITY — $(hostname) VM health check
$(date -u '+%Y-%m-%d %H:%M UTC')

$(printf '%s\n' "${ISSUES[@]}")"
    curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
      -d chat_id="$TELEGRAM_CHAT_ID" \
      -d text="$MSG" > /dev/null || true
  fi
fi

# ── Log result ───────────────────────────────────────────────────────────────
if   (( WORST_LEVEL >= 2 )); then RESULT_STR="CRIT"
elif (( WORST_LEVEL == 1 )); then RESULT_STR="WARN"
else                               RESULT_STR="OK"
fi
log_to_file "health-check result=$RESULT_STR issues=${#ISSUES[@]}"

exit "$WORST_LEVEL"