#!/usr/bin/env bash # ============================================================================= # vm-health-check.sh — Hostinger VM Health Check (READ-ONLY) # # Checks disk, memory, load, swap, and Docker container health. # Prints a colour-coded report. Exits non-zero if any threshold is exceeded # so it can drive cron alerts or CI gates. # # Usage: # bash vm-health-check.sh # interactive report # bash vm-health-check.sh --quiet # only print problems (exit 1 if any) # bash vm-health-check.sh --json # machine-readable JSON output # bash vm-health-check.sh --notify # send Telegram alert on WARNING/CRITICAL # # Exit codes: # 0 — all green # 1 — at least one WARNING # 2 — at least one CRITICAL # ============================================================================= set -Eeuo pipefail # shellcheck disable=SC2034 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOG_FILE="/var/log/vm-health-check.log" # ── Thresholds ────────────────────────────────────────────────────────────── DISK_WARN=55 # % used DISK_CRIT=70 LOAD_WARN=4.0 # absolute (not per-CPU) LOAD_CRIT=8.0 RAM_FREE_WARN_GB=3 # GB available RAM_FREE_CRIT_GB=1 SWAP_USED_WARN_GB=1 SWAP_USED_CRIT_GB=3 CONTAINER_RESTART_WARN=10 CONTAINER_RESTART_CRIT=50 BUILD_CACHE_WARN_GB=5 BUILD_CACHE_CRIT_GB=20 # shellcheck disable=SC2034 DOCKER_IMAGES_WARN_GB=15 # shellcheck disable=SC2034 DOCKER_IMAGES_CRIT_GB=25 # ── Colour codes ──────────────────────────────────────────────────────────── RED=$'\033[0;31m' YELLOW=$'\033[1;33m' GREEN=$'\033[0;32m' CYAN=$'\033[0;36m' BOLD=$'\033[1m' NC=$'\033[0m' # ── Flags ─────────────────────────────────────────────────────────────────── QUIET=false JSON_MODE=false NOTIFY=false for arg in "$@"; do case "$arg" in --quiet) QUIET=true ;; --json) JSON_MODE=true ;; --notify) NOTIFY=true ;; esac done # ── State tracking ────────────────────────────────────────────────────────── WORST_LEVEL=0 # 0=OK, 1=WARN, 2=CRIT ISSUES=() declare -A JSON_DATA # ── Helpers ───────────────────────────────────────────────────────────────── log_to_file() { local log_dir log_dir="$(dirname "$LOG_FILE")" if [[ ( -e "$LOG_FILE" && ! -w "$LOG_FILE" ) || ( ! -e "$LOG_FILE" && ! -w "$log_dir" ) ]]; then return fi echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true } status_icon() { case "$1" in OK) echo -e "${GREEN}✓${NC}" ;; WARN) echo -e "${YELLOW}⚠${NC}" ;; CRIT) echo -e "${RED}✗${NC}" ;; esac } level_int() { case "$1" in OK) echo 0 ;; WARN) echo 1 ;; CRIT) echo 2 ;; *) echo 0 ;; esac } record() { # record NAME LEVEL VALUE MESSAGE local name="$1" level="$2" value="$3" message="$4" local lvl_int lvl_int=$(level_int "$level") if (( lvl_int > WORST_LEVEL )); then WORST_LEVEL=$lvl_int; fi if [[ "$level" != "OK" ]]; then ISSUES+=("[$level] $message"); fi JSON_DATA["$name"]=$(printf '{"level":"%s","value":"%s","message":"%s"}' \ "$level" "$value" "$message") if $QUIET && [[ "$level" == "OK" ]]; then return; fi if ! $JSON_MODE; then printf " %s %-30s %s\n" "$(status_icon "$level")" "$message" "${CYAN}($value)${NC}" fi } header() { $JSON_MODE && return $QUIET && return echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}" } # ── Checks ────────────────────────────────────────────────────────────────── check_disk() { header "DISK" local used_pct used_pct=$(df / --output=pcent | tail -1 | tr -d ' %') local avail_gb avail_gb=$(df / --output=avail -BG | tail -1 | tr -d ' G') if (( used_pct >= DISK_CRIT )); then record disk CRIT "${used_pct}%" "Disk ${used_pct}% used — CRITICAL (>${DISK_CRIT}%)" elif (( used_pct >= DISK_WARN )); then record disk WARN "${used_pct}%" "Disk ${used_pct}% used — WARNING (>${DISK_WARN}%)" else record disk OK "${used_pct}% used, ${avail_gb}G free" "Disk OK (${used_pct}%)" fi } check_load() { header "LOAD" local load1 load5 _load15 read -r load1 load5 _load15 _ < /proc/loadavg local ncpu ncpu=$(nproc) # compare as integers (multiply by 10 to avoid bc dependency) local load1_int load1_int=$(echo "$load1" | awk '{printf "%d", $1 * 10}') local warn_int crit_int warn_int=$(echo "$LOAD_WARN" | awk '{printf "%d", $1 * 10}') crit_int=$(echo "$LOAD_CRIT" | awk '{printf "%d", $1 * 10}') if (( load1_int >= crit_int )); then record load CRIT "$load1" "Load avg $load1 — CRITICAL (>${LOAD_CRIT}, ${ncpu} CPUs)" elif (( load1_int >= warn_int )); then record load WARN "$load1" "Load avg $load1 — WARNING (>${LOAD_WARN})" else record load OK "$load1 (1m) / $load5 (5m)" "Load OK ($load1)" fi } check_memory() { header "MEMORY" local available_kb total_kb available_kb=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo) total_kb=$(awk '/^MemTotal/ {print $2}' /proc/meminfo) local available_gb total_gb available_gb=$(( available_kb / 1024 / 1024 )) total_gb=$(( total_kb / 1024 / 1024 )) if (( available_gb < RAM_FREE_CRIT_GB )); then record ram CRIT "${available_gb}G avail" "RAM available ${available_gb}G — CRITICAL (<${RAM_FREE_CRIT_GB}G)" elif (( available_gb < RAM_FREE_WARN_GB )); then record ram WARN "${available_gb}G avail" "RAM available ${available_gb}G — WARNING (<${RAM_FREE_WARN_GB}G)" else record ram OK "${available_gb}G / ${total_gb}G" "RAM OK (${available_gb}G available)" fi } check_swap() { header "SWAP" local swap_total_kb swap_used_kb swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo) swap_used_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo) swap_used_kb=$(( swap_total_kb - swap_used_kb )) local swap_total_gb swap_used_gb swap_total_gb=$(( swap_total_kb / 1024 / 1024 )) swap_used_gb=$(( swap_used_kb / 1024 / 1024 )) if (( swap_total_kb == 0 )); then record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)" return fi if (( swap_used_gb >= SWAP_USED_CRIT_GB )); then record swap CRIT "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — CRITICAL" elif (( swap_used_gb >= SWAP_USED_WARN_GB )); then record swap WARN "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — WARNING" else record swap OK "${swap_used_gb}G / ${swap_total_gb}G" "Swap OK (${swap_used_gb}G used)" fi } check_docker_containers() { header "DOCKER CONTAINERS" if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then record docker_daemon WARN "not running" "Docker daemon is not running" return fi # Crash-looping containers local looping_warn=() looping_crit=() while IFS=$'\t' read -r name restarts; do [[ -z "$name" || "$name" == "NAMES" ]] && continue restarts="${restarts:-0}" if (( restarts >= CONTAINER_RESTART_CRIT )); then looping_crit+=("$name(${restarts}x)") elif (( restarts >= CONTAINER_RESTART_WARN )); then looping_warn+=("$name(${restarts}x)") fi done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true) if (( ${#looping_crit[@]} > 0 )); then record container_loops CRIT "${looping_crit[*]}" "Crash-looping containers: ${looping_crit[*]}" elif (( ${#looping_warn[@]} > 0 )); then record container_loops WARN "${looping_warn[*]}" "Containers restarting: ${looping_warn[*]}" else record container_loops OK "0 looping" "No containers crash-looping" fi # Unhealthy containers (running but health check failing) local unhealthy unhealthy=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | paste -sd, || true) if [[ -n "$unhealthy" ]]; then record container_health WARN "$unhealthy" "Unhealthy containers: $unhealthy" else record container_health OK "all healthy" "All containers passing healthchecks" fi } check_docker_disk() { header "DOCKER DISK" if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then return fi # Build cache local cache_size_gb cache_size_gb=$(docker system df --format '{{.BuildCache}}' 2>/dev/null \ | grep -oP '[0-9.]+(?=GB)' | head -1 || echo "0") cache_size_gb="${cache_size_gb:-0}" local cache_int cache_int=$(echo "$cache_size_gb" | awk '{printf "%d", $1}') if (( cache_int >= BUILD_CACHE_CRIT_GB )); then record build_cache CRIT "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — CRITICAL (>${BUILD_CACHE_CRIT_GB}GB, run: docker builder prune -f)" elif (( cache_int >= BUILD_CACHE_WARN_GB )); then record build_cache WARN "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — WARNING (run: docker builder prune -f)" else record build_cache OK "${cache_size_gb}GB" "Build cache OK (${cache_size_gb}GB)" fi # Images total size local images_size images_size=$(docker system df 2>/dev/null | awk '/^Images/ {print $3}' || echo "?") record docker_images OK "$images_size" "Docker images: $images_size" } check_logs() { header "LOGS" local journal_mb journal_mb=$(journalctl --disk-usage 2>/dev/null \ | grep -oP '[0-9.]+(?= M)' | head -1 || echo "0") journal_mb="${journal_mb:-0}" local syslog_mb=0 [[ -f /var/log/syslog ]] && syslog_mb=$(du -sm /var/log/syslog 2>/dev/null | cut -f1 || echo "0") if (( journal_mb > 300 )); then record journal WARN "${journal_mb}MB" "Journal ${journal_mb}MB — WARNING (run: journalctl --vacuum-size=200M)" else record journal OK "${journal_mb}MB" "Journal OK (${journal_mb}MB)" fi if (( syslog_mb > 100 )); then record syslog WARN "${syslog_mb}MB" "syslog ${syslog_mb}MB — WARNING" else record syslog OK "${syslog_mb}MB" "syslog OK (${syslog_mb}MB)" fi } check_automation_drift() { header "AUTOMATION DRIFT" local failed_units failed_units=$(systemctl --failed --no-legend --plain 2>/dev/null | awk '{print $1}' | paste -sd, - || true) if [[ -n "$failed_units" ]]; then record failed_units WARN "$failed_units" "Failed systemd units: $failed_units" else record failed_units OK "0 failed" "No failed systemd units" fi local missing_paths=() local cron_line path clean_path while IFS= read -r cron_line; do [[ -z "$cron_line" || "$cron_line" =~ ^[[:space:]]*# ]] && continue [[ "$cron_line" =~ ^[A-Za-z_][A-Za-z0-9_]*= ]] && continue while IFS= read -r path; do clean_path="${path%\"}" clean_path="${clean_path%\'}" clean_path="${clean_path%,}" clean_path="${clean_path%;}" clean_path="${clean_path%)}" case "$clean_path" in /var/log/*|/run/*|/proc/*|/sys/*|/dev/*) continue ;; esac if [[ "$clean_path" == *.sh || "$clean_path" == *.py || "$clean_path" == */scripts/* ]]; then [[ -e "$clean_path" ]] || missing_paths+=("$clean_path") fi done < <(grep -oE '/(opt|root|home|usr/local|etc)/[^[:space:]|;&<>]+' <<< "$cron_line" || true) done < <(crontab -l 2>/dev/null || true) if (( ${#missing_paths[@]} > 0 )); then record cron_missing_paths WARN "${missing_paths[*]}" "Cron references missing path(s): ${missing_paths[*]}" else record cron_missing_paths OK "0 missing" "No missing script paths in root crontab" fi } # ── Run all checks ─────────────────────────────────────────────────────────── if ! $JSON_MODE && ! $QUIET; then echo -e "\n${BOLD}VM Health Check — $(hostname) — $(date -u '+%Y-%m-%d %H:%M UTC')${NC}" fi check_disk check_load check_memory check_swap check_docker_containers check_docker_disk check_logs check_automation_drift # ── Summary ────────────────────────────────────────────────────────────────── if $JSON_MODE; then echo '{' echo " \"timestamp\": \"$(date -u '+%Y-%m-%dT%H:%M:%SZ')\"," echo " \"hostname\": \"$(hostname)\"," if (( WORST_LEVEL >= 2 )); then _overall='"CRIT"' elif (( WORST_LEVEL == 1 )); then _overall='"WARN"' else _overall='"OK"' fi echo " \"overall\": ${_overall}," echo " \"checks\": {" local_keys=("${!JSON_DATA[@]}") for i in "${!local_keys[@]}"; do k="${local_keys[$i]}" if [[ $i -lt $(( ${#local_keys[@]} - 1 )) ]]; then comma=","; else comma=""; fi echo " \"$k\": ${JSON_DATA[$k]}$comma" done echo " }" echo '}' else echo "" if (( WORST_LEVEL == 0 )); then echo -e " ${GREEN}${BOLD}✓ All checks passed${NC}" elif (( WORST_LEVEL == 1 )); then echo -e " ${YELLOW}${BOLD}⚠ ${#ISSUES[@]} warning(s):${NC}" for issue in "${ISSUES[@]}"; do echo -e " ${YELLOW}→${NC} $issue"; done else echo -e " ${RED}${BOLD}✗ ${#ISSUES[@]} issue(s) — action required:${NC}" for issue in "${ISSUES[@]}"; do echo -e " ${RED}→${NC} $issue"; done fi echo "" fi # ── Telegram notification ───────────────────────────────────────────────── if $NOTIFY && (( WORST_LEVEL > 0 )); then TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" TELEGRAM_TOKEN="" TELEGRAM_CHAT_ID="" if [[ -f "$TOKEN_FILE" ]]; then TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" || true) TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" || true) fi if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then SEVERITY=$([[ $WORST_LEVEL -ge 2 ]] && echo "🚨 CRITICAL" || echo "⚠️ WARNING") MSG="$SEVERITY — $(hostname) VM health check $(date -u '+%Y-%m-%d %H:%M UTC') $(printf '%s\n' "${ISSUES[@]}")" curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ -d chat_id="$TELEGRAM_CHAT_ID" \ -d text="$MSG" > /dev/null || true fi fi # ── Log result ─────────────────────────────────────────────────────────────── if (( WORST_LEVEL >= 2 )); then RESULT_STR="CRIT" elif (( WORST_LEVEL == 1 )); then RESULT_STR="WARN" else RESULT_STR="OK" fi log_to_file "health-check result=$RESULT_STR issues=${#ISSUES[@]}" exit "$WORST_LEVEL"