385 lines
15 KiB
Bash
Executable File
385 lines
15 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# vm-health-check.sh — Hostinger VM Health Check (READ-ONLY)
|
|
#
|
|
# Checks disk, memory, load, swap, and Docker container health.
|
|
# Prints a colour-coded report. Exits non-zero if any threshold is exceeded
|
|
# so it can drive cron alerts or CI gates.
|
|
#
|
|
# Usage:
|
|
# bash vm-health-check.sh # interactive report
|
|
# bash vm-health-check.sh --quiet # only print problems (exit 1 if any)
|
|
# bash vm-health-check.sh --json # machine-readable JSON output
|
|
# bash vm-health-check.sh --notify # send Telegram alert on WARNING/CRITICAL
|
|
#
|
|
# Exit codes:
|
|
# 0 — all green
|
|
# 1 — at least one WARNING
|
|
# 2 — at least one CRITICAL
|
|
# =============================================================================
|
|
set -Eeuo pipefail
|
|
|
|
# shellcheck disable=SC2034
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
LOG_FILE="/var/log/vm-health-check.log"
|
|
|
|
# ── Thresholds ──────────────────────────────────────────────────────────────
|
|
DISK_WARN=55 # % used
|
|
DISK_CRIT=70
|
|
LOAD_WARN=4.0 # absolute (not per-CPU)
|
|
LOAD_CRIT=8.0
|
|
RAM_FREE_WARN_GB=3 # GB available
|
|
RAM_FREE_CRIT_GB=1
|
|
SWAP_USED_WARN_GB=1
|
|
SWAP_USED_CRIT_GB=3
|
|
CONTAINER_RESTART_WARN=10
|
|
CONTAINER_RESTART_CRIT=50
|
|
BUILD_CACHE_WARN_GB=5
|
|
BUILD_CACHE_CRIT_GB=20
|
|
# shellcheck disable=SC2034
|
|
DOCKER_IMAGES_WARN_GB=15
|
|
# shellcheck disable=SC2034
|
|
DOCKER_IMAGES_CRIT_GB=25
|
|
|
|
# ── Colour codes ────────────────────────────────────────────────────────────
|
|
RED=$'\033[0;31m'
|
|
YELLOW=$'\033[1;33m'
|
|
GREEN=$'\033[0;32m'
|
|
CYAN=$'\033[0;36m'
|
|
BOLD=$'\033[1m'
|
|
NC=$'\033[0m'
|
|
|
|
# ── Flags ───────────────────────────────────────────────────────────────────
|
|
QUIET=false
|
|
JSON_MODE=false
|
|
NOTIFY=false
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--quiet) QUIET=true ;;
|
|
--json) JSON_MODE=true ;;
|
|
--notify) NOTIFY=true ;;
|
|
esac
|
|
done
|
|
|
|
# ── State tracking ──────────────────────────────────────────────────────────
|
|
WORST_LEVEL=0 # 0=OK, 1=WARN, 2=CRIT
|
|
ISSUES=()
|
|
declare -A JSON_DATA
|
|
|
|
# ── Helpers ─────────────────────────────────────────────────────────────────
|
|
log_to_file() {
|
|
local log_dir
|
|
log_dir="$(dirname "$LOG_FILE")"
|
|
if [[ ( -e "$LOG_FILE" && ! -w "$LOG_FILE" ) || ( ! -e "$LOG_FILE" && ! -w "$log_dir" ) ]]; then
|
|
return
|
|
fi
|
|
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true
|
|
}
|
|
|
|
status_icon() {
|
|
case "$1" in
|
|
OK) echo -e "${GREEN}✓${NC}" ;;
|
|
WARN) echo -e "${YELLOW}⚠${NC}" ;;
|
|
CRIT) echo -e "${RED}✗${NC}" ;;
|
|
esac
|
|
}
|
|
|
|
level_int() {
|
|
case "$1" in OK) echo 0 ;; WARN) echo 1 ;; CRIT) echo 2 ;; *) echo 0 ;; esac
|
|
}
|
|
|
|
record() {
|
|
# record NAME LEVEL VALUE MESSAGE
|
|
local name="$1" level="$2" value="$3" message="$4"
|
|
local lvl_int
|
|
lvl_int=$(level_int "$level")
|
|
if (( lvl_int > WORST_LEVEL )); then WORST_LEVEL=$lvl_int; fi
|
|
if [[ "$level" != "OK" ]]; then ISSUES+=("[$level] $message"); fi
|
|
JSON_DATA["$name"]=$(printf '{"level":"%s","value":"%s","message":"%s"}' \
|
|
"$level" "$value" "$message")
|
|
|
|
if $QUIET && [[ "$level" == "OK" ]]; then return; fi
|
|
if ! $JSON_MODE; then
|
|
printf " %s %-30s %s\n" "$(status_icon "$level")" "$message" "${CYAN}($value)${NC}"
|
|
fi
|
|
}
|
|
|
|
header() {
|
|
$JSON_MODE && return
|
|
$QUIET && return
|
|
echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}"
|
|
}
|
|
|
|
# ── Checks ──────────────────────────────────────────────────────────────────
|
|
|
|
check_disk() {
|
|
header "DISK"
|
|
local used_pct
|
|
used_pct=$(df / --output=pcent | tail -1 | tr -d ' %')
|
|
local avail_gb
|
|
avail_gb=$(df / --output=avail -BG | tail -1 | tr -d ' G')
|
|
|
|
if (( used_pct >= DISK_CRIT )); then record disk CRIT "${used_pct}%" "Disk ${used_pct}% used — CRITICAL (>${DISK_CRIT}%)"
|
|
elif (( used_pct >= DISK_WARN )); then record disk WARN "${used_pct}%" "Disk ${used_pct}% used — WARNING (>${DISK_WARN}%)"
|
|
else record disk OK "${used_pct}% used, ${avail_gb}G free" "Disk OK (${used_pct}%)"
|
|
fi
|
|
}
|
|
|
|
check_load() {
|
|
header "LOAD"
|
|
local load1 load5 _load15
|
|
read -r load1 load5 _load15 _ < /proc/loadavg
|
|
local ncpu
|
|
ncpu=$(nproc)
|
|
|
|
# compare as integers (multiply by 10 to avoid bc dependency)
|
|
local load1_int
|
|
load1_int=$(echo "$load1" | awk '{printf "%d", $1 * 10}')
|
|
local warn_int crit_int
|
|
warn_int=$(echo "$LOAD_WARN" | awk '{printf "%d", $1 * 10}')
|
|
crit_int=$(echo "$LOAD_CRIT" | awk '{printf "%d", $1 * 10}')
|
|
|
|
if (( load1_int >= crit_int )); then record load CRIT "$load1" "Load avg $load1 — CRITICAL (>${LOAD_CRIT}, ${ncpu} CPUs)"
|
|
elif (( load1_int >= warn_int )); then record load WARN "$load1" "Load avg $load1 — WARNING (>${LOAD_WARN})"
|
|
else record load OK "$load1 (1m) / $load5 (5m)" "Load OK ($load1)"
|
|
fi
|
|
}
|
|
|
|
check_memory() {
|
|
header "MEMORY"
|
|
local available_kb total_kb
|
|
available_kb=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
|
|
total_kb=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
|
|
local available_gb total_gb
|
|
available_gb=$(( available_kb / 1024 / 1024 ))
|
|
total_gb=$(( total_kb / 1024 / 1024 ))
|
|
|
|
if (( available_gb < RAM_FREE_CRIT_GB )); then record ram CRIT "${available_gb}G avail" "RAM available ${available_gb}G — CRITICAL (<${RAM_FREE_CRIT_GB}G)"
|
|
elif (( available_gb < RAM_FREE_WARN_GB )); then record ram WARN "${available_gb}G avail" "RAM available ${available_gb}G — WARNING (<${RAM_FREE_WARN_GB}G)"
|
|
else record ram OK "${available_gb}G / ${total_gb}G" "RAM OK (${available_gb}G available)"
|
|
fi
|
|
}
|
|
|
|
check_swap() {
|
|
header "SWAP"
|
|
local swap_total_kb swap_used_kb
|
|
swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
|
|
swap_used_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
|
|
swap_used_kb=$(( swap_total_kb - swap_used_kb ))
|
|
local swap_total_gb swap_used_gb
|
|
swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
|
|
swap_used_gb=$(( swap_used_kb / 1024 / 1024 ))
|
|
|
|
if (( swap_total_kb == 0 )); then
|
|
record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
|
|
return
|
|
fi
|
|
|
|
if (( swap_used_gb >= SWAP_USED_CRIT_GB )); then record swap CRIT "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — CRITICAL"
|
|
elif (( swap_used_gb >= SWAP_USED_WARN_GB )); then record swap WARN "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — WARNING"
|
|
else record swap OK "${swap_used_gb}G / ${swap_total_gb}G" "Swap OK (${swap_used_gb}G used)"
|
|
fi
|
|
}
|
|
|
|
check_docker_containers() {
|
|
header "DOCKER CONTAINERS"
|
|
|
|
if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
|
|
record docker_daemon WARN "not running" "Docker daemon is not running"
|
|
return
|
|
fi
|
|
|
|
# Crash-looping containers
|
|
local looping_warn=() looping_crit=()
|
|
while IFS=$'\t' read -r name restarts; do
|
|
[[ -z "$name" || "$name" == "NAMES" ]] && continue
|
|
restarts="${restarts:-0}"
|
|
if (( restarts >= CONTAINER_RESTART_CRIT )); then looping_crit+=("$name(${restarts}x)")
|
|
elif (( restarts >= CONTAINER_RESTART_WARN )); then looping_warn+=("$name(${restarts}x)")
|
|
fi
|
|
done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true)
|
|
|
|
if (( ${#looping_crit[@]} > 0 )); then
|
|
record container_loops CRIT "${looping_crit[*]}" "Crash-looping containers: ${looping_crit[*]}"
|
|
elif (( ${#looping_warn[@]} > 0 )); then
|
|
record container_loops WARN "${looping_warn[*]}" "Containers restarting: ${looping_warn[*]}"
|
|
else
|
|
record container_loops OK "0 looping" "No containers crash-looping"
|
|
fi
|
|
|
|
# Unhealthy containers (running but health check failing)
|
|
local unhealthy
|
|
unhealthy=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | paste -sd, || true)
|
|
if [[ -n "$unhealthy" ]]; then
|
|
record container_health WARN "$unhealthy" "Unhealthy containers: $unhealthy"
|
|
else
|
|
record container_health OK "all healthy" "All containers passing healthchecks"
|
|
fi
|
|
}
|
|
|
|
check_docker_disk() {
|
|
header "DOCKER DISK"
|
|
|
|
if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
|
|
return
|
|
fi
|
|
|
|
# Build cache
|
|
local cache_size_gb
|
|
cache_size_gb=$(docker system df --format '{{.BuildCache}}' 2>/dev/null \
|
|
| grep -oP '[0-9.]+(?=GB)' | head -1 || echo "0")
|
|
cache_size_gb="${cache_size_gb:-0}"
|
|
local cache_int
|
|
cache_int=$(echo "$cache_size_gb" | awk '{printf "%d", $1}')
|
|
if (( cache_int >= BUILD_CACHE_CRIT_GB )); then record build_cache CRIT "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — CRITICAL (>${BUILD_CACHE_CRIT_GB}GB, run: docker builder prune -f)"
|
|
elif (( cache_int >= BUILD_CACHE_WARN_GB )); then record build_cache WARN "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — WARNING (run: docker builder prune -f)"
|
|
else record build_cache OK "${cache_size_gb}GB" "Build cache OK (${cache_size_gb}GB)"
|
|
fi
|
|
|
|
# Images total size
|
|
local images_size
|
|
images_size=$(docker system df 2>/dev/null | awk '/^Images/ {print $3}' || echo "?")
|
|
record docker_images OK "$images_size" "Docker images: $images_size"
|
|
}
|
|
|
|
check_logs() {
|
|
header "LOGS"
|
|
local journal_mb
|
|
journal_mb=$(journalctl --disk-usage 2>/dev/null \
|
|
| grep -oP '[0-9.]+(?= M)' | head -1 || echo "0")
|
|
journal_mb="${journal_mb:-0}"
|
|
local syslog_mb=0
|
|
[[ -f /var/log/syslog ]] && syslog_mb=$(du -sm /var/log/syslog 2>/dev/null | cut -f1 || echo "0")
|
|
|
|
if (( journal_mb > 300 )); then record journal WARN "${journal_mb}MB" "Journal ${journal_mb}MB — WARNING (run: journalctl --vacuum-size=200M)"
|
|
else record journal OK "${journal_mb}MB" "Journal OK (${journal_mb}MB)"
|
|
fi
|
|
|
|
if (( syslog_mb > 100 )); then record syslog WARN "${syslog_mb}MB" "syslog ${syslog_mb}MB — WARNING"
|
|
else record syslog OK "${syslog_mb}MB" "syslog OK (${syslog_mb}MB)"
|
|
fi
|
|
}
|
|
|
|
check_automation_drift() {
|
|
header "AUTOMATION DRIFT"
|
|
|
|
local failed_units
|
|
failed_units=$(systemctl --failed --no-legend --plain 2>/dev/null | awk '{print $1}' | paste -sd, - || true)
|
|
if [[ -n "$failed_units" ]]; then
|
|
record failed_units WARN "$failed_units" "Failed systemd units: $failed_units"
|
|
else
|
|
record failed_units OK "0 failed" "No failed systemd units"
|
|
fi
|
|
|
|
local missing_paths=()
|
|
local cron_line path clean_path
|
|
while IFS= read -r cron_line; do
|
|
[[ -z "$cron_line" || "$cron_line" =~ ^[[:space:]]*# ]] && continue
|
|
[[ "$cron_line" =~ ^[A-Za-z_][A-Za-z0-9_]*= ]] && continue
|
|
|
|
while IFS= read -r path; do
|
|
clean_path="${path%\"}"
|
|
clean_path="${clean_path%\'}"
|
|
clean_path="${clean_path%,}"
|
|
clean_path="${clean_path%;}"
|
|
clean_path="${clean_path%)}"
|
|
|
|
case "$clean_path" in
|
|
/var/log/*|/run/*|/proc/*|/sys/*|/dev/*) continue ;;
|
|
esac
|
|
|
|
if [[ "$clean_path" == *.sh || "$clean_path" == *.py || "$clean_path" == */scripts/* ]]; then
|
|
[[ -e "$clean_path" ]] || missing_paths+=("$clean_path")
|
|
fi
|
|
done < <(grep -oE '/(opt|root|home|usr/local|etc)/[^[:space:]|;&<>]+' <<< "$cron_line" || true)
|
|
done < <(crontab -l 2>/dev/null || true)
|
|
|
|
if (( ${#missing_paths[@]} > 0 )); then
|
|
record cron_missing_paths WARN "${missing_paths[*]}" "Cron references missing path(s): ${missing_paths[*]}"
|
|
else
|
|
record cron_missing_paths OK "0 missing" "No missing script paths in root crontab"
|
|
fi
|
|
}
|
|
|
|
# ── Run all checks ───────────────────────────────────────────────────────────
|
|
|
|
if ! $JSON_MODE && ! $QUIET; then
|
|
echo -e "\n${BOLD}VM Health Check — $(hostname) — $(date -u '+%Y-%m-%d %H:%M UTC')${NC}"
|
|
fi
|
|
|
|
check_disk
|
|
check_load
|
|
check_memory
|
|
check_swap
|
|
check_docker_containers
|
|
check_docker_disk
|
|
check_logs
|
|
check_automation_drift
|
|
|
|
# ── Summary ──────────────────────────────────────────────────────────────────
|
|
|
|
if $JSON_MODE; then
|
|
echo '{'
|
|
echo " \"timestamp\": \"$(date -u '+%Y-%m-%dT%H:%M:%SZ')\","
|
|
echo " \"hostname\": \"$(hostname)\","
|
|
if (( WORST_LEVEL >= 2 )); then _overall='"CRIT"'
|
|
elif (( WORST_LEVEL == 1 )); then _overall='"WARN"'
|
|
else _overall='"OK"'
|
|
fi
|
|
echo " \"overall\": ${_overall},"
|
|
echo " \"checks\": {"
|
|
local_keys=("${!JSON_DATA[@]}")
|
|
for i in "${!local_keys[@]}"; do
|
|
k="${local_keys[$i]}"
|
|
if [[ $i -lt $(( ${#local_keys[@]} - 1 )) ]]; then comma=","; else comma=""; fi
|
|
echo " \"$k\": ${JSON_DATA[$k]}$comma"
|
|
done
|
|
echo " }"
|
|
echo '}'
|
|
else
|
|
echo ""
|
|
if (( WORST_LEVEL == 0 )); then
|
|
echo -e " ${GREEN}${BOLD}✓ All checks passed${NC}"
|
|
elif (( WORST_LEVEL == 1 )); then
|
|
echo -e " ${YELLOW}${BOLD}⚠ ${#ISSUES[@]} warning(s):${NC}"
|
|
for issue in "${ISSUES[@]}"; do echo -e " ${YELLOW}→${NC} $issue"; done
|
|
else
|
|
echo -e " ${RED}${BOLD}✗ ${#ISSUES[@]} issue(s) — action required:${NC}"
|
|
for issue in "${ISSUES[@]}"; do echo -e " ${RED}→${NC} $issue"; done
|
|
fi
|
|
echo ""
|
|
fi
|
|
|
|
# ── Telegram notification ─────────────────────────────────────────────────
|
|
if $NOTIFY && (( WORST_LEVEL > 0 )); then
|
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
|
TELEGRAM_TOKEN=""
|
|
TELEGRAM_CHAT_ID=""
|
|
|
|
if [[ -f "$TOKEN_FILE" ]]; then
|
|
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" || true)
|
|
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" || true)
|
|
fi
|
|
|
|
if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
|
SEVERITY=$([[ $WORST_LEVEL -ge 2 ]] && echo "🚨 CRITICAL" || echo "⚠️ WARNING")
|
|
MSG="$SEVERITY — $(hostname) VM health check
|
|
$(date -u '+%Y-%m-%d %H:%M UTC')
|
|
|
|
$(printf '%s\n' "${ISSUES[@]}")"
|
|
curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
|
-d chat_id="$TELEGRAM_CHAT_ID" \
|
|
-d text="$MSG" > /dev/null || true
|
|
fi
|
|
fi
|
|
|
|
# ── Log result ───────────────────────────────────────────────────────────────
|
|
if (( WORST_LEVEL >= 2 )); then RESULT_STR="CRIT"
|
|
elif (( WORST_LEVEL == 1 )); then RESULT_STR="WARN"
|
|
else RESULT_STR="OK"
|
|
fi
|
|
log_to_file "health-check result=$RESULT_STR issues=${#ISSUES[@]}"
|
|
|
|
exit "$WORST_LEVEL"
|