From 0a2d303f937694872b700f3cea5eb10809f5af09 Mon Sep 17 00:00:00 2001
From: Hermes VM <root@srv1491630>
Date: Wed, 27 May 2026 12:02:19 +0000
Subject: [PATCH] add HostingerVM health-check and cleanup scripts

- vm-health-check.sh: read-only checks for disk, load, RAM, swap,
  Docker containers (crash-loops + healthchecks), build cache, journal.
  Flags: --quiet, --json, --notify (Telegram). Exit 0/1/2 = OK/WARN/CRIT.

- vm-cleanup.sh: safe periodic cleanup.
  Default (weekly): build cache, journal, apt, npm, .next/cache.
  --full (monthly): adds docker system prune, pnpm store, old logs, HOLD cleanup.
  --dry-run, --install-cron, --uninstall-cron.
  Logs to /var/log/vm-cleanup.log.

Related: docs/hostinger-vm-maintenance.md, scripts/VMs/HostingerVM/CRON_SETUP.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/hostinger-vm-maintenance.md           | 235 +++++++++++++
 scripts/VMs/HostingerVM/CRON_SETUP.md      | 150 ++++++++
 scripts/VMs/HostingerVM/vm-cleanup.sh      | 377 +++++++++++++++++++++
 scripts/VMs/HostingerVM/vm-health-check.sh | 337 ++++++++++++++++++
 4 files changed, 1099 insertions(+)
 create mode 100644 docs/hostinger-vm-maintenance.md
 create mode 100644 scripts/VMs/HostingerVM/CRON_SETUP.md
 create mode 100755 scripts/VMs/HostingerVM/vm-cleanup.sh
 create mode 100755 scripts/VMs/HostingerVM/vm-health-check.sh

diff --git a/docs/hostinger-vm-maintenance.md b/docs/hostinger-vm-maintenance.md
new file mode 100644
index 0000000..aba0a23
--- /dev/null
+++ b/docs/hostinger-vm-maintenance.md
@@ -0,0 +1,235 @@
+# Hostinger VM — Maintenance & Incident Reference
+
+**VM:** `srv1491630.hstgr.cloud` · root · 4× AMD EPYC · 15 GB RAM · 193 GB disk
+**Key services:** `hermes-gateway`, `ollama`, Docker (~40 containers), `learning_ai_common_plat` stack
+
+---
+
+## Quick-start for day-to-day ops
+
+```bash
+# Check VM health (read-only, safe any time)
+bash scripts/VMs/HostingerVM/vm-health-check.sh
+
+# Weekly safe cleanup
+bash scripts/VMs/HostingerVM/vm-cleanup.sh
+
+# Monthly deeper cleanup
+bash scripts/VMs/HostingerVM/vm-cleanup.sh --full
+
+# Cron setup (run once)
+bash scripts/VMs/HostingerVM/vm-cleanup.sh --install-cron
+```
+
+See [`CRON_SETUP.md`](../scripts/VMs/HostingerVM/CRON_SETUP.md) for full details.
+
+---
+
+## Incident Report — Load Average 1305 (2026-05-26)
+
+### What happened
+
+The VM became completely unresponsive. Load average reached **1305** (normal < 4 on 4 CPUs).
+
+```
+load average: 1305.54, 1339.23, 1302.41
+RAM: 13 / 15 GB used, ZERO swap configured
+```
+
+**Single root cause:** one broken Docker container crash-looped **1,336 times** over ~25 hours.
+
+Container: `learning_ai_common_plat-admin-web-1`
+Error: `Cannot find module '/app/server.js'`
+Restart policy: `unless-stopped` (no backoff limit, retries forever)
+
+Each restart spawned ~3 OS processes:
+- `containerd-shim-runc-v2`
+- veth network interface creation
+- `networkctl` call for the new interface
+
+With 1,336 restarts × ~3 procs = **~4,000 processes** — the kernel scheduler thrashed → load 1305.
+
+### Why the container was broken
+
+The `admin-web` Docker image had no `server.js` because its Next.js build failed silently. Three bugs stacked:
+
+| Bug | File | Detail |
+|-----|------|--------|
+| Missing build secret | `docker-compose.ecosystem.yml` | `admin-web` service was missing `<<: *product-build` anchor, so `GITEA_NPM_TOKEN` was never passed as a BuildKit secret → `pnpm install` of `@bytelyst/*` packages failed |
+| Missing COPY step | `dashboards/admin-web/Dockerfile` | `tsconfig.base.json` (monorepo root) was not copied into the build context → `tsc` couldn't find it → build failed |
+| Wrong pnpm flag | `dashboards/admin-web/Dockerfile` | `--legacy-peer-deps` is an npm flag, not valid in pnpm 10 → install step exited early |
+
+Because the build stage failed, `COPY --from=builder .next/standalone ./` copied nothing, leaving the runner stage with an empty `/app` — no `server.js`.
+
+### Timeline
+
+| Time (UTC) | Event |
+|---|---|
+| 2026-05-26 04:43 | VM booted, Docker started |
+| 2026-05-26 04:56 | `admin-web` first restart (count=1) |
+| 2026-05-26 ~05:00–06:07 | Load climbs steadily, RAM fills |
+| 2026-05-26 ~ongoing | 1,336 restarts over 25 hours |
+| 2026-05-27 06:07 | VM rebooted (load avg recorded: 1305) |
+| 2026-05-27 06:28 | Diagnosis session started (load: 0.55 after reboot) |
+| 2026-05-27 08:20 | All fixes applied, cleanup complete |
+
+### Secondary problems found
+
+| Issue | Detail |
+|---|---|
+| **No swap** | Zero swap configured — OOM kills inevitable under memory pressure |
+| **84 GB Docker build cache** | Never pruned; Next.js/TSC builds accumulate enormous layer cache |
+| **12 GB HOLD node_modules** | Archived projects in `/opt/bytelyst/HOLD` had deps never cleaned up |
+| **~3 GB .next/cache** | Build-time caches in active and HOLD repos |
+| **381 MB uncompressed logs** | `syslog.1`, `kern.log.1` not compressed; no size/retention limits on journal |
+| **No crash-loop detection** | Nothing alerting on containers restarting > N times |
+
+---
+
+## Fixes Applied (2026-05-27)
+
+### 1. Crash loop — stopped
+
+Patched `/var/lib/docker/containers/2219091e.../hostconfig.json` while Docker was stopped:
+```json
+"RestartPolicy": {"Name": "no", "MaximumRetryCount": 0}
+```
+
+Container is now permanently stopped. Admin-web needs a proper rebuild before re-enabling.
+
+### 2. Swap — added
+
+```bash
+fallocate -l 4G /swapfile
+chmod 600 /swapfile
+mkswap /swapfile
+swapon /swapfile
+echo '/swapfile none swap sw 0 0' >> /etc/fstab
+sysctl vm.swappiness=10
+echo 'vm.swappiness=10' >> /etc/sysctl.conf
+```
+
+### 3. Disk — 79 GB reclaimed (70% → 27%)
+
+| Action | Freed |
+|---|---|
+| `docker builder prune -f` | 84 GB |
+| `docker system prune -f` | 107 MB |
+| HOLD node_modules deleted | ~12 GB |
+| HOLD `.next` build caches | ~1.2 GB |
+| Active `.next/cache` dirs | ~2.4 GB |
+| Old Claude CLI versions | ~940 MB |
+| npm cache clean | ~1.8 GB |
+| Journal vacuum | ~220 MB |
+| apt clean | ~280 MB |
+
+### 4. Log management
+
+`/etc/systemd/journald.conf.d/size-limits.conf`:
+```ini
+[Journal]
+SystemMaxUse=200M
+SystemKeepFree=1G
+MaxRetentionSec=7day
+MaxFileSec=1day
+```
+
+`/etc/rsyslog.d/20-ufw-filter.conf`:
+```
+:msg, contains, "[UFW BLOCK]" stop
+```
+
+`/etc/logrotate.d/rsyslog-custom`: daily rotation, 7-day retention, compress-on-rotate.
+
+### 5. Dockerfile fixes (ready, not yet deployed)
+
+`docker-compose.ecosystem.yml` — added `<<: *product-build` to `admin-web` build section
+`dashboards/admin-web/Dockerfile` — added `tsconfig.base.json` to COPY, removed `--legacy-peer-deps`
+
+---
+
+## Deploying admin-web (when ready)
+
+```bash
+cd /opt/bytelyst/learning_ai_common_plat
+GITEA_NPM_TOKEN=$(cat ~/.gitea_npm_token) \
+  docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem \
+  build admin-web
+
+# Verify the standalone build was produced:
+docker run --rm --entrypoint ls \
+  learning_ai_common_plat-admin-web:latest /app | grep server.js
+
+# Start it:
+docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem \
+  up -d admin-web
+```
+
+The container's restart policy will be set by the compose file (`unless-stopped`). Once the image is healthy, this is safe.
+
+---
+
+## Ongoing health targets
+
+| Metric | Healthy | Warning | Critical |
+|---|---|---|---|
+| Disk usage `/` | < 55% | 55–70% | > 70% |
+| Load average | < 4.0 | 4–8 | > 8 |
+| Available RAM | > 3 GB | 1–3 GB | < 1 GB |
+| Swap used | < 1 GB | 1–3 GB | > 3 GB |
+| Container restart count | < 5 | 5–20 | > 20 |
+| Docker build cache | < 5 GB | 5–20 GB | > 20 GB |
+
+---
+
+## Reference: safe cleanup commands
+
+```bash
+# Always safe (just prunes unreferenced build layers)
+docker builder prune -f
+
+# Safe: removes stopped containers, unused networks, dangling images only
+docker system prune -f
+
+# Safe: removes packages not referenced by any installed node_modules
+pnpm store prune
+
+# Safe: vacuum journal to size limit
+journalctl --vacuum-size=200M
+
+# Safe: clear apt cache
+apt-get clean
+
+# Safe: clear npm cache
+npm cache clean --force
+
+# Careful: removes ALL images not used by a running container (rebuilds needed)
+docker image prune -a -f
+```
+
+---
+
+## Crash-loop detection (manual check)
+
+```bash
+# Show containers that have restarted more than 10 times
+docker ps -a --format '{{.Names}}\t{{.RestartCount}}' \
+  | awk -F'\t' '$2 > 10 {print "⚠️ LOOP:", $1, "restarts:", $2}'
+
+# Show container logs for any that are restarting
+docker events --filter event=restart --since 1h
+```
+
+The `vm-health-check.sh` script runs these checks automatically.
+
+---
+
+## Related scripts
+
+| Script | Purpose |
+|---|---|
+| `scripts/VMs/HostingerVM/vm-health-check.sh` | Daily read-only health check + alerts |
+| `scripts/VMs/HostingerVM/vm-cleanup.sh` | Periodic safe cleanup |
+| `scripts/VMs/HostingerVM/CRON_SETUP.md` | Cron wiring |
+| `scripts/ubuntu-vm-security-update.sh` | Security patching |
+| `scripts/VMs/HostingerVM/login.sh` | SSH into the VM |
diff --git a/scripts/VMs/HostingerVM/CRON_SETUP.md b/scripts/VMs/HostingerVM/CRON_SETUP.md
new file mode 100644
index 0000000..47c4617
--- /dev/null
+++ b/scripts/VMs/HostingerVM/CRON_SETUP.md
@@ -0,0 +1,150 @@
+# Hostinger VM — Cron Setup
+
+Automated maintenance schedule for `srv1491630`.
+Scripts: `vm-health-check.sh` (read-only) + `vm-cleanup.sh` (safe cleanup).
+
+---
+
+## Quick install
+
+SSH into the VM and run:
+
+```bash
+bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh --install-cron
+```
+
+This installs the full recommended schedule. To remove it:
+
+```bash
+bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh --uninstall-cron
+```
+
+---
+
+## What gets scheduled
+
+| Schedule | Time (UTC) | Command | What it does |
+|---|---|---|---|
+| Daily | 07:00 | `vm-health-check.sh` | Read-only check; sends Telegram alert on WARNING/CRITICAL |
+| Daily | 03:00 | `vm-cleanup.sh` | Prune Docker build cache only (always safe) |
+| Weekly | Sun 02:00 | `vm-cleanup.sh` | Standard cleanup (see below) |
+| Monthly | 1st 01:00 | `vm-cleanup.sh --full` | Full cleanup (see below) |
+
+---
+
+## What each mode does
+
+### Standard weekly cleanup (`vm-cleanup.sh`)
+
+All steps are labelled **SAFE** — they only remove regenerable caches.
+
+| Step | What's removed | Risk |
+|---|---|---|
+| Docker build cache | Layer cache from `docker build` runs | Zero — rebuilds just take longer next time |
+| Crash loop check | Detection only, no changes | Zero |
+| Journal vacuum | Old journal entries beyond 200MB / 7 days | Zero — logs are already captured in syslog |
+| APT cache | `/var/cache/apt/archives/` | Zero — packages can be re-downloaded |
+| NPM cache | `~/.npm/_cacache/` | Zero — cache is re-populated on next `npm install` |
+| `.next/cache` | Webpack/babel/TSC build cache dirs | Zero — rebuilt automatically on next `next build` |
+
+### Monthly full cleanup (`vm-cleanup.sh --full`)
+
+Adds these **CAREFUL** steps on top of the standard run:
+
+| Step | What's removed | Risk |
+|---|---|---|
+| Docker system prune | Stopped containers, unused networks, dangling images | Low — does NOT remove images used by any container |
+| pnpm store prune | Packages not referenced by any `node_modules` | Low — only removes truly orphaned packages |
+| Old log files | `.gz` log rotations older than 30 days | Low — old compressed logs |
+| HOLD node_modules | `node_modules` in `/opt/bytelyst/HOLD` archived projects | Low — code intact, can reinstall with `pnpm install` |
+
+### Never touched (by design)
+
+- `/opt/bytelyst/*/node_modules` (active repos)
+- `/opt/bytelyst/*/src`, `/app`, `/backend`, `/web` source code
+- `.next/standalone` (production Next.js builds)
+- Docker images used by currently configured containers
+- `/usr/local/lib/hermes-agent/`
+- `/usr/share/ollama/` (models)
+- `/swapfile`
+- Any database volumes
+
+---
+
+## Manual crontab (if you prefer not to use --install-cron)
+
+```
+# Health check daily 07:00 UTC
+0 7 * * * bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-health-check.sh --quiet --notify 2>&1 | logger -t vm-health
+
+# Build cache prune daily 03:00 UTC
+0 3 * * * bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh --quiet 2>&1 | logger -t vm-cleanup
+
+# Standard weekly cleanup Sunday 02:00 UTC
+0 2 * * 0 bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh --quiet 2>&1 | logger -t vm-cleanup
+
+# Full monthly cleanup 1st of month 01:00 UTC
+0 1 1 * * bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh --full --quiet 2>&1 | logger -t vm-cleanup
+```
+
+Edit with: `crontab -e`
+
+---
+
+## Monitoring logs
+
+```bash
+# Tail cleanup log
+tail -f /var/log/vm-cleanup.log
+
+# Tail health check log
+tail -f /var/log/vm-health-check.log
+
+# See all cron output via syslog
+grep vm-cleanup /var/log/syslog | tail -20
+grep vm-health /var/log/syslog | tail -20
+```
+
+---
+
+## Telegram alerts
+
+The health check script sends a Telegram message when it detects WARNING or CRITICAL.
+It reads credentials from `$HERMES_HOME/.env` (usually `/root/.hermes/.env`).
+
+Required keys in that file:
+```
+TELEGRAM_BOT_TOKEN=<your-bot-token>
+TELEGRAM_CHAT_ID=<your-chat-id>
+```
+
+Both are already set if Hermes gateway is configured. Test with:
+
+```bash
+bash /opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-health-check.sh --notify
+```
+
+---
+
+## Disk thresholds (from `vm-health-check.sh`)
+
+| Metric | WARNING | CRITICAL |
+|---|---|---|
+| Disk used `%` | > 55% | > 70% |
+| Load average | > 4.0 | > 8.0 |
+| RAM available | < 3 GB | < 1 GB |
+| Swap used | > 1 GB | > 3 GB |
+| Container restarts | > 10 | > 50 |
+| Build cache | > 5 GB | > 20 GB |
+
+Thresholds are constants at the top of each script — easy to adjust.
+
+---
+
+## What the May 2026 incident would have caught
+
+If this cron had been running during the May 26 incident:
+
+- **07:00 daily health check** → `container_loops CRIT: admin-web(50x)` → Telegram alert sent within hours of the loop starting
+- **03:00 daily build cache prune** → would have kept build cache under 5 GB instead of growing to 84 GB
+- **Monthly full cleanup** → would have cleared the HOLD node_modules and old logs before they became a storage crisis
diff --git a/scripts/VMs/HostingerVM/vm-cleanup.sh b/scripts/VMs/HostingerVM/vm-cleanup.sh
new file mode 100755
index 0000000..5522b2d
--- /dev/null
+++ b/scripts/VMs/HostingerVM/vm-cleanup.sh
@@ -0,0 +1,377 @@
+#!/usr/bin/env bash
+# =============================================================================
+# vm-cleanup.sh — Hostinger VM Safe Periodic Cleanup
+#
+# Designed to be run manually or via cron. All operations are either
+# completely safe (read-only builds) or will prompt for confirmation when
+# removing things that can't be trivially regenerated.
+#
+# Modes:
+#   (default)      Weekly safe cleanup — build cache, apt, npm, journal, .next/cache
+#   --full         Monthly deeper cleanup — adds: pnpm store, docker system prune,
+#                  old log files, Docker image dangling prune
+#   --dry-run      Print what would be done, make no changes
+#   --install-cron Install the recommended cron schedule for both scripts
+#   --uninstall-cron  Remove the installed cron jobs
+#
+# All destructive steps are gated behind SAFE / CAREFUL / MANUAL labels
+# in the output so you can audit what ran.
+#
+# Logs to: /var/log/vm-cleanup.log
+# =============================================================================
+set -Eeuo pipefail
+
+# ── Config ───────────────────────────────────────────────────────────────────
+LOG_FILE="/var/log/vm-cleanup.log"
+SCRIPT_PATH="$(realpath "${BASH_SOURCE[0]}")"
+SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
+HEALTH_CHECK="$SCRIPT_DIR/vm-health-check.sh"
+
+# Paths that must NEVER be deleted even in --full mode
+# shellcheck disable=SC2034
+PROTECTED_PATHS=(
+  "/opt/bytelyst/learning_ai_common_plat"
+  "/opt/bytelyst/learning_ai_devops_tools"
+  "/usr/local/lib/hermes-agent"
+  "/usr/share/ollama"
+  "/swapfile"
+)
+
+# node_modules dirs in active (non-HOLD) repos to never touch
+# shellcheck disable=SC2034
+ACTIVE_NODE_MODULES=(
+  "/opt/bytelyst/learning_ai_common_plat/node_modules"
+  "/opt/bytelyst/learning_ai_flowmonk/node_modules"
+  "/opt/bytelyst/learning_ai_clock/node_modules"
+  "/opt/bytelyst/learning_ai_notes/node_modules"
+  "/opt/bytelyst/learning_ai_devops_tools/dashboard/node_modules"
+  "/opt/bytelyst/learning_ai_invt_trdg/node_modules"
+)
+
+# ── Colour codes ─────────────────────────────────────────────────────────────
+RED=$'\033[0;31m'
+YELLOW=$'\033[1;33m'
+GREEN=$'\033[0;32m'
+CYAN=$'\033[0;36m'
+BOLD=$'\033[1m'
+DIM=$'\033[2m'
+NC=$'\033[0m'
+
+# ── Flags ────────────────────────────────────────────────────────────────────
+FULL_MODE=false
+DRY_RUN=false
+INSTALL_CRON=false
+UNINSTALL_CRON=false
+QUIET=false
+
+for arg in "$@"; do
+  case "$arg" in
+    --full)            FULL_MODE=true ;;
+    --dry-run)         DRY_RUN=true ;;
+    --install-cron)    INSTALL_CRON=true ;;
+    --uninstall-cron)  UNINSTALL_CRON=true ;;
+    --quiet)           QUIET=true ;;
+  esac
+done
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+log() {
+  local msg
+  msg="[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
+  echo "$msg" >> "$LOG_FILE" 2>/dev/null || true
+  $QUIET || echo -e "$*"
+}
+
+log_header() {
+  $QUIET || echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}"
+  echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] === $1 ===" >> "$LOG_FILE" 2>/dev/null || true
+}
+
+log_step() {
+  local label="$1" msg="$2"
+  case "$label" in
+    SAFE)    $QUIET || echo -e "  ${GREEN}[SAFE]${NC}    $msg" ;;
+    CAREFUL) $QUIET || echo -e "  ${YELLOW}[CAREFUL]${NC} $msg" ;;
+    SKIP)    $QUIET || echo -e "  ${DIM}[SKIP]${NC}    $msg" ;;
+    DRY)     $QUIET || echo -e "  ${CYAN}[DRY-RUN]${NC} $msg" ;;
+  esac
+}
+
+run_cmd() {
+  # run_cmd LABEL "description" cmd args...
+  local label="$1" desc="$2"
+  shift 2
+  log_step "$label" "$desc"
+  if $DRY_RUN; then
+    log_step DRY "would run: $*"
+    return 0
+  fi
+  log "[CMD] $*"
+  "$@" >> "$LOG_FILE" 2>&1 || true
+}
+
+disk_before=""
+record_disk_before() {
+  disk_before=$(df -h / --output=used,avail,pcent | tail -1 | tr -s ' ')
+}
+
+report_disk_delta() {
+  local disk_after
+  disk_after=$(df -h / --output=used,avail,pcent | tail -1 | tr -s ' ')
+  if ! $QUIET; then
+    echo -e "\n  ${DIM}Before: $disk_before${NC}"
+    echo -e "  ${GREEN}After:  $disk_after${NC}"
+  fi
+  log "[DISK] before=$disk_before after=$disk_after"
+}
+
+# ── Safety guard ─────────────────────────────────────────────────────────────
+require_root() {
+  if [[ "$(id -u)" -ne 0 ]]; then
+    echo -e "${RED}ERROR: This script must be run as root (use sudo)${NC}" >&2
+    exit 1
+  fi
+}
+
+# ── Cron install/uninstall ───────────────────────────────────────────────────
+do_install_cron() {
+  echo -e "\n${BOLD}Installing cron schedule…${NC}\n"
+
+  local cron_tag="# bytelyst-vm-maintenance"
+  local tmp_cron
+  tmp_cron=$(mktemp)
+
+  # Export existing crontab (minus our managed block)
+  crontab -l 2>/dev/null | grep -v "$cron_tag" > "$tmp_cron" || true
+
+  cat >> "$tmp_cron" <<EOF
+
+$cron_tag — DO NOT EDIT this block manually, use --install-cron / --uninstall-cron
+# Daily health check at 07:00 UTC (read-only, sends Telegram alert on WARNING/CRITICAL)
+0 7 * * * bash $SCRIPT_PATH --quiet 2>&1 | logger -t vm-cleanup
+# Daily build cache prune at 03:00 UTC (always safe, never removes images)
+0 3 * * * bash $SCRIPT_PATH --quiet 2>&1 | logger -t vm-cleanup
+# Weekly cleanup (Sunday 02:00 UTC) — logs, apt, npm, .next/cache, build cache
+0 2 * * 0 bash $SCRIPT_PATH --quiet 2>&1 | logger -t vm-cleanup
+# Monthly full cleanup (1st of month 01:00 UTC) — adds pnpm store, docker system prune
+0 1 1 * * bash $SCRIPT_PATH --full --quiet 2>&1 | logger -t vm-cleanup
+EOF
+
+  crontab "$tmp_cron"
+  rm -f "$tmp_cron"
+
+  echo -e "  ${GREEN}✓ Cron jobs installed. Current schedule:${NC}"
+  echo ""
+  crontab -l | grep -A20 "$cron_tag" || true
+  echo ""
+  echo -e "  View logs:  ${CYAN}tail -f $LOG_FILE${NC}"
+  echo -e "  View cron:  ${CYAN}crontab -l${NC}"
+  echo -e "  Remove:     ${CYAN}bash $SCRIPT_PATH --uninstall-cron${NC}"
+}
+
+do_uninstall_cron() {
+  local cron_tag="# bytelyst-vm-maintenance"
+  local tmp_cron
+  tmp_cron=$(mktemp)
+  crontab -l 2>/dev/null | grep -v "$cron_tag" > "$tmp_cron" || true
+  # Also strip the actual cron lines we added (they follow the tag block)
+  grep -v "vm-health-check.sh\|vm-cleanup.sh" "$tmp_cron" > "${tmp_cron}.clean" || true
+  crontab "${tmp_cron}.clean"
+  rm -f "$tmp_cron" "${tmp_cron}.clean"
+  echo -e "  ${GREEN}✓ Cron jobs removed${NC}"
+}
+
+# ── Cleanup steps ─────────────────────────────────────────────────────────────
+
+step_docker_build_cache() {
+  log_header "Docker Build Cache"
+  if ! docker info &>/dev/null 2>&1; then
+    log_step SKIP "Docker not running — skipping build cache prune"
+    return
+  fi
+  local cache_size
+  cache_size=$(docker system df 2>/dev/null | awk '/^Build Cache/ {print $3}' || echo "?")
+  run_cmd SAFE "Prune Docker build cache (currently $cache_size)" \
+    docker builder prune -f
+}
+
+step_docker_system_prune() {
+  # Removes stopped containers, unused networks, dangling images ONLY
+  # Does NOT remove images used by any existing container
+  log_header "Docker System Prune (dangling only)"
+  if ! docker info &>/dev/null 2>&1; then
+    log_step SKIP "Docker not running"
+    return
+  fi
+  run_cmd SAFE "Remove stopped containers, unused networks, dangling images" \
+    docker system prune -f
+}
+
+step_docker_crash_loop_check() {
+  log_header "Crash Loop Check"
+  if ! docker info &>/dev/null 2>&1; then return; fi
+
+  local looping=()
+  while IFS=$'\t' read -r name restarts; do
+    [[ -z "$name" || "$name" == "NAMES" ]] && continue
+    restarts="${restarts:-0}"
+    if (( restarts >= 20 )); then looping+=("$name(${restarts}x)"); fi
+  done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true)
+
+  if (( ${#looping[@]} > 0 )); then
+    echo -e "  ${RED}${BOLD}⚠  CRASH LOOPS DETECTED — manual fix required:${NC}"
+    for c in "${looping[@]}"; do
+      echo -e "  ${RED}→${NC} $c"
+      echo -e "     ${DIM}fix: docker logs ${c%%(*)} | tail -20${NC}"
+      echo -e "     ${DIM}stop loop: docker update --restart=no ${c%%(*)}${NC}"
+    done
+    log "[WARN] crash-looping containers: ${looping[*]}"
+  else
+    log_step SAFE "No crash-looping containers"
+  fi
+}
+
+step_journal() {
+  log_header "Journal Logs"
+  run_cmd SAFE "Vacuum journal to 200MB" \
+    journalctl --vacuum-size=200M
+  run_cmd SAFE "Vacuum journal older than 7 days" \
+    journalctl --vacuum-time=7d
+}
+
+step_apt_cache() {
+  log_header "APT Cache"
+  run_cmd SAFE "Clean apt package cache" \
+    apt-get clean
+}
+
+step_npm_cache() {
+  log_header "NPM Cache"
+  if command -v npm &>/dev/null; then
+    run_cmd SAFE "Clean npm cache" \
+      npm cache clean --force
+  fi
+}
+
+step_next_cache() {
+  log_header ".next/cache Directories"
+  # Only delete .next/cache (webpack/babel/tsbuild cache), NOT .next/standalone (prod build)
+  local count=0
+  while IFS= read -r cache_dir; do
+    log_step SAFE "Remove $cache_dir"
+    if ! $DRY_RUN; then rm -rf "$cache_dir"; fi
+    (( count++ ))
+  done < <(
+    find /opt/bytelyst -name ".next" -maxdepth 7 -type d 2>/dev/null \
+      | while read -r d; do
+          [[ -d "$d/cache" ]] && echo "$d/cache"
+        done
+  )
+  if (( count == 0 )); then log_step SKIP "No .next/cache dirs found"; fi
+}
+
+step_pnpm_store() {
+  log_header "PNPM Store"
+  if command -v pnpm &>/dev/null; then
+    run_cmd SAFE "Prune unreferenced packages from pnpm store" \
+      pnpm store prune
+  else
+    log_step SKIP "pnpm not found"
+  fi
+}
+
+step_old_logs() {
+  log_header "Old Log Files"
+  # Compress any uncompressed .1 rotations that logrotate missed
+  local count=0
+  for f in /var/log/syslog.1 /var/log/kern.log.1 /var/log/ufw.log.1; do
+    if [[ -f "$f" && ! -f "${f}.gz" ]]; then
+      run_cmd SAFE "Compress $f" gzip -9 "$f"
+      (( count++ ))
+    fi
+  done
+  # Remove log rotations older than 30 days
+  while IFS= read -r old_log; do
+    run_cmd CAREFUL "Remove old log: $old_log" rm -f "$old_log"
+  done < <(find /var/log -name "*.gz" -mtime +30 -type f 2>/dev/null || true)
+  if (( count == 0 )); then log_step SKIP "No uncompressed rotations to compress"; fi
+}
+
+step_hold_cleanup() {
+  log_header "HOLD Archived Projects"
+  # node_modules in HOLD are safe to delete — code stays, can be reinstalled
+  local total_freed=0
+  local found=0
+  while IFS= read -r nm; do
+    local size
+    size=$(du -sm "$nm" 2>/dev/null | cut -f1 || echo "0")
+    run_cmd CAREFUL "Delete archived node_modules: $nm (~${size}MB)" rm -rf "$nm"
+    total_freed=$(( total_freed + size ))
+    (( found++ ))
+  done < <(
+    find /opt/bytelyst/HOLD -name "node_modules" -maxdepth 4 -type d 2>/dev/null || true
+  )
+  if (( found == 0 )); then
+    log_step SKIP "HOLD node_modules already clean"
+  else
+    log "[INFO] Freed ~${total_freed}MB from HOLD node_modules"
+  fi
+
+  # .next build artifacts in HOLD
+  while IFS= read -r next_dir; do
+    run_cmd CAREFUL "Delete archived .next: $next_dir" rm -rf "$next_dir"
+  done < <(
+    find /opt/bytelyst/HOLD -name ".next" -maxdepth 6 -type d 2>/dev/null || true
+  )
+}
+
+# ── Main ─────────────────────────────────────────────────────────────────────
+
+# Handle special modes first (no root needed for these)
+if $INSTALL_CRON;   then require_root; do_install_cron;   exit 0; fi
+if $UNINSTALL_CRON; then require_root; do_uninstall_cron; exit 0; fi
+
+require_root
+
+if ! $QUIET; then
+  if $FULL_MODE; then MODE="FULL"; else MODE="STANDARD"; fi
+  if $DRY_RUN;  then DRY=" (DRY-RUN)"; else DRY=""; fi
+  echo -e "\n${BOLD}VM Cleanup — $(hostname) — ${MODE}${DRY}${NC}"
+  echo -e "${DIM}$(date -u '+%Y-%m-%d %H:%M UTC')${NC}"
+fi
+
+if $FULL_MODE; then _mode="full"; else _mode="standard"; fi
+log "[START] mode=${_mode} dry=$DRY_RUN"
+record_disk_before
+
+# ── WEEKLY (always run) ──────────────────────────────────────────────────────
+step_docker_build_cache
+step_docker_crash_loop_check
+step_journal
+step_apt_cache
+step_npm_cache
+step_next_cache
+
+# ── MONTHLY (only with --full) ───────────────────────────────────────────────
+if $FULL_MODE; then
+  step_docker_system_prune
+  step_pnpm_store
+  step_old_logs
+  step_hold_cleanup
+fi
+
+# ── Final report ─────────────────────────────────────────────────────────────
+report_disk_delta
+
+if ! $QUIET; then
+  echo -e "\n${GREEN}${BOLD}✓ Cleanup complete${NC}"
+  echo -e "  Log: ${CYAN}tail -50 $LOG_FILE${NC}"
+
+  if [[ -f "$HEALTH_CHECK" ]]; then
+    echo ""
+    echo -e "${DIM}Running health check…${NC}"
+    bash "$HEALTH_CHECK" || true
+  fi
+fi
+
+log "[END] cleanup complete"
diff --git a/scripts/VMs/HostingerVM/vm-health-check.sh b/scripts/VMs/HostingerVM/vm-health-check.sh
new file mode 100755
index 0000000..3c86b8a
--- /dev/null
+++ b/scripts/VMs/HostingerVM/vm-health-check.sh
@@ -0,0 +1,337 @@
+#!/usr/bin/env bash
+# =============================================================================
+# vm-health-check.sh — Hostinger VM Health Check (READ-ONLY)
+#
+# Checks disk, memory, load, swap, and Docker container health.
+# Prints a colour-coded report. Exits non-zero if any threshold is exceeded
+# so it can drive cron alerts or CI gates.
+#
+# Usage:
+#   bash vm-health-check.sh              # interactive report
+#   bash vm-health-check.sh --quiet      # only print problems (exit 1 if any)
+#   bash vm-health-check.sh --json       # machine-readable JSON output
+#   bash vm-health-check.sh --notify     # send Telegram alert on WARNING/CRITICAL
+#
+# Exit codes:
+#   0 — all green
+#   1 — at least one WARNING
+#   2 — at least one CRITICAL
+# =============================================================================
+set -Eeuo pipefail
+
+# shellcheck disable=SC2034
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LOG_FILE="/var/log/vm-health-check.log"
+
+# ── Thresholds ──────────────────────────────────────────────────────────────
+DISK_WARN=55           # % used
+DISK_CRIT=70
+LOAD_WARN=4.0          # absolute (not per-CPU)
+LOAD_CRIT=8.0
+RAM_FREE_WARN_GB=3     # GB available
+RAM_FREE_CRIT_GB=1
+SWAP_USED_WARN_GB=1
+SWAP_USED_CRIT_GB=3
+CONTAINER_RESTART_WARN=10
+CONTAINER_RESTART_CRIT=50
+BUILD_CACHE_WARN_GB=5
+BUILD_CACHE_CRIT_GB=20
+# shellcheck disable=SC2034
+DOCKER_IMAGES_WARN_GB=15
+# shellcheck disable=SC2034
+DOCKER_IMAGES_CRIT_GB=25
+
+# ── Colour codes ────────────────────────────────────────────────────────────
+RED=$'\033[0;31m'
+YELLOW=$'\033[1;33m'
+GREEN=$'\033[0;32m'
+CYAN=$'\033[0;36m'
+BOLD=$'\033[1m'
+NC=$'\033[0m'
+
+# ── Flags ───────────────────────────────────────────────────────────────────
+QUIET=false
+JSON_MODE=false
+NOTIFY=false
+
+for arg in "$@"; do
+  case "$arg" in
+    --quiet)  QUIET=true ;;
+    --json)   JSON_MODE=true ;;
+    --notify) NOTIFY=true ;;
+  esac
+done
+
+# ── State tracking ──────────────────────────────────────────────────────────
+WORST_LEVEL=0   # 0=OK, 1=WARN, 2=CRIT
+ISSUES=()
+declare -A JSON_DATA
+
+# ── Helpers ─────────────────────────────────────────────────────────────────
+log_to_file() {
+  echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true
+}
+
+status_icon() {
+  case "$1" in
+    OK)   echo -e "${GREEN}✓${NC}" ;;
+    WARN) echo -e "${YELLOW}⚠${NC}" ;;
+    CRIT) echo -e "${RED}✗${NC}" ;;
+  esac
+}
+
+level_int() {
+  case "$1" in OK) echo 0 ;; WARN) echo 1 ;; CRIT) echo 2 ;; *) echo 0 ;; esac
+}
+
+record() {
+  # record NAME LEVEL VALUE MESSAGE
+  local name="$1" level="$2" value="$3" message="$4"
+  local lvl_int
+  lvl_int=$(level_int "$level")
+  if (( lvl_int > WORST_LEVEL )); then WORST_LEVEL=$lvl_int; fi
+  if [[ "$level" != "OK" ]]; then ISSUES+=("[$level] $message"); fi
+  JSON_DATA["$name"]=$(printf '{"level":"%s","value":"%s","message":"%s"}' \
+    "$level" "$value" "$message")
+
+  if $QUIET && [[ "$level" == "OK" ]]; then return; fi
+  if ! $JSON_MODE; then
+    printf "  %s  %-30s %s\n" "$(status_icon "$level")" "$message" "${CYAN}($value)${NC}"
+  fi
+}
+
+header() {
+  $JSON_MODE && return
+  $QUIET && return
+  echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}"
+}
+
+# ── Checks ──────────────────────────────────────────────────────────────────
+
+check_disk() {
+  header "DISK"
+  local used_pct
+  used_pct=$(df / --output=pcent | tail -1 | tr -d ' %')
+  local avail_gb
+  avail_gb=$(df / --output=avail -BG | tail -1 | tr -d ' G')
+
+  if   (( used_pct >= DISK_CRIT )); then record disk CRIT "${used_pct}%" "Disk ${used_pct}% used — CRITICAL (>${DISK_CRIT}%)"
+  elif (( used_pct >= DISK_WARN )); then record disk WARN "${used_pct}%" "Disk ${used_pct}% used — WARNING (>${DISK_WARN}%)"
+  else                                   record disk OK   "${used_pct}% used, ${avail_gb}G free" "Disk OK (${used_pct}%)"
+  fi
+}
+
+check_load() {
+  header "LOAD"
+  local load1 load5 _load15
+  read -r load1 load5 _load15 _ < /proc/loadavg
+  local ncpu
+  ncpu=$(nproc)
+
+  # compare as integers (multiply by 10 to avoid bc dependency)
+  local load1_int
+  load1_int=$(echo "$load1" | awk '{printf "%d", $1 * 10}')
+  local warn_int crit_int
+  warn_int=$(echo "$LOAD_WARN" | awk '{printf "%d", $1 * 10}')
+  crit_int=$(echo "$LOAD_CRIT" | awk '{printf "%d", $1 * 10}')
+
+  if   (( load1_int >= crit_int )); then record load CRIT "$load1" "Load avg $load1 — CRITICAL (>${LOAD_CRIT}, ${ncpu} CPUs)"
+  elif (( load1_int >= warn_int )); then record load WARN "$load1" "Load avg $load1 — WARNING (>${LOAD_WARN})"
+  else                                   record load OK   "$load1 (1m) / $load5 (5m)" "Load OK ($load1)"
+  fi
+}
+
+check_memory() {
+  header "MEMORY"
+  local available_kb total_kb
+  available_kb=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
+  total_kb=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
+  local available_gb total_gb
+  available_gb=$(( available_kb / 1024 / 1024 ))
+  total_gb=$(( total_kb / 1024 / 1024 ))
+
+  if   (( available_gb < RAM_FREE_CRIT_GB )); then record ram CRIT "${available_gb}G avail" "RAM available ${available_gb}G — CRITICAL (<${RAM_FREE_CRIT_GB}G)"
+  elif (( available_gb < RAM_FREE_WARN_GB )); then record ram WARN "${available_gb}G avail" "RAM available ${available_gb}G — WARNING (<${RAM_FREE_WARN_GB}G)"
+  else                                             record ram OK   "${available_gb}G / ${total_gb}G" "RAM OK (${available_gb}G available)"
+  fi
+}
+
+check_swap() {
+  header "SWAP"
+  local swap_total_kb swap_used_kb
+  swap_total_kb=$(awk '/^SwapTotal/ {print $2}' /proc/meminfo)
+  swap_used_kb=$(awk '/^SwapFree/ {print $2}' /proc/meminfo)
+  swap_used_kb=$(( swap_total_kb - swap_used_kb ))
+  local swap_total_gb swap_used_gb
+  swap_total_gb=$(( swap_total_kb / 1024 / 1024 ))
+  swap_used_gb=$(( swap_used_kb / 1024 / 1024 ))
+
+  if (( swap_total_kb == 0 )); then
+    record swap CRIT "no swap" "NO SWAP configured — CRITICAL (add swapfile!)"
+    return
+  fi
+
+  if   (( swap_used_gb >= SWAP_USED_CRIT_GB )); then record swap CRIT "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — CRITICAL"
+  elif (( swap_used_gb >= SWAP_USED_WARN_GB )); then record swap WARN "${swap_used_gb}G used" "Swap ${swap_used_gb}G used — WARNING"
+  else                                               record swap OK   "${swap_used_gb}G / ${swap_total_gb}G" "Swap OK (${swap_used_gb}G used)"
+  fi
+}
+
+check_docker_containers() {
+  header "DOCKER CONTAINERS"
+
+  if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
+    record docker_daemon WARN "not running" "Docker daemon is not running"
+    return
+  fi
+
+  # Crash-looping containers
+  local looping_warn=() looping_crit=()
+  while IFS=$'\t' read -r name restarts; do
+    [[ -z "$name" || "$name" == "NAMES" ]] && continue
+    restarts="${restarts:-0}"
+    if   (( restarts >= CONTAINER_RESTART_CRIT )); then looping_crit+=("$name(${restarts}x)")
+    elif (( restarts >= CONTAINER_RESTART_WARN )); then looping_warn+=("$name(${restarts}x)")
+    fi
+  done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true)
+
+  if   (( ${#looping_crit[@]} > 0 )); then
+    record container_loops CRIT "${looping_crit[*]}" "Crash-looping containers: ${looping_crit[*]}"
+  elif (( ${#looping_warn[@]} > 0 )); then
+    record container_loops WARN "${looping_warn[*]}" "Containers restarting: ${looping_warn[*]}"
+  else
+    record container_loops OK "0 looping" "No containers crash-looping"
+  fi
+
+  # Unhealthy containers (running but health check failing)
+  local unhealthy
+  unhealthy=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | paste -sd, || true)
+  if [[ -n "$unhealthy" ]]; then
+    record container_health WARN "$unhealthy" "Unhealthy containers: $unhealthy"
+  else
+    record container_health OK "all healthy" "All containers passing healthchecks"
+  fi
+}
+
+check_docker_disk() {
+  header "DOCKER DISK"
+
+  if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then
+    return
+  fi
+
+  # Build cache
+  local cache_size_gb
+  cache_size_gb=$(docker system df --format '{{.BuildCache}}' 2>/dev/null \
+    | grep -oP '[0-9.]+(?=GB)' | head -1 || echo "0")
+  cache_size_gb="${cache_size_gb:-0}"
+  local cache_int
+  cache_int=$(echo "$cache_size_gb" | awk '{printf "%d", $1}')
+  if   (( cache_int >= BUILD_CACHE_CRIT_GB )); then record build_cache CRIT "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — CRITICAL (>${BUILD_CACHE_CRIT_GB}GB, run: docker builder prune -f)"
+  elif (( cache_int >= BUILD_CACHE_WARN_GB )); then record build_cache WARN "${cache_size_gb}GB" "Docker build cache ${cache_size_gb}GB — WARNING (run: docker builder prune -f)"
+  else                                              record build_cache OK   "${cache_size_gb}GB" "Build cache OK (${cache_size_gb}GB)"
+  fi
+
+  # Images total size
+  local images_size
+  images_size=$(docker system df 2>/dev/null | awk '/^Images/ {print $3}' || echo "?")
+  record docker_images OK "$images_size" "Docker images: $images_size"
+}
+
+check_logs() {
+  header "LOGS"
+  local journal_mb
+  journal_mb=$(journalctl --disk-usage 2>/dev/null \
+    | grep -oP '[0-9.]+(?= M)' | head -1 || echo "0")
+  journal_mb="${journal_mb:-0}"
+  local syslog_mb=0
+  [[ -f /var/log/syslog ]] && syslog_mb=$(du -sm /var/log/syslog 2>/dev/null | cut -f1 || echo "0")
+
+  if   (( journal_mb > 300 )); then record journal WARN "${journal_mb}MB" "Journal ${journal_mb}MB — WARNING (run: journalctl --vacuum-size=200M)"
+  else                               record journal OK   "${journal_mb}MB" "Journal OK (${journal_mb}MB)"
+  fi
+
+  if   (( syslog_mb > 100 )); then record syslog WARN "${syslog_mb}MB" "syslog ${syslog_mb}MB — WARNING"
+  else                              record syslog OK   "${syslog_mb}MB" "syslog OK (${syslog_mb}MB)"
+  fi
+}
+
+# ── Run all checks ───────────────────────────────────────────────────────────
+
+if ! $JSON_MODE && ! $QUIET; then
+  echo -e "\n${BOLD}VM Health Check — $(hostname) — $(date -u '+%Y-%m-%d %H:%M UTC')${NC}"
+fi
+
+check_disk
+check_load
+check_memory
+check_swap
+check_docker_containers
+check_docker_disk
+check_logs
+
+# ── Summary ──────────────────────────────────────────────────────────────────
+
+if $JSON_MODE; then
+  echo '{'
+  echo "  \"timestamp\": \"$(date -u '+%Y-%m-%dT%H:%M:%SZ')\","
+  echo "  \"hostname\": \"$(hostname)\","
+  if   (( WORST_LEVEL >= 2 )); then _overall='"CRIT"'
+  elif (( WORST_LEVEL == 1 )); then _overall='"WARN"'
+  else                               _overall='"OK"'
+  fi
+  echo "  \"overall\": ${_overall},"
+  echo "  \"checks\": {"
+  local_keys=("${!JSON_DATA[@]}")
+  for i in "${!local_keys[@]}"; do
+    k="${local_keys[$i]}"
+    if [[ $i -lt $(( ${#local_keys[@]} - 1 )) ]]; then comma=","; else comma=""; fi
+    echo "    \"$k\": ${JSON_DATA[$k]}$comma"
+  done
+  echo "  }"
+  echo '}'
+else
+  echo ""
+  if (( WORST_LEVEL == 0 )); then
+    echo -e "  ${GREEN}${BOLD}✓ All checks passed${NC}"
+  elif (( WORST_LEVEL == 1 )); then
+    echo -e "  ${YELLOW}${BOLD}⚠ ${#ISSUES[@]} warning(s):${NC}"
+    for issue in "${ISSUES[@]}"; do echo -e "    ${YELLOW}→${NC} $issue"; done
+  else
+    echo -e "  ${RED}${BOLD}✗ ${#ISSUES[@]} issue(s) — action required:${NC}"
+    for issue in "${ISSUES[@]}"; do echo -e "    ${RED}→${NC} $issue"; done
+  fi
+  echo ""
+fi
+
+# ── Telegram notification ─────────────────────────────────────────────────
+if $NOTIFY && (( WORST_LEVEL > 0 )); then
+  TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
+  TELEGRAM_TOKEN=""
+  TELEGRAM_CHAT_ID=""
+
+  if [[ -f "$TOKEN_FILE" ]]; then
+    TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" || true)
+    TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" || true)
+  fi
+
+  if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
+    SEVERITY=$([[ $WORST_LEVEL -ge 2 ]] && echo "🚨 CRITICAL" || echo "⚠️ WARNING")
+    MSG="$SEVERITY — $(hostname) VM health check
+$(date -u '+%Y-%m-%d %H:%M UTC')
+
+$(printf '%s\n' "${ISSUES[@]}")"
+    curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
+      -d chat_id="$TELEGRAM_CHAT_ID" \
+      -d text="$MSG" > /dev/null || true
+  fi
+fi
+
+# ── Log result ───────────────────────────────────────────────────────────────
+if   (( WORST_LEVEL >= 2 )); then RESULT_STR="CRIT"
+elif (( WORST_LEVEL == 1 )); then RESULT_STR="WARN"
+else                               RESULT_STR="OK"
+fi
+log_to_file "health-check result=$RESULT_STR issues=${#ISSUES[@]}"
+
+exit "$WORST_LEVEL"