- Switch backend runner from node:20-alpine to node:20-slim so GNU df flags (--output=pcent/avail) work inside the container - Add volume mounts to docker-compose.yml: scripts (ro), VM logs (rw), docker.sock; set VM_SCRIPTS_PATH + VM_LOG_DIR env vars - Rebuild repository.ts: env-configurable paths, cron history parser, unhealthy-container inspector, Ollama model endpoints - Add routes: GET /api/vm/cron-status, unhealthy containers, Ollama models, container restart, model unload - vm-cleanup.sh: add step_cosmos_pglog, step_docker_aged_images; fix (( count++ )) → count=$(( count + 1 )) for set -e compatibility - Add docs/VM_OBSERVABILITY_ROADMAP.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
436 lines
16 KiB
Bash
Executable File
436 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# vm-cleanup.sh — Hostinger VM Safe Periodic Cleanup
|
|
#
|
|
# Designed to be run manually or via cron. All operations are either
|
|
# completely safe (read-only builds) or will prompt for confirmation when
|
|
# removing things that can't be trivially regenerated.
|
|
#
|
|
# Modes:
|
|
# (default) Weekly safe cleanup — build cache, apt, npm, journal, .next/cache
|
|
# --full Monthly deeper cleanup — adds: pnpm store, docker system prune,
|
|
# old log files, Docker image dangling prune
|
|
# --dry-run Print what would be done, make no changes
|
|
# --install-cron Install the recommended cron schedule for both scripts
|
|
# --uninstall-cron Remove the installed cron jobs
|
|
#
|
|
# All destructive steps are gated behind SAFE / CAREFUL / MANUAL labels
|
|
# in the output so you can audit what ran.
|
|
#
|
|
# Logs to: /var/log/vm-cleanup.log
|
|
# =============================================================================
|
|
set -Eeuo pipefail
|
|
|
|
# ── Config ───────────────────────────────────────────────────────────────────
|
|
LOG_FILE="/var/log/vm-cleanup.log"
|
|
SCRIPT_PATH="$(realpath "${BASH_SOURCE[0]}")"
|
|
SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
|
|
HEALTH_CHECK="$SCRIPT_DIR/vm-health-check.sh"
|
|
|
|
# Paths that must NEVER be deleted even in --full mode
|
|
# shellcheck disable=SC2034
|
|
PROTECTED_PATHS=(
|
|
"/opt/bytelyst/learning_ai_common_plat"
|
|
"/opt/bytelyst/learning_ai_devops_tools"
|
|
"/usr/local/lib/hermes-agent"
|
|
"/usr/share/ollama"
|
|
"/swapfile"
|
|
)
|
|
|
|
# node_modules dirs in active (non-HOLD) repos to never touch
|
|
# shellcheck disable=SC2034
|
|
ACTIVE_NODE_MODULES=(
|
|
"/opt/bytelyst/learning_ai_common_plat/node_modules"
|
|
"/opt/bytelyst/learning_ai_flowmonk/node_modules"
|
|
"/opt/bytelyst/learning_ai_clock/node_modules"
|
|
"/opt/bytelyst/learning_ai_notes/node_modules"
|
|
"/opt/bytelyst/learning_ai_devops_tools/dashboard/node_modules"
|
|
"/opt/bytelyst/learning_ai_invt_trdg/node_modules"
|
|
)
|
|
|
|
# ── Colour codes ─────────────────────────────────────────────────────────────
|
|
RED=$'\033[0;31m'
|
|
YELLOW=$'\033[1;33m'
|
|
GREEN=$'\033[0;32m'
|
|
CYAN=$'\033[0;36m'
|
|
BOLD=$'\033[1m'
|
|
DIM=$'\033[2m'
|
|
NC=$'\033[0m'
|
|
|
|
# ── Flags ────────────────────────────────────────────────────────────────────
|
|
FULL_MODE=false
|
|
DRY_RUN=false
|
|
INSTALL_CRON=false
|
|
UNINSTALL_CRON=false
|
|
QUIET=false
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--full) FULL_MODE=true ;;
|
|
--dry-run) DRY_RUN=true ;;
|
|
--install-cron) INSTALL_CRON=true ;;
|
|
--uninstall-cron) UNINSTALL_CRON=true ;;
|
|
--quiet) QUIET=true ;;
|
|
esac
|
|
done
|
|
|
|
# ── Helpers ──────────────────────────────────────────────────────────────────
|
|
log() {
|
|
local msg
|
|
msg="[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
|
|
echo "$msg" >> "$LOG_FILE" 2>/dev/null || true
|
|
$QUIET || echo -e "$*"
|
|
}
|
|
|
|
log_header() {
|
|
$QUIET || echo -e "\n${BOLD}${CYAN}── $1 ──────────────────────────────────────${NC}"
|
|
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] === $1 ===" >> "$LOG_FILE" 2>/dev/null || true
|
|
}
|
|
|
|
log_step() {
|
|
local label="$1" msg="$2"
|
|
case "$label" in
|
|
SAFE) $QUIET || echo -e " ${GREEN}[SAFE]${NC} $msg" ;;
|
|
CAREFUL) $QUIET || echo -e " ${YELLOW}[CAREFUL]${NC} $msg" ;;
|
|
SKIP) $QUIET || echo -e " ${DIM}[SKIP]${NC} $msg" ;;
|
|
DRY) $QUIET || echo -e " ${CYAN}[DRY-RUN]${NC} $msg" ;;
|
|
esac
|
|
}
|
|
|
|
run_cmd() {
|
|
# run_cmd LABEL "description" cmd args...
|
|
local label="$1" desc="$2"
|
|
shift 2
|
|
log_step "$label" "$desc"
|
|
if $DRY_RUN; then
|
|
log_step DRY "would run: $*"
|
|
return 0
|
|
fi
|
|
log "[CMD] $*"
|
|
"$@" >> "$LOG_FILE" 2>&1 || true
|
|
}
|
|
|
|
disk_before=""
|
|
record_disk_before() {
|
|
disk_before=$(df -h / --output=used,avail,pcent | tail -1 | tr -s ' ')
|
|
}
|
|
|
|
report_disk_delta() {
|
|
local disk_after
|
|
disk_after=$(df -h / --output=used,avail,pcent | tail -1 | tr -s ' ')
|
|
if ! $QUIET; then
|
|
echo -e "\n ${DIM}Before: $disk_before${NC}"
|
|
echo -e " ${GREEN}After: $disk_after${NC}"
|
|
fi
|
|
log "[DISK] before=$disk_before after=$disk_after"
|
|
}
|
|
|
|
# ── Safety guard ─────────────────────────────────────────────────────────────
|
|
require_root() {
|
|
if [[ "$(id -u)" -ne 0 ]]; then
|
|
echo -e "${RED}ERROR: This script must be run as root (use sudo)${NC}" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# ── Cron install/uninstall ───────────────────────────────────────────────────
|
|
do_install_cron() {
|
|
echo -e "\n${BOLD}Installing cron schedule…${NC}\n"
|
|
|
|
local cron_tag="# bytelyst-vm-maintenance"
|
|
local tmp_cron
|
|
tmp_cron=$(mktemp)
|
|
|
|
# Export existing crontab (minus our managed block)
|
|
crontab -l 2>/dev/null | grep -v "$cron_tag" > "$tmp_cron" || true
|
|
|
|
cat >> "$tmp_cron" <<EOF
|
|
|
|
$cron_tag — DO NOT EDIT this block manually, use --install-cron / --uninstall-cron
|
|
# Daily health check at 07:00 UTC (read-only, sends Telegram alert on WARNING/CRITICAL)
|
|
0 7 * * * bash $SCRIPT_DIR/vm-health-check.sh --quiet --notify 2>&1 | logger -t vm-health-check
|
|
# Daily build cache prune at 03:00 UTC (always safe, never removes images)
|
|
0 3 * * * bash $SCRIPT_PATH --quiet 2>&1 | logger -t vm-cleanup
|
|
# Weekly cleanup (Sunday 02:00 UTC) — logs, apt, npm, .next/cache, build cache
|
|
0 2 * * 0 bash $SCRIPT_PATH --quiet 2>&1 | logger -t vm-cleanup
|
|
# Monthly full cleanup (1st of month 01:00 UTC) — adds pnpm store, docker system prune
|
|
0 1 1 * * bash $SCRIPT_PATH --full --quiet 2>&1 | logger -t vm-cleanup
|
|
EOF
|
|
|
|
crontab "$tmp_cron"
|
|
rm -f "$tmp_cron"
|
|
|
|
echo -e " ${GREEN}✓ Cron jobs installed. Current schedule:${NC}"
|
|
echo ""
|
|
crontab -l | grep -A20 "$cron_tag" || true
|
|
echo ""
|
|
echo -e " View logs: ${CYAN}tail -f $LOG_FILE${NC}"
|
|
echo -e " View cron: ${CYAN}crontab -l${NC}"
|
|
echo -e " Remove: ${CYAN}bash $SCRIPT_PATH --uninstall-cron${NC}"
|
|
}
|
|
|
|
do_uninstall_cron() {
|
|
local cron_tag="# bytelyst-vm-maintenance"
|
|
local tmp_cron
|
|
tmp_cron=$(mktemp)
|
|
crontab -l 2>/dev/null | grep -v "$cron_tag" > "$tmp_cron" || true
|
|
# Also strip the actual cron lines we added (they follow the tag block)
|
|
grep -v "vm-health-check.sh\|vm-cleanup.sh" "$tmp_cron" > "${tmp_cron}.clean" || true
|
|
crontab "${tmp_cron}.clean"
|
|
rm -f "$tmp_cron" "${tmp_cron}.clean"
|
|
echo -e " ${GREEN}✓ Cron jobs removed${NC}"
|
|
}
|
|
|
|
# ── Cleanup steps ─────────────────────────────────────────────────────────────
|
|
|
|
step_cosmos_pglog() {
|
|
# The Azure CosmosDB emulator uses an embedded Postgres instance that logs
|
|
# every SQL statement to /logs/pglog.log inside its overlay layer.
|
|
# It grows ~275 MB/hr during heavy trading activity. Truncate it safely —
|
|
# Postgres keeps the file descriptor open so truncation doesn't break it.
|
|
log_header "CosmosDB Emulator Postgres Log"
|
|
local container="learning_ai_common_plat-cosmos-emulator-1"
|
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${container}$"; then
|
|
log_step SKIP "CosmosDB emulator not running"
|
|
return
|
|
fi
|
|
# Locate the overlay upper dir for this container
|
|
local pglog
|
|
pglog=$(docker inspect "$container" 2>/dev/null \
|
|
| python3 -c "
|
|
import json,sys,os
|
|
d=json.load(sys.stdin)[0]
|
|
# Try direct GraphDriver path first
|
|
upper=d.get('GraphDriver',{}).get('Data',{}).get('UpperDir','')
|
|
if upper:
|
|
p=os.path.join(upper,'logs','pglog.log')
|
|
if os.path.exists(p): print(p)
|
|
exit()
|
|
# Fallback: scan rootfs overlayfs dirs
|
|
import glob
|
|
for f in glob.glob('/var/lib/docker/rootfs/overlayfs/*/logs/pglog.log'):
|
|
print(f); exit()
|
|
" 2>/dev/null || true)
|
|
if [[ -z "$pglog" || ! -f "$pglog" ]]; then
|
|
log_step SKIP "pglog.log not found (overlay path changed?)"
|
|
return
|
|
fi
|
|
local size_mb
|
|
size_mb=$(du -sm "$pglog" 2>/dev/null | cut -f1 || echo 0)
|
|
if (( size_mb < 100 )); then
|
|
log_step SKIP "pglog.log is ${size_mb}MB — no truncation needed (<100 MB)"
|
|
return
|
|
fi
|
|
run_cmd SAFE "Truncate CosmosDB pglog.log (${size_mb}MB → 0)" truncate -s 0 "$pglog"
|
|
}
|
|
|
|
step_docker_build_cache() {
|
|
log_header "Docker Build Cache"
|
|
if ! docker info &>/dev/null 2>&1; then
|
|
log_step SKIP "Docker not running — skipping build cache prune"
|
|
return
|
|
fi
|
|
local cache_size
|
|
cache_size=$(docker system df 2>/dev/null | awk '/^Build Cache/ {print $3}' || echo "?")
|
|
run_cmd SAFE "Prune Docker build cache (currently $cache_size)" \
|
|
docker builder prune -f
|
|
}
|
|
|
|
step_docker_system_prune() {
|
|
# Removes stopped containers, unused networks, dangling images ONLY
|
|
# Does NOT remove images used by any existing container
|
|
log_header "Docker System Prune (dangling only)"
|
|
if ! docker info &>/dev/null 2>&1; then
|
|
log_step SKIP "Docker not running"
|
|
return
|
|
fi
|
|
run_cmd SAFE "Remove stopped containers, unused networks, dangling images" \
|
|
docker system prune -f
|
|
}
|
|
|
|
step_docker_aged_images() {
|
|
# Removes tagged images that haven't been used by any container in >7 days.
|
|
# Safe because any running container holds a reference to its image — this
|
|
# only cleans up old image versions that were replaced (e.g. after a deploy).
|
|
log_header "Docker Aged Image Prune (unused >7 days)"
|
|
if ! docker info &>/dev/null 2>&1; then
|
|
log_step SKIP "Docker not running"
|
|
return
|
|
fi
|
|
local reclaimable
|
|
reclaimable=$(docker system df 2>/dev/null | awk '/^Images/ {print $4}' || echo "?")
|
|
run_cmd SAFE "Prune images unused for >7 days (currently $reclaimable reclaimable)" \
|
|
docker image prune -a -f --filter "until=168h"
|
|
}
|
|
|
|
step_docker_crash_loop_check() {
|
|
log_header "Crash Loop Check"
|
|
if ! docker info &>/dev/null 2>&1; then return; fi
|
|
|
|
local looping=()
|
|
while IFS=$'\t' read -r name restarts; do
|
|
[[ -z "$name" || "$name" == "NAMES" ]] && continue
|
|
restarts="${restarts:-0}"
|
|
if (( restarts >= 20 )); then looping+=("$name(${restarts}x)"); fi
|
|
done < <(docker ps -a --format $'{{.Names}}\t{{.RestartCount}}' 2>/dev/null || true)
|
|
|
|
if (( ${#looping[@]} > 0 )); then
|
|
echo -e " ${RED}${BOLD}⚠ CRASH LOOPS DETECTED — manual fix required:${NC}"
|
|
for c in "${looping[@]}"; do
|
|
echo -e " ${RED}→${NC} $c"
|
|
echo -e " ${DIM}fix: docker logs ${c%%(*)} | tail -20${NC}"
|
|
echo -e " ${DIM}stop loop: docker update --restart=no ${c%%(*)}${NC}"
|
|
done
|
|
log "[WARN] crash-looping containers: ${looping[*]}"
|
|
else
|
|
log_step SAFE "No crash-looping containers"
|
|
fi
|
|
}
|
|
|
|
step_journal() {
|
|
log_header "Journal Logs"
|
|
run_cmd SAFE "Vacuum journal to 200MB" \
|
|
journalctl --vacuum-size=200M
|
|
run_cmd SAFE "Vacuum journal older than 7 days" \
|
|
journalctl --vacuum-time=7d
|
|
}
|
|
|
|
step_apt_cache() {
|
|
log_header "APT Cache"
|
|
run_cmd SAFE "Clean apt package cache" \
|
|
apt-get clean
|
|
}
|
|
|
|
step_npm_cache() {
|
|
log_header "NPM Cache"
|
|
if command -v npm &>/dev/null; then
|
|
run_cmd SAFE "Clean npm cache" \
|
|
npm cache clean --force
|
|
fi
|
|
}
|
|
|
|
step_next_cache() {
|
|
log_header ".next/cache Directories"
|
|
# Only delete .next/cache (webpack/babel/tsbuild cache), NOT .next/standalone (prod build)
|
|
local count=0
|
|
while IFS= read -r cache_dir; do
|
|
log_step SAFE "Remove $cache_dir"
|
|
if ! $DRY_RUN; then rm -rf "$cache_dir"; fi
|
|
count=$(( count + 1 ))
|
|
done < <(
|
|
find /opt/bytelyst -name ".next" -maxdepth 7 -type d 2>/dev/null \
|
|
| while read -r d; do
|
|
[[ -d "$d/cache" ]] && echo "$d/cache"
|
|
done
|
|
)
|
|
if (( count == 0 )); then log_step SKIP "No .next/cache dirs found"; fi
|
|
}
|
|
|
|
step_pnpm_store() {
|
|
log_header "PNPM Store"
|
|
if command -v pnpm &>/dev/null; then
|
|
run_cmd SAFE "Prune unreferenced packages from pnpm store" \
|
|
pnpm store prune
|
|
else
|
|
log_step SKIP "pnpm not found"
|
|
fi
|
|
}
|
|
|
|
step_old_logs() {
|
|
log_header "Old Log Files"
|
|
# Compress any uncompressed .1 rotations that logrotate missed
|
|
local count=0
|
|
for f in /var/log/syslog.1 /var/log/kern.log.1 /var/log/ufw.log.1; do
|
|
if [[ -f "$f" && ! -f "${f}.gz" ]]; then
|
|
run_cmd SAFE "Compress $f" gzip -9 "$f"
|
|
count=$(( count + 1 ))
|
|
fi
|
|
done
|
|
# Remove log rotations older than 30 days
|
|
while IFS= read -r old_log; do
|
|
run_cmd CAREFUL "Remove old log: $old_log" rm -f "$old_log"
|
|
done < <(find /var/log -name "*.gz" -mtime +30 -type f 2>/dev/null || true)
|
|
if (( count == 0 )); then log_step SKIP "No uncompressed rotations to compress"; fi
|
|
}
|
|
|
|
step_hold_cleanup() {
|
|
log_header "HOLD Archived Projects"
|
|
# node_modules in HOLD are safe to delete — code stays, can be reinstalled
|
|
local total_freed=0
|
|
local found=0
|
|
while IFS= read -r nm; do
|
|
local size
|
|
size=$(du -sm "$nm" 2>/dev/null | cut -f1 || echo "0")
|
|
run_cmd CAREFUL "Delete archived node_modules: $nm (~${size}MB)" rm -rf "$nm"
|
|
total_freed=$(( total_freed + size ))
|
|
found=$(( found + 1 ))
|
|
done < <(
|
|
find /opt/bytelyst/HOLD -name "node_modules" -maxdepth 4 -type d 2>/dev/null || true
|
|
)
|
|
if (( found == 0 )); then
|
|
log_step SKIP "HOLD node_modules already clean"
|
|
else
|
|
log "[INFO] Freed ~${total_freed}MB from HOLD node_modules"
|
|
fi
|
|
|
|
# .next build artifacts in HOLD
|
|
while IFS= read -r next_dir; do
|
|
run_cmd CAREFUL "Delete archived .next: $next_dir" rm -rf "$next_dir"
|
|
done < <(
|
|
find /opt/bytelyst/HOLD -name ".next" -maxdepth 6 -type d 2>/dev/null || true
|
|
)
|
|
}
|
|
|
|
# ── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
# Handle special modes first (no root needed for these)
|
|
if $INSTALL_CRON; then require_root; do_install_cron; exit 0; fi
|
|
if $UNINSTALL_CRON; then require_root; do_uninstall_cron; exit 0; fi
|
|
|
|
require_root
|
|
|
|
if ! $QUIET; then
|
|
if $FULL_MODE; then MODE="FULL"; else MODE="STANDARD"; fi
|
|
if $DRY_RUN; then DRY=" (DRY-RUN)"; else DRY=""; fi
|
|
echo -e "\n${BOLD}VM Cleanup — $(hostname) — ${MODE}${DRY}${NC}"
|
|
echo -e "${DIM}$(date -u '+%Y-%m-%d %H:%M UTC')${NC}"
|
|
fi
|
|
|
|
if $FULL_MODE; then _mode="full"; else _mode="standard"; fi
|
|
log "[START] mode=${_mode} dry=$DRY_RUN"
|
|
record_disk_before
|
|
|
|
# ── WEEKLY (always run) ──────────────────────────────────────────────────────
|
|
step_cosmos_pglog
|
|
step_docker_build_cache
|
|
step_docker_crash_loop_check
|
|
step_journal
|
|
step_apt_cache
|
|
step_npm_cache
|
|
step_next_cache
|
|
|
|
# ── MONTHLY (only with --full) ───────────────────────────────────────────────
|
|
if $FULL_MODE; then
|
|
step_docker_system_prune
|
|
step_docker_aged_images
|
|
step_pnpm_store
|
|
step_old_logs
|
|
step_hold_cleanup
|
|
fi
|
|
|
|
# ── Final report ─────────────────────────────────────────────────────────────
|
|
report_disk_delta
|
|
|
|
if ! $QUIET; then
|
|
echo -e "\n${GREEN}${BOLD}✓ Cleanup complete${NC}"
|
|
echo -e " Log: ${CYAN}tail -50 $LOG_FILE${NC}"
|
|
|
|
if [[ -f "$HEALTH_CHECK" ]]; then
|
|
echo ""
|
|
echo -e "${DIM}Running health check…${NC}"
|
|
bash "$HEALTH_CHECK" || true
|
|
fi
|
|
fi
|
|
|
|
log "[END] cleanup complete"
|