diff --git a/dashboard/web/src/app/vm/page.tsx b/dashboard/web/src/app/vm/page.tsx index 3d7f790..94eb80d 100644 --- a/dashboard/web/src/app/vm/page.tsx +++ b/dashboard/web/src/app/vm/page.tsx @@ -1085,14 +1085,17 @@ const CHECK_META: Record = { syslog: { label: 'Syslog', icon: ScrollText }, failed_units: { label: 'Systemd Units', icon: Activity }, cron_missing_paths: { label: 'Cron Paths', icon: Clock }, + gpu: { label: 'GPU', icon: Zap }, + image_freshness: { label: 'Image Freshness', icon: Layers }, }; const CHECK_ORDER = [ 'disk', 'load', 'steal', 'ram', 'swap', 'container_loops', 'container_health', 'docker_daemon', - 'build_cache', 'docker_images', + 'build_cache', 'docker_images', 'image_freshness', 'journal', 'syslog', 'failed_units', 'cron_missing_paths', + 'gpu', ]; // ── Main page ────────────────────────────────────────────────────────────── diff --git a/scripts/VMs/HostingerVM/chaos-validation.sh b/scripts/VMs/HostingerVM/chaos-validation.sh new file mode 100755 index 0000000..ff4eee9 --- /dev/null +++ b/scripts/VMs/HostingerVM/chaos-validation.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# ============================================================================= +# chaos-validation.sh — Verify docker-health-watchdog actually heals containers +# +# Once a month, intentionally break a non-critical test container and confirm +# the watchdog restarts it within the expected window. Reports result to +# Telegram regardless of outcome (silence = unknown = bad). +# +# Test target: chronomind-web (Next.js, idempotent, no side effects on restart) +# Method: kill PID 1 inside the container → healthcheck fails → +# watchdog detects after 3 consecutive failures (~30 min worst case) → +# docker restart. +# +# Wait window: WATCHDOG_TIMER_FREQ × WATCHDOG_FAILURE_THRESHOLD + buffer = 35 min. +# ============================================================================= +set -Eeuo pipefail + +TARGET="${CHAOS_TARGET:-chronomind-web}" +WAIT_SECS="${CHAOS_WAIT_SECS:-2100}" # 35 min +POLL_SECS=30 + +LOG_FILE="/var/log/chaos-validation.log" +TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; } + +notify() { + local msg="$1" + local tg_token tg_chat + tg_token="" + tg_chat="" + if [[ -f "$TOKEN_FILE" ]]; then + tg_token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + tg_chat=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + fi + if [[ -n "$tg_token" && -n "$tg_chat" ]]; then + curl -sf -X POST "https://api.telegram.org/bot${tg_token}/sendMessage" \ + -d chat_id="$tg_chat" -d text="$msg" > /dev/null 2>&1 || log "Telegram send failed" + fi +} + +# ── Pre-flight ─────────────────────────────────────────────────────────────── +if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then + log "ERROR: target container '${TARGET}' not running — aborting" + notify "🧪 Chaos test ABORTED — target ${TARGET} not running" + exit 1 +fi + +ORIG_HEALTH=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none") +if [[ "$ORIG_HEALTH" != "healthy" ]]; then + log "ERROR: target ${TARGET} not healthy (${ORIG_HEALTH}) — refusing to chaos-test" + notify "🧪 Chaos test ABORTED — target ${TARGET} not healthy: ${ORIG_HEALTH}" + exit 1 +fi + +ORIG_RESTART_COUNT=$(docker inspect --format '{{.RestartCount}}' "$TARGET" 2>/dev/null || echo "0") +ORIG_STARTED_AT=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "") + +log "Chaos test starting — target=${TARGET} restart_count=${ORIG_RESTART_COUNT} started=${ORIG_STARTED_AT}" + +# ── Inject failure ─────────────────────────────────────────────────────────── +# Kill PID 1 inside container — process exits non-zero, healthcheck fails on next probe. +# Docker auto-restart from compose (`restart: unless-stopped`) will restart on the +# subsequent fail; if it doesn't, watchdog should. +log "Injecting failure: killing PID 1 inside ${TARGET}" +docker exec "$TARGET" kill -9 1 2>/dev/null || true + +CHAOS_START=$(date +%s) + +# ── Wait for recovery ──────────────────────────────────────────────────────── +RECOVERED=0 +RECOVERY_SECS=0 +while (( $(date +%s) - CHAOS_START < WAIT_SECS )); do + sleep "$POLL_SECS" + + if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then + log "Container ${TARGET} not in 'docker ps' yet" + continue + fi + + current_started=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "") + current_health=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none") + + # Recovery = container restarted (new StartedAt) AND now healthy + if [[ "$current_started" != "$ORIG_STARTED_AT" && "$current_health" == "healthy" ]]; then + RECOVERED=1 + RECOVERY_SECS=$(( $(date +%s) - CHAOS_START )) + log "RECOVERED in ${RECOVERY_SECS}s (new start=${current_started}, health=${current_health})" + break + fi + log "Still recovering: started=${current_started} health=${current_health}" +done + +# ── Report ─────────────────────────────────────────────────────────────────── +if (( RECOVERED == 1 )); then + msg="✅ Chaos validation PASS +Target: ${TARGET} +Recovered in: ${RECOVERY_SECS}s (window ${WAIT_SECS}s) +Watchdog working as designed." + log "PASS" + notify "$msg" + exit 0 +else + current_state=$(docker inspect --format '{{.State.Status}}/{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "missing") + msg="🚨 Chaos validation FAIL +Target: ${TARGET} +State after ${WAIT_SECS}s: ${current_state} +docker-health-watchdog did NOT restore the container. +Investigate: journalctl -u docker-health-watchdog.service --since '1 hour ago'" + log "FAIL: state=${current_state}" + notify "$msg" + exit 2 +fi diff --git a/scripts/VMs/HostingerVM/vm-health-check.sh b/scripts/VMs/HostingerVM/vm-health-check.sh index 6acfe10..7f0cc03 100755 --- a/scripts/VMs/HostingerVM/vm-health-check.sh +++ b/scripts/VMs/HostingerVM/vm-health-check.sh @@ -347,6 +347,62 @@ check_automation_drift() { fi } +check_gpu() { + # Surface GPU readiness so the Ollama panel can warn "GPU: none — slow inference" + header "GPU" + if command -v nvidia-smi >/dev/null 2>&1; then + local gpu_name + gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\n') + if [[ -n "$gpu_name" ]]; then + record gpu OK "$gpu_name" "GPU available ($gpu_name)" + else + record gpu WARN "nvidia-smi present but no GPU" "GPU driver loaded but no device" + fi + else + # No GPU → not a failure, but inference will be CPU-bound and slow + record gpu OK "CPU-only" "No GPU — Ollama inference will be CPU-bound" + fi +} + +check_image_freshness() { + # Flag containers running images that haven't been rebuilt in IMAGE_STALE_DAYS+ days. + # Helps catch deploy-pipeline drift and forgotten services. + header "IMAGE FRESHNESS" + local stale_days=30 + local now stale_cutoff + now=$(date +%s) + stale_cutoff=$(( now - stale_days * 86400 )) + + local stale_list=() + local cid name image created created_ts + while IFS= read -r line; do + [[ -z "$line" ]] && continue + cid="${line%% *}" + name=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') + image=$(docker inspect --format '{{.Config.Image}}' "$cid" 2>/dev/null) + # Skip known third-party images (no rebuild cadence under our control) + case "$image" in + gitea/*|grafana/*|prom/*|mcr.microsoft.com/*|axllent/*|caddy:*|traefik:*|valkey/*|gcr.io/cadvisor/*) continue ;; + esac + created=$(docker inspect --format '{{.Image}}' "$cid" 2>/dev/null \ + | xargs -I{} docker inspect --format '{{.Created}}' {} 2>/dev/null) + [[ -z "$created" ]] && continue + # Use date for ISO → epoch + created_ts=$(date -d "$created" +%s 2>/dev/null || echo "$now") + if (( created_ts < stale_cutoff )); then + local age_days + age_days=$(( (now - created_ts) / 86400 )) + stale_list+=("$name(${age_days}d)") + fi + done < <(docker ps --format '{{.ID}}' 2>/dev/null) + + if (( ${#stale_list[@]} > 0 )); then + record image_freshness WARN "${stale_list[*]}" "${#stale_list[@]} container(s) on image >${stale_days}d old: ${stale_list[*]}" + else + record image_freshness OK "all fresh" "All container images rebuilt within ${stale_days}d" + fi +} + # ── Run all checks ─────────────────────────────────────────────────────────── if ! $JSON_MODE && ! $QUIET; then @@ -362,6 +418,8 @@ check_docker_containers check_docker_disk check_logs check_automation_drift +check_gpu +check_image_freshness # ── Summary ────────────────────────────────────────────────────────────────── diff --git a/scripts/VMs/HostingerVM/vm-io-anomaly-check.sh b/scripts/VMs/HostingerVM/vm-io-anomaly-check.sh new file mode 100755 index 0000000..e318658 --- /dev/null +++ b/scripts/VMs/HostingerVM/vm-io-anomaly-check.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +# ============================================================================= +# vm-io-anomaly-check.sh — Sustained disk-write I/O anomaly alert +# +# Queries Prometheus for the average sda write rate over the past 6 hours. +# Alerts via Telegram if the rate exceeds the WARN threshold and identifies +# the top-3 container writers from cAdvisor metrics for context. +# +# Phase 0.3 identified invttrdg-backend + trading-backend as the steady-state +# write source (~6 GB/day). This script catches new spikes above that baseline. +# +# Runs every 6 hours via systemd timer. +# ============================================================================= +set -Eeuo pipefail + +BACKEND_CONTAINER="devops-backend" +PROM="http://learning_ai_common_plat-prometheus-1:9090" + +WARN_GB_PER_HR="${IO_WARN_GB_PER_HR:-1.0}" # baseline ~0.3 GB/hr; alert if 3x sustained +CRIT_GB_PER_HR="${IO_CRIT_GB_PER_HR:-2.5}" + +LOG_FILE="/var/log/vm-io-anomaly.log" +STATE_FILE="/var/log/vm-io-anomaly-state" +TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; } + +prom_query() { + local query="$1" + docker exec "$BACKEND_CONTAINER" \ + curl -sf --max-time 10 \ + "${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \ + 2>/dev/null \ + | python3 -c " +import json,sys +try: + d=json.load(sys.stdin) + r=d['data']['result'] + print(round(float(r[0]['value'][1]),3) if r else '?') +except Exception: + print('?') +" 2>/dev/null || echo "?" +} + +# ── Pre-flight ─────────────────────────────────────────────────────────────── +if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then + log "ERROR: ${BACKEND_CONTAINER} not running — skipping" + exit 0 +fi + +# 6-hour avg sda write rate in GB/hr +AVG_6H=$(prom_query 'avg_over_time((rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824)[6h:5m])') +if [[ "$AVG_6H" == "?" ]]; then + log "Could not query Prometheus — skipping" + exit 0 +fi + +# Numeric comparison via awk (handles fractions) +LEVEL=$(awk -v v="$AVG_6H" -v w="$WARN_GB_PER_HR" -v c="$CRIT_GB_PER_HR" \ + 'BEGIN{ if (v+0 >= c+0) print "CRIT"; else if (v+0 >= w+0) print "WARN"; else print "OK" }') + +log "6h avg write rate = ${AVG_6H} GB/hr → ${LEVEL}" + +# ── Compute daily projection ───────────────────────────────────────────────── +PROJ_DAY=$(awk -v v="$AVG_6H" 'BEGIN{ printf "%.1f", v*24 }') + +# ── Identify top writers (cAdvisor doesn't expose per-container blkio in this +# setup, but we can at least show top RAM/CPU consumers as a proxy) ──────── +TOP_PROCS="" +if [[ "$LEVEL" != "OK" ]]; then + TOP_PROCS=$(docker stats --no-stream --format '{{.Name}} {{.CPUPerc}} {{.MemUsage}}' 2>/dev/null \ + | sort -k2 -rh | head -3 \ + | awk '{printf " %s — %s CPU, %s\n", $1, $2, $3}') +fi + +# ── Deduplicate: only alert once per LEVEL transition ──────────────────────── +PREV_LEVEL="" +if [[ -f "$STATE_FILE" ]]; then + PREV_LEVEL=$(tr -d '[:space:]' < "$STATE_FILE" 2>/dev/null || echo "") +fi +echo "$LEVEL" > "$STATE_FILE" + +if [[ "$LEVEL" == "$PREV_LEVEL" ]]; then + log "Level unchanged (${LEVEL}); no alert" + exit 0 +fi + +# Only notify on transitions INTO a non-OK level, or recovery to OK +if [[ "$LEVEL" == "OK" && "$PREV_LEVEL" != "" && "$PREV_LEVEL" != "OK" ]]; then + MSG="✅ I/O anomaly cleared — $(hostname) +sda 6h avg now ${AVG_6H} GB/hr (was ${PREV_LEVEL})" +elif [[ "$LEVEL" != "OK" ]]; then + ICON=$([[ "$LEVEL" == "CRIT" ]] && echo "🚨" || echo "⚠️") + MSG="${ICON} I/O anomaly ${LEVEL} — $(hostname) +sda 6h avg = ${AVG_6H} GB/hr (~${PROJ_DAY} GB/day) +Threshold: WARN ${WARN_GB_PER_HR} / CRIT ${CRIT_GB_PER_HR} + +Top containers (CPU proxy — cAdvisor blkio not available): +${TOP_PROCS:- (none)} + +Phase 0.3 baseline: invttrdg-backend (~5 GB/day) + trading-backend (~1 GB/day). +Investigate further: docker stats; iotop -ao -n 5" +else + log "No transition needing alert" + exit 0 +fi + +# ── Send Telegram ──────────────────────────────────────────────────────────── +TELEGRAM_TOKEN="" +TELEGRAM_CHAT_ID="" +if [[ -f "$TOKEN_FILE" ]]; then + TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) +fi + +if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then + if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ + -d chat_id="$TELEGRAM_CHAT_ID" -d text="$MSG" > /dev/null 2>&1; then + log "Telegram alert sent (${LEVEL})" + else + log "ERROR: Telegram send failed" + fi +else + log "No Telegram credentials — alert NOT sent" + echo "$MSG" >> "$LOG_FILE" +fi diff --git a/systemd/chaos-validation.service b/systemd/chaos-validation.service new file mode 100644 index 0000000..9385ec4 --- /dev/null +++ b/systemd/chaos-validation.service @@ -0,0 +1,12 @@ +[Unit] +Description=Monthly chaos validation — break a container, verify watchdog restores it +After=docker.service docker-health-watchdog.timer +Requires=docker.service + +[Service] +Type=oneshot +User=root +Group=root +Environment="HERMES_HOME=/root/.hermes" +ExecStart=/usr/local/bin/chaos-validation.sh +TimeoutStartSec=2700 diff --git a/systemd/chaos-validation.timer b/systemd/chaos-validation.timer new file mode 100644 index 0000000..9ec264a --- /dev/null +++ b/systemd/chaos-validation.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run chaos validation monthly (1st at 10:00 UTC, after weekly digest) +After=docker.service + +[Timer] +OnCalendar=*-*-01 10:00 UTC +AccuracySec=1h +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/systemd/vm-io-anomaly-check.service b/systemd/vm-io-anomaly-check.service new file mode 100644 index 0000000..f3d4ba0 --- /dev/null +++ b/systemd/vm-io-anomaly-check.service @@ -0,0 +1,11 @@ +[Unit] +Description=Check sustained disk I/O anomaly and alert via Telegram +After=docker.service network-online.target +Requires=docker.service + +[Service] +Type=oneshot +User=root +Group=root +Environment="HERMES_HOME=/root/.hermes" +ExecStart=/usr/local/bin/vm-io-anomaly-check.sh diff --git a/systemd/vm-io-anomaly-check.timer b/systemd/vm-io-anomaly-check.timer new file mode 100644 index 0000000..5baae61 --- /dev/null +++ b/systemd/vm-io-anomaly-check.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Run sustained-I/O anomaly check every 6 hours +After=docker.service + +[Timer] +OnBootSec=15min +OnUnitActiveSec=6h +AccuracySec=10min +Persistent=true + +[Install] +WantedBy=timers.target