vm-health-check.sh: - check_gpu(): nvidia-smi probe; "CPU-only" OK on this VM (no GPU) - check_image_freshness(): flag containers running images >30d old. Skips third-party images (gitea, grafana, prom, mcr.microsoft, axllent, caddy, traefik, valkey, cadvisor) — they have their own rebuild cadence. Currently flags 19 stale product images (~60d old). chaos-validation.sh: - Monthly chaos test: kill PID 1 in chronomind-web, wait up to 35 min for docker-health-watchdog to detect + restart. Telegram pass/fail. - Refuses to run if target not healthy. systemd timer fires 1st of month at 10:00 UTC (after 08:00 weekly digest). vm-io-anomaly-check.sh: - 6h avg sda write rate; transition alerts at WARN (1 GB/hr) / CRIT (2.5 GB/hr). De-dupes via /var/log/vm-io-anomaly-state so the alert fires once per transition, not every 6h. Current baseline: ~1.94 GB/hr (orphan-container state-file writes; see Phase 0.3). - Reports recovery to OK when rate drops back. vm/page.tsx: gpu + image_freshness added to CHECK_META so they render with proper icon/label and slot into CHECK_ORDER. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
114 lines
5.1 KiB
Bash
Executable File
114 lines
5.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# =============================================================================
|
||
# chaos-validation.sh — Verify docker-health-watchdog actually heals containers
|
||
#
|
||
# Once a month, intentionally break a non-critical test container and confirm
|
||
# the watchdog restarts it within the expected window. Reports result to
|
||
# Telegram regardless of outcome (silence = unknown = bad).
|
||
#
|
||
# Test target: chronomind-web (Next.js, idempotent, no side effects on restart)
|
||
# Method: kill PID 1 inside the container → healthcheck fails →
|
||
# watchdog detects after 3 consecutive failures (~30 min worst case) →
|
||
# docker restart.
|
||
#
|
||
# Wait window: WATCHDOG_TIMER_FREQ × WATCHDOG_FAILURE_THRESHOLD + buffer = 35 min.
|
||
# =============================================================================
|
||
set -Eeuo pipefail
|
||
|
||
TARGET="${CHAOS_TARGET:-chronomind-web}"
|
||
WAIT_SECS="${CHAOS_WAIT_SECS:-2100}" # 35 min
|
||
POLL_SECS=30
|
||
|
||
LOG_FILE="/var/log/chaos-validation.log"
|
||
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
||
|
||
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; }
|
||
|
||
notify() {
|
||
local msg="$1"
|
||
local tg_token tg_chat
|
||
tg_token=""
|
||
tg_chat=""
|
||
if [[ -f "$TOKEN_FILE" ]]; then
|
||
tg_token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||
tg_chat=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||
fi
|
||
if [[ -n "$tg_token" && -n "$tg_chat" ]]; then
|
||
curl -sf -X POST "https://api.telegram.org/bot${tg_token}/sendMessage" \
|
||
-d chat_id="$tg_chat" -d text="$msg" > /dev/null 2>&1 || log "Telegram send failed"
|
||
fi
|
||
}
|
||
|
||
# ── Pre-flight ───────────────────────────────────────────────────────────────
|
||
if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then
|
||
log "ERROR: target container '${TARGET}' not running — aborting"
|
||
notify "🧪 Chaos test ABORTED — target ${TARGET} not running"
|
||
exit 1
|
||
fi
|
||
|
||
ORIG_HEALTH=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none")
|
||
if [[ "$ORIG_HEALTH" != "healthy" ]]; then
|
||
log "ERROR: target ${TARGET} not healthy (${ORIG_HEALTH}) — refusing to chaos-test"
|
||
notify "🧪 Chaos test ABORTED — target ${TARGET} not healthy: ${ORIG_HEALTH}"
|
||
exit 1
|
||
fi
|
||
|
||
ORIG_RESTART_COUNT=$(docker inspect --format '{{.RestartCount}}' "$TARGET" 2>/dev/null || echo "0")
|
||
ORIG_STARTED_AT=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "")
|
||
|
||
log "Chaos test starting — target=${TARGET} restart_count=${ORIG_RESTART_COUNT} started=${ORIG_STARTED_AT}"
|
||
|
||
# ── Inject failure ───────────────────────────────────────────────────────────
|
||
# Kill PID 1 inside container — process exits non-zero, healthcheck fails on next probe.
|
||
# Docker auto-restart from compose (`restart: unless-stopped`) will restart on the
|
||
# subsequent fail; if it doesn't, watchdog should.
|
||
log "Injecting failure: killing PID 1 inside ${TARGET}"
|
||
docker exec "$TARGET" kill -9 1 2>/dev/null || true
|
||
|
||
CHAOS_START=$(date +%s)
|
||
|
||
# ── Wait for recovery ────────────────────────────────────────────────────────
|
||
RECOVERED=0
|
||
RECOVERY_SECS=0
|
||
while (( $(date +%s) - CHAOS_START < WAIT_SECS )); do
|
||
sleep "$POLL_SECS"
|
||
|
||
if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then
|
||
log "Container ${TARGET} not in 'docker ps' yet"
|
||
continue
|
||
fi
|
||
|
||
current_started=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "")
|
||
current_health=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none")
|
||
|
||
# Recovery = container restarted (new StartedAt) AND now healthy
|
||
if [[ "$current_started" != "$ORIG_STARTED_AT" && "$current_health" == "healthy" ]]; then
|
||
RECOVERED=1
|
||
RECOVERY_SECS=$(( $(date +%s) - CHAOS_START ))
|
||
log "RECOVERED in ${RECOVERY_SECS}s (new start=${current_started}, health=${current_health})"
|
||
break
|
||
fi
|
||
log "Still recovering: started=${current_started} health=${current_health}"
|
||
done
|
||
|
||
# ── Report ───────────────────────────────────────────────────────────────────
|
||
if (( RECOVERED == 1 )); then
|
||
msg="✅ Chaos validation PASS
|
||
Target: ${TARGET}
|
||
Recovered in: ${RECOVERY_SECS}s (window ${WAIT_SECS}s)
|
||
Watchdog working as designed."
|
||
log "PASS"
|
||
notify "$msg"
|
||
exit 0
|
||
else
|
||
current_state=$(docker inspect --format '{{.State.Status}}/{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "missing")
|
||
msg="🚨 Chaos validation FAIL
|
||
Target: ${TARGET}
|
||
State after ${WAIT_SECS}s: ${current_state}
|
||
docker-health-watchdog did NOT restore the container.
|
||
Investigate: journalctl -u docker-health-watchdog.service --since '1 hour ago'"
|
||
log "FAIL: state=${current_state}"
|
||
notify "$msg"
|
||
exit 2
|
||
fi
|