#!/usr/bin/env bash # ============================================================================= # chaos-validation.sh — Verify docker-health-watchdog actually heals containers # # Once a month, intentionally break a non-critical test container and confirm # the watchdog restarts it within the expected window. Reports result to # Telegram regardless of outcome (silence = unknown = bad). # # Test target: chronomind-web (Next.js, idempotent, no side effects on restart) # Method: kill PID 1 inside the container → healthcheck fails → # watchdog detects after 3 consecutive failures (~30 min worst case) → # docker restart. # # Wait window: WATCHDOG_TIMER_FREQ × WATCHDOG_FAILURE_THRESHOLD + buffer = 35 min. # ============================================================================= set -Eeuo pipefail TARGET="${CHAOS_TARGET:-chronomind-web}" WAIT_SECS="${CHAOS_WAIT_SECS:-2100}" # 35 min POLL_SECS=30 LOG_FILE="/var/log/chaos-validation.log" TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; } notify() { local msg="$1" local tg_token tg_chat tg_token="" tg_chat="" if [[ -f "$TOKEN_FILE" ]]; then tg_token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) tg_chat=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) fi if [[ -n "$tg_token" && -n "$tg_chat" ]]; then curl -sf -X POST "https://api.telegram.org/bot${tg_token}/sendMessage" \ -d chat_id="$tg_chat" -d text="$msg" > /dev/null 2>&1 || log "Telegram send failed" fi } # ── Pre-flight ─────────────────────────────────────────────────────────────── if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then log "ERROR: target container '${TARGET}' not running — aborting" notify "🧪 Chaos test ABORTED — target ${TARGET} not running" exit 1 fi ORIG_HEALTH=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none") if [[ "$ORIG_HEALTH" != "healthy" ]]; then log "ERROR: target ${TARGET} not healthy (${ORIG_HEALTH}) — refusing to chaos-test" notify "🧪 Chaos test ABORTED — target ${TARGET} not healthy: ${ORIG_HEALTH}" exit 1 fi ORIG_RESTART_COUNT=$(docker inspect --format '{{.RestartCount}}' "$TARGET" 2>/dev/null || echo "0") ORIG_STARTED_AT=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "") log "Chaos test starting — target=${TARGET} restart_count=${ORIG_RESTART_COUNT} started=${ORIG_STARTED_AT}" # ── Inject failure ─────────────────────────────────────────────────────────── # Kill PID 1 inside container — process exits non-zero, healthcheck fails on next probe. # Docker auto-restart from compose (`restart: unless-stopped`) will restart on the # subsequent fail; if it doesn't, watchdog should. log "Injecting failure: killing PID 1 inside ${TARGET}" docker exec "$TARGET" kill -9 1 2>/dev/null || true CHAOS_START=$(date +%s) # ── Wait for recovery ──────────────────────────────────────────────────────── RECOVERED=0 RECOVERY_SECS=0 while (( $(date +%s) - CHAOS_START < WAIT_SECS )); do sleep "$POLL_SECS" if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then log "Container ${TARGET} not in 'docker ps' yet" continue fi current_started=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "") current_health=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none") # Recovery = container restarted (new StartedAt) AND now healthy if [[ "$current_started" != "$ORIG_STARTED_AT" && "$current_health" == "healthy" ]]; then RECOVERED=1 RECOVERY_SECS=$(( $(date +%s) - CHAOS_START )) log "RECOVERED in ${RECOVERY_SECS}s (new start=${current_started}, health=${current_health})" break fi log "Still recovering: started=${current_started} health=${current_health}" done # ── Report ─────────────────────────────────────────────────────────────────── if (( RECOVERED == 1 )); then msg="✅ Chaos validation PASS Target: ${TARGET} Recovered in: ${RECOVERY_SECS}s (window ${WAIT_SECS}s) Watchdog working as designed." log "PASS" notify "$msg" exit 0 else current_state=$(docker inspect --format '{{.State.Status}}/{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "missing") msg="🚨 Chaos validation FAIL Target: ${TARGET} State after ${WAIT_SECS}s: ${current_state} docker-health-watchdog did NOT restore the container. Investigate: journalctl -u docker-health-watchdog.service --since '1 hour ago'" log "FAIL: state=${current_state}" notify "$msg" exit 2 fi