bytelyst-devops-tools/scripts/VMs/HostingerVM/chaos-validation.sh

#!/usr/bin/env bash
# =============================================================================
# chaos-validation.sh — Verify docker-health-watchdog actually heals containers
#
# Once a month, intentionally break a non-critical test container and confirm
# the watchdog restarts it within the expected window. Reports result to
# Telegram regardless of outcome (silence = unknown = bad).
#
# Test target: chronomind-web (Next.js, idempotent, no side effects on restart)
# Method:      kill PID 1 inside the container → healthcheck fails →
#              watchdog detects after 3 consecutive failures (~30 min worst case) →
#              docker restart.
#
# Wait window: WATCHDOG_TIMER_FREQ × WATCHDOG_FAILURE_THRESHOLD + buffer = 35 min.
# =============================================================================
set -Eeuo pipefail

TARGET="${CHAOS_TARGET:-chronomind-web}"
WAIT_SECS="${CHAOS_WAIT_SECS:-2100}"   # 35 min
POLL_SECS=30

LOG_FILE="/var/log/chaos-validation.log"
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"

log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; }

notify() {
  local msg="$1"
  local tg_token tg_chat
  tg_token=""
  tg_chat=""
  if [[ -f "$TOKEN_FILE" ]]; then
    tg_token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
    tg_chat=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
  fi
  if [[ -n "$tg_token" && -n "$tg_chat" ]]; then
    curl -sf -X POST "https://api.telegram.org/bot${tg_token}/sendMessage" \
      -d chat_id="$tg_chat" -d text="$msg" > /dev/null 2>&1 || log "Telegram send failed"
  fi
}

# ── Pre-flight ───────────────────────────────────────────────────────────────
if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then
  log "ERROR: target container '${TARGET}' not running — aborting"
  notify "🧪 Chaos test ABORTED — target ${TARGET} not running"
  exit 1
fi

ORIG_HEALTH=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none")
if [[ "$ORIG_HEALTH" != "healthy" ]]; then
  log "ERROR: target ${TARGET} not healthy (${ORIG_HEALTH}) — refusing to chaos-test"
  notify "🧪 Chaos test ABORTED — target ${TARGET} not healthy: ${ORIG_HEALTH}"
  exit 1
fi

ORIG_RESTART_COUNT=$(docker inspect --format '{{.RestartCount}}' "$TARGET" 2>/dev/null || echo "0")
ORIG_STARTED_AT=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "")

log "Chaos test starting — target=${TARGET} restart_count=${ORIG_RESTART_COUNT} started=${ORIG_STARTED_AT}"

# ── Inject failure ───────────────────────────────────────────────────────────
# Kill PID 1 inside container — process exits non-zero, healthcheck fails on next probe.
# Docker auto-restart from compose (`restart: unless-stopped`) will restart on the
# subsequent fail; if it doesn't, watchdog should.
log "Injecting failure: killing PID 1 inside ${TARGET}"
docker exec "$TARGET" kill -9 1 2>/dev/null || true

CHAOS_START=$(date +%s)

# ── Wait for recovery ────────────────────────────────────────────────────────
RECOVERED=0
RECOVERY_SECS=0
while (( $(date +%s) - CHAOS_START < WAIT_SECS )); do
  sleep "$POLL_SECS"

  if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then
    log "Container ${TARGET} not in 'docker ps' yet"
    continue
  fi

  current_started=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "")
  current_health=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none")

  # Recovery = container restarted (new StartedAt) AND now healthy
  if [[ "$current_started" != "$ORIG_STARTED_AT" && "$current_health" == "healthy" ]]; then
    RECOVERED=1
    RECOVERY_SECS=$(( $(date +%s) - CHAOS_START ))
    log "RECOVERED in ${RECOVERY_SECS}s (new start=${current_started}, health=${current_health})"
    break
  fi
  log "Still recovering: started=${current_started} health=${current_health}"
done

# ── Report ───────────────────────────────────────────────────────────────────
if (( RECOVERED == 1 )); then
  msg="✅ Chaos validation PASS
Target: ${TARGET}
Recovered in: ${RECOVERY_SECS}s (window ${WAIT_SECS}s)
Watchdog working as designed."
  log "PASS"
  notify "$msg"
  exit 0
else
  current_state=$(docker inspect --format '{{.State.Status}}/{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "missing")
  msg="🚨 Chaos validation FAIL
Target: ${TARGET}
State after ${WAIT_SECS}s: ${current_state}
docker-health-watchdog did NOT restore the container.
Investigate: journalctl -u docker-health-watchdog.service --since '1 hour ago'"
  log "FAIL: state=${current_state}"
  notify "$msg"
  exit 2
fi