feat(vm): Phase 5 closure — GPU/freshness checks, chaos validation, I/O alert
vm-health-check.sh: - check_gpu(): nvidia-smi probe; "CPU-only" OK on this VM (no GPU) - check_image_freshness(): flag containers running images >30d old. Skips third-party images (gitea, grafana, prom, mcr.microsoft, axllent, caddy, traefik, valkey, cadvisor) — they have their own rebuild cadence. Currently flags 19 stale product images (~60d old). chaos-validation.sh: - Monthly chaos test: kill PID 1 in chronomind-web, wait up to 35 min for docker-health-watchdog to detect + restart. Telegram pass/fail. - Refuses to run if target not healthy. systemd timer fires 1st of month at 10:00 UTC (after 08:00 weekly digest). vm-io-anomaly-check.sh: - 6h avg sda write rate; transition alerts at WARN (1 GB/hr) / CRIT (2.5 GB/hr). De-dupes via /var/log/vm-io-anomaly-state so the alert fires once per transition, not every 6h. Current baseline: ~1.94 GB/hr (orphan-container state-file writes; see Phase 0.3). - Reports recovery to OK when rate drops back. vm/page.tsx: gpu + image_freshness added to CHECK_META so they render with proper icon/label and slot into CHECK_ORDER. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
76ef17f26b
commit
13a105ba23
@ -1085,14 +1085,17 @@ const CHECK_META: Record<string, { label: string; icon: React.ElementType }> = {
|
||||
syslog: { label: 'Syslog', icon: ScrollText },
|
||||
failed_units: { label: 'Systemd Units', icon: Activity },
|
||||
cron_missing_paths: { label: 'Cron Paths', icon: Clock },
|
||||
gpu: { label: 'GPU', icon: Zap },
|
||||
image_freshness: { label: 'Image Freshness', icon: Layers },
|
||||
};
|
||||
|
||||
const CHECK_ORDER = [
|
||||
'disk', 'load', 'steal', 'ram', 'swap',
|
||||
'container_loops', 'container_health', 'docker_daemon',
|
||||
'build_cache', 'docker_images',
|
||||
'build_cache', 'docker_images', 'image_freshness',
|
||||
'journal', 'syslog',
|
||||
'failed_units', 'cron_missing_paths',
|
||||
'gpu',
|
||||
];
|
||||
|
||||
// ── Main page ──────────────────────────────────────────────────────────────
|
||||
|
||||
113
scripts/VMs/HostingerVM/chaos-validation.sh
Executable file
113
scripts/VMs/HostingerVM/chaos-validation.sh
Executable file
@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# chaos-validation.sh — Verify docker-health-watchdog actually heals containers
|
||||
#
|
||||
# Once a month, intentionally break a non-critical test container and confirm
|
||||
# the watchdog restarts it within the expected window. Reports result to
|
||||
# Telegram regardless of outcome (silence = unknown = bad).
|
||||
#
|
||||
# Test target: chronomind-web (Next.js, idempotent, no side effects on restart)
|
||||
# Method: kill PID 1 inside the container → healthcheck fails →
|
||||
# watchdog detects after 3 consecutive failures (~30 min worst case) →
|
||||
# docker restart.
|
||||
#
|
||||
# Wait window: WATCHDOG_TIMER_FREQ × WATCHDOG_FAILURE_THRESHOLD + buffer = 35 min.
|
||||
# =============================================================================
|
||||
set -Eeuo pipefail
|
||||
|
||||
TARGET="${CHAOS_TARGET:-chronomind-web}"
|
||||
WAIT_SECS="${CHAOS_WAIT_SECS:-2100}" # 35 min
|
||||
POLL_SECS=30
|
||||
|
||||
LOG_FILE="/var/log/chaos-validation.log"
|
||||
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
||||
|
||||
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; }
|
||||
|
||||
notify() {
|
||||
local msg="$1"
|
||||
local tg_token tg_chat
|
||||
tg_token=""
|
||||
tg_chat=""
|
||||
if [[ -f "$TOKEN_FILE" ]]; then
|
||||
tg_token=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||
tg_chat=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||
fi
|
||||
if [[ -n "$tg_token" && -n "$tg_chat" ]]; then
|
||||
curl -sf -X POST "https://api.telegram.org/bot${tg_token}/sendMessage" \
|
||||
-d chat_id="$tg_chat" -d text="$msg" > /dev/null 2>&1 || log "Telegram send failed"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Pre-flight ───────────────────────────────────────────────────────────────
|
||||
if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then
|
||||
log "ERROR: target container '${TARGET}' not running — aborting"
|
||||
notify "🧪 Chaos test ABORTED — target ${TARGET} not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ORIG_HEALTH=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none")
|
||||
if [[ "$ORIG_HEALTH" != "healthy" ]]; then
|
||||
log "ERROR: target ${TARGET} not healthy (${ORIG_HEALTH}) — refusing to chaos-test"
|
||||
notify "🧪 Chaos test ABORTED — target ${TARGET} not healthy: ${ORIG_HEALTH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ORIG_RESTART_COUNT=$(docker inspect --format '{{.RestartCount}}' "$TARGET" 2>/dev/null || echo "0")
|
||||
ORIG_STARTED_AT=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "")
|
||||
|
||||
log "Chaos test starting — target=${TARGET} restart_count=${ORIG_RESTART_COUNT} started=${ORIG_STARTED_AT}"
|
||||
|
||||
# ── Inject failure ───────────────────────────────────────────────────────────
|
||||
# Kill PID 1 inside container — process exits non-zero, healthcheck fails on next probe.
|
||||
# Docker auto-restart from compose (`restart: unless-stopped`) will restart on the
|
||||
# subsequent fail; if it doesn't, watchdog should.
|
||||
log "Injecting failure: killing PID 1 inside ${TARGET}"
|
||||
docker exec "$TARGET" kill -9 1 2>/dev/null || true
|
||||
|
||||
CHAOS_START=$(date +%s)
|
||||
|
||||
# ── Wait for recovery ────────────────────────────────────────────────────────
|
||||
RECOVERED=0
|
||||
RECOVERY_SECS=0
|
||||
while (( $(date +%s) - CHAOS_START < WAIT_SECS )); do
|
||||
sleep "$POLL_SECS"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' | grep -q "^${TARGET}$"; then
|
||||
log "Container ${TARGET} not in 'docker ps' yet"
|
||||
continue
|
||||
fi
|
||||
|
||||
current_started=$(docker inspect --format '{{.State.StartedAt}}' "$TARGET" 2>/dev/null || echo "")
|
||||
current_health=$(docker inspect --format '{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "none")
|
||||
|
||||
# Recovery = container restarted (new StartedAt) AND now healthy
|
||||
if [[ "$current_started" != "$ORIG_STARTED_AT" && "$current_health" == "healthy" ]]; then
|
||||
RECOVERED=1
|
||||
RECOVERY_SECS=$(( $(date +%s) - CHAOS_START ))
|
||||
log "RECOVERED in ${RECOVERY_SECS}s (new start=${current_started}, health=${current_health})"
|
||||
break
|
||||
fi
|
||||
log "Still recovering: started=${current_started} health=${current_health}"
|
||||
done
|
||||
|
||||
# ── Report ───────────────────────────────────────────────────────────────────
|
||||
if (( RECOVERED == 1 )); then
|
||||
msg="✅ Chaos validation PASS
|
||||
Target: ${TARGET}
|
||||
Recovered in: ${RECOVERY_SECS}s (window ${WAIT_SECS}s)
|
||||
Watchdog working as designed."
|
||||
log "PASS"
|
||||
notify "$msg"
|
||||
exit 0
|
||||
else
|
||||
current_state=$(docker inspect --format '{{.State.Status}}/{{.State.Health.Status}}' "$TARGET" 2>/dev/null || echo "missing")
|
||||
msg="🚨 Chaos validation FAIL
|
||||
Target: ${TARGET}
|
||||
State after ${WAIT_SECS}s: ${current_state}
|
||||
docker-health-watchdog did NOT restore the container.
|
||||
Investigate: journalctl -u docker-health-watchdog.service --since '1 hour ago'"
|
||||
log "FAIL: state=${current_state}"
|
||||
notify "$msg"
|
||||
exit 2
|
||||
fi
|
||||
@ -347,6 +347,62 @@ check_automation_drift() {
|
||||
fi
|
||||
}
|
||||
|
||||
check_gpu() {
|
||||
# Surface GPU readiness so the Ollama panel can warn "GPU: none — slow inference"
|
||||
header "GPU"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
local gpu_name
|
||||
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | tr -d '\n')
|
||||
if [[ -n "$gpu_name" ]]; then
|
||||
record gpu OK "$gpu_name" "GPU available ($gpu_name)"
|
||||
else
|
||||
record gpu WARN "nvidia-smi present but no GPU" "GPU driver loaded but no device"
|
||||
fi
|
||||
else
|
||||
# No GPU → not a failure, but inference will be CPU-bound and slow
|
||||
record gpu OK "CPU-only" "No GPU — Ollama inference will be CPU-bound"
|
||||
fi
|
||||
}
|
||||
|
||||
check_image_freshness() {
|
||||
# Flag containers running images that haven't been rebuilt in IMAGE_STALE_DAYS+ days.
|
||||
# Helps catch deploy-pipeline drift and forgotten services.
|
||||
header "IMAGE FRESHNESS"
|
||||
local stale_days=30
|
||||
local now stale_cutoff
|
||||
now=$(date +%s)
|
||||
stale_cutoff=$(( now - stale_days * 86400 ))
|
||||
|
||||
local stale_list=()
|
||||
local cid name image created created_ts
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" ]] && continue
|
||||
cid="${line%% *}"
|
||||
name=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||')
|
||||
image=$(docker inspect --format '{{.Config.Image}}' "$cid" 2>/dev/null)
|
||||
# Skip known third-party images (no rebuild cadence under our control)
|
||||
case "$image" in
|
||||
gitea/*|grafana/*|prom/*|mcr.microsoft.com/*|axllent/*|caddy:*|traefik:*|valkey/*|gcr.io/cadvisor/*) continue ;;
|
||||
esac
|
||||
created=$(docker inspect --format '{{.Image}}' "$cid" 2>/dev/null \
|
||||
| xargs -I{} docker inspect --format '{{.Created}}' {} 2>/dev/null)
|
||||
[[ -z "$created" ]] && continue
|
||||
# Use date for ISO → epoch
|
||||
created_ts=$(date -d "$created" +%s 2>/dev/null || echo "$now")
|
||||
if (( created_ts < stale_cutoff )); then
|
||||
local age_days
|
||||
age_days=$(( (now - created_ts) / 86400 ))
|
||||
stale_list+=("$name(${age_days}d)")
|
||||
fi
|
||||
done < <(docker ps --format '{{.ID}}' 2>/dev/null)
|
||||
|
||||
if (( ${#stale_list[@]} > 0 )); then
|
||||
record image_freshness WARN "${stale_list[*]}" "${#stale_list[@]} container(s) on image >${stale_days}d old: ${stale_list[*]}"
|
||||
else
|
||||
record image_freshness OK "all fresh" "All container images rebuilt within ${stale_days}d"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Run all checks ───────────────────────────────────────────────────────────
|
||||
|
||||
if ! $JSON_MODE && ! $QUIET; then
|
||||
@ -362,6 +418,8 @@ check_docker_containers
|
||||
check_docker_disk
|
||||
check_logs
|
||||
check_automation_drift
|
||||
check_gpu
|
||||
check_image_freshness
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
126
scripts/VMs/HostingerVM/vm-io-anomaly-check.sh
Executable file
126
scripts/VMs/HostingerVM/vm-io-anomaly-check.sh
Executable file
@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# vm-io-anomaly-check.sh — Sustained disk-write I/O anomaly alert
|
||||
#
|
||||
# Queries Prometheus for the average sda write rate over the past 6 hours.
|
||||
# Alerts via Telegram if the rate exceeds the WARN threshold and identifies
|
||||
# the top-3 container writers from cAdvisor metrics for context.
|
||||
#
|
||||
# Phase 0.3 identified invttrdg-backend + trading-backend as the steady-state
|
||||
# write source (~6 GB/day). This script catches new spikes above that baseline.
|
||||
#
|
||||
# Runs every 6 hours via systemd timer.
|
||||
# =============================================================================
|
||||
set -Eeuo pipefail
|
||||
|
||||
BACKEND_CONTAINER="devops-backend"
|
||||
PROM="http://learning_ai_common_plat-prometheus-1:9090"
|
||||
|
||||
WARN_GB_PER_HR="${IO_WARN_GB_PER_HR:-1.0}" # baseline ~0.3 GB/hr; alert if 3x sustained
|
||||
CRIT_GB_PER_HR="${IO_CRIT_GB_PER_HR:-2.5}"
|
||||
|
||||
LOG_FILE="/var/log/vm-io-anomaly.log"
|
||||
STATE_FILE="/var/log/vm-io-anomaly-state"
|
||||
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
||||
|
||||
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; }
|
||||
|
||||
prom_query() {
|
||||
local query="$1"
|
||||
docker exec "$BACKEND_CONTAINER" \
|
||||
curl -sf --max-time 10 \
|
||||
"${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \
|
||||
2>/dev/null \
|
||||
| python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
d=json.load(sys.stdin)
|
||||
r=d['data']['result']
|
||||
print(round(float(r[0]['value'][1]),3) if r else '?')
|
||||
except Exception:
|
||||
print('?')
|
||||
" 2>/dev/null || echo "?"
|
||||
}
|
||||
|
||||
# ── Pre-flight ───────────────────────────────────────────────────────────────
|
||||
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then
|
||||
log "ERROR: ${BACKEND_CONTAINER} not running — skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 6-hour avg sda write rate in GB/hr
|
||||
AVG_6H=$(prom_query 'avg_over_time((rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824)[6h:5m])')
|
||||
if [[ "$AVG_6H" == "?" ]]; then
|
||||
log "Could not query Prometheus — skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Numeric comparison via awk (handles fractions)
|
||||
LEVEL=$(awk -v v="$AVG_6H" -v w="$WARN_GB_PER_HR" -v c="$CRIT_GB_PER_HR" \
|
||||
'BEGIN{ if (v+0 >= c+0) print "CRIT"; else if (v+0 >= w+0) print "WARN"; else print "OK" }')
|
||||
|
||||
log "6h avg write rate = ${AVG_6H} GB/hr → ${LEVEL}"
|
||||
|
||||
# ── Compute daily projection ─────────────────────────────────────────────────
|
||||
PROJ_DAY=$(awk -v v="$AVG_6H" 'BEGIN{ printf "%.1f", v*24 }')
|
||||
|
||||
# ── Identify top writers (cAdvisor doesn't expose per-container blkio in this
|
||||
# setup, but we can at least show top RAM/CPU consumers as a proxy) ────────
|
||||
TOP_PROCS=""
|
||||
if [[ "$LEVEL" != "OK" ]]; then
|
||||
TOP_PROCS=$(docker stats --no-stream --format '{{.Name}} {{.CPUPerc}} {{.MemUsage}}' 2>/dev/null \
|
||||
| sort -k2 -rh | head -3 \
|
||||
| awk '{printf " %s — %s CPU, %s\n", $1, $2, $3}')
|
||||
fi
|
||||
|
||||
# ── Deduplicate: only alert once per LEVEL transition ────────────────────────
|
||||
PREV_LEVEL=""
|
||||
if [[ -f "$STATE_FILE" ]]; then
|
||||
PREV_LEVEL=$(tr -d '[:space:]' < "$STATE_FILE" 2>/dev/null || echo "")
|
||||
fi
|
||||
echo "$LEVEL" > "$STATE_FILE"
|
||||
|
||||
if [[ "$LEVEL" == "$PREV_LEVEL" ]]; then
|
||||
log "Level unchanged (${LEVEL}); no alert"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Only notify on transitions INTO a non-OK level, or recovery to OK
|
||||
if [[ "$LEVEL" == "OK" && "$PREV_LEVEL" != "" && "$PREV_LEVEL" != "OK" ]]; then
|
||||
MSG="✅ I/O anomaly cleared — $(hostname)
|
||||
sda 6h avg now ${AVG_6H} GB/hr (was ${PREV_LEVEL})"
|
||||
elif [[ "$LEVEL" != "OK" ]]; then
|
||||
ICON=$([[ "$LEVEL" == "CRIT" ]] && echo "🚨" || echo "⚠️")
|
||||
MSG="${ICON} I/O anomaly ${LEVEL} — $(hostname)
|
||||
sda 6h avg = ${AVG_6H} GB/hr (~${PROJ_DAY} GB/day)
|
||||
Threshold: WARN ${WARN_GB_PER_HR} / CRIT ${CRIT_GB_PER_HR}
|
||||
|
||||
Top containers (CPU proxy — cAdvisor blkio not available):
|
||||
${TOP_PROCS:- (none)}
|
||||
|
||||
Phase 0.3 baseline: invttrdg-backend (~5 GB/day) + trading-backend (~1 GB/day).
|
||||
Investigate further: docker stats; iotop -ao -n 5"
|
||||
else
|
||||
log "No transition needing alert"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Send Telegram ────────────────────────────────────────────────────────────
|
||||
TELEGRAM_TOKEN=""
|
||||
TELEGRAM_CHAT_ID=""
|
||||
if [[ -f "$TOKEN_FILE" ]]; then
|
||||
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||
fi
|
||||
|
||||
if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
||||
if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
||||
-d chat_id="$TELEGRAM_CHAT_ID" -d text="$MSG" > /dev/null 2>&1; then
|
||||
log "Telegram alert sent (${LEVEL})"
|
||||
else
|
||||
log "ERROR: Telegram send failed"
|
||||
fi
|
||||
else
|
||||
log "No Telegram credentials — alert NOT sent"
|
||||
echo "$MSG" >> "$LOG_FILE"
|
||||
fi
|
||||
12
systemd/chaos-validation.service
Normal file
12
systemd/chaos-validation.service
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Monthly chaos validation — break a container, verify watchdog restores it
|
||||
After=docker.service docker-health-watchdog.timer
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=root
|
||||
Group=root
|
||||
Environment="HERMES_HOME=/root/.hermes"
|
||||
ExecStart=/usr/local/bin/chaos-validation.sh
|
||||
TimeoutStartSec=2700
|
||||
11
systemd/chaos-validation.timer
Normal file
11
systemd/chaos-validation.timer
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run chaos validation monthly (1st at 10:00 UTC, after weekly digest)
|
||||
After=docker.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-01 10:00 UTC
|
||||
AccuracySec=1h
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
11
systemd/vm-io-anomaly-check.service
Normal file
11
systemd/vm-io-anomaly-check.service
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Check sustained disk I/O anomaly and alert via Telegram
|
||||
After=docker.service network-online.target
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=root
|
||||
Group=root
|
||||
Environment="HERMES_HOME=/root/.hermes"
|
||||
ExecStart=/usr/local/bin/vm-io-anomaly-check.sh
|
||||
12
systemd/vm-io-anomaly-check.timer
Normal file
12
systemd/vm-io-anomaly-check.timer
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Run sustained-I/O anomaly check every 6 hours
|
||||
After=docker.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=15min
|
||||
OnUnitActiveSec=6h
|
||||
AccuracySec=10min
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
Loading…
Reference in New Issue
Block a user