vm-health-check.sh: - check_gpu(): nvidia-smi probe; "CPU-only" OK on this VM (no GPU) - check_image_freshness(): flag containers running images >30d old. Skips third-party images (gitea, grafana, prom, mcr.microsoft, axllent, caddy, traefik, valkey, cadvisor) — they have their own rebuild cadence. Currently flags 19 stale product images (~60d old). chaos-validation.sh: - Monthly chaos test: kill PID 1 in chronomind-web, wait up to 35 min for docker-health-watchdog to detect + restart. Telegram pass/fail. - Refuses to run if target not healthy. systemd timer fires 1st of month at 10:00 UTC (after 08:00 weekly digest). vm-io-anomaly-check.sh: - 6h avg sda write rate; transition alerts at WARN (1 GB/hr) / CRIT (2.5 GB/hr). De-dupes via /var/log/vm-io-anomaly-state so the alert fires once per transition, not every 6h. Current baseline: ~1.94 GB/hr (orphan-container state-file writes; see Phase 0.3). - Reports recovery to OK when rate drops back. vm/page.tsx: gpu + image_freshness added to CHECK_META so they render with proper icon/label and slot into CHECK_ORDER. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
127 lines
5.1 KiB
Bash
Executable File
127 lines
5.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# vm-io-anomaly-check.sh — Sustained disk-write I/O anomaly alert
|
|
#
|
|
# Queries Prometheus for the average sda write rate over the past 6 hours.
|
|
# Alerts via Telegram if the rate exceeds the WARN threshold and identifies
|
|
# the top-3 container writers from cAdvisor metrics for context.
|
|
#
|
|
# Phase 0.3 identified invttrdg-backend + trading-backend as the steady-state
|
|
# write source (~6 GB/day). This script catches new spikes above that baseline.
|
|
#
|
|
# Runs every 6 hours via systemd timer.
|
|
# =============================================================================
|
|
set -Eeuo pipefail
|
|
|
|
BACKEND_CONTAINER="devops-backend"
|
|
PROM="http://learning_ai_common_plat-prometheus-1:9090"
|
|
|
|
WARN_GB_PER_HR="${IO_WARN_GB_PER_HR:-1.0}" # baseline ~0.3 GB/hr; alert if 3x sustained
|
|
CRIT_GB_PER_HR="${IO_CRIT_GB_PER_HR:-2.5}"
|
|
|
|
LOG_FILE="/var/log/vm-io-anomaly.log"
|
|
STATE_FILE="/var/log/vm-io-anomaly-state"
|
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
|
|
|
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; }
|
|
|
|
prom_query() {
|
|
local query="$1"
|
|
docker exec "$BACKEND_CONTAINER" \
|
|
curl -sf --max-time 10 \
|
|
"${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \
|
|
2>/dev/null \
|
|
| python3 -c "
|
|
import json,sys
|
|
try:
|
|
d=json.load(sys.stdin)
|
|
r=d['data']['result']
|
|
print(round(float(r[0]['value'][1]),3) if r else '?')
|
|
except Exception:
|
|
print('?')
|
|
" 2>/dev/null || echo "?"
|
|
}
|
|
|
|
# ── Pre-flight ───────────────────────────────────────────────────────────────
|
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then
|
|
log "ERROR: ${BACKEND_CONTAINER} not running — skipping"
|
|
exit 0
|
|
fi
|
|
|
|
# 6-hour avg sda write rate in GB/hr
|
|
AVG_6H=$(prom_query 'avg_over_time((rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824)[6h:5m])')
|
|
if [[ "$AVG_6H" == "?" ]]; then
|
|
log "Could not query Prometheus — skipping"
|
|
exit 0
|
|
fi
|
|
|
|
# Numeric comparison via awk (handles fractions)
|
|
LEVEL=$(awk -v v="$AVG_6H" -v w="$WARN_GB_PER_HR" -v c="$CRIT_GB_PER_HR" \
|
|
'BEGIN{ if (v+0 >= c+0) print "CRIT"; else if (v+0 >= w+0) print "WARN"; else print "OK" }')
|
|
|
|
log "6h avg write rate = ${AVG_6H} GB/hr → ${LEVEL}"
|
|
|
|
# ── Compute daily projection ─────────────────────────────────────────────────
|
|
PROJ_DAY=$(awk -v v="$AVG_6H" 'BEGIN{ printf "%.1f", v*24 }')
|
|
|
|
# ── Identify top writers (cAdvisor doesn't expose per-container blkio in this
|
|
# setup, but we can at least show top RAM/CPU consumers as a proxy) ────────
|
|
TOP_PROCS=""
|
|
if [[ "$LEVEL" != "OK" ]]; then
|
|
TOP_PROCS=$(docker stats --no-stream --format '{{.Name}} {{.CPUPerc}} {{.MemUsage}}' 2>/dev/null \
|
|
| sort -k2 -rh | head -3 \
|
|
| awk '{printf " %s — %s CPU, %s\n", $1, $2, $3}')
|
|
fi
|
|
|
|
# ── Deduplicate: only alert once per LEVEL transition ────────────────────────
|
|
PREV_LEVEL=""
|
|
if [[ -f "$STATE_FILE" ]]; then
|
|
PREV_LEVEL=$(tr -d '[:space:]' < "$STATE_FILE" 2>/dev/null || echo "")
|
|
fi
|
|
echo "$LEVEL" > "$STATE_FILE"
|
|
|
|
if [[ "$LEVEL" == "$PREV_LEVEL" ]]; then
|
|
log "Level unchanged (${LEVEL}); no alert"
|
|
exit 0
|
|
fi
|
|
|
|
# Only notify on transitions INTO a non-OK level, or recovery to OK
|
|
if [[ "$LEVEL" == "OK" && "$PREV_LEVEL" != "" && "$PREV_LEVEL" != "OK" ]]; then
|
|
MSG="✅ I/O anomaly cleared — $(hostname)
|
|
sda 6h avg now ${AVG_6H} GB/hr (was ${PREV_LEVEL})"
|
|
elif [[ "$LEVEL" != "OK" ]]; then
|
|
ICON=$([[ "$LEVEL" == "CRIT" ]] && echo "🚨" || echo "⚠️")
|
|
MSG="${ICON} I/O anomaly ${LEVEL} — $(hostname)
|
|
sda 6h avg = ${AVG_6H} GB/hr (~${PROJ_DAY} GB/day)
|
|
Threshold: WARN ${WARN_GB_PER_HR} / CRIT ${CRIT_GB_PER_HR}
|
|
|
|
Top containers (CPU proxy — cAdvisor blkio not available):
|
|
${TOP_PROCS:- (none)}
|
|
|
|
Phase 0.3 baseline: invttrdg-backend (~5 GB/day) + trading-backend (~1 GB/day).
|
|
Investigate further: docker stats; iotop -ao -n 5"
|
|
else
|
|
log "No transition needing alert"
|
|
exit 0
|
|
fi
|
|
|
|
# ── Send Telegram ────────────────────────────────────────────────────────────
|
|
TELEGRAM_TOKEN=""
|
|
TELEGRAM_CHAT_ID=""
|
|
if [[ -f "$TOKEN_FILE" ]]; then
|
|
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
fi
|
|
|
|
if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
|
if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
|
-d chat_id="$TELEGRAM_CHAT_ID" -d text="$MSG" > /dev/null 2>&1; then
|
|
log "Telegram alert sent (${LEVEL})"
|
|
else
|
|
log "ERROR: Telegram send failed"
|
|
fi
|
|
else
|
|
log "No Telegram credentials — alert NOT sent"
|
|
echo "$MSG" >> "$LOG_FILE"
|
|
fi
|