#!/usr/bin/env bash # ============================================================================= # vm-io-anomaly-check.sh — Sustained disk-write I/O anomaly alert # # Queries Prometheus for the average sda write rate over the past 6 hours. # Alerts via Telegram if the rate exceeds the WARN threshold and identifies # the top-3 container writers from cAdvisor metrics for context. # # Phase 0.3 identified invttrdg-backend + trading-backend as the steady-state # write source (~6 GB/day). This script catches new spikes above that baseline. # # Runs every 6 hours via systemd timer. # ============================================================================= set -Eeuo pipefail BACKEND_CONTAINER="devops-backend" PROM="http://learning_ai_common_plat-prometheus-1:9090" WARN_GB_PER_HR="${IO_WARN_GB_PER_HR:-1.0}" # baseline ~0.3 GB/hr; alert if 3x sustained CRIT_GB_PER_HR="${IO_CRIT_GB_PER_HR:-2.5}" LOG_FILE="/var/log/vm-io-anomaly.log" STATE_FILE="/var/log/vm-io-anomaly-state" TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG_FILE" >&2; } prom_query() { local query="$1" docker exec "$BACKEND_CONTAINER" \ curl -sf --max-time 10 \ "${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \ 2>/dev/null \ | python3 -c " import json,sys try: d=json.load(sys.stdin) r=d['data']['result'] print(round(float(r[0]['value'][1]),3) if r else '?') except Exception: print('?') " 2>/dev/null || echo "?" } # ── Pre-flight ─────────────────────────────────────────────────────────────── if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then log "ERROR: ${BACKEND_CONTAINER} not running — skipping" exit 0 fi # 6-hour avg sda write rate in GB/hr AVG_6H=$(prom_query 'avg_over_time((rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824)[6h:5m])') if [[ "$AVG_6H" == "?" ]]; then log "Could not query Prometheus — skipping" exit 0 fi # Numeric comparison via awk (handles fractions) LEVEL=$(awk -v v="$AVG_6H" -v w="$WARN_GB_PER_HR" -v c="$CRIT_GB_PER_HR" \ 'BEGIN{ if (v+0 >= c+0) print "CRIT"; else if (v+0 >= w+0) print "WARN"; else print "OK" }') log "6h avg write rate = ${AVG_6H} GB/hr → ${LEVEL}" # ── Compute daily projection ───────────────────────────────────────────────── PROJ_DAY=$(awk -v v="$AVG_6H" 'BEGIN{ printf "%.1f", v*24 }') # ── Identify top writers (cAdvisor doesn't expose per-container blkio in this # setup, but we can at least show top RAM/CPU consumers as a proxy) ──────── TOP_PROCS="" if [[ "$LEVEL" != "OK" ]]; then TOP_PROCS=$(docker stats --no-stream --format '{{.Name}} {{.CPUPerc}} {{.MemUsage}}' 2>/dev/null \ | sort -k2 -rh | head -3 \ | awk '{printf " %s — %s CPU, %s\n", $1, $2, $3}') fi # ── Deduplicate: only alert once per LEVEL transition ──────────────────────── PREV_LEVEL="" if [[ -f "$STATE_FILE" ]]; then PREV_LEVEL=$(tr -d '[:space:]' < "$STATE_FILE" 2>/dev/null || echo "") fi echo "$LEVEL" > "$STATE_FILE" if [[ "$LEVEL" == "$PREV_LEVEL" ]]; then log "Level unchanged (${LEVEL}); no alert" exit 0 fi # Only notify on transitions INTO a non-OK level, or recovery to OK if [[ "$LEVEL" == "OK" && "$PREV_LEVEL" != "" && "$PREV_LEVEL" != "OK" ]]; then MSG="✅ I/O anomaly cleared — $(hostname) sda 6h avg now ${AVG_6H} GB/hr (was ${PREV_LEVEL})" elif [[ "$LEVEL" != "OK" ]]; then ICON=$([[ "$LEVEL" == "CRIT" ]] && echo "🚨" || echo "⚠️") MSG="${ICON} I/O anomaly ${LEVEL} — $(hostname) sda 6h avg = ${AVG_6H} GB/hr (~${PROJ_DAY} GB/day) Threshold: WARN ${WARN_GB_PER_HR} / CRIT ${CRIT_GB_PER_HR} Top containers (CPU proxy — cAdvisor blkio not available): ${TOP_PROCS:- (none)} Phase 0.3 baseline: invttrdg-backend (~5 GB/day) + trading-backend (~1 GB/day). Investigate further: docker stats; iotop -ao -n 5" else log "No transition needing alert" exit 0 fi # ── Send Telegram ──────────────────────────────────────────────────────────── TELEGRAM_TOKEN="" TELEGRAM_CHAT_ID="" if [[ -f "$TOKEN_FILE" ]]; then TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) fi if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ -d chat_id="$TELEGRAM_CHAT_ID" -d text="$MSG" > /dev/null 2>&1; then log "Telegram alert sent (${LEVEL})" else log "ERROR: Telegram send failed" fi else log "No Telegram credentials — alert NOT sent" echo "$MSG" >> "$LOG_FILE" fi