#!/usr/bin/env bash # ============================================================================= # vm-oom-watchdog.sh — Detect kernel OOM-kill events and alert via Telegram # # Runs hourly via systemd timer. Scans dmesg for OOM-killer entries since the # last successful run (cursor stored in /var/log/vm-oom-cursor). Posts a # Telegram alert listing every container/process killed since last check. # # Originally added to validate Phase 2.3 memory-limit rollout — but kept # running indefinitely as ongoing OOM detection. # ============================================================================= set -Eeuo pipefail CURSOR_FILE="/var/log/vm-oom-cursor" LOG_FILE="/var/log/vm-oom-watchdog.log" TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true; } # ── Read cursor (epoch seconds of last scan) ───────────────────────────────── NOW=$(date +%s) LAST=0 if [[ -f "$CURSOR_FILE" ]]; then LAST=$(tr -dc '0-9' < "$CURSOR_FILE" 2>/dev/null || echo 0) fi # On first run: only look back 1 hour to avoid flooding from historical noise. if (( LAST == 0 )); then LAST=$(( NOW - 3600 )) log "First run — looking back 1 hour" fi LOOKBACK_SECS=$(( NOW - LAST )) if (( LOOKBACK_SECS < 60 )); then log "Cursor too recent (${LOOKBACK_SECS}s); skipping" exit 0 fi # ── Pull dmesg since cursor ────────────────────────────────────────────────── # Use --since with relative time; dmesg prints kernel ring buffer with # timestamps relative to boot, so we use journalctl -k for wall-clock filtering. if command -v journalctl >/dev/null 2>&1; then SINCE_ISO=$(date -u -d "@${LAST}" '+%Y-%m-%d %H:%M:%S') KERNEL_LOG=$(journalctl -k --since "$SINCE_ISO" --no-pager 2>/dev/null || true) else KERNEL_LOG=$(dmesg -T 2>/dev/null || dmesg 2>/dev/null || true) fi # ── Parse OOM events ───────────────────────────────────────────────────────── # Patterns covered: # "Killed process 1234 (procname) total-vm:..." # "Memory cgroup out of memory: Killed process 1234 (procname)" # "oom-kill:constraint=..." OOM_LINES=$(echo "$KERNEL_LOG" | grep -iE 'oom-kill|killed process|out of memory.*killed' 2>/dev/null || true) if [[ -z "$OOM_LINES" ]]; then echo "$NOW" > "$CURSOR_FILE" log "No OOM events in last ${LOOKBACK_SECS}s" exit 0 fi # Count distinct processes killed KILLED_COUNT=$(echo "$OOM_LINES" | grep -ciE 'killed process' || echo 0) PROCESSES=$(echo "$OOM_LINES" \ | grep -oE 'Killed process [0-9]+ \([^)]+\)' \ | sort -u \ | head -10 \ | sed 's/Killed process [0-9]* / /') log "Detected ${KILLED_COUNT} OOM kill(s) in last ${LOOKBACK_SECS}s" # ── Map killed processes back to containers when possible ──────────────────── CONTAINER_HITS="" if command -v docker >/dev/null 2>&1; then # Pull "killed as a result of limit of /docker/" entries CGROUPS=$(echo "$OOM_LINES" \ | grep -oE 'limit of /docker/[a-f0-9]+' \ | awk -F/ '{print $NF}' \ | sort -u) if [[ -n "$CGROUPS" ]]; then while IFS= read -r cid; do [[ -z "$cid" ]] && continue name=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null \ | sed 's|^/||' || true) if [[ -n "$name" ]]; then CONTAINER_HITS+=" • ${name}"$'\n' fi done <<< "$CGROUPS" fi fi # ── Build Telegram message ─────────────────────────────────────────────────── HOST=$(hostname) MSG="🚨 OOM Kill Detected — ${HOST} Last ${LOOKBACK_SECS}s: ${KILLED_COUNT} kill(s) Processes: ${PROCESSES:- (unable to parse)}" if [[ -n "$CONTAINER_HITS" ]]; then MSG+=$'\n\nContainers:\n'"${CONTAINER_HITS%$'\n'}" fi MSG+=$'\n\nReview: journalctl -k --since "1 hour ago" | grep -i oom' # ── Send Telegram ──────────────────────────────────────────────────────────── TELEGRAM_TOKEN="" TELEGRAM_CHAT_ID="" if [[ -f "$TOKEN_FILE" ]]; then TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) fi if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ -d chat_id="$TELEGRAM_CHAT_ID" \ -d text="$MSG" > /dev/null 2>&1; then log "Telegram alert sent" else log "ERROR: Telegram send failed" fi else log "No Telegram credentials — alert NOT sent" echo "$MSG" >> "$LOG_FILE" fi # ── Advance cursor ─────────────────────────────────────────────────────────── echo "$NOW" > "$CURSOR_FILE"