OOM watchdog: - vm-oom-watchdog.sh — scans journalctl -k since cursor for oom-kill, killed-process, and "out of memory ... killed" entries; maps cgroup hits back to container names via docker inspect; posts a single Telegram alert per scan window (no dedupe needed — cursor advances on every run). Cursor at /var/log/vm-oom-cursor, log at /var/log/vm-oom-watchdog.log. - Systemd: OnBootSec=10min, OnUnitActiveSec=1h, Persistent=true. Orphan containers (no compose file on disk): - trading-backend → docker update --memory=768m (high-I/O bot) - gitea-npm-registry → docker update --memory=512m - orphan-containers.md captures canonical configs for recovery (env, mounts, networks, restart policy, memory limits). Closes Phase 2.3 (post-monitoring) and Phase 3.3 (orphan limits). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
128 lines
5.1 KiB
Bash
Executable File
128 lines
5.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# vm-oom-watchdog.sh — Detect kernel OOM-kill events and alert via Telegram
|
|
#
|
|
# Runs hourly via systemd timer. Scans dmesg for OOM-killer entries since the
|
|
# last successful run (cursor stored in /var/log/vm-oom-cursor). Posts a
|
|
# Telegram alert listing every container/process killed since last check.
|
|
#
|
|
# Originally added to validate Phase 2.3 memory-limit rollout — but kept
|
|
# running indefinitely as ongoing OOM detection.
|
|
# =============================================================================
|
|
set -Eeuo pipefail
|
|
|
|
CURSOR_FILE="/var/log/vm-oom-cursor"
|
|
LOG_FILE="/var/log/vm-oom-watchdog.log"
|
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
|
|
|
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true; }
|
|
|
|
# ── Read cursor (epoch seconds of last scan) ─────────────────────────────────
|
|
NOW=$(date +%s)
|
|
LAST=0
|
|
if [[ -f "$CURSOR_FILE" ]]; then
|
|
LAST=$(tr -dc '0-9' < "$CURSOR_FILE" 2>/dev/null || echo 0)
|
|
fi
|
|
# On first run: only look back 1 hour to avoid flooding from historical noise.
|
|
if (( LAST == 0 )); then
|
|
LAST=$(( NOW - 3600 ))
|
|
log "First run — looking back 1 hour"
|
|
fi
|
|
|
|
LOOKBACK_SECS=$(( NOW - LAST ))
|
|
if (( LOOKBACK_SECS < 60 )); then
|
|
log "Cursor too recent (${LOOKBACK_SECS}s); skipping"
|
|
exit 0
|
|
fi
|
|
|
|
# ── Pull dmesg since cursor ──────────────────────────────────────────────────
|
|
# Use --since with relative time; dmesg prints kernel ring buffer with
|
|
# timestamps relative to boot, so we use journalctl -k for wall-clock filtering.
|
|
if command -v journalctl >/dev/null 2>&1; then
|
|
SINCE_ISO=$(date -u -d "@${LAST}" '+%Y-%m-%d %H:%M:%S')
|
|
KERNEL_LOG=$(journalctl -k --since "$SINCE_ISO" --no-pager 2>/dev/null || true)
|
|
else
|
|
KERNEL_LOG=$(dmesg -T 2>/dev/null || dmesg 2>/dev/null || true)
|
|
fi
|
|
|
|
# ── Parse OOM events ─────────────────────────────────────────────────────────
|
|
# Patterns covered:
|
|
# "Killed process 1234 (procname) total-vm:..."
|
|
# "Memory cgroup out of memory: Killed process 1234 (procname)"
|
|
# "oom-kill:constraint=..."
|
|
OOM_LINES=$(echo "$KERNEL_LOG" | grep -iE 'oom-kill|killed process|out of memory.*killed' 2>/dev/null || true)
|
|
|
|
if [[ -z "$OOM_LINES" ]]; then
|
|
echo "$NOW" > "$CURSOR_FILE"
|
|
log "No OOM events in last ${LOOKBACK_SECS}s"
|
|
exit 0
|
|
fi
|
|
|
|
# Count distinct processes killed
|
|
KILLED_COUNT=$(echo "$OOM_LINES" | grep -ciE 'killed process' || echo 0)
|
|
PROCESSES=$(echo "$OOM_LINES" \
|
|
| grep -oE 'Killed process [0-9]+ \([^)]+\)' \
|
|
| sort -u \
|
|
| head -10 \
|
|
| sed 's/Killed process [0-9]* / /')
|
|
|
|
log "Detected ${KILLED_COUNT} OOM kill(s) in last ${LOOKBACK_SECS}s"
|
|
|
|
# ── Map killed processes back to containers when possible ────────────────────
|
|
CONTAINER_HITS=""
|
|
if command -v docker >/dev/null 2>&1; then
|
|
# Pull "killed as a result of limit of /docker/<id>" entries
|
|
CGROUPS=$(echo "$OOM_LINES" \
|
|
| grep -oE 'limit of /docker/[a-f0-9]+' \
|
|
| awk -F/ '{print $NF}' \
|
|
| sort -u)
|
|
if [[ -n "$CGROUPS" ]]; then
|
|
while IFS= read -r cid; do
|
|
[[ -z "$cid" ]] && continue
|
|
name=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null \
|
|
| sed 's|^/||' || true)
|
|
if [[ -n "$name" ]]; then
|
|
CONTAINER_HITS+=" • ${name}"$'\n'
|
|
fi
|
|
done <<< "$CGROUPS"
|
|
fi
|
|
fi
|
|
|
|
# ── Build Telegram message ───────────────────────────────────────────────────
|
|
HOST=$(hostname)
|
|
MSG="🚨 OOM Kill Detected — ${HOST}
|
|
Last ${LOOKBACK_SECS}s: ${KILLED_COUNT} kill(s)
|
|
|
|
Processes:
|
|
${PROCESSES:- (unable to parse)}"
|
|
|
|
if [[ -n "$CONTAINER_HITS" ]]; then
|
|
MSG+=$'\n\nContainers:\n'"${CONTAINER_HITS%$'\n'}"
|
|
fi
|
|
|
|
MSG+=$'\n\nReview: journalctl -k --since "1 hour ago" | grep -i oom'
|
|
|
|
# ── Send Telegram ────────────────────────────────────────────────────────────
|
|
TELEGRAM_TOKEN=""
|
|
TELEGRAM_CHAT_ID=""
|
|
if [[ -f "$TOKEN_FILE" ]]; then
|
|
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
fi
|
|
|
|
if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
|
if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
|
-d chat_id="$TELEGRAM_CHAT_ID" \
|
|
-d text="$MSG" > /dev/null 2>&1; then
|
|
log "Telegram alert sent"
|
|
else
|
|
log "ERROR: Telegram send failed"
|
|
fi
|
|
else
|
|
log "No Telegram credentials — alert NOT sent"
|
|
echo "$MSG" >> "$LOG_FILE"
|
|
fi
|
|
|
|
# ── Advance cursor ───────────────────────────────────────────────────────────
|
|
echo "$NOW" > "$CURSOR_FILE"
|