bytelyst-devops-tools/scripts/VMs/HostingerVM/vm-oom-watchdog.sh
Hermes VM 76ef17f26b feat(vm): Phase 2.3 closure — OOM watchdog + orphan-container docs
OOM watchdog:
- vm-oom-watchdog.sh — scans journalctl -k since cursor for oom-kill,
  killed-process, and "out of memory ... killed" entries; maps cgroup
  hits back to container names via docker inspect; posts a single
  Telegram alert per scan window (no dedupe needed — cursor advances
  on every run). Cursor at /var/log/vm-oom-cursor, log at
  /var/log/vm-oom-watchdog.log.
- Systemd: OnBootSec=10min, OnUnitActiveSec=1h, Persistent=true.

Orphan containers (no compose file on disk):
- trading-backend → docker update --memory=768m (high-I/O bot)
- gitea-npm-registry → docker update --memory=512m
- orphan-containers.md captures canonical configs for recovery
  (env, mounts, networks, restart policy, memory limits).

Closes Phase 2.3 (post-monitoring) and Phase 3.3 (orphan limits).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 05:26:49 +00:00

128 lines
5.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# vm-oom-watchdog.sh — Detect kernel OOM-kill events and alert via Telegram
#
# Runs hourly via systemd timer. Scans dmesg for OOM-killer entries since the
# last successful run (cursor stored in /var/log/vm-oom-cursor). Posts a
# Telegram alert listing every container/process killed since last check.
#
# Originally added to validate Phase 2.3 memory-limit rollout — but kept
# running indefinitely as ongoing OOM detection.
# =============================================================================
set -Eeuo pipefail
CURSOR_FILE="/var/log/vm-oom-cursor"
LOG_FILE="/var/log/vm-oom-watchdog.log"
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true; }
# ── Read cursor (epoch seconds of last scan) ─────────────────────────────────
NOW=$(date +%s)
LAST=0
if [[ -f "$CURSOR_FILE" ]]; then
LAST=$(tr -dc '0-9' < "$CURSOR_FILE" 2>/dev/null || echo 0)
fi
# On first run: only look back 1 hour to avoid flooding from historical noise.
if (( LAST == 0 )); then
LAST=$(( NOW - 3600 ))
log "First run — looking back 1 hour"
fi
LOOKBACK_SECS=$(( NOW - LAST ))
if (( LOOKBACK_SECS < 60 )); then
log "Cursor too recent (${LOOKBACK_SECS}s); skipping"
exit 0
fi
# ── Pull dmesg since cursor ──────────────────────────────────────────────────
# Use --since with relative time; dmesg prints kernel ring buffer with
# timestamps relative to boot, so we use journalctl -k for wall-clock filtering.
if command -v journalctl >/dev/null 2>&1; then
SINCE_ISO=$(date -u -d "@${LAST}" '+%Y-%m-%d %H:%M:%S')
KERNEL_LOG=$(journalctl -k --since "$SINCE_ISO" --no-pager 2>/dev/null || true)
else
KERNEL_LOG=$(dmesg -T 2>/dev/null || dmesg 2>/dev/null || true)
fi
# ── Parse OOM events ─────────────────────────────────────────────────────────
# Patterns covered:
# "Killed process 1234 (procname) total-vm:..."
# "Memory cgroup out of memory: Killed process 1234 (procname)"
# "oom-kill:constraint=..."
OOM_LINES=$(echo "$KERNEL_LOG" | grep -iE 'oom-kill|killed process|out of memory.*killed' 2>/dev/null || true)
if [[ -z "$OOM_LINES" ]]; then
echo "$NOW" > "$CURSOR_FILE"
log "No OOM events in last ${LOOKBACK_SECS}s"
exit 0
fi
# Count distinct processes killed
KILLED_COUNT=$(echo "$OOM_LINES" | grep -ciE 'killed process' || echo 0)
PROCESSES=$(echo "$OOM_LINES" \
| grep -oE 'Killed process [0-9]+ \([^)]+\)' \
| sort -u \
| head -10 \
| sed 's/Killed process [0-9]* / /')
log "Detected ${KILLED_COUNT} OOM kill(s) in last ${LOOKBACK_SECS}s"
# ── Map killed processes back to containers when possible ────────────────────
CONTAINER_HITS=""
if command -v docker >/dev/null 2>&1; then
# Pull "killed as a result of limit of /docker/<id>" entries
CGROUPS=$(echo "$OOM_LINES" \
| grep -oE 'limit of /docker/[a-f0-9]+' \
| awk -F/ '{print $NF}' \
| sort -u)
if [[ -n "$CGROUPS" ]]; then
while IFS= read -r cid; do
[[ -z "$cid" ]] && continue
name=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null \
| sed 's|^/||' || true)
if [[ -n "$name" ]]; then
CONTAINER_HITS+="${name}"$'\n'
fi
done <<< "$CGROUPS"
fi
fi
# ── Build Telegram message ───────────────────────────────────────────────────
HOST=$(hostname)
MSG="🚨 OOM Kill Detected — ${HOST}
Last ${LOOKBACK_SECS}s: ${KILLED_COUNT} kill(s)
Processes:
${PROCESSES:- (unable to parse)}"
if [[ -n "$CONTAINER_HITS" ]]; then
MSG+=$'\n\nContainers:\n'"${CONTAINER_HITS%$'\n'}"
fi
MSG+=$'\n\nReview: journalctl -k --since "1 hour ago" | grep -i oom'
# ── Send Telegram ────────────────────────────────────────────────────────────
TELEGRAM_TOKEN=""
TELEGRAM_CHAT_ID=""
if [[ -f "$TOKEN_FILE" ]]; then
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
fi
if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
-d chat_id="$TELEGRAM_CHAT_ID" \
-d text="$MSG" > /dev/null 2>&1; then
log "Telegram alert sent"
else
log "ERROR: Telegram send failed"
fi
else
log "No Telegram credentials — alert NOT sent"
echo "$MSG" >> "$LOG_FILE"
fi
# ── Advance cursor ───────────────────────────────────────────────────────────
echo "$NOW" > "$CURSOR_FILE"