diff --git a/scripts/VMs/HostingerVM/orphan-containers.md b/scripts/VMs/HostingerVM/orphan-containers.md new file mode 100644 index 0000000..ac3e48b --- /dev/null +++ b/scripts/VMs/HostingerVM/orphan-containers.md @@ -0,0 +1,95 @@ +# Orphan Containers — Canonical Configuration + +Two containers on this VM run without an on-disk compose file: + +| Container | Image | Memory Limit | Restart Policy | Source | +|---|---|---|---|---| +| `trading-backend` | `bytelyst-trading-bot-service-trading-backend` | **768 MiB** | always | compose file deleted; image present | +| `gitea-npm-registry` | `gitea/gitea:1.22` | **512 MiB** | unless-stopped | started via `docker run` | + +Both have runtime memory limits applied via `docker update`. Limits persist across +restarts of the same container but are lost on `docker rm`. If recreated, use the +recovery procedures below. + +--- + +## trading-backend + +Compose file referenced by container labels: +`/opt/bytelyst/trading/bytelyst-trading-bot-service/docker-compose.yml` — **missing on disk**. + +Only the `logs/` directory remains; the bind mount references it. + +### Recovery procedure +If the container is removed, recreate the compose file with: + +```yaml +services: + trading-backend: + image: bytelyst-trading-bot-service-trading-backend + container_name: trading-backend + user: node + working_dir: /app + entrypoint: ["docker-entrypoint.sh"] + command: ["node", "dist/index.js"] + env_file: + - .env # holds secrets — restore from backup + environment: + ENABLE_TRADING: "true" + PAPER_TRADING: "true" + EXECUTION_PROVIDER: alpaca + ALPACA_SUBTAG_ENV: paper + SYMBOLS: "BTC/USDT,ETH/USDT,SOL/USDT,DOGE/USDT,PEPE/USDT" + TOTAL_CAPITAL: "1000" + EXECUTION_TIMEFRAME: 15m + POLLING_INTERVAL: "60000" + volumes: + - /opt/bytelyst/trading/bytelyst-trading-bot-service/logs:/app/logs + networks: + - default + restart: always + deploy: + resources: + limits: + memory: 768m + +networks: + default: + name: learning_ai_common_plat_default + external: true +``` + +Full env captured via `docker inspect` is needed for secrets; check `.env` backup +or recover from hermes drive backup. + +--- + +## gitea-npm-registry + +Started via raw `docker run` rather than compose. The internal package registry +service used by ByteLyst monorepos for `@bytelyst/*` package distribution. + +### Recovery procedure + +```bash +docker run -d \ + --name gitea-npm-registry \ + --restart unless-stopped \ + --memory 512m --memory-swap 512m \ + -p 3300:3000 \ + -v gitea-data:/data \ + -e GITEA__server__DOMAIN=gitea.bytelyst.com \ + -e GITEA__server__SSH_DOMAIN=gitea.bytelyst.com \ + -e GITEA__server__ROOT_URL=https://gitea.bytelyst.com/ \ + -e GITEA__server__HTTP_PORT=3000 \ + -e GITEA__packages__ENABLED=true \ + -e GITEA__security__INSTALL_LOCK=true \ + -e INSTALL_LOCK=true \ + --network bridge \ + gitea/gitea:1.22 + +docker network connect learning_ai_common_plat_default gitea-npm-registry +``` + +The `gitea-data` named volume contains all repository + package state; never +delete it without a backup. diff --git a/scripts/VMs/HostingerVM/vm-oom-watchdog.sh b/scripts/VMs/HostingerVM/vm-oom-watchdog.sh new file mode 100755 index 0000000..338418c --- /dev/null +++ b/scripts/VMs/HostingerVM/vm-oom-watchdog.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# ============================================================================= +# vm-oom-watchdog.sh — Detect kernel OOM-kill events and alert via Telegram +# +# Runs hourly via systemd timer. Scans dmesg for OOM-killer entries since the +# last successful run (cursor stored in /var/log/vm-oom-cursor). Posts a +# Telegram alert listing every container/process killed since last check. +# +# Originally added to validate Phase 2.3 memory-limit rollout — but kept +# running indefinitely as ongoing OOM detection. +# ============================================================================= +set -Eeuo pipefail + +CURSOR_FILE="/var/log/vm-oom-cursor" +LOG_FILE="/var/log/vm-oom-watchdog.log" +TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true; } + +# ── Read cursor (epoch seconds of last scan) ───────────────────────────────── +NOW=$(date +%s) +LAST=0 +if [[ -f "$CURSOR_FILE" ]]; then + LAST=$(tr -dc '0-9' < "$CURSOR_FILE" 2>/dev/null || echo 0) +fi +# On first run: only look back 1 hour to avoid flooding from historical noise. +if (( LAST == 0 )); then + LAST=$(( NOW - 3600 )) + log "First run — looking back 1 hour" +fi + +LOOKBACK_SECS=$(( NOW - LAST )) +if (( LOOKBACK_SECS < 60 )); then + log "Cursor too recent (${LOOKBACK_SECS}s); skipping" + exit 0 +fi + +# ── Pull dmesg since cursor ────────────────────────────────────────────────── +# Use --since with relative time; dmesg prints kernel ring buffer with +# timestamps relative to boot, so we use journalctl -k for wall-clock filtering. +if command -v journalctl >/dev/null 2>&1; then + SINCE_ISO=$(date -u -d "@${LAST}" '+%Y-%m-%d %H:%M:%S') + KERNEL_LOG=$(journalctl -k --since "$SINCE_ISO" --no-pager 2>/dev/null || true) +else + KERNEL_LOG=$(dmesg -T 2>/dev/null || dmesg 2>/dev/null || true) +fi + +# ── Parse OOM events ───────────────────────────────────────────────────────── +# Patterns covered: +# "Killed process 1234 (procname) total-vm:..." +# "Memory cgroup out of memory: Killed process 1234 (procname)" +# "oom-kill:constraint=..." +OOM_LINES=$(echo "$KERNEL_LOG" | grep -iE 'oom-kill|killed process|out of memory.*killed' 2>/dev/null || true) + +if [[ -z "$OOM_LINES" ]]; then + echo "$NOW" > "$CURSOR_FILE" + log "No OOM events in last ${LOOKBACK_SECS}s" + exit 0 +fi + +# Count distinct processes killed +KILLED_COUNT=$(echo "$OOM_LINES" | grep -ciE 'killed process' || echo 0) +PROCESSES=$(echo "$OOM_LINES" \ + | grep -oE 'Killed process [0-9]+ \([^)]+\)' \ + | sort -u \ + | head -10 \ + | sed 's/Killed process [0-9]* / /') + +log "Detected ${KILLED_COUNT} OOM kill(s) in last ${LOOKBACK_SECS}s" + +# ── Map killed processes back to containers when possible ──────────────────── +CONTAINER_HITS="" +if command -v docker >/dev/null 2>&1; then + # Pull "killed as a result of limit of /docker/" entries + CGROUPS=$(echo "$OOM_LINES" \ + | grep -oE 'limit of /docker/[a-f0-9]+' \ + | awk -F/ '{print $NF}' \ + | sort -u) + if [[ -n "$CGROUPS" ]]; then + while IFS= read -r cid; do + [[ -z "$cid" ]] && continue + name=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null \ + | sed 's|^/||' || true) + if [[ -n "$name" ]]; then + CONTAINER_HITS+=" • ${name}"$'\n' + fi + done <<< "$CGROUPS" + fi +fi + +# ── Build Telegram message ─────────────────────────────────────────────────── +HOST=$(hostname) +MSG="🚨 OOM Kill Detected — ${HOST} +Last ${LOOKBACK_SECS}s: ${KILLED_COUNT} kill(s) + +Processes: +${PROCESSES:- (unable to parse)}" + +if [[ -n "$CONTAINER_HITS" ]]; then + MSG+=$'\n\nContainers:\n'"${CONTAINER_HITS%$'\n'}" +fi + +MSG+=$'\n\nReview: journalctl -k --since "1 hour ago" | grep -i oom' + +# ── Send Telegram ──────────────────────────────────────────────────────────── +TELEGRAM_TOKEN="" +TELEGRAM_CHAT_ID="" +if [[ -f "$TOKEN_FILE" ]]; then + TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) +fi + +if [[ -n "$TELEGRAM_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then + if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ + -d chat_id="$TELEGRAM_CHAT_ID" \ + -d text="$MSG" > /dev/null 2>&1; then + log "Telegram alert sent" + else + log "ERROR: Telegram send failed" + fi +else + log "No Telegram credentials — alert NOT sent" + echo "$MSG" >> "$LOG_FILE" +fi + +# ── Advance cursor ─────────────────────────────────────────────────────────── +echo "$NOW" > "$CURSOR_FILE" diff --git a/systemd/vm-oom-watchdog.service b/systemd/vm-oom-watchdog.service new file mode 100644 index 0000000..ce8e4cc --- /dev/null +++ b/systemd/vm-oom-watchdog.service @@ -0,0 +1,10 @@ +[Unit] +Description=Detect kernel OOM-kill events and alert via Telegram +After=docker.service network-online.target + +[Service] +Type=oneshot +User=root +Group=root +Environment="HERMES_HOME=/root/.hermes" +ExecStart=/usr/local/bin/vm-oom-watchdog.sh diff --git a/systemd/vm-oom-watchdog.timer b/systemd/vm-oom-watchdog.timer new file mode 100644 index 0000000..7ab1ad9 --- /dev/null +++ b/systemd/vm-oom-watchdog.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Run OOM watchdog hourly +After=docker.service + +[Timer] +OnBootSec=10min +OnUnitActiveSec=1h +AccuracySec=5min +Persistent=true + +[Install] +WantedBy=timers.target