#!/usr/bin/env bash # ============================================================================= # vm-weekly-digest.sh — Weekly Telegram summary for srv1491630 # # Queries Prometheus via the devops-backend container (which is on the same # Docker network as Prometheus), collects cleanup history, and sends a # formatted summary to Telegram. # # Runs via systemd timer every Monday at 08:00 UTC. # ============================================================================= set -Eeuo pipefail TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" BACKEND_CONTAINER="devops-backend" PROM="http://learning_ai_common_plat-prometheus-1:9090" # ── Helpers ───────────────────────────────────────────────────────────────── log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2; } prom_query() { # Run a Prometheus instant query from inside the backend container. # Returns the first result value, or "?" on failure. local query="$1" docker exec "$BACKEND_CONTAINER" \ curl -sf --max-time 10 \ "${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \ 2>/dev/null \ | python3 -c " import json,sys try: d=json.load(sys.stdin) r=d['data']['result'] print(round(float(r[0]['value'][1]),1) if r else '?') except Exception: print('?') " 2>/dev/null || echo "?" } prom_range_avg() { # 7-day range_query, return average of all values. local query="$1" local now step start now=$(date +%s) start=$(( now - 7 * 86400 )) step="3600" docker exec "$BACKEND_CONTAINER" \ curl -sf --max-time 15 \ "${PROM}/api/v1/query_range?$(printf 'query=%s&start=%s&end=%s&step=%s' \ "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")" \ "$start" "$now" "$step")" \ 2>/dev/null \ | python3 -c " import json,sys try: d=json.load(sys.stdin) vals=[float(v) for s in d['data']['result'] for _,v in s['values']] if vals: print(round(sum(vals)/len(vals),1)) else: print('?') except Exception: print('?') " 2>/dev/null || echo "?" } # ── Check backend container is running ─────────────────────────────────────── if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then log "ERROR: ${BACKEND_CONTAINER} is not running — skipping weekly digest" exit 1 fi # ── Collect metrics ────────────────────────────────────────────────────────── log "Collecting 7-day metrics from Prometheus..." STEAL_AVG=$(prom_range_avg 'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100') DISK_NOW=$(prom_query '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100') RAM_AVG=$(prom_range_avg 'node_memory_MemAvailable_bytes / 1073741824') SWAP_AVG=$(prom_range_avg '(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824') IO_AVG=$(prom_range_avg 'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824') # Unhealthy containers (current) UNHEALTHY=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | wc -l || echo "?") # Cleanup summary from log (last 7 days) CLEANUP_LOG="/var/log/vm-cleanup.log" CLEANUPS_THIS_WEEK=0 if [[ -f "$CLEANUP_LOG" ]]; then WEEK_AGO=$(date -u -d "7 days ago" '+%Y-%m-%dT' 2>/dev/null || date -u -v-7d '+%Y-%m-%dT' 2>/dev/null || true) if [[ -n "$WEEK_AGO" ]]; then CLEANUPS_THIS_WEEK=$(awk -v cutoff="$WEEK_AGO" ' /\[START\]/ { in_block=1 } in_block && /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T)/ { match($0, /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:Z]+)\]/, a) if (a[1] >= cutoff) count++ in_block=0 } END { print count+0 } ' "$CLEANUP_LOG" 2>/dev/null || echo 0) fi fi # ── Build Telegram message ──────────────────────────────────────────────────── # Determine severity indicators steal_icon="✅"; [[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 15" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="🚨" [[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="⚠️" disk_icon="✅"; [[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 70" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="🚨" [[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 55" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="⚠️" ram_icon="✅"; [[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 1" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="🚨" [[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 3" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="⚠️" svc_icon="✅"; [[ "$UNHEALTHY" -gt 0 ]] 2>/dev/null && svc_icon="⚠️" [[ "$UNHEALTHY" -gt 5 ]] 2>/dev/null && svc_icon="🚨" io_icon="✅"; [[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 1.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="🚨" [[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 0.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="⚠️" WEEK_END=$(date -u '+%Y-%m-%d') WEEK_START=$(date -u -d "7 days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-7d '+%Y-%m-%d' 2>/dev/null || echo "N/A") MSG="📊 Weekly VM Digest — $(hostname) Week ${WEEK_START} → ${WEEK_END} ${steal_icon} CPU Steal: ${STEAL_AVG}% avg ${disk_icon} Disk: ${DISK_NOW}% used ${ram_icon} RAM: ${RAM_AVG} GB free avg ⏩ Swap: ${SWAP_AVG} GB avg ${svc_icon} Containers: ${UNHEALTHY} unhealthy now ${io_icon} Disk Writes: ${IO_AVG} GB/hr avg (sda total) 🧹 Cleanups: ${CLEANUPS_THIS_WEEK} this week Dashboard: https://devops.bytelyst.com" # ── Send Telegram ───────────────────────────────────────────────────────────── TELEGRAM_TOKEN="" TELEGRAM_CHAT_ID="" if [[ -f "$TOKEN_FILE" ]]; then TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) fi if [[ -z "$TELEGRAM_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then log "No Telegram credentials — printing digest to stdout:" echo "$MSG" exit 0 fi log "Sending weekly digest to Telegram..." if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ -d chat_id="$TELEGRAM_CHAT_ID" \ -d text="$MSG" > /dev/null; then log "Weekly digest sent" else log "ERROR: Telegram send failed" exit 1 fi