- prometheus.ts: new Prometheus client with 7d/30d range queries for disk, memory, swap, CPU steal, and disk I/O (GB/hr); getWeeklyDigestData() aggregates all metrics for digest and API endpoint - routes.ts: GET /api/vm/metrics/trend?metric=…&range=… and GET /api/vm/weekly-digest endpoints - api.ts: TrendPoint/TrendSeries types; getTrend() and getMemoryTrend() added to vmApi - vm/page.tsx: Sparkline (pure SVG polyline+fill), TrendCard with latest/avg/peak and threshold colouring, TrendsPanel with lazy load on first open; Promise.allSettled() isolation for all 5 data panels - vm-weekly-digest.sh: weekly Telegram digest via docker exec into devops-backend to reach Prometheus; emoji severity indicators; cron summary from /var/log/vm-cleanup.log - systemd timer: Mon 08:00 UTC, Persistent=true (fires on next boot if missed); first trigger 2026-06-02 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
161 lines
7.0 KiB
Bash
Executable File
161 lines
7.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# vm-weekly-digest.sh — Weekly Telegram summary for srv1491630
|
|
#
|
|
# Queries Prometheus via the devops-backend container (which is on the same
|
|
# Docker network as Prometheus), collects cleanup history, and sends a
|
|
# formatted summary to Telegram.
|
|
#
|
|
# Runs via systemd timer every Monday at 08:00 UTC.
|
|
# =============================================================================
|
|
set -Eeuo pipefail
|
|
|
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
|
BACKEND_CONTAINER="devops-backend"
|
|
PROM="http://learning_ai_common_plat-prometheus-1:9090"
|
|
|
|
# ── Helpers ─────────────────────────────────────────────────────────────────
|
|
|
|
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2; }
|
|
|
|
prom_query() {
|
|
# Run a Prometheus instant query from inside the backend container.
|
|
# Returns the first result value, or "?" on failure.
|
|
local query="$1"
|
|
docker exec "$BACKEND_CONTAINER" \
|
|
curl -sf --max-time 10 \
|
|
"${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \
|
|
2>/dev/null \
|
|
| python3 -c "
|
|
import json,sys
|
|
try:
|
|
d=json.load(sys.stdin)
|
|
r=d['data']['result']
|
|
print(round(float(r[0]['value'][1]),1) if r else '?')
|
|
except Exception:
|
|
print('?')
|
|
" 2>/dev/null || echo "?"
|
|
}
|
|
|
|
prom_range_avg() {
|
|
# 7-day range_query, return average of all values.
|
|
local query="$1"
|
|
local now step start
|
|
now=$(date +%s)
|
|
start=$(( now - 7 * 86400 ))
|
|
step="3600"
|
|
docker exec "$BACKEND_CONTAINER" \
|
|
curl -sf --max-time 15 \
|
|
"${PROM}/api/v1/query_range?$(printf 'query=%s&start=%s&end=%s&step=%s' \
|
|
"$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")" \
|
|
"$start" "$now" "$step")" \
|
|
2>/dev/null \
|
|
| python3 -c "
|
|
import json,sys
|
|
try:
|
|
d=json.load(sys.stdin)
|
|
vals=[float(v) for s in d['data']['result'] for _,v in s['values']]
|
|
if vals: print(round(sum(vals)/len(vals),1))
|
|
else: print('?')
|
|
except Exception:
|
|
print('?')
|
|
" 2>/dev/null || echo "?"
|
|
}
|
|
|
|
# ── Check backend container is running ───────────────────────────────────────
|
|
|
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then
|
|
log "ERROR: ${BACKEND_CONTAINER} is not running — skipping weekly digest"
|
|
exit 1
|
|
fi
|
|
|
|
# ── Collect metrics ──────────────────────────────────────────────────────────
|
|
|
|
log "Collecting 7-day metrics from Prometheus..."
|
|
|
|
STEAL_AVG=$(prom_range_avg 'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100')
|
|
DISK_NOW=$(prom_query '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100')
|
|
RAM_AVG=$(prom_range_avg 'node_memory_MemAvailable_bytes / 1073741824')
|
|
SWAP_AVG=$(prom_range_avg '(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824')
|
|
IO_AVG=$(prom_range_avg 'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824')
|
|
|
|
# Unhealthy containers (current)
|
|
UNHEALTHY=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | wc -l || echo "?")
|
|
|
|
# Cleanup summary from log (last 7 days)
|
|
CLEANUP_LOG="/var/log/vm-cleanup.log"
|
|
CLEANUPS_THIS_WEEK=0
|
|
if [[ -f "$CLEANUP_LOG" ]]; then
|
|
WEEK_AGO=$(date -u -d "7 days ago" '+%Y-%m-%dT' 2>/dev/null || date -u -v-7d '+%Y-%m-%dT' 2>/dev/null || true)
|
|
if [[ -n "$WEEK_AGO" ]]; then
|
|
CLEANUPS_THIS_WEEK=$(awk -v cutoff="$WEEK_AGO" '
|
|
/\[START\]/ { in_block=1 }
|
|
in_block && /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T)/ {
|
|
match($0, /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:Z]+)\]/, a)
|
|
if (a[1] >= cutoff) count++
|
|
in_block=0
|
|
}
|
|
END { print count+0 }
|
|
' "$CLEANUP_LOG" 2>/dev/null || echo 0)
|
|
fi
|
|
fi
|
|
|
|
# ── Build Telegram message ────────────────────────────────────────────────────
|
|
|
|
# Determine severity indicators
|
|
steal_icon="✅"; [[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 15" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="🚨"
|
|
[[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="⚠️"
|
|
|
|
disk_icon="✅"; [[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 70" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="🚨"
|
|
[[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 55" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="⚠️"
|
|
|
|
ram_icon="✅"; [[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 1" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="🚨"
|
|
[[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 3" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="⚠️"
|
|
|
|
svc_icon="✅"; [[ "$UNHEALTHY" -gt 0 ]] 2>/dev/null && svc_icon="⚠️"
|
|
[[ "$UNHEALTHY" -gt 5 ]] 2>/dev/null && svc_icon="🚨"
|
|
|
|
io_icon="✅"; [[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 1.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="🚨"
|
|
[[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 0.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="⚠️"
|
|
|
|
WEEK_END=$(date -u '+%Y-%m-%d')
|
|
WEEK_START=$(date -u -d "7 days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-7d '+%Y-%m-%d' 2>/dev/null || echo "N/A")
|
|
|
|
MSG="📊 Weekly VM Digest — $(hostname)
|
|
Week ${WEEK_START} → ${WEEK_END}
|
|
|
|
${steal_icon} CPU Steal: ${STEAL_AVG}% avg
|
|
${disk_icon} Disk: ${DISK_NOW}% used
|
|
${ram_icon} RAM: ${RAM_AVG} GB free avg
|
|
⏩ Swap: ${SWAP_AVG} GB avg
|
|
${svc_icon} Containers: ${UNHEALTHY} unhealthy now
|
|
${io_icon} Disk Writes: ${IO_AVG} GB/hr avg (sda total)
|
|
🧹 Cleanups: ${CLEANUPS_THIS_WEEK} this week
|
|
|
|
Dashboard: https://devops.bytelyst.com"
|
|
|
|
# ── Send Telegram ─────────────────────────────────────────────────────────────
|
|
|
|
TELEGRAM_TOKEN=""
|
|
TELEGRAM_CHAT_ID=""
|
|
if [[ -f "$TOKEN_FILE" ]]; then
|
|
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
|
fi
|
|
|
|
if [[ -z "$TELEGRAM_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
|
|
log "No Telegram credentials — printing digest to stdout:"
|
|
echo "$MSG"
|
|
exit 0
|
|
fi
|
|
|
|
log "Sending weekly digest to Telegram..."
|
|
if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
|
-d chat_id="$TELEGRAM_CHAT_ID" \
|
|
-d text="$MSG" > /dev/null; then
|
|
log "Weekly digest sent"
|
|
else
|
|
log "ERROR: Telegram send failed"
|
|
exit 1
|
|
fi
|