From 8d32cb79806dd61afeb6e0e2f2ff7795d2919bef Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Fri, 29 May 2026 00:15:11 +0000 Subject: [PATCH] =?UTF-8?q?feat(dashboard/vm):=20Phases=204.1-4.3=20?= =?UTF-8?q?=E2=80=94=20Prometheus=20trends,=20sparklines,=20weekly=20diges?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - prometheus.ts: new Prometheus client with 7d/30d range queries for disk, memory, swap, CPU steal, and disk I/O (GB/hr); getWeeklyDigestData() aggregates all metrics for digest and API endpoint - routes.ts: GET /api/vm/metrics/trend?metric=…&range=… and GET /api/vm/weekly-digest endpoints - api.ts: TrendPoint/TrendSeries types; getTrend() and getMemoryTrend() added to vmApi - vm/page.tsx: Sparkline (pure SVG polyline+fill), TrendCard with latest/avg/peak and threshold colouring, TrendsPanel with lazy load on first open; Promise.allSettled() isolation for all 5 data panels - vm-weekly-digest.sh: weekly Telegram digest via docker exec into devops-backend to reach Prometheus; emoji severity indicators; cron summary from /var/log/vm-cleanup.log - systemd timer: Mon 08:00 UTC, Persistent=true (fires on next boot if missed); first trigger 2026-06-02 Co-Authored-By: Claude Sonnet 4.6 --- .../backend/src/modules/vm/prometheus.ts | 172 +++++++++++ dashboard/backend/src/modules/vm/routes.ts | 42 +++ dashboard/web/src/app/vm/page.tsx | 287 ++++++++++++++++++ dashboard/web/src/lib/api.ts | 17 ++ scripts/VMs/HostingerVM/vm-weekly-digest.sh | 160 ++++++++++ systemd/vm-weekly-digest.service | 11 + systemd/vm-weekly-digest.timer | 11 + 7 files changed, 700 insertions(+) create mode 100644 dashboard/backend/src/modules/vm/prometheus.ts create mode 100755 scripts/VMs/HostingerVM/vm-weekly-digest.sh create mode 100644 systemd/vm-weekly-digest.service create mode 100644 systemd/vm-weekly-digest.timer diff --git a/dashboard/backend/src/modules/vm/prometheus.ts b/dashboard/backend/src/modules/vm/prometheus.ts new file mode 100644 index 0000000..a5ec455 --- /dev/null +++ b/dashboard/backend/src/modules/vm/prometheus.ts @@ -0,0 +1,172 @@ +const PROMETHEUS_BASE = + process.env.PROMETHEUS_URL ?? 'http://learning_ai_common_plat-prometheus-1:9090'; + +// ── Types ────────────────────────────────────────────────────────────────── + +export interface TrendPoint { + t: number; // unix ms + v: number; +} + +export interface TrendSeries { + metric: string; + unit: string; + points: TrendPoint[]; + latest: number; + avg: number; + peak: number; +} + +export interface MemoryTrend { + available: TrendSeries; + swap: TrendSeries; +} + +// ── Internal helpers ──────────────────────────────────────────────────────── + +function rangeParams(rangeStr: string): { start: number; end: number; step: string } { + const end = Date.now(); + const days = rangeStr === '30d' ? 30 : 7; + const start = end - days * 86_400_000; + const step = days <= 7 ? '1h' : '4h'; + return { start, end, step }; +} + +async function queryRange(query: string, start: number, end: number, step: string): Promise { + const url = new URL(`${PROMETHEUS_BASE}/api/v1/query_range`); + url.searchParams.set('query', query); + url.searchParams.set('start', String(Math.floor(start / 1000))); + url.searchParams.set('end', String(Math.floor(end / 1000))); + url.searchParams.set('step', step); + + const res = await fetch(url.toString(), { + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) throw new Error(`Prometheus ${res.status}: ${res.statusText}`); + + const body = (await res.json()) as { + status: string; + error?: string; + data?: { result: Array<{ values: [number, string][] }> }; + }; + if (body.status !== 'success') throw new Error(`Prometheus: ${body.error ?? 'unknown error'}`); + + const result = body.data?.result ?? []; + if (result.length === 0) return []; + + if (result.length === 1) { + return result[0].values + .map(([ts, v]) => ({ t: ts * 1000, v: parseFloat(v) })) + .filter(p => !isNaN(p.v)); + } + + // Multi-series (e.g. per-CPU steal) → average by timestamp + const byTime = new Map(); + for (const series of result) { + for (const [ts, v] of series.values) { + const ms = ts * 1000; + const val = parseFloat(v); + if (!isNaN(val)) { + const bucket = byTime.get(ms); + if (bucket) bucket.push(val); + else byTime.set(ms, [val]); + } + } + } + return Array.from(byTime.entries()) + .sort(([a], [b]) => a - b) + .map(([t, vals]) => ({ t, v: vals.reduce((s, x) => s + x, 0) / vals.length })); +} + +function summarize(points: TrendPoint[]): Pick { + if (points.length === 0) return { latest: 0, avg: 0, peak: 0 }; + const vals = points.map(p => p.v); + const latest = vals[vals.length - 1]; + const avg = vals.reduce((s, v) => s + v, 0) / vals.length; + const peak = Math.max(...vals); + const round1 = (n: number) => Math.round(n * 10) / 10; + return { latest: round1(latest), avg: round1(avg), peak: round1(peak) }; +} + +// ── Public trend queries ──────────────────────────────────────────────────── + +export async function getDiskTrend(range = '7d'): Promise { + const { start, end, step } = rangeParams(range); + const points = await queryRange( + '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100', + start, end, step, + ); + return { metric: 'disk', unit: '%', points, ...summarize(points) }; +} + +export async function getMemoryTrend(range = '7d'): Promise { + const { start, end, step } = rangeParams(range); + const [availPts, swapPts] = await Promise.all([ + queryRange('node_memory_MemAvailable_bytes / 1073741824', start, end, step), + queryRange( + '(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824', + start, end, step, + ), + ]); + return { + available: { metric: 'ram_available', unit: 'GB', points: availPts, ...summarize(availPts) }, + swap: { metric: 'swap_used', unit: 'GB', points: swapPts, ...summarize(swapPts) }, + }; +} + +export async function getStealTrend(range = '7d'): Promise { + const { start, end, step } = rangeParams(range); + // avg() across all CPUs so multi-CPU hosts get a single % value + const points = await queryRange( + 'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100', + start, end, step, + ); + return { metric: 'steal', unit: '%', points, ...summarize(points) }; +} + +export async function getIoTrend(range = '7d'): Promise { + const { start, end, step } = rangeParams(range); + // Total VM block write rate in GB/hr (sda = primary disk). + // cAdvisor does not expose per-container blkio in this setup, so we use + // the node-exporter metric which covers all processes including invttrdg. + const points = await queryRange( + 'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824', + start, end, step, + ); + return { metric: 'disk_io_write', unit: 'GB/hr', points, ...summarize(points) }; +} + +// ── Weekly digest summary (used by digest endpoint + cron) ───────────────── + +export interface WeeklyDigestData { + period: { from: string; to: string }; + steal: { avg: number; peak: number }; + disk: { latest: number; peak: number }; + ram: { avg: number; low: number }; + swap: { avg: number; peak: number }; + io: { avg: number; peak: number }; +} + +export async function getWeeklyDigestData(): Promise { + const [diskData, memData, stealData, ioData] = await Promise.all([ + getDiskTrend('7d'), + getMemoryTrend('7d'), + getStealTrend('7d'), + getIoTrend('7d'), + ]); + + const to = new Date(); + const from = new Date(to.getTime() - 7 * 86_400_000); + + const ramVals = memData.available.points.map(p => p.v); + const swapVals = memData.swap.points.map(p => p.v); + + return { + period: { from: from.toISOString(), to: to.toISOString() }, + steal: { avg: stealData.avg, peak: stealData.peak }, + disk: { latest: diskData.latest, peak: diskData.peak }, + ram: { avg: memData.available.avg, low: ramVals.length ? Math.min(...ramVals) : 0 }, + swap: { avg: memData.swap.avg, peak: swapVals.length ? Math.max(...swapVals) : 0 }, + io: { avg: ioData.avg, peak: ioData.peak }, + }; +} diff --git a/dashboard/backend/src/modules/vm/routes.ts b/dashboard/backend/src/modules/vm/routes.ts index 8697472..5972d68 100644 --- a/dashboard/backend/src/modules/vm/routes.ts +++ b/dashboard/backend/src/modules/vm/routes.ts @@ -10,6 +10,13 @@ import { getOllamaModels, unloadOllamaModel, } from './repository.js'; +import { + getDiskTrend, + getMemoryTrend, + getStealTrend, + getIoTrend, + getWeeklyDigestData, +} from './prometheus.js'; import { VmCleanupParamsSchema, VmContainerRestartParamsSchema } from './types.js'; export async function vmRoutes(fastify: FastifyInstance) { @@ -127,4 +134,39 @@ export async function vmRoutes(fastify: FastifyInstance) { return reply.code(500).send({ error: error.message || 'Unload failed' }); } }); + + // ── Prometheus trend queries ─────────────────────────────────────────────── + + // GET /api/vm/metrics/trend?metric=disk|memory|steal|io&range=7d|30d + fastify.get('/vm/metrics/trend', { + preHandler: async (req) => requireAdmin(req), + }, async (req, reply) => { + const { metric = 'disk', range = '7d' } = req.query as Record; + const validRange = range === '30d' ? '30d' : '7d'; + try { + switch (metric) { + case 'disk': return reply.send(await getDiskTrend(validRange)); + case 'steal': return reply.send(await getStealTrend(validRange)); + case 'io': return reply.send(await getIoTrend(validRange)); + case 'memory': return reply.send(await getMemoryTrend(validRange)); + default: + return reply.code(400).send({ error: `Unknown metric: ${metric}` }); + } + } catch (error: any) { + fastify.log.error(error, 'Prometheus trend query failed'); + return reply.code(502).send({ error: 'Prometheus unavailable' }); + } + }); + + // GET /api/vm/weekly-digest — 7-day summary for Telegram digest + fastify.get('/vm/weekly-digest', { + preHandler: async (req) => requireAdmin(req), + }, async (_req, reply) => { + try { + return reply.send(await getWeeklyDigestData()); + } catch (error: any) { + fastify.log.error(error, 'Weekly digest data failed'); + return reply.code(502).send({ error: 'Prometheus unavailable' }); + } + }); } diff --git a/dashboard/web/src/app/vm/page.tsx b/dashboard/web/src/app/vm/page.tsx index a87918a..c5d5e90 100644 --- a/dashboard/web/src/app/vm/page.tsx +++ b/dashboard/web/src/app/vm/page.tsx @@ -9,6 +9,7 @@ import { type CronStatusResponse, type UnhealthyContainer, type OllamaModelsResponse, + type TrendSeries, } from '@/lib/api'; import { CheckCircle, @@ -33,6 +34,8 @@ import { Shield, Zap, MemoryStick, + TrendingUp, + AlertCircle, } from 'lucide-react'; // ── Types ────────────────────────────────────────────────────────────────── @@ -513,6 +516,256 @@ function OllamaPanel({ ); } +// ── Trend charts ────────────────────────────────────────────────────────── + +function Sparkline({ + points, + color = '#3b82f6', + fillColor, + min: minOverride, + max: maxOverride, + height = 48, +}: { + points: { t: number; v: number }[]; + color?: string; + fillColor?: string; + min?: number; + max?: number; + height?: number; +}) { + if (points.length < 2) { + return
; + } + const W = 400; + const H = height; + const vals = points.map(p => p.v); + const lo = minOverride ?? Math.min(...vals); + const hi = maxOverride ?? Math.max(...vals); + const range = hi - lo || 1; + + const toX = (i: number) => (i / (points.length - 1)) * W; + const toY = (v: number) => H - ((v - lo) / range) * (H - 4) - 2; + + const pts = points.map((p, i) => `${toX(i)},${toY(p.v)}`).join(' '); + const fillPath = `M${toX(0)},${H} ` + + points.map((p, i) => `L${toX(i)},${toY(p.v)}`).join(' ') + + ` L${toX(points.length - 1)},${H} Z`; + + return ( + + {fillColor && } + + {/* Latest value dot */} + + + ); +} + +function TrendCard({ + title, + series, + color, + fillColor, + unit, + warnThreshold, + critThreshold, + higherIsBetter = false, + min, + max, + note, +}: { + title: string; + series: TrendSeries | null | undefined; + color: string; + fillColor: string; + unit: string; + warnThreshold?: number; + critThreshold?: number; + higherIsBetter?: boolean; + min?: number; + max?: number; + note?: string; +}) { + if (!series) { + return ( +
+

{title}

+
+
+ ); + } + + const { latest, avg, peak, points } = series; + const isBad = (v: number) => { + if (higherIsBetter) return critThreshold !== undefined && v < critThreshold; + return critThreshold !== undefined && v >= critThreshold; + }; + const isWarn = (v: number) => { + if (higherIsBetter) return warnThreshold !== undefined && v < warnThreshold; + return warnThreshold !== undefined && v >= warnThreshold; + }; + + const latestColor = isBad(latest) + ? 'text-red-600' + : isWarn(latest) + ? 'text-yellow-600' + : 'text-gray-900'; + + const fmt = (v: number) => + v < 10 ? v.toFixed(1) : Math.round(v).toString(); + + return ( +
+
+

{title}

+ + {fmt(latest)}{unit} + +
+ +
+ avg {fmt(avg)}{unit} + peak {fmt(peak)}{unit} + {points.length}pts · 7d +
+ {note && ( +

{note}

+ )} +
+ ); +} + +interface TrendsData { + disk: TrendSeries | null; + steal: TrendSeries | null; + ram: TrendSeries | null; + swap: TrendSeries | null; + io: TrendSeries | null; +} + +function TrendsPanel({ + data, + loading, + onOpen, +}: { + data: TrendsData; + loading: boolean; + onOpen: () => void; +}) { + const [open, setOpen] = useState(false); + + const handleToggle = () => { + const next = !open; + setOpen(next); + if (next) onOpen(); + }; + + const ioNote = data.io && data.io.avg > 0.5 + ? `VM writes avg ${data.io.avg.toFixed(1)} GB/hr (${(data.io.avg * 24).toFixed(0)} GB/day) — investigate source if sustained` + : undefined; + + return ( +
+ + + {open && ( +
+ {loading ? ( +
+ {Array.from({ length: 5 }).map((_, i) => ( +
+ ))} +
+ ) : ( +
+ + + + + +
+ )} +
+ )} +
+ ); +} + // ── Check card meta ──────────────────────────────────────────────────────── const CHECK_META: Record = { @@ -559,6 +812,10 @@ export default function VmHealthPage() { const [unloading, setUnloading] = useState>(new Set()); + const [trends, setTrends] = useState({ disk: null, steal: null, ram: null, swap: null, io: null }); + const [trendsLoading, setTrendsLoading] = useState(false); + const [trendsLoaded, setTrendsLoaded] = useState(false); + const [showLog, setShowLog] = useState(false); const [lastRefreshed, setLastRefreshed] = useState(null); @@ -595,6 +852,29 @@ export default function VmHealthPage() { const handleRefresh = () => { setRefreshing(true); loadAll(); }; + const loadTrends = useCallback(async () => { + if (trendsLoading) return; + setTrendsLoading(true); + try { + const [diskRes, stealRes, memRes, ioRes] = await Promise.allSettled([ + vmApi.getTrend('disk', '7d'), + vmApi.getTrend('steal', '7d'), + vmApi.getMemoryTrend('7d'), + vmApi.getTrend('io', '7d'), + ]); + setTrends({ + disk: diskRes.status === 'fulfilled' ? diskRes.value : null, + steal: stealRes.status === 'fulfilled' ? stealRes.value : null, + ram: memRes.status === 'fulfilled' ? memRes.value.available : null, + swap: memRes.status === 'fulfilled' ? memRes.value.swap : null, + io: ioRes.status === 'fulfilled' ? ioRes.value : null, + }); + setTrendsLoaded(true); + } finally { + setTrendsLoading(false); + } + }, [trendsLoading]); + const handleCleanup = async (mode: 'weekly' | 'monthly' | 'dry-run') => { const msg = mode === 'monthly' ? 'Run MONTHLY full cleanup? This removes build cache, pnpm store, old logs, and HOLD node_modules.' : @@ -787,6 +1067,13 @@ export default function VmHealthPage() { unloading={unloading} /> + {/* ── 7-day trends ── */} + { if (!trendsLoaded && !trendsLoading) loadTrends(); }} + /> + {/* ── Cleanup section ── */}
diff --git a/dashboard/web/src/lib/api.ts b/dashboard/web/src/lib/api.ts index 5b0c359..8d2200e 100644 --- a/dashboard/web/src/lib/api.ts +++ b/dashboard/web/src/lib/api.ts @@ -512,8 +512,25 @@ export const vmApi = { `/api/vm/ollama/models/${encodeURIComponent(name)}`, { method: 'DELETE' }, ), + getTrend: (metric: 'disk' | 'steal' | 'io', range: '7d' | '30d') => + apiRequest(`/api/vm/metrics/trend?metric=${metric}&range=${range}`), + getMemoryTrend: (range: '7d' | '30d') => + apiRequest<{ available: TrendSeries; swap: TrendSeries }>( + `/api/vm/metrics/trend?metric=memory&range=${range}`, + ), }; +export interface TrendPoint { t: number; v: number } + +export interface TrendSeries { + metric: string; + unit: string; + points: TrendPoint[]; + latest: number; + avg: number; + peak: number; +} + // Auth API - calls platform-service for authentication export interface LoginRequest { email: string; diff --git a/scripts/VMs/HostingerVM/vm-weekly-digest.sh b/scripts/VMs/HostingerVM/vm-weekly-digest.sh new file mode 100755 index 0000000..cf3c148 --- /dev/null +++ b/scripts/VMs/HostingerVM/vm-weekly-digest.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# ============================================================================= +# vm-weekly-digest.sh — Weekly Telegram summary for srv1491630 +# +# Queries Prometheus via the devops-backend container (which is on the same +# Docker network as Prometheus), collects cleanup history, and sends a +# formatted summary to Telegram. +# +# Runs via systemd timer every Monday at 08:00 UTC. +# ============================================================================= +set -Eeuo pipefail + +TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env" +BACKEND_CONTAINER="devops-backend" +PROM="http://learning_ai_common_plat-prometheus-1:9090" + +# ── Helpers ───────────────────────────────────────────────────────────────── + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2; } + +prom_query() { + # Run a Prometheus instant query from inside the backend container. + # Returns the first result value, or "?" on failure. + local query="$1" + docker exec "$BACKEND_CONTAINER" \ + curl -sf --max-time 10 \ + "${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \ + 2>/dev/null \ + | python3 -c " +import json,sys +try: + d=json.load(sys.stdin) + r=d['data']['result'] + print(round(float(r[0]['value'][1]),1) if r else '?') +except Exception: + print('?') +" 2>/dev/null || echo "?" +} + +prom_range_avg() { + # 7-day range_query, return average of all values. + local query="$1" + local now step start + now=$(date +%s) + start=$(( now - 7 * 86400 )) + step="3600" + docker exec "$BACKEND_CONTAINER" \ + curl -sf --max-time 15 \ + "${PROM}/api/v1/query_range?$(printf 'query=%s&start=%s&end=%s&step=%s' \ + "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")" \ + "$start" "$now" "$step")" \ + 2>/dev/null \ + | python3 -c " +import json,sys +try: + d=json.load(sys.stdin) + vals=[float(v) for s in d['data']['result'] for _,v in s['values']] + if vals: print(round(sum(vals)/len(vals),1)) + else: print('?') +except Exception: + print('?') +" 2>/dev/null || echo "?" +} + +# ── Check backend container is running ─────────────────────────────────────── + +if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then + log "ERROR: ${BACKEND_CONTAINER} is not running — skipping weekly digest" + exit 1 +fi + +# ── Collect metrics ────────────────────────────────────────────────────────── + +log "Collecting 7-day metrics from Prometheus..." + +STEAL_AVG=$(prom_range_avg 'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100') +DISK_NOW=$(prom_query '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100') +RAM_AVG=$(prom_range_avg 'node_memory_MemAvailable_bytes / 1073741824') +SWAP_AVG=$(prom_range_avg '(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824') +IO_AVG=$(prom_range_avg 'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824') + +# Unhealthy containers (current) +UNHEALTHY=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | wc -l || echo "?") + +# Cleanup summary from log (last 7 days) +CLEANUP_LOG="/var/log/vm-cleanup.log" +CLEANUPS_THIS_WEEK=0 +if [[ -f "$CLEANUP_LOG" ]]; then + WEEK_AGO=$(date -u -d "7 days ago" '+%Y-%m-%dT' 2>/dev/null || date -u -v-7d '+%Y-%m-%dT' 2>/dev/null || true) + if [[ -n "$WEEK_AGO" ]]; then + CLEANUPS_THIS_WEEK=$(awk -v cutoff="$WEEK_AGO" ' + /\[START\]/ { in_block=1 } + in_block && /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T)/ { + match($0, /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:Z]+)\]/, a) + if (a[1] >= cutoff) count++ + in_block=0 + } + END { print count+0 } + ' "$CLEANUP_LOG" 2>/dev/null || echo 0) + fi +fi + +# ── Build Telegram message ──────────────────────────────────────────────────── + +# Determine severity indicators +steal_icon="✅"; [[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 15" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="🚨" +[[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="⚠️" + +disk_icon="✅"; [[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 70" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="🚨" +[[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 55" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="⚠️" + +ram_icon="✅"; [[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 1" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="🚨" +[[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 3" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="⚠️" + +svc_icon="✅"; [[ "$UNHEALTHY" -gt 0 ]] 2>/dev/null && svc_icon="⚠️" +[[ "$UNHEALTHY" -gt 5 ]] 2>/dev/null && svc_icon="🚨" + +io_icon="✅"; [[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 1.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="🚨" +[[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 0.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="⚠️" + +WEEK_END=$(date -u '+%Y-%m-%d') +WEEK_START=$(date -u -d "7 days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-7d '+%Y-%m-%d' 2>/dev/null || echo "N/A") + +MSG="📊 Weekly VM Digest — $(hostname) +Week ${WEEK_START} → ${WEEK_END} + +${steal_icon} CPU Steal: ${STEAL_AVG}% avg +${disk_icon} Disk: ${DISK_NOW}% used +${ram_icon} RAM: ${RAM_AVG} GB free avg +⏩ Swap: ${SWAP_AVG} GB avg +${svc_icon} Containers: ${UNHEALTHY} unhealthy now +${io_icon} Disk Writes: ${IO_AVG} GB/hr avg (sda total) +🧹 Cleanups: ${CLEANUPS_THIS_WEEK} this week + +Dashboard: https://devops.bytelyst.com" + +# ── Send Telegram ───────────────────────────────────────────────────────────── + +TELEGRAM_TOKEN="" +TELEGRAM_CHAT_ID="" +if [[ -f "$TOKEN_FILE" ]]; then + TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true) + TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true) +fi + +if [[ -z "$TELEGRAM_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then + log "No Telegram credentials — printing digest to stdout:" + echo "$MSG" + exit 0 +fi + +log "Sending weekly digest to Telegram..." +if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ + -d chat_id="$TELEGRAM_CHAT_ID" \ + -d text="$MSG" > /dev/null; then + log "Weekly digest sent" +else + log "ERROR: Telegram send failed" + exit 1 +fi diff --git a/systemd/vm-weekly-digest.service b/systemd/vm-weekly-digest.service new file mode 100644 index 0000000..097e38c --- /dev/null +++ b/systemd/vm-weekly-digest.service @@ -0,0 +1,11 @@ +[Unit] +Description=Send weekly VM health digest via Telegram +After=docker.service network-online.target +Requires=docker.service + +[Service] +Type=oneshot +User=root +Group=root +Environment="HERMES_HOME=/root/.hermes" +ExecStart=/usr/local/bin/vm-weekly-digest.sh diff --git a/systemd/vm-weekly-digest.timer b/systemd/vm-weekly-digest.timer new file mode 100644 index 0000000..83f8524 --- /dev/null +++ b/systemd/vm-weekly-digest.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run weekly VM Telegram digest every Monday 08:00 UTC +After=docker.service + +[Timer] +OnCalendar=Mon 08:00 UTC +AccuracySec=5min +Persistent=true + +[Install] +WantedBy=timers.target