feat(dashboard/vm): Phases 4.1-4.3 — Prometheus trends, sparklines, weekly digest
- prometheus.ts: new Prometheus client with 7d/30d range queries for disk, memory, swap, CPU steal, and disk I/O (GB/hr); getWeeklyDigestData() aggregates all metrics for digest and API endpoint - routes.ts: GET /api/vm/metrics/trend?metric=…&range=… and GET /api/vm/weekly-digest endpoints - api.ts: TrendPoint/TrendSeries types; getTrend() and getMemoryTrend() added to vmApi - vm/page.tsx: Sparkline (pure SVG polyline+fill), TrendCard with latest/avg/peak and threshold colouring, TrendsPanel with lazy load on first open; Promise.allSettled() isolation for all 5 data panels - vm-weekly-digest.sh: weekly Telegram digest via docker exec into devops-backend to reach Prometheus; emoji severity indicators; cron summary from /var/log/vm-cleanup.log - systemd timer: Mon 08:00 UTC, Persistent=true (fires on next boot if missed); first trigger 2026-06-02 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9a073ef225
commit
8d32cb7980
172
dashboard/backend/src/modules/vm/prometheus.ts
Normal file
172
dashboard/backend/src/modules/vm/prometheus.ts
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
const PROMETHEUS_BASE =
|
||||||
|
process.env.PROMETHEUS_URL ?? 'http://learning_ai_common_plat-prometheus-1:9090';
|
||||||
|
|
||||||
|
// ── Types ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export interface TrendPoint {
|
||||||
|
t: number; // unix ms
|
||||||
|
v: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TrendSeries {
|
||||||
|
metric: string;
|
||||||
|
unit: string;
|
||||||
|
points: TrendPoint[];
|
||||||
|
latest: number;
|
||||||
|
avg: number;
|
||||||
|
peak: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MemoryTrend {
|
||||||
|
available: TrendSeries;
|
||||||
|
swap: TrendSeries;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Internal helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function rangeParams(rangeStr: string): { start: number; end: number; step: string } {
|
||||||
|
const end = Date.now();
|
||||||
|
const days = rangeStr === '30d' ? 30 : 7;
|
||||||
|
const start = end - days * 86_400_000;
|
||||||
|
const step = days <= 7 ? '1h' : '4h';
|
||||||
|
return { start, end, step };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queryRange(query: string, start: number, end: number, step: string): Promise<TrendPoint[]> {
|
||||||
|
const url = new URL(`${PROMETHEUS_BASE}/api/v1/query_range`);
|
||||||
|
url.searchParams.set('query', query);
|
||||||
|
url.searchParams.set('start', String(Math.floor(start / 1000)));
|
||||||
|
url.searchParams.set('end', String(Math.floor(end / 1000)));
|
||||||
|
url.searchParams.set('step', step);
|
||||||
|
|
||||||
|
const res = await fetch(url.toString(), {
|
||||||
|
signal: AbortSignal.timeout(15_000),
|
||||||
|
});
|
||||||
|
if (!res.ok) throw new Error(`Prometheus ${res.status}: ${res.statusText}`);
|
||||||
|
|
||||||
|
const body = (await res.json()) as {
|
||||||
|
status: string;
|
||||||
|
error?: string;
|
||||||
|
data?: { result: Array<{ values: [number, string][] }> };
|
||||||
|
};
|
||||||
|
if (body.status !== 'success') throw new Error(`Prometheus: ${body.error ?? 'unknown error'}`);
|
||||||
|
|
||||||
|
const result = body.data?.result ?? [];
|
||||||
|
if (result.length === 0) return [];
|
||||||
|
|
||||||
|
if (result.length === 1) {
|
||||||
|
return result[0].values
|
||||||
|
.map(([ts, v]) => ({ t: ts * 1000, v: parseFloat(v) }))
|
||||||
|
.filter(p => !isNaN(p.v));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multi-series (e.g. per-CPU steal) → average by timestamp
|
||||||
|
const byTime = new Map<number, number[]>();
|
||||||
|
for (const series of result) {
|
||||||
|
for (const [ts, v] of series.values) {
|
||||||
|
const ms = ts * 1000;
|
||||||
|
const val = parseFloat(v);
|
||||||
|
if (!isNaN(val)) {
|
||||||
|
const bucket = byTime.get(ms);
|
||||||
|
if (bucket) bucket.push(val);
|
||||||
|
else byTime.set(ms, [val]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Array.from(byTime.entries())
|
||||||
|
.sort(([a], [b]) => a - b)
|
||||||
|
.map(([t, vals]) => ({ t, v: vals.reduce((s, x) => s + x, 0) / vals.length }));
|
||||||
|
}
|
||||||
|
|
||||||
|
function summarize(points: TrendPoint[]): Pick<TrendSeries, 'latest' | 'avg' | 'peak'> {
|
||||||
|
if (points.length === 0) return { latest: 0, avg: 0, peak: 0 };
|
||||||
|
const vals = points.map(p => p.v);
|
||||||
|
const latest = vals[vals.length - 1];
|
||||||
|
const avg = vals.reduce((s, v) => s + v, 0) / vals.length;
|
||||||
|
const peak = Math.max(...vals);
|
||||||
|
const round1 = (n: number) => Math.round(n * 10) / 10;
|
||||||
|
return { latest: round1(latest), avg: round1(avg), peak: round1(peak) };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Public trend queries ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export async function getDiskTrend(range = '7d'): Promise<TrendSeries> {
|
||||||
|
const { start, end, step } = rangeParams(range);
|
||||||
|
const points = await queryRange(
|
||||||
|
'(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100',
|
||||||
|
start, end, step,
|
||||||
|
);
|
||||||
|
return { metric: 'disk', unit: '%', points, ...summarize(points) };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getMemoryTrend(range = '7d'): Promise<MemoryTrend> {
|
||||||
|
const { start, end, step } = rangeParams(range);
|
||||||
|
const [availPts, swapPts] = await Promise.all([
|
||||||
|
queryRange('node_memory_MemAvailable_bytes / 1073741824', start, end, step),
|
||||||
|
queryRange(
|
||||||
|
'(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824',
|
||||||
|
start, end, step,
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
return {
|
||||||
|
available: { metric: 'ram_available', unit: 'GB', points: availPts, ...summarize(availPts) },
|
||||||
|
swap: { metric: 'swap_used', unit: 'GB', points: swapPts, ...summarize(swapPts) },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getStealTrend(range = '7d'): Promise<TrendSeries> {
|
||||||
|
const { start, end, step } = rangeParams(range);
|
||||||
|
// avg() across all CPUs so multi-CPU hosts get a single % value
|
||||||
|
const points = await queryRange(
|
||||||
|
'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100',
|
||||||
|
start, end, step,
|
||||||
|
);
|
||||||
|
return { metric: 'steal', unit: '%', points, ...summarize(points) };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getIoTrend(range = '7d'): Promise<TrendSeries> {
|
||||||
|
const { start, end, step } = rangeParams(range);
|
||||||
|
// Total VM block write rate in GB/hr (sda = primary disk).
|
||||||
|
// cAdvisor does not expose per-container blkio in this setup, so we use
|
||||||
|
// the node-exporter metric which covers all processes including invttrdg.
|
||||||
|
const points = await queryRange(
|
||||||
|
'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824',
|
||||||
|
start, end, step,
|
||||||
|
);
|
||||||
|
return { metric: 'disk_io_write', unit: 'GB/hr', points, ...summarize(points) };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Weekly digest summary (used by digest endpoint + cron) ─────────────────
|
||||||
|
|
||||||
|
export interface WeeklyDigestData {
|
||||||
|
period: { from: string; to: string };
|
||||||
|
steal: { avg: number; peak: number };
|
||||||
|
disk: { latest: number; peak: number };
|
||||||
|
ram: { avg: number; low: number };
|
||||||
|
swap: { avg: number; peak: number };
|
||||||
|
io: { avg: number; peak: number };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getWeeklyDigestData(): Promise<WeeklyDigestData> {
|
||||||
|
const [diskData, memData, stealData, ioData] = await Promise.all([
|
||||||
|
getDiskTrend('7d'),
|
||||||
|
getMemoryTrend('7d'),
|
||||||
|
getStealTrend('7d'),
|
||||||
|
getIoTrend('7d'),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const to = new Date();
|
||||||
|
const from = new Date(to.getTime() - 7 * 86_400_000);
|
||||||
|
|
||||||
|
const ramVals = memData.available.points.map(p => p.v);
|
||||||
|
const swapVals = memData.swap.points.map(p => p.v);
|
||||||
|
|
||||||
|
return {
|
||||||
|
period: { from: from.toISOString(), to: to.toISOString() },
|
||||||
|
steal: { avg: stealData.avg, peak: stealData.peak },
|
||||||
|
disk: { latest: diskData.latest, peak: diskData.peak },
|
||||||
|
ram: { avg: memData.available.avg, low: ramVals.length ? Math.min(...ramVals) : 0 },
|
||||||
|
swap: { avg: memData.swap.avg, peak: swapVals.length ? Math.max(...swapVals) : 0 },
|
||||||
|
io: { avg: ioData.avg, peak: ioData.peak },
|
||||||
|
};
|
||||||
|
}
|
||||||
@ -10,6 +10,13 @@ import {
|
|||||||
getOllamaModels,
|
getOllamaModels,
|
||||||
unloadOllamaModel,
|
unloadOllamaModel,
|
||||||
} from './repository.js';
|
} from './repository.js';
|
||||||
|
import {
|
||||||
|
getDiskTrend,
|
||||||
|
getMemoryTrend,
|
||||||
|
getStealTrend,
|
||||||
|
getIoTrend,
|
||||||
|
getWeeklyDigestData,
|
||||||
|
} from './prometheus.js';
|
||||||
import { VmCleanupParamsSchema, VmContainerRestartParamsSchema } from './types.js';
|
import { VmCleanupParamsSchema, VmContainerRestartParamsSchema } from './types.js';
|
||||||
|
|
||||||
export async function vmRoutes(fastify: FastifyInstance) {
|
export async function vmRoutes(fastify: FastifyInstance) {
|
||||||
@ -127,4 +134,39 @@ export async function vmRoutes(fastify: FastifyInstance) {
|
|||||||
return reply.code(500).send({ error: error.message || 'Unload failed' });
|
return reply.code(500).send({ error: error.message || 'Unload failed' });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── Prometheus trend queries ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
// GET /api/vm/metrics/trend?metric=disk|memory|steal|io&range=7d|30d
|
||||||
|
fastify.get('/vm/metrics/trend', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (req, reply) => {
|
||||||
|
const { metric = 'disk', range = '7d' } = req.query as Record<string, string>;
|
||||||
|
const validRange = range === '30d' ? '30d' : '7d';
|
||||||
|
try {
|
||||||
|
switch (metric) {
|
||||||
|
case 'disk': return reply.send(await getDiskTrend(validRange));
|
||||||
|
case 'steal': return reply.send(await getStealTrend(validRange));
|
||||||
|
case 'io': return reply.send(await getIoTrend(validRange));
|
||||||
|
case 'memory': return reply.send(await getMemoryTrend(validRange));
|
||||||
|
default:
|
||||||
|
return reply.code(400).send({ error: `Unknown metric: ${metric}` });
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
fastify.log.error(error, 'Prometheus trend query failed');
|
||||||
|
return reply.code(502).send({ error: 'Prometheus unavailable' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// GET /api/vm/weekly-digest — 7-day summary for Telegram digest
|
||||||
|
fastify.get('/vm/weekly-digest', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (_req, reply) => {
|
||||||
|
try {
|
||||||
|
return reply.send(await getWeeklyDigestData());
|
||||||
|
} catch (error: any) {
|
||||||
|
fastify.log.error(error, 'Weekly digest data failed');
|
||||||
|
return reply.code(502).send({ error: 'Prometheus unavailable' });
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import {
|
|||||||
type CronStatusResponse,
|
type CronStatusResponse,
|
||||||
type UnhealthyContainer,
|
type UnhealthyContainer,
|
||||||
type OllamaModelsResponse,
|
type OllamaModelsResponse,
|
||||||
|
type TrendSeries,
|
||||||
} from '@/lib/api';
|
} from '@/lib/api';
|
||||||
import {
|
import {
|
||||||
CheckCircle,
|
CheckCircle,
|
||||||
@ -33,6 +34,8 @@ import {
|
|||||||
Shield,
|
Shield,
|
||||||
Zap,
|
Zap,
|
||||||
MemoryStick,
|
MemoryStick,
|
||||||
|
TrendingUp,
|
||||||
|
AlertCircle,
|
||||||
} from 'lucide-react';
|
} from 'lucide-react';
|
||||||
|
|
||||||
// ── Types ──────────────────────────────────────────────────────────────────
|
// ── Types ──────────────────────────────────────────────────────────────────
|
||||||
@ -513,6 +516,256 @@ function OllamaPanel({
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Trend charts ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function Sparkline({
|
||||||
|
points,
|
||||||
|
color = '#3b82f6',
|
||||||
|
fillColor,
|
||||||
|
min: minOverride,
|
||||||
|
max: maxOverride,
|
||||||
|
height = 48,
|
||||||
|
}: {
|
||||||
|
points: { t: number; v: number }[];
|
||||||
|
color?: string;
|
||||||
|
fillColor?: string;
|
||||||
|
min?: number;
|
||||||
|
max?: number;
|
||||||
|
height?: number;
|
||||||
|
}) {
|
||||||
|
if (points.length < 2) {
|
||||||
|
return <div className="w-full bg-gray-50 rounded" style={{ height }} />;
|
||||||
|
}
|
||||||
|
const W = 400;
|
||||||
|
const H = height;
|
||||||
|
const vals = points.map(p => p.v);
|
||||||
|
const lo = minOverride ?? Math.min(...vals);
|
||||||
|
const hi = maxOverride ?? Math.max(...vals);
|
||||||
|
const range = hi - lo || 1;
|
||||||
|
|
||||||
|
const toX = (i: number) => (i / (points.length - 1)) * W;
|
||||||
|
const toY = (v: number) => H - ((v - lo) / range) * (H - 4) - 2;
|
||||||
|
|
||||||
|
const pts = points.map((p, i) => `${toX(i)},${toY(p.v)}`).join(' ');
|
||||||
|
const fillPath = `M${toX(0)},${H} ` +
|
||||||
|
points.map((p, i) => `L${toX(i)},${toY(p.v)}`).join(' ') +
|
||||||
|
` L${toX(points.length - 1)},${H} Z`;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<svg
|
||||||
|
viewBox={`0 0 ${W} ${H}`}
|
||||||
|
className="w-full"
|
||||||
|
style={{ height }}
|
||||||
|
preserveAspectRatio="none"
|
||||||
|
>
|
||||||
|
{fillColor && <path d={fillPath} fill={fillColor} opacity="0.15" />}
|
||||||
|
<polyline points={pts} fill="none" stroke={color} strokeWidth="1.5" strokeLinejoin="round" />
|
||||||
|
{/* Latest value dot */}
|
||||||
|
<circle
|
||||||
|
cx={toX(points.length - 1)}
|
||||||
|
cy={toY(vals[vals.length - 1])}
|
||||||
|
r="3"
|
||||||
|
fill={color}
|
||||||
|
/>
|
||||||
|
</svg>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function TrendCard({
|
||||||
|
title,
|
||||||
|
series,
|
||||||
|
color,
|
||||||
|
fillColor,
|
||||||
|
unit,
|
||||||
|
warnThreshold,
|
||||||
|
critThreshold,
|
||||||
|
higherIsBetter = false,
|
||||||
|
min,
|
||||||
|
max,
|
||||||
|
note,
|
||||||
|
}: {
|
||||||
|
title: string;
|
||||||
|
series: TrendSeries | null | undefined;
|
||||||
|
color: string;
|
||||||
|
fillColor: string;
|
||||||
|
unit: string;
|
||||||
|
warnThreshold?: number;
|
||||||
|
critThreshold?: number;
|
||||||
|
higherIsBetter?: boolean;
|
||||||
|
min?: number;
|
||||||
|
max?: number;
|
||||||
|
note?: string;
|
||||||
|
}) {
|
||||||
|
if (!series) {
|
||||||
|
return (
|
||||||
|
<div className="bg-white border border-gray-200 rounded-lg p-4 space-y-2">
|
||||||
|
<p className="text-sm font-medium text-gray-500">{title}</p>
|
||||||
|
<div className="h-12 bg-gray-50 rounded animate-pulse" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { latest, avg, peak, points } = series;
|
||||||
|
const isBad = (v: number) => {
|
||||||
|
if (higherIsBetter) return critThreshold !== undefined && v < critThreshold;
|
||||||
|
return critThreshold !== undefined && v >= critThreshold;
|
||||||
|
};
|
||||||
|
const isWarn = (v: number) => {
|
||||||
|
if (higherIsBetter) return warnThreshold !== undefined && v < warnThreshold;
|
||||||
|
return warnThreshold !== undefined && v >= warnThreshold;
|
||||||
|
};
|
||||||
|
|
||||||
|
const latestColor = isBad(latest)
|
||||||
|
? 'text-red-600'
|
||||||
|
: isWarn(latest)
|
||||||
|
? 'text-yellow-600'
|
||||||
|
: 'text-gray-900';
|
||||||
|
|
||||||
|
const fmt = (v: number) =>
|
||||||
|
v < 10 ? v.toFixed(1) : Math.round(v).toString();
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="bg-white border border-gray-200 rounded-lg p-4 space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<p className="text-sm font-medium text-gray-600">{title}</p>
|
||||||
|
<span className={`text-lg font-bold tabular-nums ${latestColor}`}>
|
||||||
|
{fmt(latest)}<span className="text-xs font-normal text-gray-400 ml-0.5">{unit}</span>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<Sparkline points={points} color={color} fillColor={fillColor} min={min} max={max} />
|
||||||
|
<div className="flex items-center justify-between text-xs text-gray-400">
|
||||||
|
<span>avg {fmt(avg)}{unit}</span>
|
||||||
|
<span>peak {fmt(peak)}{unit}</span>
|
||||||
|
<span>{points.length}pts · 7d</span>
|
||||||
|
</div>
|
||||||
|
{note && (
|
||||||
|
<p className="text-xs text-yellow-700 bg-yellow-50 rounded px-2 py-1">{note}</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TrendsData {
|
||||||
|
disk: TrendSeries | null;
|
||||||
|
steal: TrendSeries | null;
|
||||||
|
ram: TrendSeries | null;
|
||||||
|
swap: TrendSeries | null;
|
||||||
|
io: TrendSeries | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function TrendsPanel({
|
||||||
|
data,
|
||||||
|
loading,
|
||||||
|
onOpen,
|
||||||
|
}: {
|
||||||
|
data: TrendsData;
|
||||||
|
loading: boolean;
|
||||||
|
onOpen: () => void;
|
||||||
|
}) {
|
||||||
|
const [open, setOpen] = useState(false);
|
||||||
|
|
||||||
|
const handleToggle = () => {
|
||||||
|
const next = !open;
|
||||||
|
setOpen(next);
|
||||||
|
if (next) onOpen();
|
||||||
|
};
|
||||||
|
|
||||||
|
const ioNote = data.io && data.io.avg > 0.5
|
||||||
|
? `VM writes avg ${data.io.avg.toFixed(1)} GB/hr (${(data.io.avg * 24).toFixed(0)} GB/day) — investigate source if sustained`
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="bg-white border border-gray-200 rounded-lg overflow-hidden">
|
||||||
|
<button
|
||||||
|
className="w-full flex items-center justify-between px-6 py-4 text-left hover:bg-gray-50"
|
||||||
|
onClick={handleToggle}
|
||||||
|
>
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<TrendingUp className="w-5 h-5 text-gray-500" />
|
||||||
|
<span className="font-semibold text-gray-900">7-Day Trends</span>
|
||||||
|
<span className="text-xs text-gray-400">Prometheus · 1h resolution</span>
|
||||||
|
{data.io && data.io.avg > 0.5 && (
|
||||||
|
<span className="flex items-center gap-1 text-xs text-yellow-700 bg-yellow-50 border border-yellow-200 px-2 py-0.5 rounded-full">
|
||||||
|
<AlertCircle className="w-3 h-3" /> I/O anomaly
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
{open
|
||||||
|
? <ChevronUp className="w-4 h-4 text-gray-400" />
|
||||||
|
: <ChevronDown className="w-4 h-4 text-gray-400" />}
|
||||||
|
</button>
|
||||||
|
|
||||||
|
{open && (
|
||||||
|
<div className="border-t border-gray-100 p-4">
|
||||||
|
{loading ? (
|
||||||
|
<div className="grid grid-cols-1 sm:grid-cols-2 xl:grid-cols-3 gap-4">
|
||||||
|
{Array.from({ length: 5 }).map((_, i) => (
|
||||||
|
<div key={i} className="bg-gray-50 rounded-lg h-28 animate-pulse" />
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="grid grid-cols-1 sm:grid-cols-2 xl:grid-cols-3 gap-4">
|
||||||
|
<TrendCard
|
||||||
|
title="Disk Used %"
|
||||||
|
series={data.disk}
|
||||||
|
color="#ef4444"
|
||||||
|
fillColor="#ef4444"
|
||||||
|
unit="%"
|
||||||
|
warnThreshold={55}
|
||||||
|
critThreshold={70}
|
||||||
|
min={0}
|
||||||
|
max={100}
|
||||||
|
/>
|
||||||
|
<TrendCard
|
||||||
|
title="RAM Available"
|
||||||
|
series={data.ram}
|
||||||
|
color="#3b82f6"
|
||||||
|
fillColor="#3b82f6"
|
||||||
|
unit="GB"
|
||||||
|
warnThreshold={3}
|
||||||
|
critThreshold={1}
|
||||||
|
higherIsBetter
|
||||||
|
min={0}
|
||||||
|
/>
|
||||||
|
<TrendCard
|
||||||
|
title="Swap Used"
|
||||||
|
series={data.swap}
|
||||||
|
color="#f59e0b"
|
||||||
|
fillColor="#f59e0b"
|
||||||
|
unit="GB"
|
||||||
|
warnThreshold={1.5}
|
||||||
|
critThreshold={3}
|
||||||
|
min={0}
|
||||||
|
/>
|
||||||
|
<TrendCard
|
||||||
|
title="CPU Steal %"
|
||||||
|
series={data.steal}
|
||||||
|
color="#8b5cf6"
|
||||||
|
fillColor="#8b5cf6"
|
||||||
|
unit="%"
|
||||||
|
warnThreshold={5}
|
||||||
|
critThreshold={15}
|
||||||
|
min={0}
|
||||||
|
/>
|
||||||
|
<TrendCard
|
||||||
|
title="VM Disk Writes (sda)"
|
||||||
|
series={data.io}
|
||||||
|
color="#f97316"
|
||||||
|
fillColor="#f97316"
|
||||||
|
unit="GB/hr"
|
||||||
|
warnThreshold={0.5}
|
||||||
|
critThreshold={1.5}
|
||||||
|
min={0}
|
||||||
|
note={ioNote}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// ── Check card meta ────────────────────────────────────────────────────────
|
// ── Check card meta ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const CHECK_META: Record<string, { label: string; icon: React.ElementType }> = {
|
const CHECK_META: Record<string, { label: string; icon: React.ElementType }> = {
|
||||||
@ -559,6 +812,10 @@ export default function VmHealthPage() {
|
|||||||
|
|
||||||
const [unloading, setUnloading] = useState<Set<string>>(new Set());
|
const [unloading, setUnloading] = useState<Set<string>>(new Set());
|
||||||
|
|
||||||
|
const [trends, setTrends] = useState<TrendsData>({ disk: null, steal: null, ram: null, swap: null, io: null });
|
||||||
|
const [trendsLoading, setTrendsLoading] = useState(false);
|
||||||
|
const [trendsLoaded, setTrendsLoaded] = useState(false);
|
||||||
|
|
||||||
const [showLog, setShowLog] = useState(false);
|
const [showLog, setShowLog] = useState(false);
|
||||||
const [lastRefreshed, setLastRefreshed] = useState<Date | null>(null);
|
const [lastRefreshed, setLastRefreshed] = useState<Date | null>(null);
|
||||||
|
|
||||||
@ -595,6 +852,29 @@ export default function VmHealthPage() {
|
|||||||
|
|
||||||
const handleRefresh = () => { setRefreshing(true); loadAll(); };
|
const handleRefresh = () => { setRefreshing(true); loadAll(); };
|
||||||
|
|
||||||
|
const loadTrends = useCallback(async () => {
|
||||||
|
if (trendsLoading) return;
|
||||||
|
setTrendsLoading(true);
|
||||||
|
try {
|
||||||
|
const [diskRes, stealRes, memRes, ioRes] = await Promise.allSettled([
|
||||||
|
vmApi.getTrend('disk', '7d'),
|
||||||
|
vmApi.getTrend('steal', '7d'),
|
||||||
|
vmApi.getMemoryTrend('7d'),
|
||||||
|
vmApi.getTrend('io', '7d'),
|
||||||
|
]);
|
||||||
|
setTrends({
|
||||||
|
disk: diskRes.status === 'fulfilled' ? diskRes.value : null,
|
||||||
|
steal: stealRes.status === 'fulfilled' ? stealRes.value : null,
|
||||||
|
ram: memRes.status === 'fulfilled' ? memRes.value.available : null,
|
||||||
|
swap: memRes.status === 'fulfilled' ? memRes.value.swap : null,
|
||||||
|
io: ioRes.status === 'fulfilled' ? ioRes.value : null,
|
||||||
|
});
|
||||||
|
setTrendsLoaded(true);
|
||||||
|
} finally {
|
||||||
|
setTrendsLoading(false);
|
||||||
|
}
|
||||||
|
}, [trendsLoading]);
|
||||||
|
|
||||||
const handleCleanup = async (mode: 'weekly' | 'monthly' | 'dry-run') => {
|
const handleCleanup = async (mode: 'weekly' | 'monthly' | 'dry-run') => {
|
||||||
const msg =
|
const msg =
|
||||||
mode === 'monthly' ? 'Run MONTHLY full cleanup? This removes build cache, pnpm store, old logs, and HOLD node_modules.' :
|
mode === 'monthly' ? 'Run MONTHLY full cleanup? This removes build cache, pnpm store, old logs, and HOLD node_modules.' :
|
||||||
@ -787,6 +1067,13 @@ export default function VmHealthPage() {
|
|||||||
unloading={unloading}
|
unloading={unloading}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
{/* ── 7-day trends ── */}
|
||||||
|
<TrendsPanel
|
||||||
|
data={trends}
|
||||||
|
loading={trendsLoading}
|
||||||
|
onOpen={() => { if (!trendsLoaded && !trendsLoading) loadTrends(); }}
|
||||||
|
/>
|
||||||
|
|
||||||
{/* ── Cleanup section ── */}
|
{/* ── Cleanup section ── */}
|
||||||
<div className="bg-white border border-gray-200 rounded-lg p-6">
|
<div className="bg-white border border-gray-200 rounded-lg p-6">
|
||||||
<div className="flex items-center gap-3 mb-5">
|
<div className="flex items-center gap-3 mb-5">
|
||||||
|
|||||||
@ -512,8 +512,25 @@ export const vmApi = {
|
|||||||
`/api/vm/ollama/models/${encodeURIComponent(name)}`,
|
`/api/vm/ollama/models/${encodeURIComponent(name)}`,
|
||||||
{ method: 'DELETE' },
|
{ method: 'DELETE' },
|
||||||
),
|
),
|
||||||
|
getTrend: (metric: 'disk' | 'steal' | 'io', range: '7d' | '30d') =>
|
||||||
|
apiRequest<TrendSeries>(`/api/vm/metrics/trend?metric=${metric}&range=${range}`),
|
||||||
|
getMemoryTrend: (range: '7d' | '30d') =>
|
||||||
|
apiRequest<{ available: TrendSeries; swap: TrendSeries }>(
|
||||||
|
`/api/vm/metrics/trend?metric=memory&range=${range}`,
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export interface TrendPoint { t: number; v: number }
|
||||||
|
|
||||||
|
export interface TrendSeries {
|
||||||
|
metric: string;
|
||||||
|
unit: string;
|
||||||
|
points: TrendPoint[];
|
||||||
|
latest: number;
|
||||||
|
avg: number;
|
||||||
|
peak: number;
|
||||||
|
}
|
||||||
|
|
||||||
// Auth API - calls platform-service for authentication
|
// Auth API - calls platform-service for authentication
|
||||||
export interface LoginRequest {
|
export interface LoginRequest {
|
||||||
email: string;
|
email: string;
|
||||||
|
|||||||
160
scripts/VMs/HostingerVM/vm-weekly-digest.sh
Executable file
160
scripts/VMs/HostingerVM/vm-weekly-digest.sh
Executable file
@ -0,0 +1,160 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# vm-weekly-digest.sh — Weekly Telegram summary for srv1491630
|
||||||
|
#
|
||||||
|
# Queries Prometheus via the devops-backend container (which is on the same
|
||||||
|
# Docker network as Prometheus), collects cleanup history, and sends a
|
||||||
|
# formatted summary to Telegram.
|
||||||
|
#
|
||||||
|
# Runs via systemd timer every Monday at 08:00 UTC.
|
||||||
|
# =============================================================================
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
TOKEN_FILE="${HERMES_HOME:-/root/.hermes}/.env"
|
||||||
|
BACKEND_CONTAINER="devops-backend"
|
||||||
|
PROM="http://learning_ai_common_plat-prometheus-1:9090"
|
||||||
|
|
||||||
|
# ── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2; }
|
||||||
|
|
||||||
|
prom_query() {
|
||||||
|
# Run a Prometheus instant query from inside the backend container.
|
||||||
|
# Returns the first result value, or "?" on failure.
|
||||||
|
local query="$1"
|
||||||
|
docker exec "$BACKEND_CONTAINER" \
|
||||||
|
curl -sf --max-time 10 \
|
||||||
|
"${PROM}/api/v1/query?$(printf 'query=%s' "$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")")" \
|
||||||
|
2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import json,sys
|
||||||
|
try:
|
||||||
|
d=json.load(sys.stdin)
|
||||||
|
r=d['data']['result']
|
||||||
|
print(round(float(r[0]['value'][1]),1) if r else '?')
|
||||||
|
except Exception:
|
||||||
|
print('?')
|
||||||
|
" 2>/dev/null || echo "?"
|
||||||
|
}
|
||||||
|
|
||||||
|
prom_range_avg() {
|
||||||
|
# 7-day range_query, return average of all values.
|
||||||
|
local query="$1"
|
||||||
|
local now step start
|
||||||
|
now=$(date +%s)
|
||||||
|
start=$(( now - 7 * 86400 ))
|
||||||
|
step="3600"
|
||||||
|
docker exec "$BACKEND_CONTAINER" \
|
||||||
|
curl -sf --max-time 15 \
|
||||||
|
"${PROM}/api/v1/query_range?$(printf 'query=%s&start=%s&end=%s&step=%s' \
|
||||||
|
"$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$query")" \
|
||||||
|
"$start" "$now" "$step")" \
|
||||||
|
2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import json,sys
|
||||||
|
try:
|
||||||
|
d=json.load(sys.stdin)
|
||||||
|
vals=[float(v) for s in d['data']['result'] for _,v in s['values']]
|
||||||
|
if vals: print(round(sum(vals)/len(vals),1))
|
||||||
|
else: print('?')
|
||||||
|
except Exception:
|
||||||
|
print('?')
|
||||||
|
" 2>/dev/null || echo "?"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Check backend container is running ───────────────────────────────────────
|
||||||
|
|
||||||
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BACKEND_CONTAINER}$"; then
|
||||||
|
log "ERROR: ${BACKEND_CONTAINER} is not running — skipping weekly digest"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Collect metrics ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
log "Collecting 7-day metrics from Prometheus..."
|
||||||
|
|
||||||
|
STEAL_AVG=$(prom_range_avg 'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100')
|
||||||
|
DISK_NOW=$(prom_query '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100')
|
||||||
|
RAM_AVG=$(prom_range_avg 'node_memory_MemAvailable_bytes / 1073741824')
|
||||||
|
SWAP_AVG=$(prom_range_avg '(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824')
|
||||||
|
IO_AVG=$(prom_range_avg 'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824')
|
||||||
|
|
||||||
|
# Unhealthy containers (current)
|
||||||
|
UNHEALTHY=$(docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null | wc -l || echo "?")
|
||||||
|
|
||||||
|
# Cleanup summary from log (last 7 days)
|
||||||
|
CLEANUP_LOG="/var/log/vm-cleanup.log"
|
||||||
|
CLEANUPS_THIS_WEEK=0
|
||||||
|
if [[ -f "$CLEANUP_LOG" ]]; then
|
||||||
|
WEEK_AGO=$(date -u -d "7 days ago" '+%Y-%m-%dT' 2>/dev/null || date -u -v-7d '+%Y-%m-%dT' 2>/dev/null || true)
|
||||||
|
if [[ -n "$WEEK_AGO" ]]; then
|
||||||
|
CLEANUPS_THIS_WEEK=$(awk -v cutoff="$WEEK_AGO" '
|
||||||
|
/\[START\]/ { in_block=1 }
|
||||||
|
in_block && /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T)/ {
|
||||||
|
match($0, /\[([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:Z]+)\]/, a)
|
||||||
|
if (a[1] >= cutoff) count++
|
||||||
|
in_block=0
|
||||||
|
}
|
||||||
|
END { print count+0 }
|
||||||
|
' "$CLEANUP_LOG" 2>/dev/null || echo 0)
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Build Telegram message ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Determine severity indicators
|
||||||
|
steal_icon="✅"; [[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 15" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="🚨"
|
||||||
|
[[ "$STEAL_AVG" != "?" ]] && (( $(echo "$STEAL_AVG > 5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && steal_icon="⚠️"
|
||||||
|
|
||||||
|
disk_icon="✅"; [[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 70" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="🚨"
|
||||||
|
[[ "$DISK_NOW" != "?" ]] && (( $(echo "$DISK_NOW > 55" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && disk_icon="⚠️"
|
||||||
|
|
||||||
|
ram_icon="✅"; [[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 1" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="🚨"
|
||||||
|
[[ "$RAM_AVG" != "?" ]] && (( $(echo "$RAM_AVG < 3" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && ram_icon="⚠️"
|
||||||
|
|
||||||
|
svc_icon="✅"; [[ "$UNHEALTHY" -gt 0 ]] 2>/dev/null && svc_icon="⚠️"
|
||||||
|
[[ "$UNHEALTHY" -gt 5 ]] 2>/dev/null && svc_icon="🚨"
|
||||||
|
|
||||||
|
io_icon="✅"; [[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 1.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="🚨"
|
||||||
|
[[ "$IO_AVG" != "?" ]] && (( $(echo "$IO_AVG > 0.5" | python3 -c "import sys; print(int(eval(sys.stdin.read())))") )) && io_icon="⚠️"
|
||||||
|
|
||||||
|
WEEK_END=$(date -u '+%Y-%m-%d')
|
||||||
|
WEEK_START=$(date -u -d "7 days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-7d '+%Y-%m-%d' 2>/dev/null || echo "N/A")
|
||||||
|
|
||||||
|
MSG="📊 Weekly VM Digest — $(hostname)
|
||||||
|
Week ${WEEK_START} → ${WEEK_END}
|
||||||
|
|
||||||
|
${steal_icon} CPU Steal: ${STEAL_AVG}% avg
|
||||||
|
${disk_icon} Disk: ${DISK_NOW}% used
|
||||||
|
${ram_icon} RAM: ${RAM_AVG} GB free avg
|
||||||
|
⏩ Swap: ${SWAP_AVG} GB avg
|
||||||
|
${svc_icon} Containers: ${UNHEALTHY} unhealthy now
|
||||||
|
${io_icon} Disk Writes: ${IO_AVG} GB/hr avg (sda total)
|
||||||
|
🧹 Cleanups: ${CLEANUPS_THIS_WEEK} this week
|
||||||
|
|
||||||
|
Dashboard: https://devops.bytelyst.com"
|
||||||
|
|
||||||
|
# ── Send Telegram ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
TELEGRAM_TOKEN=""
|
||||||
|
TELEGRAM_CHAT_ID=""
|
||||||
|
if [[ -f "$TOKEN_FILE" ]]; then
|
||||||
|
TELEGRAM_TOKEN=$(grep -oP '(?<=TELEGRAM_BOT_TOKEN=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||||
|
TELEGRAM_CHAT_ID=$(grep -oP '(?<=TELEGRAM_CHAT_ID=)\S+' "$TOKEN_FILE" 2>/dev/null || true)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$TELEGRAM_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
|
||||||
|
log "No Telegram credentials — printing digest to stdout:"
|
||||||
|
echo "$MSG"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Sending weekly digest to Telegram..."
|
||||||
|
if curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
||||||
|
-d chat_id="$TELEGRAM_CHAT_ID" \
|
||||||
|
-d text="$MSG" > /dev/null; then
|
||||||
|
log "Weekly digest sent"
|
||||||
|
else
|
||||||
|
log "ERROR: Telegram send failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
11
systemd/vm-weekly-digest.service
Normal file
11
systemd/vm-weekly-digest.service
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Send weekly VM health digest via Telegram
|
||||||
|
After=docker.service network-online.target
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=root
|
||||||
|
Group=root
|
||||||
|
Environment="HERMES_HOME=/root/.hermes"
|
||||||
|
ExecStart=/usr/local/bin/vm-weekly-digest.sh
|
||||||
11
systemd/vm-weekly-digest.timer
Normal file
11
systemd/vm-weekly-digest.timer
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Run weekly VM Telegram digest every Monday 08:00 UTC
|
||||||
|
After=docker.service
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnCalendar=Mon 08:00 UTC
|
||||||
|
AccuracySec=5min
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
Loading…
Reference in New Issue
Block a user