- prometheus.ts: new Prometheus client with 7d/30d range queries for disk, memory, swap, CPU steal, and disk I/O (GB/hr); getWeeklyDigestData() aggregates all metrics for digest and API endpoint - routes.ts: GET /api/vm/metrics/trend?metric=…&range=… and GET /api/vm/weekly-digest endpoints - api.ts: TrendPoint/TrendSeries types; getTrend() and getMemoryTrend() added to vmApi - vm/page.tsx: Sparkline (pure SVG polyline+fill), TrendCard with latest/avg/peak and threshold colouring, TrendsPanel with lazy load on first open; Promise.allSettled() isolation for all 5 data panels - vm-weekly-digest.sh: weekly Telegram digest via docker exec into devops-backend to reach Prometheus; emoji severity indicators; cron summary from /var/log/vm-cleanup.log - systemd timer: Mon 08:00 UTC, Persistent=true (fires on next boot if missed); first trigger 2026-06-02 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
173 lines
6.4 KiB
TypeScript
173 lines
6.4 KiB
TypeScript
const PROMETHEUS_BASE =
|
|
process.env.PROMETHEUS_URL ?? 'http://learning_ai_common_plat-prometheus-1:9090';
|
|
|
|
// ── Types ──────────────────────────────────────────────────────────────────
|
|
|
|
export interface TrendPoint {
|
|
t: number; // unix ms
|
|
v: number;
|
|
}
|
|
|
|
export interface TrendSeries {
|
|
metric: string;
|
|
unit: string;
|
|
points: TrendPoint[];
|
|
latest: number;
|
|
avg: number;
|
|
peak: number;
|
|
}
|
|
|
|
export interface MemoryTrend {
|
|
available: TrendSeries;
|
|
swap: TrendSeries;
|
|
}
|
|
|
|
// ── Internal helpers ────────────────────────────────────────────────────────
|
|
|
|
function rangeParams(rangeStr: string): { start: number; end: number; step: string } {
|
|
const end = Date.now();
|
|
const days = rangeStr === '30d' ? 30 : 7;
|
|
const start = end - days * 86_400_000;
|
|
const step = days <= 7 ? '1h' : '4h';
|
|
return { start, end, step };
|
|
}
|
|
|
|
async function queryRange(query: string, start: number, end: number, step: string): Promise<TrendPoint[]> {
|
|
const url = new URL(`${PROMETHEUS_BASE}/api/v1/query_range`);
|
|
url.searchParams.set('query', query);
|
|
url.searchParams.set('start', String(Math.floor(start / 1000)));
|
|
url.searchParams.set('end', String(Math.floor(end / 1000)));
|
|
url.searchParams.set('step', step);
|
|
|
|
const res = await fetch(url.toString(), {
|
|
signal: AbortSignal.timeout(15_000),
|
|
});
|
|
if (!res.ok) throw new Error(`Prometheus ${res.status}: ${res.statusText}`);
|
|
|
|
const body = (await res.json()) as {
|
|
status: string;
|
|
error?: string;
|
|
data?: { result: Array<{ values: [number, string][] }> };
|
|
};
|
|
if (body.status !== 'success') throw new Error(`Prometheus: ${body.error ?? 'unknown error'}`);
|
|
|
|
const result = body.data?.result ?? [];
|
|
if (result.length === 0) return [];
|
|
|
|
if (result.length === 1) {
|
|
return result[0].values
|
|
.map(([ts, v]) => ({ t: ts * 1000, v: parseFloat(v) }))
|
|
.filter(p => !isNaN(p.v));
|
|
}
|
|
|
|
// Multi-series (e.g. per-CPU steal) → average by timestamp
|
|
const byTime = new Map<number, number[]>();
|
|
for (const series of result) {
|
|
for (const [ts, v] of series.values) {
|
|
const ms = ts * 1000;
|
|
const val = parseFloat(v);
|
|
if (!isNaN(val)) {
|
|
const bucket = byTime.get(ms);
|
|
if (bucket) bucket.push(val);
|
|
else byTime.set(ms, [val]);
|
|
}
|
|
}
|
|
}
|
|
return Array.from(byTime.entries())
|
|
.sort(([a], [b]) => a - b)
|
|
.map(([t, vals]) => ({ t, v: vals.reduce((s, x) => s + x, 0) / vals.length }));
|
|
}
|
|
|
|
function summarize(points: TrendPoint[]): Pick<TrendSeries, 'latest' | 'avg' | 'peak'> {
|
|
if (points.length === 0) return { latest: 0, avg: 0, peak: 0 };
|
|
const vals = points.map(p => p.v);
|
|
const latest = vals[vals.length - 1];
|
|
const avg = vals.reduce((s, v) => s + v, 0) / vals.length;
|
|
const peak = Math.max(...vals);
|
|
const round1 = (n: number) => Math.round(n * 10) / 10;
|
|
return { latest: round1(latest), avg: round1(avg), peak: round1(peak) };
|
|
}
|
|
|
|
// ── Public trend queries ────────────────────────────────────────────────────
|
|
|
|
export async function getDiskTrend(range = '7d'): Promise<TrendSeries> {
|
|
const { start, end, step } = rangeParams(range);
|
|
const points = await queryRange(
|
|
'(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100',
|
|
start, end, step,
|
|
);
|
|
return { metric: 'disk', unit: '%', points, ...summarize(points) };
|
|
}
|
|
|
|
export async function getMemoryTrend(range = '7d'): Promise<MemoryTrend> {
|
|
const { start, end, step } = rangeParams(range);
|
|
const [availPts, swapPts] = await Promise.all([
|
|
queryRange('node_memory_MemAvailable_bytes / 1073741824', start, end, step),
|
|
queryRange(
|
|
'(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824',
|
|
start, end, step,
|
|
),
|
|
]);
|
|
return {
|
|
available: { metric: 'ram_available', unit: 'GB', points: availPts, ...summarize(availPts) },
|
|
swap: { metric: 'swap_used', unit: 'GB', points: swapPts, ...summarize(swapPts) },
|
|
};
|
|
}
|
|
|
|
export async function getStealTrend(range = '7d'): Promise<TrendSeries> {
|
|
const { start, end, step } = rangeParams(range);
|
|
// avg() across all CPUs so multi-CPU hosts get a single % value
|
|
const points = await queryRange(
|
|
'avg(rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100',
|
|
start, end, step,
|
|
);
|
|
return { metric: 'steal', unit: '%', points, ...summarize(points) };
|
|
}
|
|
|
|
export async function getIoTrend(range = '7d'): Promise<TrendSeries> {
|
|
const { start, end, step } = rangeParams(range);
|
|
// Total VM block write rate in GB/hr (sda = primary disk).
|
|
// cAdvisor does not expose per-container blkio in this setup, so we use
|
|
// the node-exporter metric which covers all processes including invttrdg.
|
|
const points = await queryRange(
|
|
'rate(node_disk_written_bytes_total{device="sda"}[5m]) * 3600 / 1073741824',
|
|
start, end, step,
|
|
);
|
|
return { metric: 'disk_io_write', unit: 'GB/hr', points, ...summarize(points) };
|
|
}
|
|
|
|
// ── Weekly digest summary (used by digest endpoint + cron) ─────────────────
|
|
|
|
export interface WeeklyDigestData {
|
|
period: { from: string; to: string };
|
|
steal: { avg: number; peak: number };
|
|
disk: { latest: number; peak: number };
|
|
ram: { avg: number; low: number };
|
|
swap: { avg: number; peak: number };
|
|
io: { avg: number; peak: number };
|
|
}
|
|
|
|
export async function getWeeklyDigestData(): Promise<WeeklyDigestData> {
|
|
const [diskData, memData, stealData, ioData] = await Promise.all([
|
|
getDiskTrend('7d'),
|
|
getMemoryTrend('7d'),
|
|
getStealTrend('7d'),
|
|
getIoTrend('7d'),
|
|
]);
|
|
|
|
const to = new Date();
|
|
const from = new Date(to.getTime() - 7 * 86_400_000);
|
|
|
|
const ramVals = memData.available.points.map(p => p.v);
|
|
const swapVals = memData.swap.points.map(p => p.v);
|
|
|
|
return {
|
|
period: { from: from.toISOString(), to: to.toISOString() },
|
|
steal: { avg: stealData.avg, peak: stealData.peak },
|
|
disk: { latest: diskData.latest, peak: diskData.peak },
|
|
ram: { avg: memData.available.avg, low: ramVals.length ? Math.min(...ramVals) : 0 },
|
|
swap: { avg: memData.swap.avg, peak: swapVals.length ? Math.max(...swapVals) : 0 },
|
|
io: { avg: ioData.avg, peak: ioData.peak },
|
|
};
|
|
}
|