feat: add VM Health page to devops dashboard

Backend (Fastify):
- New module: modules/vm/ (types, repository, routes)
- GET  /api/vm/health      — runs vm-health-check.sh --json, returns structured result
- GET  /api/vm/cleanup-log — tails /var/log/vm-cleanup.log
- POST /api/vm/cleanup     — triggers vm-cleanup.sh (weekly / monthly / dry-run)
- Registered vmRoutes in server.ts

Frontend (Next.js):
- New page: /vm — VM Health
  - Overall status banner (OK/WARN/CRIT) with issue summary
  - Per-check cards: disk, load, RAM, swap, crash loops, container health,
    build cache, docker images, journal, syslog — color-coded by level
  - Cleanup trigger buttons (dry-run, weekly, monthly) with output viewer
  - Collapsible cleanup log viewer (last 40 lines)
  - Auto-refresh every 60s
- sidebar-nav.tsx: added 'VM Health' entry with Server icon

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Hermes VM 2026-05-27 12:39:17 +00:00
parent 678430d77d
commit d0b8ce2c74
6 changed files with 553 additions and 0 deletions

View File

@ -0,0 +1,86 @@
import { exec } from 'child_process';
import { promisify } from 'util';
import { hostname } from 'os';
const execAsync = promisify(exec);
const HEALTH_SCRIPT =
'/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-health-check.sh';
const CLEANUP_SCRIPT =
'/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh';
const CLEANUP_LOG = '/var/log/vm-cleanup.log';
// ---------------------------------------------------------------------------
// Health check
// ---------------------------------------------------------------------------
export async function runVmHealthCheck() {
try {
// Script exits 1 (WARN) or 2 (CRIT) but still emits valid JSON on stdout.
const { stdout } = await execAsync(`bash ${HEALTH_SCRIPT} --json 2>/dev/null`, {
timeout: 30_000,
});
return JSON.parse(stdout);
} catch (error: any) {
// Non-zero exit — stdout may still contain valid JSON
if (error.stdout) {
try {
return JSON.parse(error.stdout);
} catch {
// fall through to error response
}
}
return {
timestamp: new Date().toISOString(),
hostname: hostname(),
overall: 'CRIT',
checks: {},
error: String(error.stderr || error.message || error),
};
}
}
// ---------------------------------------------------------------------------
// Cleanup log
// ---------------------------------------------------------------------------
export async function getCleanupLog(lines = 30): Promise<string> {
try {
const { stdout } = await execAsync(
`tail -${lines} ${CLEANUP_LOG} 2>/dev/null || echo "(log not found)"`,
{ timeout: 5_000 }
);
return stdout.trim();
} catch {
return '(log not available)';
}
}
// ---------------------------------------------------------------------------
// Trigger cleanup (requires sudo — configure sudoers if needed)
// ---------------------------------------------------------------------------
export async function runVmCleanup(
mode: 'weekly' | 'monthly' | 'dry-run'
): Promise<{ success: boolean; output: string }> {
const args =
mode === 'monthly'
? '--full --quiet'
: mode === 'dry-run'
? '--dry-run'
: '--quiet';
try {
const { stdout, stderr } = await execAsync(
`sudo bash ${CLEANUP_SCRIPT} ${args} 2>&1`,
{ timeout: 120_000 }
);
return { success: true, output: (stdout + stderr).trim() };
} catch (error: any) {
const out = (error.stdout || '') + (error.stderr || '');
return {
success: false,
output: out.trim() || String(error.message || error),
};
}
}

View File

@ -0,0 +1,47 @@
import type { FastifyInstance } from 'fastify';
import { requireAdmin } from '../../lib/auth.js';
import { runVmHealthCheck, getCleanupLog, runVmCleanup } from './repository.js';
import { VmCleanupParamsSchema } from './types.js';
export async function vmRoutes(fastify: FastifyInstance) {
// GET /api/vm/health — run vm-health-check.sh --json and return result (admin only)
fastify.get('/vm/health', {
preHandler: async (req) => requireAdmin(req),
}, async (_req, reply) => {
try {
const result = await runVmHealthCheck();
return reply.send(result);
} catch (error) {
fastify.log.error(error, 'VM health check failed');
return reply.code(500).send({ error: 'VM health check failed' });
}
});
// GET /api/vm/cleanup-log — tail the cleanup log (admin only)
fastify.get('/vm/cleanup-log', {
preHandler: async (req) => requireAdmin(req),
}, async (req, reply) => {
try {
const lines = Number((req.query as any).lines) || 30;
const log = await getCleanupLog(lines);
return reply.send({ log });
} catch (error) {
fastify.log.error(error, 'Failed to read cleanup log');
return reply.code(500).send({ error: 'Failed to read cleanup log' });
}
});
// POST /api/vm/cleanup — trigger vm-cleanup.sh (admin only)
fastify.post('/vm/cleanup', {
preHandler: async (req) => requireAdmin(req),
}, async (req, reply) => {
try {
const params = VmCleanupParamsSchema.parse(req.body);
const result = await runVmCleanup(params.mode);
return reply.send(result);
} catch (error: any) {
fastify.log.error(error, 'VM cleanup failed');
return reply.code(500).send({ error: error.message || 'VM cleanup failed' });
}
});
}

View File

@ -0,0 +1,31 @@
import { z } from 'zod';
export const VmCheckLevelSchema = z.enum(['OK', 'WARN', 'CRIT']);
export type VmCheckLevel = z.infer<typeof VmCheckLevelSchema>;
export const VmCheckSchema = z.object({
level: VmCheckLevelSchema,
value: z.string(),
message: z.string(),
});
export type VmCheck = z.infer<typeof VmCheckSchema>;
export const VmHealthResultSchema = z.object({
timestamp: z.string(),
hostname: z.string(),
overall: VmCheckLevelSchema,
checks: z.record(z.string(), VmCheckSchema),
error: z.string().optional(),
});
export type VmHealthResult = z.infer<typeof VmHealthResultSchema>;
export const VmCleanupParamsSchema = z.object({
mode: z.enum(['weekly', 'monthly', 'dry-run']),
});
export type VmCleanupParams = z.infer<typeof VmCleanupParamsSchema>;
export const VmCleanupResultSchema = z.object({
success: z.boolean(),
output: z.string(),
});
export type VmCleanupResult = z.infer<typeof VmCleanupResultSchema>;

View File

@ -14,6 +14,7 @@ import { azureConfigRoutes } from './modules/azure-config/routes.js';
import { codeQualityRoutes } from './modules/code-quality/routes.js'; import { codeQualityRoutes } from './modules/code-quality/routes.js';
import { cosmosConfigRoutes } from './modules/cosmos-config/routes.js'; import { cosmosConfigRoutes } from './modules/cosmos-config/routes.js';
import { hermesOpsRoutes } from './modules/hermes-ops/routes.js'; import { hermesOpsRoutes } from './modules/hermes-ops/routes.js';
import { vmRoutes } from './modules/vm/routes.js';
// import sse from 'fastify-sse-v2'; // import sse from 'fastify-sse-v2';
import rateLimit from '@fastify/rate-limit'; import rateLimit from '@fastify/rate-limit';
import swagger from '@fastify/swagger'; import swagger from '@fastify/swagger';
@ -271,6 +272,7 @@ await fastify.register(azureConfigRoutes, { prefix: '/api' });
await fastify.register(codeQualityRoutes, { prefix: '/api' }); await fastify.register(codeQualityRoutes, { prefix: '/api' });
await fastify.register(cosmosConfigRoutes, { prefix: '/api' }); await fastify.register(cosmosConfigRoutes, { prefix: '/api' });
await fastify.register(hermesOpsRoutes, { prefix: '/api' }); await fastify.register(hermesOpsRoutes, { prefix: '/api' });
await fastify.register(vmRoutes, { prefix: '/api' });
// Start server // Start server
async function start() { async function start() {

View File

@ -0,0 +1,385 @@
'use client';
import { useEffect, useState, useCallback } from 'react';
import { SidebarNav } from '@/components/sidebar-nav';
import {
CheckCircle,
AlertTriangle,
XCircle,
RefreshCw,
HardDrive,
Cpu,
Database,
Server,
Activity,
Layers,
ScrollText,
Trash2,
Terminal,
ChevronDown,
ChevronUp,
} from 'lucide-react';
const API = process.env.NEXT_PUBLIC_DEVOPS_API_URL || 'http://localhost:4004';
// ── Types ──────────────────────────────────────────────────────────────────
type Level = 'OK' | 'WARN' | 'CRIT';
interface VmCheck {
level: Level;
value: string;
message: string;
}
interface VmHealthResult {
timestamp: string;
hostname: string;
overall: Level;
checks: Record<string, VmCheck>;
error?: string;
}
// ── Helpers ────────────────────────────────────────────────────────────────
function levelColor(level: Level) {
switch (level) {
case 'OK': return 'text-green-700 bg-green-50 border-green-200';
case 'WARN': return 'text-yellow-700 bg-yellow-50 border-yellow-200';
case 'CRIT': return 'text-red-700 bg-red-50 border-red-200';
}
}
function levelBadge(level: Level) {
switch (level) {
case 'OK': return 'bg-green-100 text-green-800';
case 'WARN': return 'bg-yellow-100 text-yellow-800';
case 'CRIT': return 'bg-red-100 text-red-800';
}
}
function LevelIcon({ level, className = 'w-5 h-5' }: { level: Level; className?: string }) {
switch (level) {
case 'OK': return <CheckCircle className={`${className} text-green-600`} />;
case 'WARN': return <AlertTriangle className={`${className} text-yellow-600`} />;
case 'CRIT': return <XCircle className={`${className} text-red-600`} />;
}
}
const CHECK_META: Record<string, { label: string; icon: React.ElementType }> = {
disk: { label: 'Disk', icon: HardDrive },
load: { label: 'CPU Load', icon: Cpu },
ram: { label: 'Memory', icon: Database },
swap: { label: 'Swap', icon: Server },
container_loops: { label: 'Crash Loops', icon: Activity },
container_health: { label: 'Container Health', icon: Layers },
build_cache: { label: 'Build Cache', icon: Layers },
docker_images: { label: 'Docker Images', icon: Layers },
journal: { label: 'Journal Logs', icon: ScrollText },
syslog: { label: 'Syslog', icon: ScrollText },
docker_daemon: { label: 'Docker Daemon', icon: Activity },
};
// Preferred display order
const CHECK_ORDER = [
'disk', 'load', 'ram', 'swap',
'container_loops', 'container_health', 'docker_daemon',
'build_cache', 'docker_images',
'journal', 'syslog',
];
// ── Component ──────────────────────────────────────────────────────────────
export default function VmHealthPage() {
const [health, setHealth] = useState<VmHealthResult | null>(null);
const [cleanupLog, setCleanupLog] = useState<string>('');
const [loading, setLoading] = useState(true);
const [refreshing, setRefreshing] = useState(false);
const [cleanupRunning, setCleanupRunning] = useState(false);
const [cleanupResult, setCleanupResult] = useState<{ success: boolean; output: string } | null>(null);
const [showLog, setShowLog] = useState(false);
const [lastRefreshed, setLastRefreshed] = useState<Date | null>(null);
const authHeader = () => ({
Authorization: `Bearer ${localStorage.getItem('access_token')}`,
});
const loadHealth = useCallback(async () => {
try {
const [healthRes, logRes] = await Promise.all([
fetch(`${API}/api/vm/health`, { headers: authHeader() }),
fetch(`${API}/api/vm/cleanup-log?lines=40`, { headers: authHeader() }),
]);
if (healthRes.ok) setHealth(await healthRes.json());
if (logRes.ok) {
const { log } = await logRes.json();
setCleanupLog(log);
}
setLastRefreshed(new Date());
} catch (e) {
console.error('Failed to load VM health:', e);
} finally {
setLoading(false);
setRefreshing(false);
}
}, []);
useEffect(() => {
loadHealth();
const interval = setInterval(loadHealth, 60_000);
return () => clearInterval(interval);
}, [loadHealth]);
const handleRefresh = () => {
setRefreshing(true);
loadHealth();
};
const handleCleanup = async (mode: 'weekly' | 'monthly' | 'dry-run') => {
const confirmMsg =
mode === 'monthly'
? 'Run MONTHLY full cleanup? This removes Docker build cache, pnpm store, old logs, and HOLD node_modules.'
: mode === 'dry-run'
? 'Run cleanup in DRY-RUN mode? Nothing will be deleted.'
: 'Run weekly cleanup? This prunes Docker build cache, journal, apt, and .next/cache.';
if (!confirm(confirmMsg)) return;
setCleanupRunning(true);
setCleanupResult(null);
try {
const res = await fetch(`${API}/api/vm/cleanup`, {
method: 'POST',
headers: { 'Content-Type': 'application/json', ...authHeader() },
body: JSON.stringify({ mode }),
});
const result = await res.json();
setCleanupResult(result);
// Refresh health after cleanup
await loadHealth();
} catch (e) {
setCleanupResult({ success: false, output: String(e) });
} finally {
setCleanupRunning(false);
}
};
// ── Render ───────────────────────────────────────────────────────────────
if (loading) {
return (
<div className="flex min-h-screen bg-gray-50">
<SidebarNav />
<main className="flex-1 flex items-center justify-center">
<div className="text-gray-500">Loading VM health</div>
</main>
</div>
);
}
const overall = health?.overall ?? 'CRIT';
const checks = health?.checks ?? {};
// Sort checks into preferred order, then anything else
const sortedKeys = [
...CHECK_ORDER.filter(k => k in checks),
...Object.keys(checks).filter(k => !CHECK_ORDER.includes(k)),
];
const warnings = sortedKeys.filter(k => checks[k]?.level === 'WARN');
const crits = sortedKeys.filter(k => checks[k]?.level === 'CRIT');
return (
<div className="flex min-h-screen bg-gray-50">
<SidebarNav />
<main className="flex-1 min-w-0 overflow-y-auto">
<div className="p-8 max-md:p-4 space-y-6">
{/* ── Header ── */}
<div className="flex items-center justify-between">
<div>
<h1 className="text-2xl font-bold text-gray-900">VM Health</h1>
<p className="text-sm text-gray-500">
{health?.hostname ?? 'srv1491630'} ·{' '}
{lastRefreshed
? `last checked ${lastRefreshed.toLocaleTimeString()}`
: 'checking…'}
</p>
</div>
<button
onClick={handleRefresh}
disabled={refreshing}
className="flex items-center gap-2 px-4 py-2 text-sm font-medium text-gray-700 bg-white border border-gray-300 rounded-md hover:bg-gray-50 disabled:opacity-50"
>
<RefreshCw className={`w-4 h-4 ${refreshing ? 'animate-spin' : ''}`} />
Refresh
</button>
</div>
{/* ── Overall status banner ── */}
<div className={`rounded-lg border p-4 flex items-center gap-3 ${levelColor(overall)}`}>
<LevelIcon level={overall} className="w-6 h-6 flex-shrink-0" />
<div className="flex-1">
<p className="font-semibold">
{overall === 'OK'
? 'All checks passing'
: overall === 'WARN'
? `${warnings.length} warning${warnings.length !== 1 ? 's' : ''}`
: `${crits.length} critical issue${crits.length !== 1 ? 's' : ''}`}
</p>
{health?.error && (
<p className="text-sm mt-1 opacity-80">{health.error}</p>
)}
{(crits.length > 0 || warnings.length > 0) && !health?.error && (
<p className="text-sm mt-1 opacity-80">
{[...crits, ...warnings].map(k => checks[k]?.message).join(' · ')}
</p>
)}
</div>
<span className={`px-3 py-1 rounded-full text-sm font-bold ${levelBadge(overall)}`}>
{overall}
</span>
</div>
{/* ── Check cards grid ── */}
<div className="grid grid-cols-1 sm:grid-cols-2 xl:grid-cols-3 gap-4">
{sortedKeys.map(key => {
const check = checks[key];
if (!check) return null;
const meta = CHECK_META[key] ?? { label: key, icon: Activity };
const Icon = meta.icon;
return (
<div
key={key}
className={`rounded-lg border p-4 ${levelColor(check.level)}`}
>
<div className="flex items-start gap-3">
<div className="mt-0.5">
<LevelIcon level={check.level} />
</div>
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2 mb-1">
<Icon className="w-4 h-4 opacity-60 flex-shrink-0" />
<span className="text-sm font-semibold">{meta.label}</span>
<span className={`ml-auto px-2 py-0.5 rounded text-xs font-bold ${levelBadge(check.level)}`}>
{check.level}
</span>
</div>
<p className="text-sm leading-snug">{check.message}</p>
<p className="text-xs opacity-60 mt-1 font-mono truncate">{check.value}</p>
</div>
</div>
</div>
);
})}
</div>
{/* ── Cleanup section ── */}
<div className="bg-white border border-gray-200 rounded-lg p-6">
<div className="flex items-center gap-3 mb-5">
<Trash2 className="w-5 h-5 text-gray-500" />
<div>
<h2 className="text-lg font-semibold text-gray-900">VM Cleanup</h2>
<p className="text-sm text-gray-500">
Cron runs automatically: daily build-cache prune, weekly cleanup, monthly full cleanup.
Use buttons below to trigger manually.
</p>
</div>
</div>
{/* Cleanup result */}
{cleanupResult && (
<div className={`rounded-lg border p-4 mb-4 ${cleanupResult.success ? 'bg-green-50 border-green-200' : 'bg-red-50 border-red-200'}`}>
<div className="flex items-center gap-2 mb-2">
{cleanupResult.success
? <CheckCircle className="w-4 h-4 text-green-600" />
: <XCircle className="w-4 h-4 text-red-600" />}
<span className={`text-sm font-medium ${cleanupResult.success ? 'text-green-800' : 'text-red-800'}`}>
{cleanupResult.success ? 'Cleanup completed' : 'Cleanup failed'}
</span>
<button
onClick={() => setCleanupResult(null)}
className="ml-auto text-gray-400 hover:text-gray-600 text-xs"
>
Dismiss
</button>
</div>
{cleanupResult.output && (
<pre className="text-xs font-mono whitespace-pre-wrap text-gray-700 bg-white/60 rounded p-2 max-h-48 overflow-y-auto">
{cleanupResult.output}
</pre>
)}
</div>
)}
{/* Cleanup buttons */}
<div className="flex flex-wrap gap-3">
<button
onClick={() => handleCleanup('dry-run')}
disabled={cleanupRunning}
className="flex items-center gap-2 px-4 py-2 text-sm font-medium text-gray-700 bg-gray-50 border border-gray-300 rounded-md hover:bg-gray-100 disabled:opacity-50"
>
<Terminal className="w-4 h-4" />
Dry Run
</button>
<button
onClick={() => handleCleanup('weekly')}
disabled={cleanupRunning}
className="flex items-center gap-2 px-4 py-2 text-sm font-medium text-blue-700 bg-blue-50 border border-blue-300 rounded-md hover:bg-blue-100 disabled:opacity-50"
>
{cleanupRunning
? <RefreshCw className="w-4 h-4 animate-spin" />
: <Trash2 className="w-4 h-4" />}
Weekly Cleanup
</button>
<button
onClick={() => handleCleanup('monthly')}
disabled={cleanupRunning}
className="flex items-center gap-2 px-4 py-2 text-sm font-medium text-orange-700 bg-orange-50 border border-orange-300 rounded-md hover:bg-orange-100 disabled:opacity-50"
>
{cleanupRunning
? <RefreshCw className="w-4 h-4 animate-spin" />
: <Trash2 className="w-4 h-4" />}
Monthly Full Cleanup
</button>
</div>
<p className="text-xs text-gray-400 mt-3">
Cleanup requires the backend process to have sudo access to vm-cleanup.sh.
If the button returns an error, trigger manually via SSH:{' '}
<code className="bg-gray-100 px-1 rounded">sudo bash scripts/VMs/HostingerVM/vm-cleanup.sh</code>
</p>
</div>
{/* ── Cleanup log ── */}
{cleanupLog && (
<div className="bg-white border border-gray-200 rounded-lg overflow-hidden">
<button
className="w-full flex items-center justify-between px-6 py-4 text-left"
onClick={() => setShowLog(v => !v)}
>
<div className="flex items-center gap-2">
<ScrollText className="w-5 h-5 text-gray-500" />
<span className="font-semibold text-gray-900">Cleanup Log</span>
<span className="text-xs text-gray-400">(last 40 lines of /var/log/vm-cleanup.log)</span>
</div>
{showLog
? <ChevronUp className="w-4 h-4 text-gray-400" />
: <ChevronDown className="w-4 h-4 text-gray-400" />}
</button>
{showLog && (
<div className="border-t border-gray-100 px-6 py-4">
<pre className="text-xs font-mono whitespace-pre-wrap text-gray-700 bg-gray-50 rounded p-3 max-h-80 overflow-y-auto">
{cleanupLog}
</pre>
</div>
)}
</div>
)}
</div>
</main>
</div>
);
}

View File

@ -18,6 +18,7 @@ import {
Moon, Moon,
HeartPulse, HeartPulse,
Sparkles, Sparkles,
Server,
} from 'lucide-react'; } from 'lucide-react';
import { useAuth } from '@/lib/auth'; import { useAuth } from '@/lib/auth';
@ -26,6 +27,7 @@ const navItems = [
{ href: '/hermes', label: 'Hermes', icon: Sparkles }, { href: '/hermes', label: 'Hermes', icon: Sparkles },
{ href: '/health', label: 'Health', icon: HeartPulse }, { href: '/health', label: 'Health', icon: HeartPulse },
{ href: '/metrics', label: 'Metrics', icon: BarChart3 }, { href: '/metrics', label: 'Metrics', icon: BarChart3 },
{ href: '/vm', label: 'VM Health', icon: Server },
{ href: '/system', label: 'System', icon: Cpu }, { href: '/system', label: 'System', icon: Cpu },
{ href: '/env', label: 'Environment', icon: Key }, { href: '/env', label: 'Environment', icon: Key },
{ href: '/code-quality', label: 'Code Quality', icon: Code2 }, { href: '/code-quality', label: 'Code Quality', icon: Code2 },