From d0b8ce2c74181830978e3081179c1642e80e6275 Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Wed, 27 May 2026 12:39:17 +0000 Subject: [PATCH] feat: add VM Health page to devops dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend (Fastify): - New module: modules/vm/ (types, repository, routes) - GET /api/vm/health — runs vm-health-check.sh --json, returns structured result - GET /api/vm/cleanup-log — tails /var/log/vm-cleanup.log - POST /api/vm/cleanup — triggers vm-cleanup.sh (weekly / monthly / dry-run) - Registered vmRoutes in server.ts Frontend (Next.js): - New page: /vm — VM Health - Overall status banner (OK/WARN/CRIT) with issue summary - Per-check cards: disk, load, RAM, swap, crash loops, container health, build cache, docker images, journal, syslog — color-coded by level - Cleanup trigger buttons (dry-run, weekly, monthly) with output viewer - Collapsible cleanup log viewer (last 40 lines) - Auto-refresh every 60s - sidebar-nav.tsx: added 'VM Health' entry with Server icon Co-Authored-By: Claude Sonnet 4.6 --- .../backend/src/modules/vm/repository.ts | 86 ++++ dashboard/backend/src/modules/vm/routes.ts | 47 +++ dashboard/backend/src/modules/vm/types.ts | 31 ++ dashboard/backend/src/server.ts | 2 + dashboard/web/src/app/vm/page.tsx | 385 ++++++++++++++++++ dashboard/web/src/components/sidebar-nav.tsx | 2 + 6 files changed, 553 insertions(+) create mode 100644 dashboard/backend/src/modules/vm/repository.ts create mode 100644 dashboard/backend/src/modules/vm/routes.ts create mode 100644 dashboard/backend/src/modules/vm/types.ts create mode 100644 dashboard/web/src/app/vm/page.tsx diff --git a/dashboard/backend/src/modules/vm/repository.ts b/dashboard/backend/src/modules/vm/repository.ts new file mode 100644 index 0000000..bacd966 --- /dev/null +++ b/dashboard/backend/src/modules/vm/repository.ts @@ -0,0 +1,86 @@ +import { exec } from 'child_process'; +import { promisify } from 'util'; +import { hostname } from 'os'; + +const execAsync = promisify(exec); + +const HEALTH_SCRIPT = + '/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-health-check.sh'; +const CLEANUP_SCRIPT = + '/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh'; +const CLEANUP_LOG = '/var/log/vm-cleanup.log'; + +// --------------------------------------------------------------------------- +// Health check +// --------------------------------------------------------------------------- + +export async function runVmHealthCheck() { + try { + // Script exits 1 (WARN) or 2 (CRIT) but still emits valid JSON on stdout. + const { stdout } = await execAsync(`bash ${HEALTH_SCRIPT} --json 2>/dev/null`, { + timeout: 30_000, + }); + return JSON.parse(stdout); + } catch (error: any) { + // Non-zero exit — stdout may still contain valid JSON + if (error.stdout) { + try { + return JSON.parse(error.stdout); + } catch { + // fall through to error response + } + } + return { + timestamp: new Date().toISOString(), + hostname: hostname(), + overall: 'CRIT', + checks: {}, + error: String(error.stderr || error.message || error), + }; + } +} + +// --------------------------------------------------------------------------- +// Cleanup log +// --------------------------------------------------------------------------- + +export async function getCleanupLog(lines = 30): Promise { + try { + const { stdout } = await execAsync( + `tail -${lines} ${CLEANUP_LOG} 2>/dev/null || echo "(log not found)"`, + { timeout: 5_000 } + ); + return stdout.trim(); + } catch { + return '(log not available)'; + } +} + +// --------------------------------------------------------------------------- +// Trigger cleanup (requires sudo — configure sudoers if needed) +// --------------------------------------------------------------------------- + +export async function runVmCleanup( + mode: 'weekly' | 'monthly' | 'dry-run' +): Promise<{ success: boolean; output: string }> { + const args = + mode === 'monthly' + ? '--full --quiet' + : mode === 'dry-run' + ? '--dry-run' + : '--quiet'; + + try { + const { stdout, stderr } = await execAsync( + `sudo bash ${CLEANUP_SCRIPT} ${args} 2>&1`, + { timeout: 120_000 } + ); + return { success: true, output: (stdout + stderr).trim() }; + } catch (error: any) { + const out = (error.stdout || '') + (error.stderr || ''); + return { + success: false, + output: out.trim() || String(error.message || error), + }; + } +} diff --git a/dashboard/backend/src/modules/vm/routes.ts b/dashboard/backend/src/modules/vm/routes.ts new file mode 100644 index 0000000..a7fe693 --- /dev/null +++ b/dashboard/backend/src/modules/vm/routes.ts @@ -0,0 +1,47 @@ +import type { FastifyInstance } from 'fastify'; +import { requireAdmin } from '../../lib/auth.js'; +import { runVmHealthCheck, getCleanupLog, runVmCleanup } from './repository.js'; +import { VmCleanupParamsSchema } from './types.js'; + +export async function vmRoutes(fastify: FastifyInstance) { + // GET /api/vm/health — run vm-health-check.sh --json and return result (admin only) + fastify.get('/vm/health', { + preHandler: async (req) => requireAdmin(req), + }, async (_req, reply) => { + try { + const result = await runVmHealthCheck(); + return reply.send(result); + } catch (error) { + fastify.log.error(error, 'VM health check failed'); + return reply.code(500).send({ error: 'VM health check failed' }); + } + }); + + // GET /api/vm/cleanup-log — tail the cleanup log (admin only) + fastify.get('/vm/cleanup-log', { + preHandler: async (req) => requireAdmin(req), + }, async (req, reply) => { + try { + const lines = Number((req.query as any).lines) || 30; + const log = await getCleanupLog(lines); + return reply.send({ log }); + } catch (error) { + fastify.log.error(error, 'Failed to read cleanup log'); + return reply.code(500).send({ error: 'Failed to read cleanup log' }); + } + }); + + // POST /api/vm/cleanup — trigger vm-cleanup.sh (admin only) + fastify.post('/vm/cleanup', { + preHandler: async (req) => requireAdmin(req), + }, async (req, reply) => { + try { + const params = VmCleanupParamsSchema.parse(req.body); + const result = await runVmCleanup(params.mode); + return reply.send(result); + } catch (error: any) { + fastify.log.error(error, 'VM cleanup failed'); + return reply.code(500).send({ error: error.message || 'VM cleanup failed' }); + } + }); +} diff --git a/dashboard/backend/src/modules/vm/types.ts b/dashboard/backend/src/modules/vm/types.ts new file mode 100644 index 0000000..fabc7e7 --- /dev/null +++ b/dashboard/backend/src/modules/vm/types.ts @@ -0,0 +1,31 @@ +import { z } from 'zod'; + +export const VmCheckLevelSchema = z.enum(['OK', 'WARN', 'CRIT']); +export type VmCheckLevel = z.infer; + +export const VmCheckSchema = z.object({ + level: VmCheckLevelSchema, + value: z.string(), + message: z.string(), +}); +export type VmCheck = z.infer; + +export const VmHealthResultSchema = z.object({ + timestamp: z.string(), + hostname: z.string(), + overall: VmCheckLevelSchema, + checks: z.record(z.string(), VmCheckSchema), + error: z.string().optional(), +}); +export type VmHealthResult = z.infer; + +export const VmCleanupParamsSchema = z.object({ + mode: z.enum(['weekly', 'monthly', 'dry-run']), +}); +export type VmCleanupParams = z.infer; + +export const VmCleanupResultSchema = z.object({ + success: z.boolean(), + output: z.string(), +}); +export type VmCleanupResult = z.infer; diff --git a/dashboard/backend/src/server.ts b/dashboard/backend/src/server.ts index bb14354..08b4e84 100644 --- a/dashboard/backend/src/server.ts +++ b/dashboard/backend/src/server.ts @@ -14,6 +14,7 @@ import { azureConfigRoutes } from './modules/azure-config/routes.js'; import { codeQualityRoutes } from './modules/code-quality/routes.js'; import { cosmosConfigRoutes } from './modules/cosmos-config/routes.js'; import { hermesOpsRoutes } from './modules/hermes-ops/routes.js'; +import { vmRoutes } from './modules/vm/routes.js'; // import sse from 'fastify-sse-v2'; import rateLimit from '@fastify/rate-limit'; import swagger from '@fastify/swagger'; @@ -271,6 +272,7 @@ await fastify.register(azureConfigRoutes, { prefix: '/api' }); await fastify.register(codeQualityRoutes, { prefix: '/api' }); await fastify.register(cosmosConfigRoutes, { prefix: '/api' }); await fastify.register(hermesOpsRoutes, { prefix: '/api' }); +await fastify.register(vmRoutes, { prefix: '/api' }); // Start server async function start() { diff --git a/dashboard/web/src/app/vm/page.tsx b/dashboard/web/src/app/vm/page.tsx new file mode 100644 index 0000000..33aec72 --- /dev/null +++ b/dashboard/web/src/app/vm/page.tsx @@ -0,0 +1,385 @@ +'use client'; + +import { useEffect, useState, useCallback } from 'react'; +import { SidebarNav } from '@/components/sidebar-nav'; +import { + CheckCircle, + AlertTriangle, + XCircle, + RefreshCw, + HardDrive, + Cpu, + Database, + Server, + Activity, + Layers, + ScrollText, + Trash2, + Terminal, + ChevronDown, + ChevronUp, +} from 'lucide-react'; + +const API = process.env.NEXT_PUBLIC_DEVOPS_API_URL || 'http://localhost:4004'; + +// ── Types ────────────────────────────────────────────────────────────────── + +type Level = 'OK' | 'WARN' | 'CRIT'; + +interface VmCheck { + level: Level; + value: string; + message: string; +} + +interface VmHealthResult { + timestamp: string; + hostname: string; + overall: Level; + checks: Record; + error?: string; +} + +// ── Helpers ──────────────────────────────────────────────────────────────── + +function levelColor(level: Level) { + switch (level) { + case 'OK': return 'text-green-700 bg-green-50 border-green-200'; + case 'WARN': return 'text-yellow-700 bg-yellow-50 border-yellow-200'; + case 'CRIT': return 'text-red-700 bg-red-50 border-red-200'; + } +} + +function levelBadge(level: Level) { + switch (level) { + case 'OK': return 'bg-green-100 text-green-800'; + case 'WARN': return 'bg-yellow-100 text-yellow-800'; + case 'CRIT': return 'bg-red-100 text-red-800'; + } +} + +function LevelIcon({ level, className = 'w-5 h-5' }: { level: Level; className?: string }) { + switch (level) { + case 'OK': return ; + case 'WARN': return ; + case 'CRIT': return ; + } +} + +const CHECK_META: Record = { + disk: { label: 'Disk', icon: HardDrive }, + load: { label: 'CPU Load', icon: Cpu }, + ram: { label: 'Memory', icon: Database }, + swap: { label: 'Swap', icon: Server }, + container_loops: { label: 'Crash Loops', icon: Activity }, + container_health: { label: 'Container Health', icon: Layers }, + build_cache: { label: 'Build Cache', icon: Layers }, + docker_images: { label: 'Docker Images', icon: Layers }, + journal: { label: 'Journal Logs', icon: ScrollText }, + syslog: { label: 'Syslog', icon: ScrollText }, + docker_daemon: { label: 'Docker Daemon', icon: Activity }, +}; + +// Preferred display order +const CHECK_ORDER = [ + 'disk', 'load', 'ram', 'swap', + 'container_loops', 'container_health', 'docker_daemon', + 'build_cache', 'docker_images', + 'journal', 'syslog', +]; + +// ── Component ────────────────────────────────────────────────────────────── + +export default function VmHealthPage() { + const [health, setHealth] = useState(null); + const [cleanupLog, setCleanupLog] = useState(''); + const [loading, setLoading] = useState(true); + const [refreshing, setRefreshing] = useState(false); + const [cleanupRunning, setCleanupRunning] = useState(false); + const [cleanupResult, setCleanupResult] = useState<{ success: boolean; output: string } | null>(null); + const [showLog, setShowLog] = useState(false); + const [lastRefreshed, setLastRefreshed] = useState(null); + + const authHeader = () => ({ + Authorization: `Bearer ${localStorage.getItem('access_token')}`, + }); + + const loadHealth = useCallback(async () => { + try { + const [healthRes, logRes] = await Promise.all([ + fetch(`${API}/api/vm/health`, { headers: authHeader() }), + fetch(`${API}/api/vm/cleanup-log?lines=40`, { headers: authHeader() }), + ]); + if (healthRes.ok) setHealth(await healthRes.json()); + if (logRes.ok) { + const { log } = await logRes.json(); + setCleanupLog(log); + } + setLastRefreshed(new Date()); + } catch (e) { + console.error('Failed to load VM health:', e); + } finally { + setLoading(false); + setRefreshing(false); + } + }, []); + + useEffect(() => { + loadHealth(); + const interval = setInterval(loadHealth, 60_000); + return () => clearInterval(interval); + }, [loadHealth]); + + const handleRefresh = () => { + setRefreshing(true); + loadHealth(); + }; + + const handleCleanup = async (mode: 'weekly' | 'monthly' | 'dry-run') => { + const confirmMsg = + mode === 'monthly' + ? 'Run MONTHLY full cleanup? This removes Docker build cache, pnpm store, old logs, and HOLD node_modules.' + : mode === 'dry-run' + ? 'Run cleanup in DRY-RUN mode? Nothing will be deleted.' + : 'Run weekly cleanup? This prunes Docker build cache, journal, apt, and .next/cache.'; + if (!confirm(confirmMsg)) return; + + setCleanupRunning(true); + setCleanupResult(null); + try { + const res = await fetch(`${API}/api/vm/cleanup`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', ...authHeader() }, + body: JSON.stringify({ mode }), + }); + const result = await res.json(); + setCleanupResult(result); + // Refresh health after cleanup + await loadHealth(); + } catch (e) { + setCleanupResult({ success: false, output: String(e) }); + } finally { + setCleanupRunning(false); + } + }; + + // ── Render ─────────────────────────────────────────────────────────────── + + if (loading) { + return ( +
+ +
+
Loading VM health…
+
+
+ ); + } + + const overall = health?.overall ?? 'CRIT'; + const checks = health?.checks ?? {}; + + // Sort checks into preferred order, then anything else + const sortedKeys = [ + ...CHECK_ORDER.filter(k => k in checks), + ...Object.keys(checks).filter(k => !CHECK_ORDER.includes(k)), + ]; + + const warnings = sortedKeys.filter(k => checks[k]?.level === 'WARN'); + const crits = sortedKeys.filter(k => checks[k]?.level === 'CRIT'); + + return ( +
+ + +
+
+ + {/* ── Header ── */} +
+
+

VM Health

+

+ {health?.hostname ?? 'srv1491630'} ·{' '} + {lastRefreshed + ? `last checked ${lastRefreshed.toLocaleTimeString()}` + : 'checking…'} +

+
+ +
+ + {/* ── Overall status banner ── */} +
+ +
+

+ {overall === 'OK' + ? 'All checks passing' + : overall === 'WARN' + ? `${warnings.length} warning${warnings.length !== 1 ? 's' : ''}` + : `${crits.length} critical issue${crits.length !== 1 ? 's' : ''}`} +

+ {health?.error && ( +

{health.error}

+ )} + {(crits.length > 0 || warnings.length > 0) && !health?.error && ( +

+ {[...crits, ...warnings].map(k => checks[k]?.message).join(' · ')} +

+ )} +
+ + {overall} + +
+ + {/* ── Check cards grid ── */} +
+ {sortedKeys.map(key => { + const check = checks[key]; + if (!check) return null; + const meta = CHECK_META[key] ?? { label: key, icon: Activity }; + const Icon = meta.icon; + + return ( +
+
+
+ +
+
+
+ + {meta.label} + + {check.level} + +
+

{check.message}

+

{check.value}

+
+
+
+ ); + })} +
+ + {/* ── Cleanup section ── */} +
+
+ +
+

VM Cleanup

+

+ Cron runs automatically: daily build-cache prune, weekly cleanup, monthly full cleanup. + Use buttons below to trigger manually. +

+
+
+ + {/* Cleanup result */} + {cleanupResult && ( +
+
+ {cleanupResult.success + ? + : } + + {cleanupResult.success ? 'Cleanup completed' : 'Cleanup failed'} + + +
+ {cleanupResult.output && ( +
+                    {cleanupResult.output}
+                  
+ )} +
+ )} + + {/* Cleanup buttons */} +
+ + + +
+ +

+ Cleanup requires the backend process to have sudo access to vm-cleanup.sh. + If the button returns an error, trigger manually via SSH:{' '} + sudo bash scripts/VMs/HostingerVM/vm-cleanup.sh +

+
+ + {/* ── Cleanup log ── */} + {cleanupLog && ( +
+ + {showLog && ( +
+
+                    {cleanupLog}
+                  
+
+ )} +
+ )} + +
+
+
+ ); +} diff --git a/dashboard/web/src/components/sidebar-nav.tsx b/dashboard/web/src/components/sidebar-nav.tsx index f35e2a8..9a41844 100644 --- a/dashboard/web/src/components/sidebar-nav.tsx +++ b/dashboard/web/src/components/sidebar-nav.tsx @@ -18,6 +18,7 @@ import { Moon, HeartPulse, Sparkles, + Server, } from 'lucide-react'; import { useAuth } from '@/lib/auth'; @@ -26,6 +27,7 @@ const navItems = [ { href: '/hermes', label: 'Hermes', icon: Sparkles }, { href: '/health', label: 'Health', icon: HeartPulse }, { href: '/metrics', label: 'Metrics', icon: BarChart3 }, + { href: '/vm', label: 'VM Health', icon: Server }, { href: '/system', label: 'System', icon: Cpu }, { href: '/env', label: 'Environment', icon: Key }, { href: '/code-quality', label: 'Code Quality', icon: Code2 },