feat(dashboard/vm): Phase 3.3 — All Containers panel with CPU/RAM, logs, bulk restart

- repository.ts: getAllContainers() — batch docker inspect + docker stats
  --no-stream merged by container name; returns state, health, uptime,
  CPU%, RAM, memLimitMiB (0=no limit), restart count, stack from compose
  label; getContainerLogs() — docker logs --tail --timestamps
- routes.ts: GET /api/vm/containers (all, with stats; ~3s for 38
  containers), GET /api/vm/containers/:name/logs?lines=N
- api.ts: ContainerInfo interface; vmApi.getAllContainers(),
  vmApi.getContainerLogs()
- vm/page.tsx: ContainersPanel — collapsible (lazy-loads on first open);
  filter chips (All/Running/Unhealthy/No Limit) + stack dropdown;
  per-row log viewer (inline pre, dark bg, 50-line tail); per-row
  restart button; bulk "Restart N unhealthy" with confirmation modal;
  Fragment key pattern for row+log-row pairs

I/O anomaly (Phase 0.3) root cause identified: invttrdg-backend and
trading-backend write bot_state.json + .bak on every market tick
(5×/min and 2×/min respectively) into container overlay layer →
~6 GB/day — intentional bot behaviour, no fix needed, trend chart
already in place to monitor.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Hermes VM 2026-05-29 00:42:11 +00:00
parent 8d32cb7980
commit 42c3b9cdd5
4 changed files with 468 additions and 1 deletions

View File

@ -352,3 +352,116 @@ export async function unloadOllamaModel(name: string): Promise<{ success: boolea
return { success: false, message: String(error.message ?? error) };
}
}
// ---------------------------------------------------------------------------
// Full container list with CPU/RAM stats
// ---------------------------------------------------------------------------
export interface ContainerInfo {
name: string;
image: string;
stack: string;
state: string;
health: string;
uptimeSecs: number;
cpuPercent: number;
memMiB: number;
memLimitMiB: number; // 0 = no limit configured
restartCount: number;
}
export async function getAllContainers(): Promise<ContainerInfo[]> {
try {
// 1. All container IDs (including stopped)
const { stdout: idsOut } = await execAsync('docker ps -aq 2>/dev/null', { timeout: 5_000 });
const ids = idsOut.trim().split('\n').filter(Boolean);
if (!ids.length) return [];
// 2. Batch inspect (state, health, labels, memory limit, restart count)
const { stdout: inspectOut } = await execAsync(
`docker inspect ${ids.slice(0, 120).join(' ')} 2>/dev/null`,
{ timeout: 20_000 },
);
const inspected: any[] = JSON.parse(inspectOut);
// 3. CPU + RAM stats for running containers only
const { stdout: statsOut } = await execAsync(
"docker stats --no-stream --format '{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}' 2>/dev/null",
{ timeout: 25_000 },
);
const statsMap = new Map<string, { cpu: number; memMiB: number }>();
for (const line of statsOut.trim().split('\n').filter(Boolean)) {
const parts = line.split('\t');
if (parts.length < 3) continue;
const [cname, cpuStr, memStr] = parts;
const cpu = parseFloat(cpuStr) || 0;
// Parse "95.36MiB / 15.62GiB" — take used portion before '/'
const usedStr = (memStr ?? '').split('/')[0].trim();
const memMatch = usedStr.match(/([\d.]+)\s*(GiB|MiB|KiB|B)/i);
let memMiB = 0;
if (memMatch) {
const val = parseFloat(memMatch[1]);
const unit = memMatch[2].toLowerCase();
memMiB = unit === 'gib' ? val * 1024 : unit === 'kib' ? val / 1024 : unit === 'b' ? val / 1_048_576 : val;
}
statsMap.set(cname.trim(), { cpu: Math.round(cpu * 10) / 10, memMiB: Math.round(memMiB) });
}
// 4. Merge
const result: ContainerInfo[] = inspected.map((c: any) => {
const name = (c.Name ?? '').replace(/^\//, '');
const state = c.State?.Status ?? 'unknown';
const health = c.State?.Health?.Status ?? 'none';
let uptimeSecs = 0;
if (state === 'running' && c.State?.StartedAt) {
uptimeSecs = Math.max(0, Math.round((Date.now() - new Date(c.State.StartedAt).getTime()) / 1000));
}
const labels = c.Config?.Labels ?? {};
const stack = labels['com.docker.compose.project'] ?? (() => {
const parts = name.split('-');
return parts.length > 1 ? parts.slice(0, -1).join('-') : name;
})();
const memLimitBytes = c.HostConfig?.Memory ?? 0;
const memLimitMiB = memLimitBytes > 0 ? Math.round(memLimitBytes / 1_048_576) : 0;
const stats = statsMap.get(name);
return {
name,
image: c.Config?.Image ?? '',
stack,
state,
health,
uptimeSecs,
cpuPercent: stats?.cpu ?? 0,
memMiB: stats?.memMiB ?? 0,
memLimitMiB,
restartCount: c.RestartCount ?? 0,
} satisfies ContainerInfo;
});
return result.sort((a, b) => {
if (a.state === 'running' && b.state !== 'running') return -1;
if (a.state !== 'running' && b.state === 'running') return 1;
return a.name.localeCompare(b.name);
});
} catch (err) {
console.error('getAllContainers failed:', err);
return [];
}
}
export async function getContainerLogs(name: string, lines = 50): Promise<string> {
if (!/^[\w-]+$/.test(name)) throw new Error('Invalid container name');
try {
const { stdout } = await execAsync(
`docker logs --tail ${lines} --timestamps "${name}" 2>&1`,
{ timeout: 10_000 },
);
return stdout.trim();
} catch (error: any) {
return ((error.stdout ?? '') + (error.stderr ?? '')).trim() || String(error.message ?? 'Failed to get logs');
}
}

View File

@ -9,6 +9,8 @@ import {
restartContainer,
getOllamaModels,
unloadOllamaModel,
getAllContainers,
getContainerLogs,
} from './repository.js';
import {
getDiskTrend,
@ -79,6 +81,34 @@ export async function vmRoutes(fastify: FastifyInstance) {
}
});
// ── All containers (full list with CPU/RAM) ───────────────────────────────
// GET /api/vm/containers
fastify.get('/vm/containers', {
preHandler: async (req) => requireAdmin(req),
}, async (_req, reply) => {
try {
return reply.send(await getAllContainers());
} catch (error) {
fastify.log.error(error, 'Failed to get containers');
return reply.code(500).send({ error: 'Failed to get containers' });
}
});
// GET /api/vm/containers/:name/logs?lines=50
fastify.get('/vm/containers/:name/logs', {
preHandler: async (req) => requireAdmin(req),
}, async (req, reply) => {
try {
const { name } = VmContainerRestartParamsSchema.parse(req.params);
const lines = Math.min(Number((req.query as any).lines) || 50, 200);
return reply.send({ logs: await getContainerLogs(name, lines) });
} catch (error: any) {
fastify.log.error(error, 'Failed to get container logs');
return reply.code(500).send({ error: error.message || 'Failed to get container logs' });
}
});
// ── Unhealthy containers ──────────────────────────────────────────────────
// GET /api/vm/containers/unhealthy

View File

@ -1,6 +1,6 @@
'use client';
import { useEffect, useState, useCallback } from 'react';
import { useEffect, useState, useCallback, Fragment } from 'react';
import { SidebarNav } from '@/components/sidebar-nav';
import {
vmApi,
@ -10,6 +10,7 @@ import {
type UnhealthyContainer,
type OllamaModelsResponse,
type TrendSeries,
type ContainerInfo,
} from '@/lib/api';
import {
CheckCircle,
@ -766,6 +767,307 @@ function TrendsPanel({
);
}
// ── Containers panel ───────────────────────────────────────────────────────
function ContainersPanel({
restarting,
onRestart,
}: {
restarting: Set<string>;
onRestart: (name: string) => Promise<void>;
}) {
const [open, setOpen] = useState(false);
const [containers, setContainers] = useState<ContainerInfo[]>([]);
const [loading, setLoading] = useState(false);
const [loaded, setLoaded] = useState(false);
const [filter, setFilter] = useState<'all' | 'running' | 'unhealthy' | 'nolimit'>('all');
const [stackFilter, setStackFilter] = useState('');
const [expandedLog, setExpandedLog] = useState<string | null>(null);
const [logCache, setLogCache] = useState<Record<string, string>>({});
const [logsLoading, setLogsLoading] = useState<string | null>(null);
const [bulkConfirm, setBulkConfirm] = useState(false);
const loadContainers = async () => {
if (loading) return;
setLoading(true);
try {
const data = await vmApi.getAllContainers();
setContainers(data);
setLoaded(true);
} catch (err) {
console.error('Failed to load containers:', err);
} finally {
setLoading(false);
}
};
const handleToggle = () => {
const next = !open;
setOpen(next);
if (next && !loaded) loadContainers();
};
const handleViewLogs = async (name: string) => {
if (expandedLog === name) { setExpandedLog(null); return; }
setExpandedLog(name);
if (!logCache[name]) {
setLogsLoading(name);
try {
const { logs } = await vmApi.getContainerLogs(name, 50);
setLogCache(prev => ({ ...prev, [name]: logs }));
} catch {
setLogCache(prev => ({ ...prev, [name]: '(failed to load logs)' }));
} finally {
setLogsLoading(null);
}
}
};
const unhealthy = containers.filter(c => c.health === 'unhealthy');
const stacks = [...new Set(containers.map(c => c.stack))].sort();
const filtered = containers.filter(c => {
const main =
filter === 'running' ? c.state === 'running' :
filter === 'unhealthy' ? c.health === 'unhealthy' :
filter === 'nolimit' ? (c.state === 'running' && c.memLimitMiB === 0) :
true;
return main && (!stackFilter || c.stack === stackFilter);
});
const chips: Array<{ key: typeof filter; label: string; accent?: boolean }> = [
{ key: 'all', label: `All (${containers.length})` },
{ key: 'running', label: `Running (${containers.filter(c => c.state === 'running').length})` },
{ key: 'unhealthy', label: `Unhealthy (${unhealthy.length})`, accent: unhealthy.length > 0 },
{ key: 'nolimit', label: `No Limit (${containers.filter(c => c.state === 'running' && c.memLimitMiB === 0).length})` },
];
const healthClass = (h: string) =>
h === 'healthy' ? 'text-green-600' : h === 'unhealthy' ? 'text-red-600' : h === 'starting' ? 'text-yellow-600' : 'text-gray-400';
const fmtUptime = (s: number) =>
s < 60 ? `${s}s` : s < 3600 ? `${Math.floor(s / 60)}m` : s < 86400 ? `${Math.floor(s / 3600)}h` : `${Math.floor(s / 86400)}d`;
const fmtMem = (mib: number) =>
mib >= 1024 ? `${(mib / 1024).toFixed(1)}G` : `${mib}M`;
return (
<div className="bg-white border border-gray-200 rounded-lg overflow-hidden">
{/* Header toggle */}
<button
className="w-full flex items-center justify-between px-6 py-4 text-left hover:bg-gray-50"
onClick={handleToggle}
>
<div className="flex items-center gap-2">
<Server className="w-5 h-5 text-gray-500" />
<span className="font-semibold text-gray-900">All Containers</span>
{loaded && (
<span className="text-xs text-gray-400">
{containers.filter(c => c.state === 'running').length} running · {containers.length} total
</span>
)}
{!open && unhealthy.length > 0 && (
<span className="flex items-center gap-1 text-xs text-red-700 bg-red-50 border border-red-200 px-2 py-0.5 rounded-full">
<AlertCircle className="w-3 h-3" /> {unhealthy.length} unhealthy
</span>
)}
</div>
{open ? <ChevronUp className="w-4 h-4 text-gray-400" /> : <ChevronDown className="w-4 h-4 text-gray-400" />}
</button>
{open && (
<div className="border-t border-gray-100">
{/* Toolbar */}
<div className="px-4 py-3 border-b border-gray-100 flex flex-wrap items-center gap-3 justify-between">
<div className="flex flex-wrap items-center gap-2">
{chips.map(chip => (
<button
key={chip.key}
onClick={() => setFilter(chip.key)}
className={`px-3 py-1 text-xs rounded-full border transition-colors ${
filter === chip.key
? 'bg-blue-600 border-blue-500 text-white'
: chip.accent
? 'bg-red-50 border-red-200 text-red-700 hover:bg-red-100'
: 'bg-gray-100 border-gray-200 text-gray-600 hover:bg-gray-200'
}`}
>
{chip.label}
</button>
))}
<select
value={stackFilter}
onChange={e => setStackFilter(e.target.value)}
className="px-2 py-1 text-xs border border-gray-200 rounded-full bg-white text-gray-600"
>
<option value="">Stack</option>
{stacks.map(s => <option key={s} value={s}>{s}</option>)}
</select>
</div>
<div className="flex items-center gap-2">
{unhealthy.length > 0 && (
<button
onClick={() => setBulkConfirm(true)}
className="flex items-center gap-1.5 px-3 py-1.5 text-xs font-medium text-red-700 bg-red-50 border border-red-200 rounded-md hover:bg-red-100"
>
<RotateCw className="w-3.5 h-3.5" />
Restart {unhealthy.length} unhealthy
</button>
)}
<button
onClick={loadContainers}
disabled={loading}
className="p-1.5 text-gray-500 bg-gray-50 border border-gray-200 rounded-md hover:bg-gray-100 disabled:opacity-50"
title="Refresh"
>
<RefreshCw className={`w-3.5 h-3.5 ${loading ? 'animate-spin' : ''}`} />
</button>
</div>
</div>
{/* Table */}
{loading && !loaded ? (
<div className="py-10 text-center text-sm text-gray-400">
Collecting container stats
</div>
) : (
<div className="overflow-x-auto">
<table className="w-full text-sm">
<thead>
<tr className="border-b border-gray-100 bg-gray-50">
<th className="px-4 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wide">Container</th>
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wide">Health</th>
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wide">Uptime</th>
<th className="px-3 py-2 text-right text-xs font-medium text-gray-500 uppercase tracking-wide">CPU</th>
<th className="px-3 py-2 text-right text-xs font-medium text-gray-500 uppercase tracking-wide">RAM</th>
<th className="px-3 py-2 text-right text-xs font-medium text-gray-500 uppercase tracking-wide"></th>
<th className="px-3 py-2 w-16"></th>
</tr>
</thead>
<tbody className="divide-y divide-gray-50">
{filtered.map(c => (
<Fragment key={c.name}>
<tr className={`hover:bg-gray-50 ${c.health === 'unhealthy' ? 'bg-red-50/40' : ''}`}>
<td className="px-4 py-2.5">
<p className="font-mono text-xs font-medium text-gray-900 truncate max-w-[240px]" title={c.name}>{c.name}</p>
<p className="text-xs text-gray-400">{c.stack}</p>
</td>
<td className="px-3 py-2.5">
<span className={`text-xs font-medium ${healthClass(c.health)}`}>
{c.health === 'none' ? '—' : c.health}
</span>
{c.state !== 'running' && (
<p className="text-xs text-gray-400">{c.state}</p>
)}
</td>
<td className="px-3 py-2.5 text-xs text-gray-600 tabular-nums">
{c.state === 'running' ? fmtUptime(c.uptimeSecs) : '—'}
</td>
<td className="px-3 py-2.5 text-xs text-right tabular-nums text-gray-600">
{c.state === 'running' ? `${c.cpuPercent}%` : '—'}
</td>
<td className="px-3 py-2.5 text-xs text-right tabular-nums text-gray-600">
{c.state === 'running' ? (
<>
{fmtMem(c.memMiB)}
{c.memLimitMiB === 0 && (
<span className="ml-0.5 text-yellow-600" title="No memory limit set"></span>
)}
</>
) : '—'}
</td>
<td className="px-3 py-2.5 text-right tabular-nums">
{c.restartCount > 0 ? (
<span className={`text-xs font-medium ${c.restartCount > 5 ? 'text-red-600' : 'text-yellow-700'}`}>
{c.restartCount}
</span>
) : (
<span className="text-xs text-gray-300">0</span>
)}
</td>
<td className="px-3 py-2.5">
<div className="flex items-center gap-1 justify-end">
<button
onClick={() => handleViewLogs(c.name)}
className={`p-1 rounded hover:bg-gray-100 ${expandedLog === c.name ? 'text-blue-600' : 'text-gray-400 hover:text-gray-600'}`}
title="View logs"
>
<Terminal className="w-3.5 h-3.5" />
</button>
{c.state === 'running' && (
<button
onClick={() => onRestart(c.name)}
disabled={restarting.has(c.name)}
className="p-1 rounded text-gray-400 hover:text-blue-700 hover:bg-blue-50 disabled:opacity-40"
title="Restart"
>
<RotateCw className={`w-3.5 h-3.5 ${restarting.has(c.name) ? 'animate-spin' : ''}`} />
</button>
)}
</div>
</td>
</tr>
{expandedLog === c.name && (
<tr>
<td colSpan={7} className="px-4 pb-3 pt-0 bg-gray-50">
{logsLoading === c.name ? (
<p className="text-xs text-gray-400 py-2 text-center">Loading logs</p>
) : (
<pre className="text-xs font-mono bg-gray-900 text-gray-200 rounded p-3 max-h-52 overflow-y-auto whitespace-pre-wrap leading-relaxed">
{logCache[c.name] ?? '(no logs)'}
</pre>
)}
</td>
</tr>
)}
</Fragment>
))}
</tbody>
</table>
{filtered.length === 0 && (
<p className="py-8 text-center text-sm text-gray-400">No containers match filter</p>
)}
</div>
)}
</div>
)}
{/* Bulk restart modal */}
{bulkConfirm && (
<div className="fixed inset-0 bg-black/40 flex items-center justify-center z-50">
<div className="bg-white rounded-xl shadow-xl p-6 max-w-sm w-full mx-4">
<h4 className="font-semibold text-gray-900 mb-2">
Restart {unhealthy.length} unhealthy container{unhealthy.length !== 1 ? 's' : ''}?
</h4>
<ul className="mb-4 space-y-1 max-h-32 overflow-y-auto">
{unhealthy.map(c => (
<li key={c.name} className="text-sm font-mono text-red-700">{c.name}</li>
))}
</ul>
<div className="flex gap-3 justify-end">
<button
onClick={() => setBulkConfirm(false)}
className="px-4 py-2 text-sm text-gray-700 bg-gray-100 rounded-md hover:bg-gray-200"
>
Cancel
</button>
<button
onClick={async () => {
setBulkConfirm(false);
for (const c of unhealthy) { await onRestart(c.name); }
}}
className="px-4 py-2 text-sm font-medium text-white bg-red-600 rounded-md hover:bg-red-700"
>
Restart All
</button>
</div>
</div>
</div>
)}
</div>
);
}
// ── Check card meta ────────────────────────────────────────────────────────
const CHECK_META: Record<string, { label: string; icon: React.ElementType }> = {
@ -1074,6 +1376,12 @@ export default function VmHealthPage() {
onOpen={() => { if (!trendsLoaded && !trendsLoading) loadTrends(); }}
/>
{/* ── All containers ── */}
<ContainersPanel
restarting={restarting}
onRestart={handleRestart}
/>
{/* ── Cleanup section ── */}
<div className="bg-white border border-gray-200 rounded-lg p-6">
<div className="flex items-center gap-3 mb-5">

View File

@ -489,6 +489,19 @@ export interface OllamaModelsResponse {
running: OllamaRunning[];
}
export interface ContainerInfo {
name: string;
image: string;
stack: string;
state: string;
health: string;
uptimeSecs: number;
cpuPercent: number;
memMiB: number;
memLimitMiB: number;
restartCount: number;
}
export const vmApi = {
getHealth: () => apiRequest<VmHealthResult>('/api/vm/health'),
getCleanupLog: (lines = 40) =>
@ -518,6 +531,9 @@ export const vmApi = {
apiRequest<{ available: TrendSeries; swap: TrendSeries }>(
`/api/vm/metrics/trend?metric=memory&range=${range}`,
),
getAllContainers: () => apiRequest<ContainerInfo[]>('/api/vm/containers'),
getContainerLogs: (name: string, lines = 50) =>
apiRequest<{ logs: string }>(`/api/vm/containers/${encodeURIComponent(name)}/logs?lines=${lines}`),
};
export interface TrendPoint { t: number; v: number }