bytelyst-devops-tools/dashboard/backend/src/modules/vm/repository.ts
Hermes VM 2fc23d6baa feat(vm): fix devops-backend VM module — Phase 0.1 complete
- Switch backend runner from node:20-alpine to node:20-slim so GNU df
  flags (--output=pcent/avail) work inside the container
- Add volume mounts to docker-compose.yml: scripts (ro), VM logs (rw),
  docker.sock; set VM_SCRIPTS_PATH + VM_LOG_DIR env vars
- Rebuild repository.ts: env-configurable paths, cron history parser,
  unhealthy-container inspector, Ollama model endpoints
- Add routes: GET /api/vm/cron-status, unhealthy containers, Ollama
  models, container restart, model unload
- vm-cleanup.sh: add step_cosmos_pglog, step_docker_aged_images; fix
  (( count++ )) → count=$(( count + 1 )) for set -e compatibility
- Add docs/VM_OBSERVABILITY_ROADMAP.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:13:45 +00:00

355 lines
12 KiB
TypeScript

import { exec } from 'child_process';
import { promisify } from 'util';
import { hostname } from 'os';
import { readFile } from 'fs/promises';
const execAsync = promisify(exec);
// Paths are env-configurable so they work both in the Docker container (via
// volume mounts) and when the backend is run directly on the host for dev.
const SCRIPTS_PATH = process.env.VM_SCRIPTS_PATH
?? '/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM';
const LOG_DIR = process.env.VM_LOG_DIR ?? '/var/log';
const HEALTH_SCRIPT = `${SCRIPTS_PATH}/vm-health-check.sh`;
const CLEANUP_SCRIPT = `${SCRIPTS_PATH}/vm-cleanup.sh`;
const CLEANUP_LOG = `${LOG_DIR}/vm-cleanup.log`;
// ---------------------------------------------------------------------------
// Health check (vm-health-check.sh --json)
// ---------------------------------------------------------------------------
export async function runVmHealthCheck() {
try {
// Script exits 1 (WARN) or 2 (CRIT) but still emits valid JSON on stdout.
const { stdout } = await execAsync(`bash "${HEALTH_SCRIPT}" --json 2>/dev/null`, {
timeout: 30_000,
});
return JSON.parse(stdout);
} catch (error: any) {
// Non-zero exit — stdout may still contain valid JSON (WARN/CRIT result)
if (error.stdout) {
try { return JSON.parse(error.stdout); } catch { /* fall through */ }
}
return {
timestamp: new Date().toISOString(),
hostname: hostname(),
overall: 'CRIT',
checks: {},
error: String(error.stderr || error.message || error),
};
}
}
// ---------------------------------------------------------------------------
// Cleanup log — raw tail
// ---------------------------------------------------------------------------
export async function getCleanupLog(lines = 50): Promise<string> {
try {
const { stdout } = await execAsync(
`tail -${lines} "${CLEANUP_LOG}" 2>/dev/null || echo "(log not found)"`,
{ timeout: 5_000 }
);
return stdout.trim();
} catch {
return '(log not available)';
}
}
// ---------------------------------------------------------------------------
// Cron status — parsed run history + next scheduled times
// ---------------------------------------------------------------------------
export interface CronRunSummary {
timestamp: string;
mode: 'standard' | 'full';
diskBefore: string;
diskAfter: string;
freedMB: number;
durationSecs: number;
success: boolean;
steps: string[];
jsonSummary?: Record<string, unknown>;
}
export interface CronJob {
name: string;
schedule: string;
description: string;
lastRun: CronRunSummary | null;
nextRun: string | null;
}
export async function getCronStatus(): Promise<{ jobs: CronJob[]; recentRuns: CronRunSummary[] }> {
const [rawLog, crontab] = await Promise.all([
readFile(CLEANUP_LOG, 'utf8').catch(() => ''),
execAsync('crontab -l 2>/dev/null').then(r => r.stdout).catch(() => ''),
]);
const recentRuns = parseCleanupLog(rawLog);
const jobs = buildJobList(crontab, recentRuns);
return { jobs, recentRuns: recentRuns.slice(0, 20) };
}
function parseCleanupLog(raw: string): CronRunSummary[] {
const runs: CronRunSummary[] = [];
// Runs are delimited by [START] lines
const blocks = raw.split(/\[START\]/);
for (const block of blocks.slice(1)) {
try {
const startLine = block.match(/\[(\d{4}-\d{2}-\d{2}T[\d:Z]+)\] mode=(\w+)/);
if (!startLine) continue;
const timestamp = startLine[1];
const mode = startLine[2] === 'full' ? 'full' : 'standard';
const diskLine = block.match(/\[DISK\] before=([^\s]+) after=([^\s]+)/);
const endLine = block.match(/\[END\]/);
const cmdLines = [...block.matchAll(/\[CMD\] (.+)/g)].map(m => m[1]);
const jsonMatch = block.match(/\[JSON\] ({.+})/);
// Compute freed MB from disk "used" before/after (e.g. " 70G 123G 37%")
let freedMB = 0;
let diskBefore = '', diskAfter = '';
if (diskLine) {
diskBefore = diskLine[1].trim();
diskAfter = diskLine[2].trim();
const gbBefore = parseFloat(diskLine[1].match(/([\d.]+)G/)?.[1] ?? '0');
const gbAfter = parseFloat(diskLine[2].match(/([\d.]+)G/)?.[1] ?? '0');
freedMB = Math.round((gbBefore - gbAfter) * 1024);
}
// Rough duration: time from start to end
const startTs = new Date(timestamp).getTime();
const endTs = endLine
? (() => {
const m = block.slice(block.indexOf('[END]') - 28, block.indexOf('[END]') - 2);
return new Date(m.match(/\d{4}-\d{2}-\d{2}T[\d:Z]+/)?.[0] ?? timestamp).getTime();
})()
: startTs;
const durationSecs = Math.round((endTs - startTs) / 1000);
runs.push({
timestamp,
mode,
diskBefore,
diskAfter,
freedMB,
durationSecs: isNaN(durationSecs) ? 0 : durationSecs,
success: !!endLine,
steps: cmdLines,
jsonSummary: jsonMatch ? JSON.parse(jsonMatch[1]) : undefined,
});
} catch {
// Skip malformed blocks
}
}
return runs.reverse(); // most recent first
}
function buildJobList(crontab: string, runs: CronRunSummary[]): CronJob[] {
const managed = crontab.match(/# bytelyst-vm-maintenance[\s\S]+/m)?.[0] ?? '';
const defs: Array<{ name: string; schedule: string; description: string; mode?: string }> = [
{ name: 'build-cache-prune', schedule: '0 3 * * *', description: 'Daily build cache prune' },
{ name: 'weekly-cleanup', schedule: '0 2 * * 0', description: 'Weekly standard cleanup' },
{ name: 'monthly-full', schedule: '0 1 1 * *', description: 'Monthly full cleanup', mode: 'full' },
{ name: 'health-check', schedule: '0 7 * * *', description: 'Daily health check + Telegram alert' },
];
return defs.map(def => {
const matchingRun = def.mode === 'full'
? runs.find(r => r.mode === 'full')
: runs.find(r => r.mode === 'standard');
const nextRun = computeNextRun(def.schedule);
return { ...def, lastRun: matchingRun ?? null, nextRun };
});
}
/** Very lightweight cron next-run calculator (handles standard 5-field expressions) */
function computeNextRun(expr: string): string {
const [min, hr, dom, , dow] = expr.split(' ');
const now = new Date();
const next = new Date(now);
next.setSeconds(0, 0);
next.setMinutes(parseInt(min));
next.setHours(parseInt(hr));
if (dom === '*' && dow !== '*') {
// Weekly: advance to correct day-of-week
const targetDow = parseInt(dow);
const dayDiff = (targetDow - next.getDay() + 7) % 7 || 7;
next.setDate(next.getDate() + dayDiff);
} else if (dom !== '*') {
// Monthly: advance to correct day-of-month
next.setDate(parseInt(dom));
if (next <= now) next.setMonth(next.getMonth() + 1);
} else {
// Daily: just advance to tomorrow if already passed today
if (next <= now) next.setDate(next.getDate() + 1);
}
return next.toISOString();
}
// ---------------------------------------------------------------------------
// Trigger cleanup (container runs as root — no sudo needed)
// ---------------------------------------------------------------------------
export async function runVmCleanup(
mode: 'weekly' | 'monthly' | 'dry-run',
): Promise<{ success: boolean; output: string }> {
const args =
mode === 'monthly' ? '--full --quiet' :
mode === 'dry-run' ? '--dry-run' :
'--quiet';
try {
const { stdout, stderr } = await execAsync(
`bash "${CLEANUP_SCRIPT}" ${args} 2>&1`,
{ timeout: 120_000 },
);
return { success: true, output: (stdout + stderr).trim() };
} catch (error: any) {
const out = ((error.stdout ?? '') + (error.stderr ?? '')).trim();
return { success: false, output: out || String(error.message ?? error) };
}
}
// ---------------------------------------------------------------------------
// Unhealthy containers (docker inspect via shell — no Docker SDK needed)
// ---------------------------------------------------------------------------
export interface UnhealthyContainer {
name: string;
status: string;
restartCount: number;
lastHealthLogs: string[];
unhealthySince: string | null;
}
export async function getUnhealthyContainers(): Promise<UnhealthyContainer[]> {
try {
const { stdout } = await execAsync(
`docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null`,
{ timeout: 10_000 },
);
const names = stdout.trim().split('\n').filter(Boolean);
if (!names.length) return [];
const results = await Promise.all(names.map(async name => {
try {
const { stdout: raw } = await execAsync(
`docker inspect "${name}" 2>/dev/null`,
{ timeout: 5_000 },
);
const data = JSON.parse(raw)?.[0];
const health = data?.State?.Health ?? {};
const logs: string[] = (health.Log ?? [])
.slice(-3)
.map((l: any) => l.Output?.trim() ?? '');
const unhealthySince = health.Log?.[0]?.Start ?? null;
return {
name,
status: data?.State?.Status ?? 'unknown',
restartCount: data?.RestartCount ?? 0,
lastHealthLogs: logs,
unhealthySince,
} satisfies UnhealthyContainer;
} catch {
return { name, status: 'unknown', restartCount: 0, lastHealthLogs: [], unhealthySince: null };
}
}));
return results;
} catch {
return [];
}
}
export async function restartContainer(name: string): Promise<{ success: boolean; message: string }> {
// Validate name — only allow alphanumeric, dash, underscore
if (!/^[\w-]+$/.test(name)) {
return { success: false, message: 'Invalid container name' };
}
try {
await execAsync(`docker restart "${name}"`, { timeout: 30_000 });
return { success: true, message: `${name} restarted` };
} catch (error: any) {
return { success: false, message: String(error.stderr || error.message || error) };
}
}
// ---------------------------------------------------------------------------
// Ollama models
// ---------------------------------------------------------------------------
export interface OllamaModel {
name: string;
sizeGB: number;
modifiedAt: string;
}
export interface OllamaRunning {
name: string;
sizeGB: number;
processor: string;
expiresAt: string;
}
// Ollama REST API base — host-gateway resolves to the Docker host,
// where ollama serve listens on port 11434.
const OLLAMA_BASE = process.env.OLLAMA_BASE_URL ?? 'http://host-gateway:11434';
async function ollamaFetch(path: string, opts?: RequestInit): Promise<unknown> {
const res = await fetch(`${OLLAMA_BASE}${path}`, {
signal: AbortSignal.timeout(10_000),
...opts,
});
if (!res.ok) throw new Error(`Ollama ${path}: ${res.status}`);
return res.json();
}
export async function getOllamaModels(): Promise<{ models: OllamaModel[]; running: OllamaRunning[] }> {
try {
const [tagsData, psData] = await Promise.all([
ollamaFetch('/api/tags').catch(() => ({ models: [] })),
ollamaFetch('/api/ps').catch(() => ({ models: [] })),
]);
const models = ((tagsData as any).models ?? []).map((m: any) => ({
name: m.name ?? '',
sizeGB: parseFloat(((m.size ?? 0) / 1e9).toFixed(2)),
modifiedAt: m.modified_at ?? '',
}));
const running = ((psData as any).models ?? []).map((m: any) => ({
name: m.name ?? '',
sizeGB: parseFloat(((m.size ?? 0) / 1e9).toFixed(2)),
processor: m.details?.families?.join(', ') ?? '',
expiresAt: m.expires_at ?? '',
}));
return { models, running };
} catch {
return { models: [], running: [] };
}
}
export async function unloadOllamaModel(name: string): Promise<{ success: boolean; message: string }> {
if (!/^[\w.:\-/]+$/.test(name)) return { success: false, message: 'Invalid model name' };
try {
// Unload by setting keep_alive to 0
await ollamaFetch('/api/generate', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model: name, keep_alive: 0 }),
});
return { success: true, message: `${name} unloaded` };
} catch (error: any) {
return { success: false, message: String(error.message ?? error) };
}
}