- Switch backend runner from node:20-alpine to node:20-slim so GNU df flags (--output=pcent/avail) work inside the container - Add volume mounts to docker-compose.yml: scripts (ro), VM logs (rw), docker.sock; set VM_SCRIPTS_PATH + VM_LOG_DIR env vars - Rebuild repository.ts: env-configurable paths, cron history parser, unhealthy-container inspector, Ollama model endpoints - Add routes: GET /api/vm/cron-status, unhealthy containers, Ollama models, container restart, model unload - vm-cleanup.sh: add step_cosmos_pglog, step_docker_aged_images; fix (( count++ )) → count=$(( count + 1 )) for set -e compatibility - Add docs/VM_OBSERVABILITY_ROADMAP.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
355 lines
12 KiB
TypeScript
355 lines
12 KiB
TypeScript
import { exec } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import { hostname } from 'os';
|
|
import { readFile } from 'fs/promises';
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
// Paths are env-configurable so they work both in the Docker container (via
|
|
// volume mounts) and when the backend is run directly on the host for dev.
|
|
const SCRIPTS_PATH = process.env.VM_SCRIPTS_PATH
|
|
?? '/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM';
|
|
const LOG_DIR = process.env.VM_LOG_DIR ?? '/var/log';
|
|
|
|
const HEALTH_SCRIPT = `${SCRIPTS_PATH}/vm-health-check.sh`;
|
|
const CLEANUP_SCRIPT = `${SCRIPTS_PATH}/vm-cleanup.sh`;
|
|
const CLEANUP_LOG = `${LOG_DIR}/vm-cleanup.log`;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Health check (vm-health-check.sh --json)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export async function runVmHealthCheck() {
|
|
try {
|
|
// Script exits 1 (WARN) or 2 (CRIT) but still emits valid JSON on stdout.
|
|
const { stdout } = await execAsync(`bash "${HEALTH_SCRIPT}" --json 2>/dev/null`, {
|
|
timeout: 30_000,
|
|
});
|
|
return JSON.parse(stdout);
|
|
} catch (error: any) {
|
|
// Non-zero exit — stdout may still contain valid JSON (WARN/CRIT result)
|
|
if (error.stdout) {
|
|
try { return JSON.parse(error.stdout); } catch { /* fall through */ }
|
|
}
|
|
return {
|
|
timestamp: new Date().toISOString(),
|
|
hostname: hostname(),
|
|
overall: 'CRIT',
|
|
checks: {},
|
|
error: String(error.stderr || error.message || error),
|
|
};
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Cleanup log — raw tail
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export async function getCleanupLog(lines = 50): Promise<string> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`tail -${lines} "${CLEANUP_LOG}" 2>/dev/null || echo "(log not found)"`,
|
|
{ timeout: 5_000 }
|
|
);
|
|
return stdout.trim();
|
|
} catch {
|
|
return '(log not available)';
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Cron status — parsed run history + next scheduled times
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export interface CronRunSummary {
|
|
timestamp: string;
|
|
mode: 'standard' | 'full';
|
|
diskBefore: string;
|
|
diskAfter: string;
|
|
freedMB: number;
|
|
durationSecs: number;
|
|
success: boolean;
|
|
steps: string[];
|
|
jsonSummary?: Record<string, unknown>;
|
|
}
|
|
|
|
export interface CronJob {
|
|
name: string;
|
|
schedule: string;
|
|
description: string;
|
|
lastRun: CronRunSummary | null;
|
|
nextRun: string | null;
|
|
}
|
|
|
|
export async function getCronStatus(): Promise<{ jobs: CronJob[]; recentRuns: CronRunSummary[] }> {
|
|
const [rawLog, crontab] = await Promise.all([
|
|
readFile(CLEANUP_LOG, 'utf8').catch(() => ''),
|
|
execAsync('crontab -l 2>/dev/null').then(r => r.stdout).catch(() => ''),
|
|
]);
|
|
|
|
const recentRuns = parseCleanupLog(rawLog);
|
|
const jobs = buildJobList(crontab, recentRuns);
|
|
return { jobs, recentRuns: recentRuns.slice(0, 20) };
|
|
}
|
|
|
|
function parseCleanupLog(raw: string): CronRunSummary[] {
|
|
const runs: CronRunSummary[] = [];
|
|
// Runs are delimited by [START] lines
|
|
const blocks = raw.split(/\[START\]/);
|
|
for (const block of blocks.slice(1)) {
|
|
try {
|
|
const startLine = block.match(/\[(\d{4}-\d{2}-\d{2}T[\d:Z]+)\] mode=(\w+)/);
|
|
if (!startLine) continue;
|
|
const timestamp = startLine[1];
|
|
const mode = startLine[2] === 'full' ? 'full' : 'standard';
|
|
|
|
const diskLine = block.match(/\[DISK\] before=([^\s]+) after=([^\s]+)/);
|
|
const endLine = block.match(/\[END\]/);
|
|
const cmdLines = [...block.matchAll(/\[CMD\] (.+)/g)].map(m => m[1]);
|
|
const jsonMatch = block.match(/\[JSON\] ({.+})/);
|
|
|
|
// Compute freed MB from disk "used" before/after (e.g. " 70G 123G 37%")
|
|
let freedMB = 0;
|
|
let diskBefore = '', diskAfter = '';
|
|
if (diskLine) {
|
|
diskBefore = diskLine[1].trim();
|
|
diskAfter = diskLine[2].trim();
|
|
const gbBefore = parseFloat(diskLine[1].match(/([\d.]+)G/)?.[1] ?? '0');
|
|
const gbAfter = parseFloat(diskLine[2].match(/([\d.]+)G/)?.[1] ?? '0');
|
|
freedMB = Math.round((gbBefore - gbAfter) * 1024);
|
|
}
|
|
|
|
// Rough duration: time from start to end
|
|
const startTs = new Date(timestamp).getTime();
|
|
const endTs = endLine
|
|
? (() => {
|
|
const m = block.slice(block.indexOf('[END]') - 28, block.indexOf('[END]') - 2);
|
|
return new Date(m.match(/\d{4}-\d{2}-\d{2}T[\d:Z]+/)?.[0] ?? timestamp).getTime();
|
|
})()
|
|
: startTs;
|
|
const durationSecs = Math.round((endTs - startTs) / 1000);
|
|
|
|
runs.push({
|
|
timestamp,
|
|
mode,
|
|
diskBefore,
|
|
diskAfter,
|
|
freedMB,
|
|
durationSecs: isNaN(durationSecs) ? 0 : durationSecs,
|
|
success: !!endLine,
|
|
steps: cmdLines,
|
|
jsonSummary: jsonMatch ? JSON.parse(jsonMatch[1]) : undefined,
|
|
});
|
|
} catch {
|
|
// Skip malformed blocks
|
|
}
|
|
}
|
|
return runs.reverse(); // most recent first
|
|
}
|
|
|
|
function buildJobList(crontab: string, runs: CronRunSummary[]): CronJob[] {
|
|
const managed = crontab.match(/# bytelyst-vm-maintenance[\s\S]+/m)?.[0] ?? '';
|
|
|
|
const defs: Array<{ name: string; schedule: string; description: string; mode?: string }> = [
|
|
{ name: 'build-cache-prune', schedule: '0 3 * * *', description: 'Daily build cache prune' },
|
|
{ name: 'weekly-cleanup', schedule: '0 2 * * 0', description: 'Weekly standard cleanup' },
|
|
{ name: 'monthly-full', schedule: '0 1 1 * *', description: 'Monthly full cleanup', mode: 'full' },
|
|
{ name: 'health-check', schedule: '0 7 * * *', description: 'Daily health check + Telegram alert' },
|
|
];
|
|
|
|
return defs.map(def => {
|
|
const matchingRun = def.mode === 'full'
|
|
? runs.find(r => r.mode === 'full')
|
|
: runs.find(r => r.mode === 'standard');
|
|
|
|
const nextRun = computeNextRun(def.schedule);
|
|
return { ...def, lastRun: matchingRun ?? null, nextRun };
|
|
});
|
|
}
|
|
|
|
/** Very lightweight cron next-run calculator (handles standard 5-field expressions) */
|
|
function computeNextRun(expr: string): string {
|
|
const [min, hr, dom, , dow] = expr.split(' ');
|
|
const now = new Date();
|
|
const next = new Date(now);
|
|
next.setSeconds(0, 0);
|
|
next.setMinutes(parseInt(min));
|
|
next.setHours(parseInt(hr));
|
|
|
|
if (dom === '*' && dow !== '*') {
|
|
// Weekly: advance to correct day-of-week
|
|
const targetDow = parseInt(dow);
|
|
const dayDiff = (targetDow - next.getDay() + 7) % 7 || 7;
|
|
next.setDate(next.getDate() + dayDiff);
|
|
} else if (dom !== '*') {
|
|
// Monthly: advance to correct day-of-month
|
|
next.setDate(parseInt(dom));
|
|
if (next <= now) next.setMonth(next.getMonth() + 1);
|
|
} else {
|
|
// Daily: just advance to tomorrow if already passed today
|
|
if (next <= now) next.setDate(next.getDate() + 1);
|
|
}
|
|
|
|
return next.toISOString();
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Trigger cleanup (container runs as root — no sudo needed)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export async function runVmCleanup(
|
|
mode: 'weekly' | 'monthly' | 'dry-run',
|
|
): Promise<{ success: boolean; output: string }> {
|
|
const args =
|
|
mode === 'monthly' ? '--full --quiet' :
|
|
mode === 'dry-run' ? '--dry-run' :
|
|
'--quiet';
|
|
|
|
try {
|
|
const { stdout, stderr } = await execAsync(
|
|
`bash "${CLEANUP_SCRIPT}" ${args} 2>&1`,
|
|
{ timeout: 120_000 },
|
|
);
|
|
return { success: true, output: (stdout + stderr).trim() };
|
|
} catch (error: any) {
|
|
const out = ((error.stdout ?? '') + (error.stderr ?? '')).trim();
|
|
return { success: false, output: out || String(error.message ?? error) };
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Unhealthy containers (docker inspect via shell — no Docker SDK needed)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export interface UnhealthyContainer {
|
|
name: string;
|
|
status: string;
|
|
restartCount: number;
|
|
lastHealthLogs: string[];
|
|
unhealthySince: string | null;
|
|
}
|
|
|
|
export async function getUnhealthyContainers(): Promise<UnhealthyContainer[]> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null`,
|
|
{ timeout: 10_000 },
|
|
);
|
|
const names = stdout.trim().split('\n').filter(Boolean);
|
|
if (!names.length) return [];
|
|
|
|
const results = await Promise.all(names.map(async name => {
|
|
try {
|
|
const { stdout: raw } = await execAsync(
|
|
`docker inspect "${name}" 2>/dev/null`,
|
|
{ timeout: 5_000 },
|
|
);
|
|
const data = JSON.parse(raw)?.[0];
|
|
const health = data?.State?.Health ?? {};
|
|
const logs: string[] = (health.Log ?? [])
|
|
.slice(-3)
|
|
.map((l: any) => l.Output?.trim() ?? '');
|
|
const unhealthySince = health.Log?.[0]?.Start ?? null;
|
|
|
|
return {
|
|
name,
|
|
status: data?.State?.Status ?? 'unknown',
|
|
restartCount: data?.RestartCount ?? 0,
|
|
lastHealthLogs: logs,
|
|
unhealthySince,
|
|
} satisfies UnhealthyContainer;
|
|
} catch {
|
|
return { name, status: 'unknown', restartCount: 0, lastHealthLogs: [], unhealthySince: null };
|
|
}
|
|
}));
|
|
|
|
return results;
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
export async function restartContainer(name: string): Promise<{ success: boolean; message: string }> {
|
|
// Validate name — only allow alphanumeric, dash, underscore
|
|
if (!/^[\w-]+$/.test(name)) {
|
|
return { success: false, message: 'Invalid container name' };
|
|
}
|
|
try {
|
|
await execAsync(`docker restart "${name}"`, { timeout: 30_000 });
|
|
return { success: true, message: `${name} restarted` };
|
|
} catch (error: any) {
|
|
return { success: false, message: String(error.stderr || error.message || error) };
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Ollama models
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export interface OllamaModel {
|
|
name: string;
|
|
sizeGB: number;
|
|
modifiedAt: string;
|
|
}
|
|
|
|
export interface OllamaRunning {
|
|
name: string;
|
|
sizeGB: number;
|
|
processor: string;
|
|
expiresAt: string;
|
|
}
|
|
|
|
// Ollama REST API base — host-gateway resolves to the Docker host,
|
|
// where ollama serve listens on port 11434.
|
|
const OLLAMA_BASE = process.env.OLLAMA_BASE_URL ?? 'http://host-gateway:11434';
|
|
|
|
async function ollamaFetch(path: string, opts?: RequestInit): Promise<unknown> {
|
|
const res = await fetch(`${OLLAMA_BASE}${path}`, {
|
|
signal: AbortSignal.timeout(10_000),
|
|
...opts,
|
|
});
|
|
if (!res.ok) throw new Error(`Ollama ${path}: ${res.status}`);
|
|
return res.json();
|
|
}
|
|
|
|
export async function getOllamaModels(): Promise<{ models: OllamaModel[]; running: OllamaRunning[] }> {
|
|
try {
|
|
const [tagsData, psData] = await Promise.all([
|
|
ollamaFetch('/api/tags').catch(() => ({ models: [] })),
|
|
ollamaFetch('/api/ps').catch(() => ({ models: [] })),
|
|
]);
|
|
|
|
const models = ((tagsData as any).models ?? []).map((m: any) => ({
|
|
name: m.name ?? '',
|
|
sizeGB: parseFloat(((m.size ?? 0) / 1e9).toFixed(2)),
|
|
modifiedAt: m.modified_at ?? '',
|
|
}));
|
|
|
|
const running = ((psData as any).models ?? []).map((m: any) => ({
|
|
name: m.name ?? '',
|
|
sizeGB: parseFloat(((m.size ?? 0) / 1e9).toFixed(2)),
|
|
processor: m.details?.families?.join(', ') ?? '',
|
|
expiresAt: m.expires_at ?? '',
|
|
}));
|
|
|
|
return { models, running };
|
|
} catch {
|
|
return { models: [], running: [] };
|
|
}
|
|
}
|
|
|
|
export async function unloadOllamaModel(name: string): Promise<{ success: boolean; message: string }> {
|
|
if (!/^[\w.:\-/]+$/.test(name)) return { success: false, message: 'Invalid model name' };
|
|
try {
|
|
// Unload by setting keep_alive to 0
|
|
await ollamaFetch('/api/generate', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ model: name, keep_alive: 0 }),
|
|
});
|
|
return { success: true, message: `${name} unloaded` };
|
|
} catch (error: any) {
|
|
return { success: false, message: String(error.message ?? error) };
|
|
}
|
|
}
|