feat(vm): fix devops-backend VM module — Phase 0.1 complete
- Switch backend runner from node:20-alpine to node:20-slim so GNU df flags (--output=pcent/avail) work inside the container - Add volume mounts to docker-compose.yml: scripts (ro), VM logs (rw), docker.sock; set VM_SCRIPTS_PATH + VM_LOG_DIR env vars - Rebuild repository.ts: env-configurable paths, cron history parser, unhealthy-container inspector, Ollama model endpoints - Add routes: GET /api/vm/cron-status, unhealthy containers, Ollama models, container restart, model unload - vm-cleanup.sh: add step_cosmos_pglog, step_docker_aged_images; fix (( count++ )) → count=$(( count + 1 )) for set -e compatibility - Add docs/VM_OBSERVABILITY_ROADMAP.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5a2d92f519
commit
2fc23d6baa
@ -30,13 +30,22 @@ ENV BYTELYST_COMMIT_SHA=${BYTELYST_COMMIT_SHA} \
|
|||||||
RUN npm run build
|
RUN npm run build
|
||||||
|
|
||||||
# --- Stage 2: Run ---
|
# --- Stage 2: Run ---
|
||||||
FROM node:20-alpine AS runner
|
# Use Debian slim (not Alpine) because vm-health-check.sh uses GNU df flags
|
||||||
|
# (--output=pcent, --output=avail) that BusyBox df does not support.
|
||||||
|
FROM node:20-slim AS runner
|
||||||
|
|
||||||
WORKDIR /app/backend
|
WORKDIR /app/backend
|
||||||
|
|
||||||
COPY backend/package.json backend/package-lock.json ./
|
COPY backend/package.json backend/package-lock.json ./
|
||||||
RUN npm ci --omit=dev --ignore-scripts
|
RUN npm ci --omit=dev --ignore-scripts
|
||||||
RUN apk add --no-cache curl
|
|
||||||
|
# Install tools needed by the VM management module:
|
||||||
|
# bash — vm-health-check.sh and vm-cleanup.sh require bash
|
||||||
|
# docker.io — docker CLI to communicate with the host daemon via socket
|
||||||
|
# python3 — used in inline python3 -c snippets inside the scripts
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
curl bash docker.io python3 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY --from=builder /app/backend/dist ./dist
|
COPY --from=builder /app/backend/dist ./dist
|
||||||
|
|
||||||
|
|||||||
1714
dashboard/backend/package-lock.json
generated
1714
dashboard/backend/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,34 +1,35 @@
|
|||||||
import { exec } from 'child_process';
|
import { exec } from 'child_process';
|
||||||
import { promisify } from 'util';
|
import { promisify } from 'util';
|
||||||
import { hostname } from 'os';
|
import { hostname } from 'os';
|
||||||
|
import { readFile } from 'fs/promises';
|
||||||
|
|
||||||
const execAsync = promisify(exec);
|
const execAsync = promisify(exec);
|
||||||
|
|
||||||
const HEALTH_SCRIPT =
|
// Paths are env-configurable so they work both in the Docker container (via
|
||||||
'/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-health-check.sh';
|
// volume mounts) and when the backend is run directly on the host for dev.
|
||||||
const CLEANUP_SCRIPT =
|
const SCRIPTS_PATH = process.env.VM_SCRIPTS_PATH
|
||||||
'/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM/vm-cleanup.sh';
|
?? '/opt/bytelyst/learning_ai_devops_tools/scripts/VMs/HostingerVM';
|
||||||
const CLEANUP_LOG = '/var/log/vm-cleanup.log';
|
const LOG_DIR = process.env.VM_LOG_DIR ?? '/var/log';
|
||||||
|
|
||||||
|
const HEALTH_SCRIPT = `${SCRIPTS_PATH}/vm-health-check.sh`;
|
||||||
|
const CLEANUP_SCRIPT = `${SCRIPTS_PATH}/vm-cleanup.sh`;
|
||||||
|
const CLEANUP_LOG = `${LOG_DIR}/vm-cleanup.log`;
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Health check
|
// Health check (vm-health-check.sh --json)
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export async function runVmHealthCheck() {
|
export async function runVmHealthCheck() {
|
||||||
try {
|
try {
|
||||||
// Script exits 1 (WARN) or 2 (CRIT) but still emits valid JSON on stdout.
|
// Script exits 1 (WARN) or 2 (CRIT) but still emits valid JSON on stdout.
|
||||||
const { stdout } = await execAsync(`bash ${HEALTH_SCRIPT} --json 2>/dev/null`, {
|
const { stdout } = await execAsync(`bash "${HEALTH_SCRIPT}" --json 2>/dev/null`, {
|
||||||
timeout: 30_000,
|
timeout: 30_000,
|
||||||
});
|
});
|
||||||
return JSON.parse(stdout);
|
return JSON.parse(stdout);
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
// Non-zero exit — stdout may still contain valid JSON
|
// Non-zero exit — stdout may still contain valid JSON (WARN/CRIT result)
|
||||||
if (error.stdout) {
|
if (error.stdout) {
|
||||||
try {
|
try { return JSON.parse(error.stdout); } catch { /* fall through */ }
|
||||||
return JSON.parse(error.stdout);
|
|
||||||
} catch {
|
|
||||||
// fall through to error response
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
@ -41,13 +42,13 @@ export async function runVmHealthCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Cleanup log
|
// Cleanup log — raw tail
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export async function getCleanupLog(lines = 30): Promise<string> {
|
export async function getCleanupLog(lines = 50): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const { stdout } = await execAsync(
|
const { stdout } = await execAsync(
|
||||||
`tail -${lines} ${CLEANUP_LOG} 2>/dev/null || echo "(log not found)"`,
|
`tail -${lines} "${CLEANUP_LOG}" 2>/dev/null || echo "(log not found)"`,
|
||||||
{ timeout: 5_000 }
|
{ timeout: 5_000 }
|
||||||
);
|
);
|
||||||
return stdout.trim();
|
return stdout.trim();
|
||||||
@ -57,30 +58,297 @@ export async function getCleanupLog(lines = 30): Promise<string> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Trigger cleanup (requires sudo — configure sudoers if needed)
|
// Cron status — parsed run history + next scheduled times
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface CronRunSummary {
|
||||||
|
timestamp: string;
|
||||||
|
mode: 'standard' | 'full';
|
||||||
|
diskBefore: string;
|
||||||
|
diskAfter: string;
|
||||||
|
freedMB: number;
|
||||||
|
durationSecs: number;
|
||||||
|
success: boolean;
|
||||||
|
steps: string[];
|
||||||
|
jsonSummary?: Record<string, unknown>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CronJob {
|
||||||
|
name: string;
|
||||||
|
schedule: string;
|
||||||
|
description: string;
|
||||||
|
lastRun: CronRunSummary | null;
|
||||||
|
nextRun: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getCronStatus(): Promise<{ jobs: CronJob[]; recentRuns: CronRunSummary[] }> {
|
||||||
|
const [rawLog, crontab] = await Promise.all([
|
||||||
|
readFile(CLEANUP_LOG, 'utf8').catch(() => ''),
|
||||||
|
execAsync('crontab -l 2>/dev/null').then(r => r.stdout).catch(() => ''),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const recentRuns = parseCleanupLog(rawLog);
|
||||||
|
const jobs = buildJobList(crontab, recentRuns);
|
||||||
|
return { jobs, recentRuns: recentRuns.slice(0, 20) };
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseCleanupLog(raw: string): CronRunSummary[] {
|
||||||
|
const runs: CronRunSummary[] = [];
|
||||||
|
// Runs are delimited by [START] lines
|
||||||
|
const blocks = raw.split(/\[START\]/);
|
||||||
|
for (const block of blocks.slice(1)) {
|
||||||
|
try {
|
||||||
|
const startLine = block.match(/\[(\d{4}-\d{2}-\d{2}T[\d:Z]+)\] mode=(\w+)/);
|
||||||
|
if (!startLine) continue;
|
||||||
|
const timestamp = startLine[1];
|
||||||
|
const mode = startLine[2] === 'full' ? 'full' : 'standard';
|
||||||
|
|
||||||
|
const diskLine = block.match(/\[DISK\] before=([^\s]+) after=([^\s]+)/);
|
||||||
|
const endLine = block.match(/\[END\]/);
|
||||||
|
const cmdLines = [...block.matchAll(/\[CMD\] (.+)/g)].map(m => m[1]);
|
||||||
|
const jsonMatch = block.match(/\[JSON\] ({.+})/);
|
||||||
|
|
||||||
|
// Compute freed MB from disk "used" before/after (e.g. " 70G 123G 37%")
|
||||||
|
let freedMB = 0;
|
||||||
|
let diskBefore = '', diskAfter = '';
|
||||||
|
if (diskLine) {
|
||||||
|
diskBefore = diskLine[1].trim();
|
||||||
|
diskAfter = diskLine[2].trim();
|
||||||
|
const gbBefore = parseFloat(diskLine[1].match(/([\d.]+)G/)?.[1] ?? '0');
|
||||||
|
const gbAfter = parseFloat(diskLine[2].match(/([\d.]+)G/)?.[1] ?? '0');
|
||||||
|
freedMB = Math.round((gbBefore - gbAfter) * 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rough duration: time from start to end
|
||||||
|
const startTs = new Date(timestamp).getTime();
|
||||||
|
const endTs = endLine
|
||||||
|
? (() => {
|
||||||
|
const m = block.slice(block.indexOf('[END]') - 28, block.indexOf('[END]') - 2);
|
||||||
|
return new Date(m.match(/\d{4}-\d{2}-\d{2}T[\d:Z]+/)?.[0] ?? timestamp).getTime();
|
||||||
|
})()
|
||||||
|
: startTs;
|
||||||
|
const durationSecs = Math.round((endTs - startTs) / 1000);
|
||||||
|
|
||||||
|
runs.push({
|
||||||
|
timestamp,
|
||||||
|
mode,
|
||||||
|
diskBefore,
|
||||||
|
diskAfter,
|
||||||
|
freedMB,
|
||||||
|
durationSecs: isNaN(durationSecs) ? 0 : durationSecs,
|
||||||
|
success: !!endLine,
|
||||||
|
steps: cmdLines,
|
||||||
|
jsonSummary: jsonMatch ? JSON.parse(jsonMatch[1]) : undefined,
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
// Skip malformed blocks
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runs.reverse(); // most recent first
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildJobList(crontab: string, runs: CronRunSummary[]): CronJob[] {
|
||||||
|
const managed = crontab.match(/# bytelyst-vm-maintenance[\s\S]+/m)?.[0] ?? '';
|
||||||
|
|
||||||
|
const defs: Array<{ name: string; schedule: string; description: string; mode?: string }> = [
|
||||||
|
{ name: 'build-cache-prune', schedule: '0 3 * * *', description: 'Daily build cache prune' },
|
||||||
|
{ name: 'weekly-cleanup', schedule: '0 2 * * 0', description: 'Weekly standard cleanup' },
|
||||||
|
{ name: 'monthly-full', schedule: '0 1 1 * *', description: 'Monthly full cleanup', mode: 'full' },
|
||||||
|
{ name: 'health-check', schedule: '0 7 * * *', description: 'Daily health check + Telegram alert' },
|
||||||
|
];
|
||||||
|
|
||||||
|
return defs.map(def => {
|
||||||
|
const matchingRun = def.mode === 'full'
|
||||||
|
? runs.find(r => r.mode === 'full')
|
||||||
|
: runs.find(r => r.mode === 'standard');
|
||||||
|
|
||||||
|
const nextRun = computeNextRun(def.schedule);
|
||||||
|
return { ...def, lastRun: matchingRun ?? null, nextRun };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Very lightweight cron next-run calculator (handles standard 5-field expressions) */
|
||||||
|
function computeNextRun(expr: string): string {
|
||||||
|
const [min, hr, dom, , dow] = expr.split(' ');
|
||||||
|
const now = new Date();
|
||||||
|
const next = new Date(now);
|
||||||
|
next.setSeconds(0, 0);
|
||||||
|
next.setMinutes(parseInt(min));
|
||||||
|
next.setHours(parseInt(hr));
|
||||||
|
|
||||||
|
if (dom === '*' && dow !== '*') {
|
||||||
|
// Weekly: advance to correct day-of-week
|
||||||
|
const targetDow = parseInt(dow);
|
||||||
|
const dayDiff = (targetDow - next.getDay() + 7) % 7 || 7;
|
||||||
|
next.setDate(next.getDate() + dayDiff);
|
||||||
|
} else if (dom !== '*') {
|
||||||
|
// Monthly: advance to correct day-of-month
|
||||||
|
next.setDate(parseInt(dom));
|
||||||
|
if (next <= now) next.setMonth(next.getMonth() + 1);
|
||||||
|
} else {
|
||||||
|
// Daily: just advance to tomorrow if already passed today
|
||||||
|
if (next <= now) next.setDate(next.getDate() + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return next.toISOString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Trigger cleanup (container runs as root — no sudo needed)
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export async function runVmCleanup(
|
export async function runVmCleanup(
|
||||||
mode: 'weekly' | 'monthly' | 'dry-run'
|
mode: 'weekly' | 'monthly' | 'dry-run',
|
||||||
): Promise<{ success: boolean; output: string }> {
|
): Promise<{ success: boolean; output: string }> {
|
||||||
const args =
|
const args =
|
||||||
mode === 'monthly'
|
mode === 'monthly' ? '--full --quiet' :
|
||||||
? '--full --quiet'
|
mode === 'dry-run' ? '--dry-run' :
|
||||||
: mode === 'dry-run'
|
'--quiet';
|
||||||
? '--dry-run'
|
|
||||||
: '--quiet';
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { stdout, stderr } = await execAsync(
|
const { stdout, stderr } = await execAsync(
|
||||||
`sudo bash ${CLEANUP_SCRIPT} ${args} 2>&1`,
|
`bash "${CLEANUP_SCRIPT}" ${args} 2>&1`,
|
||||||
{ timeout: 120_000 }
|
{ timeout: 120_000 },
|
||||||
);
|
);
|
||||||
return { success: true, output: (stdout + stderr).trim() };
|
return { success: true, output: (stdout + stderr).trim() };
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
const out = (error.stdout || '') + (error.stderr || '');
|
const out = ((error.stdout ?? '') + (error.stderr ?? '')).trim();
|
||||||
return {
|
return { success: false, output: out || String(error.message ?? error) };
|
||||||
success: false,
|
}
|
||||||
output: out.trim() || String(error.message || error),
|
}
|
||||||
};
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Unhealthy containers (docker inspect via shell — no Docker SDK needed)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface UnhealthyContainer {
|
||||||
|
name: string;
|
||||||
|
status: string;
|
||||||
|
restartCount: number;
|
||||||
|
lastHealthLogs: string[];
|
||||||
|
unhealthySince: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getUnhealthyContainers(): Promise<UnhealthyContainer[]> {
|
||||||
|
try {
|
||||||
|
const { stdout } = await execAsync(
|
||||||
|
`docker ps --filter health=unhealthy --format '{{.Names}}' 2>/dev/null`,
|
||||||
|
{ timeout: 10_000 },
|
||||||
|
);
|
||||||
|
const names = stdout.trim().split('\n').filter(Boolean);
|
||||||
|
if (!names.length) return [];
|
||||||
|
|
||||||
|
const results = await Promise.all(names.map(async name => {
|
||||||
|
try {
|
||||||
|
const { stdout: raw } = await execAsync(
|
||||||
|
`docker inspect "${name}" 2>/dev/null`,
|
||||||
|
{ timeout: 5_000 },
|
||||||
|
);
|
||||||
|
const data = JSON.parse(raw)?.[0];
|
||||||
|
const health = data?.State?.Health ?? {};
|
||||||
|
const logs: string[] = (health.Log ?? [])
|
||||||
|
.slice(-3)
|
||||||
|
.map((l: any) => l.Output?.trim() ?? '');
|
||||||
|
const unhealthySince = health.Log?.[0]?.Start ?? null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
status: data?.State?.Status ?? 'unknown',
|
||||||
|
restartCount: data?.RestartCount ?? 0,
|
||||||
|
lastHealthLogs: logs,
|
||||||
|
unhealthySince,
|
||||||
|
} satisfies UnhealthyContainer;
|
||||||
|
} catch {
|
||||||
|
return { name, status: 'unknown', restartCount: 0, lastHealthLogs: [], unhealthySince: null };
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
return results;
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function restartContainer(name: string): Promise<{ success: boolean; message: string }> {
|
||||||
|
// Validate name — only allow alphanumeric, dash, underscore
|
||||||
|
if (!/^[\w-]+$/.test(name)) {
|
||||||
|
return { success: false, message: 'Invalid container name' };
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await execAsync(`docker restart "${name}"`, { timeout: 30_000 });
|
||||||
|
return { success: true, message: `${name} restarted` };
|
||||||
|
} catch (error: any) {
|
||||||
|
return { success: false, message: String(error.stderr || error.message || error) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Ollama models
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface OllamaModel {
|
||||||
|
name: string;
|
||||||
|
sizeGB: number;
|
||||||
|
modifiedAt: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OllamaRunning {
|
||||||
|
name: string;
|
||||||
|
sizeGB: number;
|
||||||
|
processor: string;
|
||||||
|
expiresAt: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ollama REST API base — host-gateway resolves to the Docker host,
|
||||||
|
// where ollama serve listens on port 11434.
|
||||||
|
const OLLAMA_BASE = process.env.OLLAMA_BASE_URL ?? 'http://host-gateway:11434';
|
||||||
|
|
||||||
|
async function ollamaFetch(path: string, opts?: RequestInit): Promise<unknown> {
|
||||||
|
const res = await fetch(`${OLLAMA_BASE}${path}`, {
|
||||||
|
signal: AbortSignal.timeout(10_000),
|
||||||
|
...opts,
|
||||||
|
});
|
||||||
|
if (!res.ok) throw new Error(`Ollama ${path}: ${res.status}`);
|
||||||
|
return res.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getOllamaModels(): Promise<{ models: OllamaModel[]; running: OllamaRunning[] }> {
|
||||||
|
try {
|
||||||
|
const [tagsData, psData] = await Promise.all([
|
||||||
|
ollamaFetch('/api/tags').catch(() => ({ models: [] })),
|
||||||
|
ollamaFetch('/api/ps').catch(() => ({ models: [] })),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const models = ((tagsData as any).models ?? []).map((m: any) => ({
|
||||||
|
name: m.name ?? '',
|
||||||
|
sizeGB: parseFloat(((m.size ?? 0) / 1e9).toFixed(2)),
|
||||||
|
modifiedAt: m.modified_at ?? '',
|
||||||
|
}));
|
||||||
|
|
||||||
|
const running = ((psData as any).models ?? []).map((m: any) => ({
|
||||||
|
name: m.name ?? '',
|
||||||
|
sizeGB: parseFloat(((m.size ?? 0) / 1e9).toFixed(2)),
|
||||||
|
processor: m.details?.families?.join(', ') ?? '',
|
||||||
|
expiresAt: m.expires_at ?? '',
|
||||||
|
}));
|
||||||
|
|
||||||
|
return { models, running };
|
||||||
|
} catch {
|
||||||
|
return { models: [], running: [] };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function unloadOllamaModel(name: string): Promise<{ success: boolean; message: string }> {
|
||||||
|
if (!/^[\w.:\-/]+$/.test(name)) return { success: false, message: 'Invalid model name' };
|
||||||
|
try {
|
||||||
|
// Unload by setting keep_alive to 0
|
||||||
|
await ollamaFetch('/api/generate', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ model: name, keep_alive: 0 }),
|
||||||
|
});
|
||||||
|
return { success: true, message: `${name} unloaded` };
|
||||||
|
} catch (error: any) {
|
||||||
|
return { success: false, message: String(error.message ?? error) };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,47 +1,130 @@
|
|||||||
import type { FastifyInstance } from 'fastify';
|
import type { FastifyInstance } from 'fastify';
|
||||||
import { requireAdmin } from '../../lib/auth.js';
|
import { requireAdmin } from '../../lib/auth.js';
|
||||||
import { runVmHealthCheck, getCleanupLog, runVmCleanup } from './repository.js';
|
import {
|
||||||
import { VmCleanupParamsSchema } from './types.js';
|
runVmHealthCheck,
|
||||||
|
getCleanupLog,
|
||||||
|
runVmCleanup,
|
||||||
|
getCronStatus,
|
||||||
|
getUnhealthyContainers,
|
||||||
|
restartContainer,
|
||||||
|
getOllamaModels,
|
||||||
|
unloadOllamaModel,
|
||||||
|
} from './repository.js';
|
||||||
|
import { VmCleanupParamsSchema, VmContainerRestartParamsSchema } from './types.js';
|
||||||
|
|
||||||
export async function vmRoutes(fastify: FastifyInstance) {
|
export async function vmRoutes(fastify: FastifyInstance) {
|
||||||
// GET /api/vm/health — run vm-health-check.sh --json and return result (admin only)
|
|
||||||
|
// ── Health check ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// GET /api/vm/health — run vm-health-check.sh --json
|
||||||
fastify.get('/vm/health', {
|
fastify.get('/vm/health', {
|
||||||
preHandler: async (req) => requireAdmin(req),
|
preHandler: async (req) => requireAdmin(req),
|
||||||
}, async (_req, reply) => {
|
}, async (_req, reply) => {
|
||||||
try {
|
try {
|
||||||
const result = await runVmHealthCheck();
|
return reply.send(await runVmHealthCheck());
|
||||||
return reply.send(result);
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
fastify.log.error(error, 'VM health check failed');
|
fastify.log.error(error, 'VM health check failed');
|
||||||
return reply.code(500).send({ error: 'VM health check failed' });
|
return reply.code(500).send({ error: 'VM health check failed' });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /api/vm/cleanup-log — tail the cleanup log (admin only)
|
// ── Cleanup log (raw tail) ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// GET /api/vm/cleanup-log?lines=50
|
||||||
fastify.get('/vm/cleanup-log', {
|
fastify.get('/vm/cleanup-log', {
|
||||||
preHandler: async (req) => requireAdmin(req),
|
preHandler: async (req) => requireAdmin(req),
|
||||||
}, async (req, reply) => {
|
}, async (req, reply) => {
|
||||||
try {
|
try {
|
||||||
const lines = Number((req.query as any).lines) || 30;
|
const lines = Math.min(Number((req.query as any).lines) || 50, 500);
|
||||||
const log = await getCleanupLog(lines);
|
return reply.send({ log: await getCleanupLog(lines) });
|
||||||
return reply.send({ log });
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
fastify.log.error(error, 'Failed to read cleanup log');
|
fastify.log.error(error, 'Failed to read cleanup log');
|
||||||
return reply.code(500).send({ error: 'Failed to read cleanup log' });
|
return reply.code(500).send({ error: 'Failed to read cleanup log' });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// POST /api/vm/cleanup — trigger vm-cleanup.sh (admin only)
|
// ── Cron status (parsed history + schedule) ───────────────────────────────
|
||||||
|
|
||||||
|
// GET /api/vm/cron-status
|
||||||
|
fastify.get('/vm/cron-status', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (_req, reply) => {
|
||||||
|
try {
|
||||||
|
return reply.send(await getCronStatus());
|
||||||
|
} catch (error) {
|
||||||
|
fastify.log.error(error, 'Failed to get cron status');
|
||||||
|
return reply.code(500).send({ error: 'Failed to get cron status' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Cleanup trigger ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// POST /api/vm/cleanup { mode: 'weekly' | 'monthly' | 'dry-run' }
|
||||||
fastify.post('/vm/cleanup', {
|
fastify.post('/vm/cleanup', {
|
||||||
preHandler: async (req) => requireAdmin(req),
|
preHandler: async (req) => requireAdmin(req),
|
||||||
}, async (req, reply) => {
|
}, async (req, reply) => {
|
||||||
try {
|
try {
|
||||||
const params = VmCleanupParamsSchema.parse(req.body);
|
const params = VmCleanupParamsSchema.parse(req.body);
|
||||||
const result = await runVmCleanup(params.mode);
|
return reply.send(await runVmCleanup(params.mode));
|
||||||
return reply.send(result);
|
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
fastify.log.error(error, 'VM cleanup failed');
|
fastify.log.error(error, 'VM cleanup failed');
|
||||||
return reply.code(500).send({ error: error.message || 'VM cleanup failed' });
|
return reply.code(500).send({ error: error.message || 'VM cleanup failed' });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── Unhealthy containers ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// GET /api/vm/containers/unhealthy
|
||||||
|
fastify.get('/vm/containers/unhealthy', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (_req, reply) => {
|
||||||
|
try {
|
||||||
|
return reply.send({ containers: await getUnhealthyContainers() });
|
||||||
|
} catch (error) {
|
||||||
|
fastify.log.error(error, 'Failed to get unhealthy containers');
|
||||||
|
return reply.code(500).send({ error: 'Failed to get unhealthy containers' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// POST /api/vm/containers/:name/restart
|
||||||
|
fastify.post('/vm/containers/:name/restart', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (req, reply) => {
|
||||||
|
try {
|
||||||
|
const { name } = VmContainerRestartParamsSchema.parse(req.params);
|
||||||
|
const result = await restartContainer(name);
|
||||||
|
return reply.code(result.success ? 200 : 400).send(result);
|
||||||
|
} catch (error: any) {
|
||||||
|
fastify.log.error(error, 'Container restart failed');
|
||||||
|
return reply.code(500).send({ error: error.message || 'Container restart failed' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Ollama / LLM models ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// GET /api/vm/ollama/models
|
||||||
|
fastify.get('/vm/ollama/models', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (_req, reply) => {
|
||||||
|
try {
|
||||||
|
return reply.send(await getOllamaModels());
|
||||||
|
} catch (error) {
|
||||||
|
fastify.log.error(error, 'Failed to get Ollama models');
|
||||||
|
return reply.code(500).send({ error: 'Failed to get Ollama models' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// DELETE /api/vm/ollama/models/:name — unload running model
|
||||||
|
fastify.delete('/vm/ollama/models/:name', {
|
||||||
|
preHandler: async (req) => requireAdmin(req),
|
||||||
|
}, async (req, reply) => {
|
||||||
|
try {
|
||||||
|
const name = decodeURIComponent((req.params as any).name ?? '');
|
||||||
|
const result = await unloadOllamaModel(name);
|
||||||
|
return reply.code(result.success ? 200 : 400).send(result);
|
||||||
|
} catch (error: any) {
|
||||||
|
fastify.log.error(error, 'Failed to unload Ollama model');
|
||||||
|
return reply.code(500).send({ error: error.message || 'Unload failed' });
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,3 +29,10 @@ export const VmCleanupResultSchema = z.object({
|
|||||||
output: z.string(),
|
output: z.string(),
|
||||||
});
|
});
|
||||||
export type VmCleanupResult = z.infer<typeof VmCleanupResultSchema>;
|
export type VmCleanupResult = z.infer<typeof VmCleanupResultSchema>;
|
||||||
|
|
||||||
|
// ── Container restart ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export const VmContainerRestartParamsSchema = z.object({
|
||||||
|
name: z.string().regex(/^[\w-]+$/, 'Invalid container name'),
|
||||||
|
});
|
||||||
|
export type VmContainerRestartParams = z.infer<typeof VmContainerRestartParamsSchema>;
|
||||||
|
|||||||
@ -22,11 +22,27 @@ services:
|
|||||||
container_name: devops-backend
|
container_name: devops-backend
|
||||||
env_file:
|
env_file:
|
||||||
- backend/.env
|
- backend/.env
|
||||||
|
environment:
|
||||||
|
- VM_SCRIPTS_PATH=/vm-scripts/VMs/HostingerVM
|
||||||
|
- VM_LOG_DIR=/host-logs
|
||||||
ports:
|
ports:
|
||||||
- '4004:4004'
|
- '4004:4004'
|
||||||
networks:
|
networks:
|
||||||
- default
|
- default
|
||||||
- platform_net
|
- platform_net
|
||||||
|
volumes:
|
||||||
|
# Read-only access to VM management scripts
|
||||||
|
- /opt/bytelyst/learning_ai_devops_tools/scripts:/vm-scripts:ro
|
||||||
|
# Read-write access to VM log files (cleanup + health-check write here)
|
||||||
|
- /var/log/vm-cleanup.log:/host-logs/vm-cleanup.log
|
||||||
|
- /var/log/vm-health-check.log:/host-logs/vm-health-check.log
|
||||||
|
- /var/log/docker-watchdog.log:/host-logs/docker-watchdog.log
|
||||||
|
# Docker socket — allows running docker commands against the host daemon
|
||||||
|
# (same pattern as Portainer/cAdvisor; container already runs as root)
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
extra_hosts:
|
||||||
|
# Reach the host for Ollama API (port 11434) and host-only services
|
||||||
|
- "host-gateway:host-gateway"
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ['CMD', 'curl', '-f', 'http://localhost:4004/health']
|
test: ['CMD', 'curl', '-f', 'http://localhost:4004/health']
|
||||||
|
|||||||
399
docs/VM_OBSERVABILITY_ROADMAP.md
Normal file
399
docs/VM_OBSERVABILITY_ROADMAP.md
Normal file
@ -0,0 +1,399 @@
|
|||||||
|
# VM Observability & Control Roadmap — v2
|
||||||
|
**Status:** Draft — Pending Approval
|
||||||
|
**Last updated:** 2026-05-27
|
||||||
|
**Scope:** `srv1491630` (Hostinger VM) + DevOps Dashboard (`devops.bytelyst.com`)
|
||||||
|
**Reviewed:** Yes — v1 audited against live system; 11 issues corrected (see change log at bottom)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Current State Snapshot
|
||||||
|
|
||||||
|
| Layer | What exists today | Verified gap |
|
||||||
|
|---|---|---|
|
||||||
|
| **Health check** | `vm-health-check.sh` — disk, load, RAM, swap, Docker | No steal time metric; no per-container detail |
|
||||||
|
| **Cleanup** | `vm-cleanup.sh` — build cache, images, logs, apt, pnpm, HOLD | Runs silently; no structured outcome record |
|
||||||
|
| **Cron** | 4 scheduled jobs (daily / weekly / monthly) | No execution history; no "last ran / freed X" |
|
||||||
|
| **Dashboard /vm** | Health check + cleanup log tail + trigger button | **VM module is non-functional** — container has no host volume mounts; all backend calls to host scripts fail silently |
|
||||||
|
| **Dashboard /system** | CPU, RAM, disk, Docker stats | Missing steal %, container detail, unhealthy drill-down |
|
||||||
|
| **Prometheus stack** | Prometheus + cAdvisor + node-exporter + Loki — ~2 weeks history | **No Grafana**; trend data exists but no UI to query it |
|
||||||
|
| **Alerting** | Telegram on WARN/CRIT at 07:00 UTC | No steal time alert; no weekly digest; no cron failure alert |
|
||||||
|
| **Container restart** | 38/39 containers have `unless-stopped` | `unless-stopped` restarts on *process exit only* — does NOT react to health check failures. 7 containers running but unhealthy (process alive, health endpoint dead) |
|
||||||
|
| **LLMs (Ollama)** | 9 models on disk; `qwen2.5-coder:1.5b` currently loaded (1.1 GB, 100% CPU) | No RAM impact warning before loading; no dashboard visibility |
|
||||||
|
| **I/O anomaly** | `invttrdg-backend` writing ~22 GB/day to block storage | Unexplained — no alert, no investigation |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architectural Decisions (settle these before building)
|
||||||
|
|
||||||
|
### A. Trend chart data source
|
||||||
|
**Options:**
|
||||||
|
- ✅ **Query existing Prometheus** from DevOps dashboard (recommended) — data already there, no new store needed. Add Prometheus query endpoints to dashboard backend, render with a chart library.
|
||||||
|
- ➕ **Add Grafana container** alongside Prometheus — purpose-built for metrics UI, out-of-box dashboards. Extra 80–150 MB RAM.
|
||||||
|
- ❌ **New Cosmos DB vm-metrics container** — redundant with Prometheus; wrong tool for time-series.
|
||||||
|
|
||||||
|
**Recommendation:** Query Prometheus from the dashboard for Phase 4.2 charts (keeps everything in one UI). Add Grafana in Phase 5 only if dashboard charts feel limiting.
|
||||||
|
|
||||||
|
### B. Dashboard → host script execution
|
||||||
|
The `devops-backend` container currently has **no host volume mounts** and **no sudoers entry**. Phase 3.2 "Run cleanup from dashboard" requires one of:
|
||||||
|
- ✅ **Mount host script + Docker socket** into devops-backend (simplest, lowest risk)
|
||||||
|
- ➕ **Thin host-side agent** (systemd socket-activated, receives commands via Unix socket)
|
||||||
|
- ❌ **SSH from container to host** — unnecessary complexity
|
||||||
|
|
||||||
|
**Recommendation:** Mount `/opt/bytelyst/learning_ai_devops_tools/scripts` read-only + `/var/log` for log reading into devops-backend. Add sudoers entry for the cleanup script only.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 0 — Fix Broken Foundations *(Day 1–2, prerequisite for all UI phases)*
|
||||||
|
|
||||||
|
These are not new features — they are bugs in the current system.
|
||||||
|
|
||||||
|
#### 0.1 Fix devops-backend VM module (host volume mounts)
|
||||||
|
**Problem:** `GET /api/vm/health`, `GET /api/vm/cleanup-log`, `POST /api/vm/cleanup` all fail because the container has no access to the host filesystem.
|
||||||
|
**Fix:** Update `docker-compose.yml` for devops-backend:
|
||||||
|
```yaml
|
||||||
|
volumes:
|
||||||
|
- /opt/bytelyst/learning_ai_devops_tools/scripts:/scripts:ro
|
||||||
|
- /var/log/vm-cleanup.log:/var/log/vm-cleanup.log:ro
|
||||||
|
- /var/log/vm-health-check.log:/var/log/vm-health-check.log:ro
|
||||||
|
```
|
||||||
|
Update `repository.ts` to use `/scripts/VMs/HostingerVM/vm-cleanup.sh` path, or use env var `VM_SCRIPTS_PATH`.
|
||||||
|
Add sudoers entry: `nobody ALL=(ALL) NOPASSWD: /scripts/VMs/HostingerVM/vm-cleanup.sh`
|
||||||
|
**Risk:** Low. Read-only mounts for scripts, append-only for logs.
|
||||||
|
**Validates:** Run `curl http://localhost:4004/api/vm/health` and confirm JSON response.
|
||||||
|
|
||||||
|
#### 0.2 Add logrotate entry for new log files
|
||||||
|
**Problem:** `/var/log/vm-cleanup.log` and `/var/log/vm-health-check.log` have no rotation policy. Will grow unbounded.
|
||||||
|
**Fix:** Create `/etc/logrotate.d/bytelyst-vm`:
|
||||||
|
```
|
||||||
|
/var/log/vm-cleanup.log /var/log/vm-health-check.log /var/log/docker-watchdog.log {
|
||||||
|
weekly
|
||||||
|
rotate 8
|
||||||
|
compress
|
||||||
|
delaycompress
|
||||||
|
missingok
|
||||||
|
notifempty
|
||||||
|
create 0644 root root
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 0.3 Investigate `invttrdg-backend` I/O anomaly
|
||||||
|
**Problem:** 22.2 GB block writes in 13 hours (~1.7 GB/hr). At this rate: 40 GB/day, will fill the 123 GB free disk in ~3 days of heavy trading activity.
|
||||||
|
**Fix path:** Check what's being written (WAL logs? tick data? verbose debug logging?). Likely a log level or persistence config issue. Add disk usage alert specific to this container.
|
||||||
|
**Risk of not fixing:** Disk fills up, all services go down.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1 — Observability Gaps *(Week 1)*
|
||||||
|
|
||||||
|
Read-only additions to existing scripts and the `/vm` dashboard page.
|
||||||
|
|
||||||
|
#### 1.1 Cron Job Execution History Panel
|
||||||
|
**Where:** Dashboard `/vm` page — new "Maintenance Schedule" card
|
||||||
|
**What:** Add `GET /api/vm/cron-status` endpoint that:
|
||||||
|
1. Parses crontab entries for the 4 managed jobs (look for `# bytelyst-vm-maintenance` block)
|
||||||
|
2. Parses `/var/log/vm-cleanup.log` into structured run objects: `{ timestamp, mode, diskBefore, diskAfter, freedMB, steps[], success }`
|
||||||
|
3. Calculates next run from cron expression
|
||||||
|
|
||||||
|
**UI:** Table — job name | schedule | last run | freed | status | next run. Expandable row shows step-by-step log.
|
||||||
|
**Dependency:** Requires Phase 0.1 (volume mount for log access).
|
||||||
|
|
||||||
|
#### 1.2 CPU Steal Time Metric
|
||||||
|
**Where:** `vm-health-check.sh` + dashboard `/vm` health cards
|
||||||
|
**What:** Sample `/proc/stat` twice 1 second apart, compute steal %:
|
||||||
|
```bash
|
||||||
|
read_steal() { awk '/^cpu /{print $9" "$2+$3+$4+$5+$6+$7+$8+$9+$10}' /proc/stat; }
|
||||||
|
s1=$(read_steal); sleep 1; s2=$(read_steal)
|
||||||
|
steal_pct=$(awk -v s1="$s1" -v s2="$s2" 'BEGIN{
|
||||||
|
split(s1,a," "); split(s2,b," ")
|
||||||
|
delta_steal=b[1]-a[1]; delta_total=b[2]-a[2]
|
||||||
|
printf "%.1f", (delta_steal/delta_total)*100
|
||||||
|
}')
|
||||||
|
```
|
||||||
|
Thresholds: `> 5%` = WARN, `> 15%` = CRIT.
|
||||||
|
**Why:** Currently at **8.2%** — silently degrading every API response and LLM inference call.
|
||||||
|
**Dependency:** None. Self-contained script change.
|
||||||
|
|
||||||
|
#### 1.3 Unhealthy Container Detail Panel
|
||||||
|
**Where:** Dashboard `/vm` — expand container health card
|
||||||
|
**What:** New `GET /api/vm/containers/unhealthy` endpoint:
|
||||||
|
- Container name, `unhealthy` since (parse `docker inspect .State.Health.Log[0].Start`)
|
||||||
|
- Last 3 health check log lines
|
||||||
|
- Current restart count
|
||||||
|
|
||||||
|
**UI:** Expandable per-container row with one-click restart button (calls existing or new `POST /api/vm/containers/:name/restart`).
|
||||||
|
**Dependency:** Requires Phase 0.1.
|
||||||
|
|
||||||
|
#### 1.4 Swap Pressure Indicator
|
||||||
|
**Where:** `vm-health-check.sh` + dashboard
|
||||||
|
**What:** Add `SwapCached` as secondary metric. High SwapCached relative to SwapUsed = system was recently under pressure even if swap looks ok now. Surface in daily Telegram alert even when overall = WARN not CRIT.
|
||||||
|
**Threshold change:** Current `SWAP_USED_WARN_GB=1` triggers today (1.4 GB in use). Consider raising to `1.5` to reduce noise while keeping the `SwapCached > 200MB` as an early warning signal.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2 — Self-Healing Automation *(Week 2)*
|
||||||
|
|
||||||
|
Scripts that fix known recurring issues automatically.
|
||||||
|
|
||||||
|
#### 2.1 Health-Check-Aware Container Watchdog
|
||||||
|
**Why the existing policy isn't enough:** All 38 containers already have `unless-stopped`. That policy restarts on *container process exit* only. When the web server process is alive but the health check endpoint returns `Connection refused`, Docker marks the container `unhealthy` but **does not restart it** — it keeps running indefinitely broken.
|
||||||
|
**Fix:** Systemd timer `docker-health-watchdog.timer` (runs every 10 minutes):
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# /usr/local/bin/docker-health-watchdog.sh
|
||||||
|
UNHEALTHY=$(docker ps --filter health=unhealthy --format '{{.Names}}')
|
||||||
|
for container in $UNHEALTHY; do
|
||||||
|
# Only restart if unhealthy for at least 3 consecutive checks (30 min)
|
||||||
|
failures=$(docker inspect "$container" | \
|
||||||
|
python3 -c "import json,sys; h=json.load(sys.stdin)[0]['State']['Health']['Log']; \
|
||||||
|
print(sum(1 for l in h[-3:] if l['ExitCode']!=0))")
|
||||||
|
if [[ "$failures" -eq 3 ]]; then
|
||||||
|
docker restart "$container"
|
||||||
|
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Auto-restarted: $container (unhealthy 3x)" \
|
||||||
|
>> /var/log/docker-watchdog.log
|
||||||
|
# Telegram notify (reads token from $HERMES_HOME/.env)
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
```
|
||||||
|
**Safety:** Never restarts a container that just became unhealthy (3-check cooldown). Logs every restart. Only targets health-check failures, not intentionally stopped containers.
|
||||||
|
**Rollback:** `systemctl disable docker-health-watchdog.timer`
|
||||||
|
|
||||||
|
#### 2.2 Fix `hermes-root-backup` Git Diverge
|
||||||
|
**Current failure:** Git fast-forward fails every ~10 minutes since 16:25 today (~30+ silent failures).
|
||||||
|
**Fix:** Patch the backup script to handle diverge gracefully:
|
||||||
|
```bash
|
||||||
|
if ! git pull --ff-only 2>/dev/null; then
|
||||||
|
# Log the diverge
|
||||||
|
git log --oneline -3 HEAD > /tmp/hermes-diverge-before.txt
|
||||||
|
git log --oneline -3 origin/main >> /tmp/hermes-diverge-before.txt
|
||||||
|
# Try rebase first (preserves local commits if intentional)
|
||||||
|
if ! git pull --rebase 2>/dev/null; then
|
||||||
|
# If rebase fails, reset to origin (backup is the source of truth)
|
||||||
|
git reset --hard origin/main
|
||||||
|
notify_telegram "⚠️ hermes-root-backup: diverged branch reset to origin/main"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
**Risk:** `git reset --hard` loses any local-only commits on the backup repo. Acceptable here because the backup script's job is to *push to* origin — local commits shouldn't exist. Add a pre-check: if local commits exist that aren't on origin, alert instead of resetting.
|
||||||
|
|
||||||
|
#### 2.3 Container Memory Limits
|
||||||
|
**Validated against actual RSS data (Phase 2 data collected 2026-05-27):**
|
||||||
|
|
||||||
|
| Category | Current RSS | Proposed Limit | Reservation | Notes |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| Next.js web frontends | 17–37 MB | `256m` | `64m` | 7× headroom for webpack spikes |
|
||||||
|
| Node/Fastify backends | 20–67 MB | `384m` | `128m` | Allows burst for LLM calls |
|
||||||
|
| `invttrdg-backend` | 107 MB | `512m` | `256m` | High I/O service; watch after 0.3 |
|
||||||
|
| `trading-backend` | 92 MB | `512m` | `256m` | Active algo trading service |
|
||||||
|
| `platform-service` | 66 MB | `384m` | `128m` | Shared auth/platform layer |
|
||||||
|
| CosmosDB emulator | 145 MB | `1g` | `512m` | Can spike on write bursts |
|
||||||
|
| Prometheus | 57 MB | `256m` | `128m` | Stable but grows with series |
|
||||||
|
| Loki | 53 MB | `256m` | `128m` | Log ingestion can spike |
|
||||||
|
| Caddy | 27 MB | `128m` | `64m` | Proxy, very stable |
|
||||||
|
| Valkey (Redis) | 3.5 MB | `128m` | `32m` | Cache, tiny |
|
||||||
|
| Gitea | 79 MB | `512m` | `256m` | Git operations can spike |
|
||||||
|
| Ollama | 130 MB idle | **No limit** | — | Must accommodate model load (up to 8 GB) |
|
||||||
|
|
||||||
|
**Rollout strategy:**
|
||||||
|
1. Run `docker stats` baseline for 24h to confirm no container spikes beyond proposed limits
|
||||||
|
2. Apply limits per stack in docker-compose files (not `docker update` — recreate on next deploy)
|
||||||
|
3. Monitor for OOMKill events: `dmesg | grep -i oom` for 48h after rollout
|
||||||
|
4. **Never set limits on Ollama** — model loading is unpredictable and limits would kill inference
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3 — Dashboard Control Plane *(Weeks 3–4)*
|
||||||
|
|
||||||
|
**Prerequisite for all Phase 3 items:** Phase 0.1 (host volume mount) must be complete.
|
||||||
|
|
||||||
|
#### 3.1 VM Score Card (Automated)
|
||||||
|
**Where:** Dashboard `/vm` — top summary widget, auto-refreshes every 5 min
|
||||||
|
**Scoring algorithm (0–100):**
|
||||||
|
```
|
||||||
|
CPU efficiency: 20 pts (steal < 2% = 20, < 5% = 15, < 10% = 10, ≥ 10% = 5)
|
||||||
|
Memory pressure: 20 pts (available > 6 GB = 20, > 3 GB = 15, > 1 GB = 5, else = 0)
|
||||||
|
Disk health: 15 pts (< 40% used = 15, < 55% = 10, < 70% = 5, else = 0)
|
||||||
|
Service health: 20 pts (0 unhealthy = 20, 1–2 = 15, 3–5 = 8, 6+ = 2)
|
||||||
|
Maintenance hygiene: 15 pts (last cleanup < 7 days + freed > 0 = 15, < 30 days = 8, else = 0)
|
||||||
|
LLM readiness: 10 pts (> 8 GB free RAM = 10, > 4 GB = 7, > 2 GB = 4, else = 1)
|
||||||
|
```
|
||||||
|
Score = sum. Display as gauge. Each dimension clickable to drill into its data.
|
||||||
|
**Dependencies:** Phase 1.2 (steal time in health check output).
|
||||||
|
|
||||||
|
#### 3.2 Cron Schedule & History Panel
|
||||||
|
**Where:** Dashboard `/vm` — "Maintenance" tab
|
||||||
|
**What:**
|
||||||
|
- Live table: 4 cron jobs × (last run, result, freed MB, next scheduled, "Run now" button)
|
||||||
|
- Last 30 cleanup runs as a sparkline: date vs MB freed
|
||||||
|
- One-click trigger for weekly / monthly / dry-run
|
||||||
|
|
||||||
|
**Backend endpoint:** `GET /api/vm/cron-status` — parse structured log + crontab
|
||||||
|
**Dependency:** Phase 0.1 (volume mount), Phase 1.1 (structured log parser).
|
||||||
|
|
||||||
|
#### 3.3 Container Management Panel
|
||||||
|
**Where:** Dashboard `/vm` — "Containers" tab
|
||||||
|
**What:**
|
||||||
|
- Full list: name, stack, health status, uptime, CPU %, RAM, restart count
|
||||||
|
- Filter chips: All | Unhealthy | No Memory Limit | By stack
|
||||||
|
- Per-container: Restart, View last 50 log lines, Show health check history
|
||||||
|
- Bulk: "Restart all unhealthy" with confirmation modal
|
||||||
|
|
||||||
|
**New backend endpoints:** `GET /api/vm/containers`, `POST /api/vm/containers/:name/restart`, `GET /api/vm/containers/:name/logs`
|
||||||
|
|
||||||
|
#### 3.4 Ollama / LLM Panel
|
||||||
|
**Where:** Dashboard `/vm` — "Models" tab
|
||||||
|
**What:**
|
||||||
|
- Models list: name, size, last used timestamp
|
||||||
|
- Currently loaded (from `ollama ps`): model name, RAM used, CPU %, expires in
|
||||||
|
- RAM visualisation bar: [used by system] [model if loaded] [free]
|
||||||
|
- Warning banner: "Loading llama3.2-vision (7.8 GB) will leave ~1.2 GB free — swap pressure likely"
|
||||||
|
- Load / Unload model buttons
|
||||||
|
|
||||||
|
**Backend endpoints:** `GET /api/vm/ollama/models`, `POST /api/vm/ollama/load`, `DELETE /api/vm/ollama/unload`
|
||||||
|
**Note:** `qwen2.5-coder:1.5b` is currently loaded — confirmed via `ollama ps`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4 — Trend Analysis *(Weeks 5–6)*
|
||||||
|
|
||||||
|
**Key architectural note:** Prometheus + cAdvisor + node-exporter are **already running and storing ~2 weeks of metrics history** including steal time, disk I/O, memory, container CPU/RAM. Do NOT create a separate Cosmos DB store. Query Prometheus directly.
|
||||||
|
|
||||||
|
#### 4.1 Prometheus Query Endpoints in Dashboard Backend
|
||||||
|
**Where:** New `GET /api/vm/metrics/trend` endpoint group
|
||||||
|
**What:** Proxy queries to internal Prometheus (http://prometheus:9090 within Docker network):
|
||||||
|
```
|
||||||
|
/api/vm/metrics/trend/disk?range=7d → disk usage % over time
|
||||||
|
/api/vm/metrics/trend/memory?range=7d → available RAM + swap used over time
|
||||||
|
/api/vm/metrics/trend/steal?range=7d → CPU steal % over time (once 1.2 is deployed)
|
||||||
|
/api/vm/metrics/trend/containers?range=7d → unhealthy container count over time
|
||||||
|
/api/vm/metrics/trend/io?range=7d → block write rate (flag invttrdg spikes)
|
||||||
|
```
|
||||||
|
**Note:** `devops-backend` is on `dashboard_default` network, Prometheus is on `learning_ai_common_plat_default`. Either add devops-backend to Prometheus network, or expose Prometheus on a host port (internal only, not via Caddy).
|
||||||
|
|
||||||
|
#### 4.2 Trend Charts on Dashboard
|
||||||
|
**Where:** Dashboard `/vm` — collapsible "Trends" section below score card
|
||||||
|
**What (7-day / 30-day toggle):**
|
||||||
|
- Disk % over time + linear projection line → "estimated to hit 55% warning in X days"
|
||||||
|
- Swap used over time (detect slow memory leak)
|
||||||
|
- CPU steal % over time (detect host degradation trend)
|
||||||
|
- Unhealthy container count per day
|
||||||
|
- Block write rate: flag days with `invttrdg-backend` anomalies
|
||||||
|
|
||||||
|
**Library recommendation:** Recharts (already likely in the Next.js project) or lightweight Chart.js wrapper.
|
||||||
|
|
||||||
|
#### 4.3 Weekly Digest (Telegram)
|
||||||
|
**Where:** New cron job — Monday 08:00 UTC — `vm-cleanup.sh --weekly-digest`
|
||||||
|
**What:**
|
||||||
|
```
|
||||||
|
📊 Weekly VM Digest — srv1491630
|
||||||
|
Week ending 2026-06-01
|
||||||
|
|
||||||
|
🖥 CPU Steal: 8.2% avg ⚠️ (host contention — escalate if > 10%)
|
||||||
|
💾 Disk: 37% (freed 257 MB this week via cleanup)
|
||||||
|
🧠 RAM: 10 GB free avg ✓
|
||||||
|
🔄 Swap peak: 1.4 GB ⚠️
|
||||||
|
🐳 Containers: 7 unhealthy (action required)
|
||||||
|
🤖 LLMs run: qwen2.5-coder:1.5b (3 sessions this week)
|
||||||
|
🧹 Cleanups: 1 standard, 0 full
|
||||||
|
📅 Next full: 2026-06-01
|
||||||
|
|
||||||
|
Top action: Restart 7 unhealthy web containers
|
||||||
|
```
|
||||||
|
**Dependency:** Phase 4.1 (needs Prometheus for weekly averages), Phase 1.2 (steal metric must be in Prometheus).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5 — Advanced / Backlog
|
||||||
|
|
||||||
|
| Item | Description | Trigger condition |
|
||||||
|
|---|---|---|
|
||||||
|
| **Add Grafana** | Container alongside Prometheus for richer dashboards; pre-built node-exporter dashboards available | Phase 4 charts feel limited |
|
||||||
|
| **Deployment ↔ health correlation** | Mark deploys on trend charts; correlate health dips to specific releases | After Phase 4.2 exists |
|
||||||
|
| **Multi-VM support** | Extend all above to aggregate across VMs | Adding second VM |
|
||||||
|
| **`invttrdg-backend` write audit** | Persistent investigation: what generates 22 GB/day of block writes? Add per-container I/O alert | After Phase 0.3 |
|
||||||
|
| **Chaos validation** | Monthly: watchdog stops a test container, verify restart within 10 min, report result | After Phase 2.1 |
|
||||||
|
| **Ollama GPU readiness check** | Detect GPU availability, surface in LLM panel as "GPU: none — inference will be slow" | Before adding large models |
|
||||||
|
| **Container image freshness** | Alert when container is running image > 30 days old (not rebuilt) | When deploy pipeline matures |
|
||||||
|
| **Cost attribution** | Tag containers by product (trading, notes, clock...) — RAM/CPU cost per product | When billing needed |
|
||||||
|
| **Backup health tracking** | `hermes-root-backup` and `uma-hermes-backup` results surfaced in dashboard | After Phase 2.2 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Order
|
||||||
|
|
||||||
|
```
|
||||||
|
Day 1–2 Phase 0 ── Fix broken foundations (VM module, logrotate, I/O investigation)
|
||||||
|
⚠️ MUST complete before any Phase 3 dashboard work
|
||||||
|
|
||||||
|
Week 1 Phase 1 ── Observability (steal metric, cron history, unhealthy detail, swap)
|
||||||
|
1.2 (steal) → unblocks 3.1 (score card)
|
||||||
|
1.1 (cron log format) → unblocks 3.2 (cron panel)
|
||||||
|
|
||||||
|
Week 2 Phase 2 ── Self-healing (watchdog, hermes-backup fix, memory limits)
|
||||||
|
2.1 requires: logrotate entry (Phase 0.2)
|
||||||
|
2.3 requires: 24h baseline observation first
|
||||||
|
|
||||||
|
Weeks 3–4 Phase 3 ── Dashboard control (score card, cron panel, containers, Ollama)
|
||||||
|
All require: Phase 0.1 (host volume mount)
|
||||||
|
3.1 requires: Phase 1.2 deployed
|
||||||
|
3.2 requires: Phase 1.1 deployed
|
||||||
|
|
||||||
|
Weeks 5–6 Phase 4 ── Trend analysis (Prometheus queries, charts, weekly digest)
|
||||||
|
4.1 requires: devops-backend on same Docker network as Prometheus
|
||||||
|
4.2 requires: Phase 4.1
|
||||||
|
4.3 requires: Phase 4.1 + Phase 1.2
|
||||||
|
|
||||||
|
Backlog Phase 5 ── Advanced items, trigger-based
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Success Criteria (how to know each phase is done)
|
||||||
|
|
||||||
|
| Phase | Done when… |
|
||||||
|
|---|---|
|
||||||
|
| 0.1 | `curl localhost:4004/api/vm/health` returns valid JSON with disk/load/swap data |
|
||||||
|
| 0.2 | `logrotate -d /etc/logrotate.d/bytelyst-vm` exits 0; logs present in `/var/log` |
|
||||||
|
| 0.3 | Root cause of 22 GB/day writes identified + alert configured |
|
||||||
|
| 1.1 | Dashboard `/vm` shows "Last cleanup: [date], freed [MB]" parsed from log |
|
||||||
|
| 1.2 | `vm-health-check.sh` includes steal % in output; Telegram sends steal alert at > 5% |
|
||||||
|
| 1.3 | Dashboard shows each unhealthy container's last health log + restart button works |
|
||||||
|
| 2.1 | Watchdog restarts an intentionally-broken test container within 30 min |
|
||||||
|
| 2.2 | `hermes-root-backup` runs 10 times without failure after fix deployed |
|
||||||
|
| 2.3 | All containers show memory limits in `docker inspect`; 48h with 0 OOMKill events |
|
||||||
|
| 3.1 | Score card renders live score; each dimension links to its detail |
|
||||||
|
| 4.1 | `/api/vm/metrics/trend/disk?range=7d` returns valid Prometheus time-series JSON |
|
||||||
|
| 4.3 | Telegram receives weekly digest on Monday 08:00 UTC |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What This Roadmap Delivers
|
||||||
|
|
||||||
|
| Today | After roadmap |
|
||||||
|
|---|---|
|
||||||
|
| `/api/vm/health` silently fails | VM module works; health data feeds dashboard |
|
||||||
|
| 8.2% steal is invisible | Daily alert + trend chart + score card dimension |
|
||||||
|
| "7 unhealthy" — no context, no fix | Drill-down to health log; auto-restart within 30 min |
|
||||||
|
| Cleanup log is a raw text dump | Structured panel: when, what, how much freed |
|
||||||
|
| invttrdg writing 22 GB/day — undetected | I/O alert + investigation complete |
|
||||||
|
| No memory guardrails on 39 containers | Per-container limits; OOM events alerted |
|
||||||
|
| 2 weeks of Prometheus data — no UI | Trend charts: disk projection, swap, steal over time |
|
||||||
|
| Manual VM diagnosis = 30 min SSH session | Score card auto-refreshes every 5 min |
|
||||||
|
| Ollama loads silently, may cause swap storm | RAM impact warning before load |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Change Log (v1 → v2)
|
||||||
|
|
||||||
|
| # | What changed | Why |
|
||||||
|
|---|---|---|
|
||||||
|
| 1 | Added **Phase 0** (fix broken foundations) | devops-backend VM module non-functional; must fix first |
|
||||||
|
| 2 | Phase 4.1 changed from Cosmos DB → **Prometheus queries** | Prometheus already running with 2 weeks of history; Cosmos would be duplicate |
|
||||||
|
| 3 | Phase 2.1 restart explanation corrected | `unless-stopped` does not react to health check failures; process is alive |
|
||||||
|
| 4 | Phase 1.2 steal time corrected | Requires **2 samples** 1s apart, not single `/proc/stat` read |
|
||||||
|
| 5 | Phase 2.3 memory limits **validated against actual RSS data** | Prevents proposing limits that would OOM running services |
|
||||||
|
| 6 | Phase 5 added **invttrdg I/O investigation** + Grafana option | 22 GB/day block writes is the highest-risk untracked issue on the machine |
|
||||||
|
| 7 | Added Phase 0.2 **logrotate** for new log files | `/var/log/docker-watchdog.log` would grow unbounded |
|
||||||
|
| 8 | Added **architectural decisions** section (Prometheus vs Cosmos, host exec strategy) | Prevents wasted build on wrong approach |
|
||||||
|
| 9 | Added **success criteria** per phase | Makes "done" objective and testable |
|
||||||
|
| 10 | Added explicit **phase dependency map** | Phase 3 items would fail if built before Phase 0 |
|
||||||
|
| 11 | Corrected LLM status: `qwen2.5-coder:1.5b` **is currently loaded** | `ollama ps` confirmed; not idle as v1 stated |
|
||||||
@ -183,6 +183,47 @@ do_uninstall_cron() {
|
|||||||
|
|
||||||
# ── Cleanup steps ─────────────────────────────────────────────────────────────
|
# ── Cleanup steps ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
step_cosmos_pglog() {
|
||||||
|
# The Azure CosmosDB emulator uses an embedded Postgres instance that logs
|
||||||
|
# every SQL statement to /logs/pglog.log inside its overlay layer.
|
||||||
|
# It grows ~275 MB/hr during heavy trading activity. Truncate it safely —
|
||||||
|
# Postgres keeps the file descriptor open so truncation doesn't break it.
|
||||||
|
log_header "CosmosDB Emulator Postgres Log"
|
||||||
|
local container="learning_ai_common_plat-cosmos-emulator-1"
|
||||||
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${container}$"; then
|
||||||
|
log_step SKIP "CosmosDB emulator not running"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
# Locate the overlay upper dir for this container
|
||||||
|
local pglog
|
||||||
|
pglog=$(docker inspect "$container" 2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import json,sys,os
|
||||||
|
d=json.load(sys.stdin)[0]
|
||||||
|
# Try direct GraphDriver path first
|
||||||
|
upper=d.get('GraphDriver',{}).get('Data',{}).get('UpperDir','')
|
||||||
|
if upper:
|
||||||
|
p=os.path.join(upper,'logs','pglog.log')
|
||||||
|
if os.path.exists(p): print(p)
|
||||||
|
exit()
|
||||||
|
# Fallback: scan rootfs overlayfs dirs
|
||||||
|
import glob
|
||||||
|
for f in glob.glob('/var/lib/docker/rootfs/overlayfs/*/logs/pglog.log'):
|
||||||
|
print(f); exit()
|
||||||
|
" 2>/dev/null || true)
|
||||||
|
if [[ -z "$pglog" || ! -f "$pglog" ]]; then
|
||||||
|
log_step SKIP "pglog.log not found (overlay path changed?)"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
local size_mb
|
||||||
|
size_mb=$(du -sm "$pglog" 2>/dev/null | cut -f1 || echo 0)
|
||||||
|
if (( size_mb < 100 )); then
|
||||||
|
log_step SKIP "pglog.log is ${size_mb}MB — no truncation needed (<100 MB)"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
run_cmd SAFE "Truncate CosmosDB pglog.log (${size_mb}MB → 0)" truncate -s 0 "$pglog"
|
||||||
|
}
|
||||||
|
|
||||||
step_docker_build_cache() {
|
step_docker_build_cache() {
|
||||||
log_header "Docker Build Cache"
|
log_header "Docker Build Cache"
|
||||||
if ! docker info &>/dev/null 2>&1; then
|
if ! docker info &>/dev/null 2>&1; then
|
||||||
@ -207,6 +248,21 @@ step_docker_system_prune() {
|
|||||||
docker system prune -f
|
docker system prune -f
|
||||||
}
|
}
|
||||||
|
|
||||||
|
step_docker_aged_images() {
|
||||||
|
# Removes tagged images that haven't been used by any container in >7 days.
|
||||||
|
# Safe because any running container holds a reference to its image — this
|
||||||
|
# only cleans up old image versions that were replaced (e.g. after a deploy).
|
||||||
|
log_header "Docker Aged Image Prune (unused >7 days)"
|
||||||
|
if ! docker info &>/dev/null 2>&1; then
|
||||||
|
log_step SKIP "Docker not running"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
local reclaimable
|
||||||
|
reclaimable=$(docker system df 2>/dev/null | awk '/^Images/ {print $4}' || echo "?")
|
||||||
|
run_cmd SAFE "Prune images unused for >7 days (currently $reclaimable reclaimable)" \
|
||||||
|
docker image prune -a -f --filter "until=168h"
|
||||||
|
}
|
||||||
|
|
||||||
step_docker_crash_loop_check() {
|
step_docker_crash_loop_check() {
|
||||||
log_header "Crash Loop Check"
|
log_header "Crash Loop Check"
|
||||||
if ! docker info &>/dev/null 2>&1; then return; fi
|
if ! docker info &>/dev/null 2>&1; then return; fi
|
||||||
@ -260,7 +316,7 @@ step_next_cache() {
|
|||||||
while IFS= read -r cache_dir; do
|
while IFS= read -r cache_dir; do
|
||||||
log_step SAFE "Remove $cache_dir"
|
log_step SAFE "Remove $cache_dir"
|
||||||
if ! $DRY_RUN; then rm -rf "$cache_dir"; fi
|
if ! $DRY_RUN; then rm -rf "$cache_dir"; fi
|
||||||
(( count++ ))
|
count=$(( count + 1 ))
|
||||||
done < <(
|
done < <(
|
||||||
find /opt/bytelyst -name ".next" -maxdepth 7 -type d 2>/dev/null \
|
find /opt/bytelyst -name ".next" -maxdepth 7 -type d 2>/dev/null \
|
||||||
| while read -r d; do
|
| while read -r d; do
|
||||||
@ -287,7 +343,7 @@ step_old_logs() {
|
|||||||
for f in /var/log/syslog.1 /var/log/kern.log.1 /var/log/ufw.log.1; do
|
for f in /var/log/syslog.1 /var/log/kern.log.1 /var/log/ufw.log.1; do
|
||||||
if [[ -f "$f" && ! -f "${f}.gz" ]]; then
|
if [[ -f "$f" && ! -f "${f}.gz" ]]; then
|
||||||
run_cmd SAFE "Compress $f" gzip -9 "$f"
|
run_cmd SAFE "Compress $f" gzip -9 "$f"
|
||||||
(( count++ ))
|
count=$(( count + 1 ))
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
# Remove log rotations older than 30 days
|
# Remove log rotations older than 30 days
|
||||||
@ -307,7 +363,7 @@ step_hold_cleanup() {
|
|||||||
size=$(du -sm "$nm" 2>/dev/null | cut -f1 || echo "0")
|
size=$(du -sm "$nm" 2>/dev/null | cut -f1 || echo "0")
|
||||||
run_cmd CAREFUL "Delete archived node_modules: $nm (~${size}MB)" rm -rf "$nm"
|
run_cmd CAREFUL "Delete archived node_modules: $nm (~${size}MB)" rm -rf "$nm"
|
||||||
total_freed=$(( total_freed + size ))
|
total_freed=$(( total_freed + size ))
|
||||||
(( found++ ))
|
found=$(( found + 1 ))
|
||||||
done < <(
|
done < <(
|
||||||
find /opt/bytelyst/HOLD -name "node_modules" -maxdepth 4 -type d 2>/dev/null || true
|
find /opt/bytelyst/HOLD -name "node_modules" -maxdepth 4 -type d 2>/dev/null || true
|
||||||
)
|
)
|
||||||
@ -345,6 +401,7 @@ log "[START] mode=${_mode} dry=$DRY_RUN"
|
|||||||
record_disk_before
|
record_disk_before
|
||||||
|
|
||||||
# ── WEEKLY (always run) ──────────────────────────────────────────────────────
|
# ── WEEKLY (always run) ──────────────────────────────────────────────────────
|
||||||
|
step_cosmos_pglog
|
||||||
step_docker_build_cache
|
step_docker_build_cache
|
||||||
step_docker_crash_loop_check
|
step_docker_crash_loop_check
|
||||||
step_journal
|
step_journal
|
||||||
@ -355,6 +412,7 @@ step_next_cache
|
|||||||
# ── MONTHLY (only with --full) ───────────────────────────────────────────────
|
# ── MONTHLY (only with --full) ───────────────────────────────────────────────
|
||||||
if $FULL_MODE; then
|
if $FULL_MODE; then
|
||||||
step_docker_system_prune
|
step_docker_system_prune
|
||||||
|
step_docker_aged_images
|
||||||
step_pnpm_store
|
step_pnpm_store
|
||||||
step_old_logs
|
step_old_logs
|
||||||
step_hold_cleanup
|
step_hold_cleanup
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user