From cf5428acd1dd3ab68efa44f4961794804955d77d Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Sat, 30 May 2026 06:33:59 +0000 Subject: [PATCH] =?UTF-8?q?feat(dashboard):=20Phase=201=20=E2=80=94=20hard?= =?UTF-8?q?en=20hermes-ops=20backend=20+=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Short-TTL (30s) snapshot cache + in-flight coalescing so the panel poll and concurrent refreshes don't fan out ~20 systemctl/git/ps/du subprocesses each time; snapshot carries a `cached` flag and `getHermesOpsSnapshot({force})`. - Distinguish "unit inactive" (down) from "probe couldn't run" (unknown): a new exec() wrapper reports whether the command actually ran (ENOENT/timeout = unknown) vs exited non-zero with output (e.g. systemctl is-active -> inactive). Per-field ProbeStatus on gateway/dashboard/timer/repo; warnings differentiate "is not active" from "status could not be determined". - Robust Bheem/Uma checks: `runuser -u uma -- systemctl --user is-active/ is-enabled` with a ps / existsSync fallback so a failed probe degrades to the legacy check instead of a false "down". - Zod schema (HermesOpsSnapshotSchema) as the stable typed contract; the route validates output before sending. New status fields are additive (active/ enabled/url/etc. preserved) so the existing web client is unaffected. - Unit tests (mock execFile/fs): healthy snapshot, down vs unknown mapping, runuser->ps fallback, unreadable repo, cache hit + force bypass, request coalescing. Backend: 16 tests green. Roadmap: check off Phase 1 items and Phase 5 P0 in hermes_dashboard_v2_roadmap.md. Co-Authored-By: Claude Opus 4.8 --- .../src/modules/hermes-ops/hermes-ops.test.ts | 186 +++++++++++++ .../src/modules/hermes-ops/repository.ts | 247 +++++++++++++----- .../backend/src/modules/hermes-ops/routes.ts | 7 +- .../backend/src/modules/hermes-ops/types.ts | 162 +++++++----- docs/hermes_dashboard_v2_roadmap.md | 18 +- 5 files changed, 484 insertions(+), 136 deletions(-) create mode 100644 dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts diff --git a/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts b/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts new file mode 100644 index 0000000..50a740e --- /dev/null +++ b/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts @@ -0,0 +1,186 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; +import { HermesOpsSnapshotSchema } from './types.js'; + +// --- Mocks for all I/O the repository performs --------------------------------- +const execFileMock = vi.hoisted(() => vi.fn()); +vi.mock('child_process', () => ({ execFile: execFileMock })); + +const readFileMock = vi.hoisted(() => vi.fn()); +const statMock = vi.hoisted(() => vi.fn()); +vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock })); + +const existsSyncMock = vi.hoisted(() => vi.fn()); +vi.mock('fs', () => ({ existsSync: existsSyncMock })); + +const { getHermesOpsSnapshot, clearHermesOpsCache } = await import('./repository.js'); + +type CmdResult = { stdout?: string; error?: Error & { code?: string | number; killed?: boolean; stdout?: string } }; +type Handler = (command: string, args: string[]) => CmdResult; + +// promisify(execFile) calls execFile(cmd, args, options, callback) and resolves +// with the callback's second arg, or rejects with the first. +function setExec(handler: Handler) { + execFileMock.mockImplementation( + (command: string, args: string[], _opts: unknown, cb: (err: unknown, result?: { stdout: string }) => void) => { + const res = handler(command, args); + if (res.error) cb(res.error); + else cb(null, { stdout: res.stdout ?? '' }); + }); +} + +// A fully-healthy fleet: every probe succeeds and reports good state. +function healthyHandler(): Handler { + return (command, args) => { + if (command === 'systemctl') { + if (args[0] === 'is-active') return { stdout: 'active\n' }; + if (args[0] === 'is-enabled') return { stdout: 'enabled\n' }; + if (args[0] === 'show') { + return { stdout: 'NextElapseUSecRealtime=Sat 2026-05-31 02:00:00\nLastTriggerUSec=Fri 2026-05-30 02:00:00\n' }; + } + } + if (command === 'runuser') { + // -u uma -- systemctl --user is-active|is-enabled ... + if (args.includes('is-active')) return { stdout: 'active\n' }; + if (args.includes('is-enabled')) return { stdout: 'enabled\n' }; + } + if (command === 'git') { + if (args[0] === 'branch') return { stdout: 'main\n' }; + if (args[0] === 'status') return { stdout: '' }; + if (args[0] === 'rev-parse') return { stdout: 'abc1234\n' }; + if (args[0] === 'log') return { stdout: '2026-05-30T02:00:00+00:00\n' }; + } + if (command === 'du') return { stdout: '12M\t.git\n' }; + if (command === 'tailscale') return { stdout: '100.87.53.10\n' }; + if (command === 'ps') return { stdout: '' }; + return { stdout: '' }; + }; +} + +function inactiveError(stdout: string): CmdResult { + // systemctl is-active for an inactive unit: exit 3, prints "inactive". + const error = Object.assign(new Error('exit 3'), { code: 3, stdout }); + return { error }; +} + +function enoentError(): CmdResult { + return { error: Object.assign(new Error('not found'), { code: 'ENOENT' }) }; +} + +beforeEach(() => { + vi.clearAllMocks(); + clearHermesOpsCache(); + // Default fs: manifests/jobs readable, google token present, emergency token present. + readFileMock.mockImplementation(async (p: string) => { + if (p.endsWith('MANIFEST.json')) return JSON.stringify({ files: [1, 2, 3] }); + if (p.endsWith('jobs.json')) return JSON.stringify({ jobs: [{ id: 'a' }, { id: 'b' }] }); + throw new Error('no such file'); + }); + statMock.mockResolvedValue({ isFile: () => true, size: 500 }); + existsSyncMock.mockReturnValue(true); +}); + +describe('hermes-ops repository', () => { + it('produces a schema-valid, fully-healthy snapshot with no warnings', async () => { + setExec(healthyHandler()); + const snapshot = await getHermesOpsSnapshot({ force: true }); + + expect(() => HermesOpsSnapshotSchema.parse(snapshot)).not.toThrow(); + expect(snapshot.cached).toBe(false); + expect(snapshot.instances).toHaveLength(2); + for (const inst of snapshot.instances) { + expect(inst.gateway.status).toBe('up'); + expect(inst.gateway.active).toBe(true); + expect(inst.gateway.enabled).toBe(true); + expect(inst.dashboard.status).toBe('up'); + expect(inst.backup.timer.status).toBe('up'); + expect(inst.backup.repo.status).toBe('up'); + expect(inst.backup.repo.clean).toBe(true); + } + expect(snapshot.warnings).toHaveLength(0); + }); + + it('maps a confirmed-inactive unit to status "down" with a warning', async () => { + setExec((command, args) => { + if (command === 'systemctl' && args[0] === 'is-active' && args[1] === 'hermes-gateway.service') { + return inactiveError('inactive\n'); + } + return healthyHandler()(command, args); + }); + + const snapshot = await getHermesOpsSnapshot({ force: true }); + const vijay = snapshot.instances.find((i) => i.id === 'vijay')!; + expect(vijay.gateway.status).toBe('down'); + expect(vijay.gateway.active).toBe(false); + expect(snapshot.warnings).toContain('Vijay / root gateway is not active'); + }); + + it('maps an un-runnable probe to status "unknown" (not a false "down")', async () => { + setExec((command, args) => { + if (command === 'systemctl' && args[0] === 'is-active' && args[1] === 'hermes-root-dashboard.service') { + return enoentError(); + } + return healthyHandler()(command, args); + }); + + const snapshot = await getHermesOpsSnapshot({ force: true }); + const vijay = snapshot.instances.find((i) => i.id === 'vijay')!; + expect(vijay.dashboard.status).toBe('unknown'); + expect(snapshot.warnings).toContain('Vijay / root private dashboard status could not be determined'); + expect(snapshot.warnings).not.toContain('Vijay / root private dashboard is not active'); + }); + + it('uses runuser --user for Bheem and falls back to ps when it cannot run', async () => { + setExec((command, args) => { + // runuser probe unavailable in this environment. + if (command === 'runuser') return enoentError(); + // Legacy ps fallback shows uma's gateway process running. + if (command === 'ps' && args[0] === '-eo') { + return { stdout: 'uma /usr/bin/python -m hermes_cli.main gateway\nroot /usr/sbin/sshd\n' }; + } + return healthyHandler()(command, args); + }); + + const snapshot = await getHermesOpsSnapshot({ force: true }); + const bheem = snapshot.instances.find((i) => i.id === 'bheem')!; + expect(bheem.gateway.active).toBe(true); + expect(bheem.gateway.status).toBe('up'); + }); + + it('reports unknown repo status when git cannot be read', async () => { + setExec((command, args) => { + if (command === 'git') return enoentError(); + return healthyHandler()(command, args); + }); + + const snapshot = await getHermesOpsSnapshot({ force: true }); + const vijay = snapshot.instances.find((i) => i.id === 'vijay')!; + expect(vijay.backup.repo.status).toBe('unknown'); + expect(vijay.backup.repo.head).toBeNull(); + expect(snapshot.warnings).toContain('Vijay / root backup repo HEAD could not be read'); + }); + + it('serves a cached snapshot within the TTL without re-probing', async () => { + setExec(healthyHandler()); + + const first = await getHermesOpsSnapshot(); + const callsAfterFirst = execFileMock.mock.calls.length; + expect(callsAfterFirst).toBeGreaterThan(0); + expect(first.cached).toBe(false); + + const second = await getHermesOpsSnapshot(); + expect(second.cached).toBe(true); + // No additional subprocesses were spawned for the cached read. + expect(execFileMock.mock.calls.length).toBe(callsAfterFirst); + + // force: true bypasses the cache and re-probes. + const third = await getHermesOpsSnapshot({ force: true }); + expect(third.cached).toBe(false); + expect(execFileMock.mock.calls.length).toBeGreaterThan(callsAfterFirst); + }); + + it('coalesces concurrent requests onto one computation', async () => { + setExec(healthyHandler()); + const [a, b] = await Promise.all([getHermesOpsSnapshot(), getHermesOpsSnapshot()]); + expect(a.generatedAt).toBe(b.generatedAt); + }); +}); diff --git a/dashboard/backend/src/modules/hermes-ops/repository.ts b/dashboard/backend/src/modules/hermes-ops/repository.ts index 039486a..184f10e 100644 --- a/dashboard/backend/src/modules/hermes-ops/repository.ts +++ b/dashboard/backend/src/modules/hermes-ops/repository.ts @@ -2,10 +2,24 @@ import { execFile } from 'child_process'; import { promisify } from 'util'; import { readFile, stat } from 'fs/promises'; import { existsSync } from 'fs'; -import type { HermesOpsCronJob, HermesOpsInstance, HermesOpsRepo, HermesOpsSnapshot, HermesOpsTimer } from './types.js'; +import type { + HermesOpsCronJob, + HermesOpsInstance, + HermesOpsRepo, + HermesOpsSnapshot, + HermesOpsTimer, + ProbeStatus, +} from './types.js'; const execFileAsync = promisify(execFile); +// Serve the snapshot from a short-TTL cache so the panel poll (~60s) and any +// concurrent refreshes don't fan out ~20 systemctl/git/ps/du subprocesses each +// time. Mirrors the health module's caching approach. +const CACHE_TTL = 30000; // 30 seconds +let cache: { snapshot: HermesOpsSnapshot; at: number } | null = null; +let inflight: Promise | null = null; + const instances = [ { id: 'vijay' as const, @@ -33,30 +47,101 @@ const instances = [ }, ]; -async function run(command: string, args: string[], cwd?: string): Promise { +interface ExecResult { + // Trimmed stdout. Present even when the command exited non-zero (e.g. + // `systemctl is-active` prints "inactive" and exits 3). + stdout: string; + // Whether the command actually executed. False only when it could not run at + // all — binary missing (ENOENT) or killed by the timeout. A non-zero exit with + // output still counts as `ran: true` so callers can read the output. + ran: boolean; +} + +async function exec(command: string, args: string[], cwd?: string): Promise { try { const { stdout } = await execFileAsync(command, args, { cwd, timeout: 5000, maxBuffer: 1024 * 1024, }); - return stdout.trim(); - } catch { - return null; + return { stdout: stdout.trim(), ran: true }; + } catch (error) { + const err = error as NodeJS.ErrnoException & { stdout?: string; killed?: boolean }; + // Command could not be spawned, or was killed by the timeout → unknown. + if (err?.code === 'ENOENT' || err?.killed) { + return { stdout: '', ran: false }; + } + // Ran but exited non-zero; the output is still meaningful. + if (typeof err?.stdout === 'string') { + return { stdout: err.stdout.trim(), ran: true }; + } + return { stdout: '', ran: false }; } } -async function isActive(unit: string): Promise { - return (await run('systemctl', ['is-active', unit])) === 'active'; +function activeFromResult(result: ExecResult): { active: boolean; status: ProbeStatus } { + if (!result.ran) return { active: false, status: 'unknown' }; + const active = result.stdout === 'active'; + return { active, status: active ? 'up' : 'down' }; } -async function isEnabled(unit: string): Promise { - return (await run('systemctl', ['is-enabled', unit])) === 'enabled'; +async function probeSystemActive(unit: string): Promise<{ active: boolean; status: ProbeStatus }> { + return activeFromResult(await exec('systemctl', ['is-active', unit])); +} + +async function probeSystemEnabled(unit: string): Promise { + const result = await exec('systemctl', ['is-enabled', unit]); + return result.ran && result.stdout === 'enabled'; +} + +// Bheem's gateway runs under Uma's *user* systemd. Use the authoritative +// `systemctl --user` check via `runuser`; if that probe can't run (no root, +// no user runtime dir, etc.) fall back to the legacy process-table scan so we +// degrade to the previous behaviour rather than reporting a false "down". +async function probeUmaGatewayActive(): Promise<{ active: boolean; status: ProbeStatus }> { + const userCheck = await exec('runuser', [ + '-u', + 'uma', + '--', + 'systemctl', + '--user', + 'is-active', + 'uma-hermes-gateway.service', + ]); + if (userCheck.ran) { + const active = userCheck.stdout === 'active'; + return { active, status: active ? 'up' : 'down' }; + } + + const ps = await exec('ps', ['-eo', 'user=,args=']); + if (ps.ran) { + const active = ps.stdout.split('\n').some((line) => { + const trimmed = line.trimStart(); + return trimmed.startsWith('uma ') && trimmed.includes('hermes_cli.main gateway'); + }); + return { active, status: active ? 'up' : 'down' }; + } + + return { active: false, status: 'unknown' }; +} + +async function probeUmaGatewayEnabled(): Promise { + const userCheck = await exec('runuser', [ + '-u', + 'uma', + '--', + 'systemctl', + '--user', + 'is-enabled', + 'uma-hermes-gateway.service', + ]); + if (userCheck.ran) return userCheck.stdout === 'enabled'; + return existsSync('/home/uma/.config/systemd/user/default.target.wants/uma-hermes-gateway.service'); } async function getTimer(name: string): Promise { - const active = await isActive(name); - const show = await run('systemctl', [ + const { active, status } = await probeSystemActive(name); + const show = await exec('systemctl', [ 'show', name, '-p', @@ -66,7 +151,7 @@ async function getTimer(name: string): Promise { '--no-pager', ]); const properties = Object.fromEntries( - (show ?? '') + (show.ran ? show.stdout : '') .split('\n') .map((line) => { const [key, ...value] = line.split('='); @@ -78,43 +163,38 @@ async function getTimer(name: string): Promise { return { name, active, + status, nextRun: properties.NextElapseUSecRealtime ?? null, lastRun: properties.LastTriggerUSec ?? null, }; } -async function isUmaGatewayActive(): Promise { - const output = await run('ps', ['-eo', 'user=,args=']); - return Boolean( - output?.split('\n').some((line) => { - const trimmed = line.trimStart(); - return trimmed.startsWith('uma ') && trimmed.includes('hermes_cli.main gateway'); - }), - ); -} - -async function isUmaGatewayEnabled(): Promise { - return existsSync('/home/uma/.config/systemd/user/default.target.wants/uma-hermes-gateway.service'); -} - async function getRepo(path: string): Promise { - const [branch, status, head, lastCommitAt, gitSize, backupSize] = await Promise.all([ - run('git', ['branch', '--show-current'], path), - run('git', ['status', '--porcelain'], path), - run('git', ['rev-parse', '--short', 'HEAD'], path), - run('git', ['log', '-1', '--format=%cI'], path), - run('du', ['-sh', '.git'], path), - run('du', ['-sh', 'hermes_persistent_backup'], path), + const [branch, statusOut, head, lastCommitAt, gitSize, backupSize] = await Promise.all([ + exec('git', ['branch', '--show-current'], path), + exec('git', ['status', '--porcelain'], path), + exec('git', ['rev-parse', '--short', 'HEAD'], path), + exec('git', ['log', '-1', '--format=%cI'], path), + exec('du', ['-sh', '.git'], path), + exec('du', ['-sh', 'hermes_persistent_backup'], path), ]); - const size = [gitSize, backupSize].filter(Boolean).join(' / '); + const size = [gitSize, backupSize] + .filter((r) => r.ran && r.stdout) + .map((r) => r.stdout) + .join(' / '); + + // HEAD readable ⇒ the repo could be inspected; otherwise we can't tell. + const status: ProbeStatus = head.ran ? 'up' : 'unknown'; return { path, - branch: branch || null, - clean: status === '', - head: head || null, - lastCommitAt: lastCommitAt || null, + branch: branch.ran ? branch.stdout || null : null, + // `clean` only means something when we could actually read status. + clean: statusOut.ran ? statusOut.stdout === '' : false, + head: head.ran ? head.stdout || null : null, + lastCommitAt: lastCommitAt.ran ? lastCommitAt.stdout || null : null, size: size ? size.replace(/\n/g, ' / ') : null, + status, }; } @@ -144,20 +224,21 @@ async function tokenExists(path: string): Promise { } async function getTailscaleIp(): Promise { - const output = await run('tailscale', ['ip', '-4']); - return output?.split('\n')[0] || null; + const result = await exec('tailscale', ['ip', '-4']); + if (!result.ran) return null; + return result.stdout.split('\n')[0] || null; } async function getActiveHermesSessionCount(): Promise { - const output = await run('ps', ['-ef']); - if (!output) return 0; - return output + const result = await exec('ps', ['-ef']); + if (!result.ran) return 0; + return result.stdout .split('\n') .filter((line) => line.includes('hermes_cli.main') && !line.includes('gateway') && !line.includes('grep')) .length; } -export async function getHermesOpsSnapshot(): Promise { +async function buildSnapshot(): Promise { const tailscaleIp = await getTailscaleIp(); const warnings: string[] = []; const emergencyDriveUpload = await getTimer('hermes-emergency-drive-upload.timer'); @@ -166,13 +247,13 @@ export async function getHermesOpsSnapshot(): Promise { const results: HermesOpsInstance[] = []; for (const item of instances) { const gatewayActiveCheck = - item.gatewayKind === 'uma-user' ? isUmaGatewayActive() : isActive(item.gatewayService); + item.gatewayKind === 'uma-user' ? probeUmaGatewayActive() : probeSystemActive(item.gatewayService); const gatewayEnabledCheck = - item.gatewayKind === 'uma-user' ? isUmaGatewayEnabled() : isEnabled(item.gatewayService); - const [gatewayActive, gatewayEnabled, dashboardActive, backupTimer, repo, stats, googleToken] = await Promise.all([ + item.gatewayKind === 'uma-user' ? probeUmaGatewayEnabled() : probeSystemEnabled(item.gatewayService); + const [gateway, gatewayEnabled, dashboard, backupTimer, repo, stats, googleToken] = await Promise.all([ gatewayActiveCheck, gatewayEnabledCheck, - isActive(item.dashboardService), + probeSystemActive(item.dashboardService), getTimer(item.backupTimer), getRepo(item.repoPath), manifestStats(`${item.repoPath}/hermes_persistent_backup`), @@ -180,12 +261,22 @@ export async function getHermesOpsSnapshot(): Promise { ]); const dashboardUrl = tailscaleIp ? `http://${tailscaleIp}:${item.dashboardPort}/` : `:${item.dashboardPort}`; - if (!gatewayActive) warnings.push(`${item.label} gateway is not active`); - if (!gatewayEnabled) warnings.push(`${item.label} gateway auto-start is not enabled`); - if (!dashboardActive) warnings.push(`${item.label} private dashboard is not active`); - if (!backupTimer.active) warnings.push(`${item.label} backup timer is not active`); - if (!repo.head) warnings.push(`${item.label} backup repo HEAD could not be read`); - if (!repo.clean) warnings.push(`${item.label} backup repo has uncommitted changes`); + + if (gateway.status === 'down') warnings.push(`${item.label} gateway is not active`); + else if (gateway.status === 'unknown') warnings.push(`${item.label} gateway status could not be determined`); + if (gateway.status !== 'unknown' && !gatewayEnabled) { + warnings.push(`${item.label} gateway auto-start is not enabled`); + } + if (dashboard.status === 'down') warnings.push(`${item.label} private dashboard is not active`); + else if (dashboard.status === 'unknown') { + warnings.push(`${item.label} private dashboard status could not be determined`); + } + if (backupTimer.status === 'down') warnings.push(`${item.label} backup timer is not active`); + else if (backupTimer.status === 'unknown') { + warnings.push(`${item.label} backup timer status could not be determined`); + } + if (repo.status === 'unknown') warnings.push(`${item.label} backup repo HEAD could not be read`); + else if (!repo.clean) warnings.push(`${item.label} backup repo has uncommitted changes`); if (!googleToken) warnings.push(`${item.label} Google Workspace token is missing`); results.push({ @@ -194,13 +285,15 @@ export async function getHermesOpsSnapshot(): Promise { hermesHome: item.hermesHome, gateway: { service: item.gatewayService, - active: gatewayActive, + active: gateway.active, enabled: gatewayEnabled, + status: gateway.status, }, dashboard: { service: item.dashboardService, - active: dashboardActive, + active: dashboard.active, url: dashboardUrl, + status: dashboard.status, }, backup: { timer: backupTimer, @@ -215,7 +308,10 @@ export async function getHermesOpsSnapshot(): Promise { }); } - if (!emergencyDriveUpload.active) warnings.push('Emergency Google Drive upload timer is not active'); + if (emergencyDriveUpload.status === 'down') warnings.push('Emergency Google Drive upload timer is not active'); + else if (emergencyDriveUpload.status === 'unknown') { + warnings.push('Emergency Google Drive upload timer status could not be determined'); + } if (!existsSync('/root/.config/hermes-google-drive/user-token.json')) { warnings.push('Emergency Drive OAuth token is missing'); } @@ -225,6 +321,7 @@ export async function getHermesOpsSnapshot(): Promise { name: emergencyDriveUpload.name, label: 'Emergency Drive upload', active: emergencyDriveUpload.active, + status: emergencyDriveUpload.status, nextRun: emergencyDriveUpload.nextRun, lastRun: emergencyDriveUpload.lastRun, }, @@ -232,18 +329,21 @@ export async function getHermesOpsSnapshot(): Promise { name: instance.backup.timer.name, label: `${instance.label} backup`, active: instance.backup.timer.active, + status: instance.backup.timer.status, nextRun: instance.backup.timer.nextRun, lastRun: instance.backup.timer.lastRun, })), ]; + const now = new Date().toISOString(); return { - generatedAt: new Date().toISOString(), + generatedAt: now, + cached: false, tailscaleIp, emergencyDriveUpload, activeSessions: { active: activeSessions, - updatedAt: new Date().toISOString(), + updatedAt: now, }, cronJobs, recentAlerts: warnings.slice(0, 6), @@ -268,3 +368,32 @@ export async function getHermesOpsSnapshot(): Promise { warnings, }; } + +export async function getHermesOpsSnapshot(options?: { force?: boolean }): Promise { + const force = options?.force ?? false; + + if (!force && cache && Date.now() - cache.at < CACHE_TTL) { + return { ...cache.snapshot, cached: true }; + } + + // Coalesce concurrent requests onto a single in-flight computation. + if (!force && inflight) return inflight; + + const promise = buildSnapshot() + .then((snapshot) => { + cache = { snapshot, at: Date.now() }; + return snapshot; + }) + .finally(() => { + if (inflight === promise) inflight = null; + }); + + if (!force) inflight = promise; + return promise; +} + +// Test hook: reset the module-level cache between cases. +export function clearHermesOpsCache(): void { + cache = null; + inflight = null; +} diff --git a/dashboard/backend/src/modules/hermes-ops/routes.ts b/dashboard/backend/src/modules/hermes-ops/routes.ts index 356b5e5..9c5b536 100644 --- a/dashboard/backend/src/modules/hermes-ops/routes.ts +++ b/dashboard/backend/src/modules/hermes-ops/routes.ts @@ -1,10 +1,15 @@ import type { FastifyInstance } from 'fastify'; import { getHermesOpsSnapshot } from './repository.js'; +import { HermesOpsSnapshotSchema } from './types.js'; export async function hermesOpsRoutes(fastify: FastifyInstance) { fastify.get('/hermes/ops', async (req, reply) => { try { - return reply.send(await getHermesOpsSnapshot()); + const snapshot = await getHermesOpsSnapshot(); + // Validate our own output against the stable contract before sending, so a + // shape regression surfaces as a 500 here rather than corrupt UI state. + const validated = HermesOpsSnapshotSchema.parse(snapshot); + return reply.send(validated); } catch (error) { fastify.log.error(error, 'Failed to get Hermes operations snapshot'); return reply.code(500).send({ error: 'Failed to get Hermes operations snapshot' }); diff --git a/dashboard/backend/src/modules/hermes-ops/types.ts b/dashboard/backend/src/modules/hermes-ops/types.ts index 410b7aa..a0e334e 100644 --- a/dashboard/backend/src/modules/hermes-ops/types.ts +++ b/dashboard/backend/src/modules/hermes-ops/types.ts @@ -1,74 +1,102 @@ -export interface HermesOpsTimer { - name: string; - active: boolean; - nextRun: string | null; - lastRun: string | null; -} +import { z } from 'zod'; -export interface HermesOpsRepo { - path: string; - branch: string | null; - clean: boolean; - head: string | null; - lastCommitAt: string | null; - size: string | null; -} +// A probed value is `up` (confirmed healthy), `down` (confirmed unhealthy/inactive), +// or `unknown` (the probe itself could not run — command missing, timed out, or no +// permission). This lets the UI distinguish "definitely down" from "couldn't tell". +export const ProbeStatusSchema = z.enum(['up', 'down', 'unknown']); +export type ProbeStatus = z.infer; -export interface HermesOpsGoogle { - workspaceToken: boolean; - driveFolder: string; -} +export const HermesOpsTimerSchema = z.object({ + name: z.string(), + active: z.boolean(), + status: ProbeStatusSchema, + nextRun: z.string().nullable(), + lastRun: z.string().nullable(), +}); +export type HermesOpsTimer = z.infer; -export interface HermesOpsInstance { - id: 'vijay' | 'bheem'; - label: string; - hermesHome: string; - gateway: { - service: string; - active: boolean; - enabled: boolean; - }; - dashboard: { - service: string; - active: boolean; - url: string; - }; - backup: { - timer: HermesOpsTimer; - repo: HermesOpsRepo; - restoredFileCount: number | null; - restoredCronJobs: number | null; - }; - google: HermesOpsGoogle; -} +export const HermesOpsRepoSchema = z.object({ + path: z.string(), + branch: z.string().nullable(), + clean: z.boolean(), + head: z.string().nullable(), + lastCommitAt: z.string().nullable(), + size: z.string().nullable(), + // `up` = HEAD was readable; `unknown` = git could not be read (path/permission). + status: ProbeStatusSchema, +}); +export type HermesOpsRepo = z.infer; -export interface HermesOpsSessionSummary { - active: number; - updatedAt: string | null; -} +export const HermesOpsGoogleSchema = z.object({ + workspaceToken: z.boolean(), + driveFolder: z.string(), +}); +export type HermesOpsGoogle = z.infer; -export interface HermesOpsCronJob { - name: string; - label: string; - active: boolean; - nextRun: string | null; - lastRun: string | null; -} +export const HermesOpsGatewaySchema = z.object({ + service: z.string(), + active: z.boolean(), + enabled: z.boolean(), + status: ProbeStatusSchema, +}); -export interface HermesOpsLink { - label: string; - href: string; - description: string; -} +export const HermesOpsDashboardSchema = z.object({ + service: z.string(), + active: z.boolean(), + url: z.string(), + status: ProbeStatusSchema, +}); -export interface HermesOpsSnapshot { - generatedAt: string; - tailscaleIp: string | null; - emergencyDriveUpload: HermesOpsTimer; - activeSessions: HermesOpsSessionSummary; - cronJobs: HermesOpsCronJob[]; - recentAlerts: string[]; - quickLinks: HermesOpsLink[]; - instances: HermesOpsInstance[]; - warnings: string[]; -} +export const HermesOpsInstanceSchema = z.object({ + id: z.enum(['vijay', 'bheem']), + label: z.string(), + hermesHome: z.string(), + gateway: HermesOpsGatewaySchema, + dashboard: HermesOpsDashboardSchema, + backup: z.object({ + timer: HermesOpsTimerSchema, + repo: HermesOpsRepoSchema, + restoredFileCount: z.number().nullable(), + restoredCronJobs: z.number().nullable(), + }), + google: HermesOpsGoogleSchema, +}); +export type HermesOpsInstance = z.infer; + +export const HermesOpsSessionSummarySchema = z.object({ + active: z.number(), + updatedAt: z.string().nullable(), +}); +export type HermesOpsSessionSummary = z.infer; + +export const HermesOpsCronJobSchema = z.object({ + name: z.string(), + label: z.string(), + active: z.boolean(), + status: ProbeStatusSchema, + nextRun: z.string().nullable(), + lastRun: z.string().nullable(), +}); +export type HermesOpsCronJob = z.infer; + +export const HermesOpsLinkSchema = z.object({ + label: z.string(), + href: z.string(), + description: z.string(), +}); +export type HermesOpsLink = z.infer; + +export const HermesOpsSnapshotSchema = z.object({ + generatedAt: z.string(), + // True when this payload was served from the short-TTL cache rather than freshly probed. + cached: z.boolean(), + tailscaleIp: z.string().nullable(), + emergencyDriveUpload: HermesOpsTimerSchema, + activeSessions: HermesOpsSessionSummarySchema, + cronJobs: z.array(HermesOpsCronJobSchema), + recentAlerts: z.array(z.string()), + quickLinks: z.array(HermesOpsLinkSchema), + instances: z.array(HermesOpsInstanceSchema), + warnings: z.array(z.string()), +}); +export type HermesOpsSnapshot = z.infer; diff --git a/docs/hermes_dashboard_v2_roadmap.md b/docs/hermes_dashboard_v2_roadmap.md index dfe33c8..5107c59 100644 --- a/docs/hermes_dashboard_v2_roadmap.md +++ b/docs/hermes_dashboard_v2_roadmap.md @@ -76,13 +76,13 @@ A single private dashboard where, for **both Vijay and Bheem**, S can see at a g The `hermes-ops` snapshot becomes the single source of truth for live status. Before building UI on it, harden it. -- [ ] Add a short-TTL cache (mirror the health module's 30s cache) so the 60s panel poll doesn't fan out ~20 `systemctl`/`git`/`ps`/`du` subprocesses every refresh; serve cached snapshot with `generatedAt`. -- [ ] Replace brittle Bheem/Uma checks in `repository.ts`: +- [x] Add a short-TTL cache (mirror the health module's 30s cache) so the 60s panel poll doesn't fan out ~20 `systemctl`/`git`/`ps`/`du` subprocesses every refresh; serve cached snapshot with `generatedAt`. +- [x] Replace brittle Bheem/Uma checks in `repository.ts` *(runuser `systemctl --user` with ps/existsSync fallback so a failed probe degrades to the legacy check, not a false "down")*: - `isUmaGatewayActive()` (currently `ps -eo` string match) → `runuser -u uma -- systemctl --user is-active uma-hermes-gateway.service` (or `--machine=uma@.host`). - `isUmaGatewayEnabled()` (currently hardcoded `existsSync` of a wants-symlink) → `systemctl --user is-enabled` via the same path. -- [ ] Stop swallowing every failure to `null` indiscriminately: distinguish "unit inactive" from "probe failed/timed out" and surface per-field status so the UI can show *unknown* vs *down*. -- [ ] Add Zod validation + a stable typed contract for `HermesOpsSnapshot` on the route. -- [ ] **Add unit tests for the `hermes-ops` repository** (mock `execFile`/fs) — closes the REVIEW_ACTIONS "only `services` has tests" gap for this module. +- [x] Stop swallowing every failure to `null` indiscriminately: distinguish "unit inactive" from "probe failed/timed out" and surface per-field status so the UI can show *unknown* vs *down*. +- [x] Add Zod validation + a stable typed contract for `HermesOpsSnapshot` on the route. +- [x] **Add unit tests for the `hermes-ops` repository** (mock `execFile`/fs) — closes the REVIEW_ACTIONS "only `services` has tests" gap for this module. - [ ] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). Interim stopgap until it ships: `runuser -u uma -- systemctl --user is-active/is-enabled` instead of the `ps`/`existsSync` checks. ## Phase 2 — Instance dimension across Mission Control (G2) @@ -120,8 +120,8 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn ## Phase 5 — Dashboard app hardening (G5) -- [ ] **P0:** Fix the CI workspace path (`${{ gitea.workspace }}`) in `.gitea/workflows/ci.yml`, `DEPLOYMENT.md`, `scripts/deploy-hotcopy.sh` (currently point at non-existent `/opt/bytelyst/bytelyst-devops-tools/...`). -- [ ] **P0:** Replace the no-op `lint` echo with real linting (`next lint` for web, minimal ESLint for backend); make `pnpm lint` fail on bad code. +- [x] **P0:** Fix the CI workspace path (`${{ gitea.workspace }}`) in `.gitea/workflows/ci.yml`, `DEPLOYMENT.md`, `scripts/deploy-hotcopy.sh` (currently point at non-existent `/opt/bytelyst/bytelyst-devops-tools/...`). +- [x] **P0:** Replace the no-op `lint` echo with real linting (`next lint` for web, minimal ESLint for backend); make `pnpm lint` fail on bad code. - [ ] **P1:** Add tests for `auth`, `csrf`, `deployments/orchestrator`, `health`, **and `hermes-ops`**; add `pnpm test:coverage` gate. - [ ] **P1:** Resolve the SSE TODO — either ship a Fastify-5-compatible log-stream or remove the SSE claim from docs/UI. - [ ] **P1:** Fix doc drift (web port 3000 vs 3049; endpoint URLs; merge duplicate deployment docs). @@ -185,11 +185,11 @@ This roadmap is complete when: Update only with evidence (source review, tests, build output, or browser/VM verification). - [ ] Phase 0 — Guardrails reconfirmed -- [ ] Phase 1 — `hermes-ops` hardened + tested +- [x] Phase 1 — `hermes-ops` hardened + tested - [ ] Phase 2 — Instance dimension + switcher - [ ] Phase 3 — Real telemetry ingestion + panes converted - [ ] Phase 4 — Bheem/Uma parity (backup, watchdog, restore drill) -- [ ] Phase 5 — App/CI hardening (P0 → P2) +- [ ] Phase 5 — App/CI hardening (P0 done; P1/P2 pending) - [ ] Phase 6 — UX polish - [ ] Phase 7 — Security & access - [ ] Phase 8 — Notifications & Telegram