From ad16b1308e037a082cdbae88e94f645f46ecc383 Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Sat, 30 May 2026 07:53:37 +0000 Subject: [PATCH] =?UTF-8?q?feat(dashboard):=20Phase=203=20slice=201=20?= =?UTF-8?q?=E2=80=94=20hermes=20telemetry=20contract=20+=20backend=20endpo?= =?UTF-8?q?int?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First slice of Phase 3 ("real per-instance telemetry"). Defines the read-only artifact contract from Decision #1 (sessions, cron, memory, skills, watchdog alerts, backup history) and ships an admin-gated backend endpoint that probes the live Hermes instance, gracefully degrading to status:'unknown' wherever the source isn't readable. What's new - `backend/src/modules/hermes-telemetry/types.ts` — Zod schemas for every section of the snapshot, plus a `HermesProbeStatus` reused from hermes-ops so the UI can distinguish "definitely empty" from "couldn't read the source" for each section independently. - `backend/src/modules/hermes-telemetry/repository.ts` — implementation that: * shells out via `runuser -u --` for cross-user instances (Bheem/uma) the same way `hermes-ops/repository.ts` does; * parses `hermes sessions stats / cron list / memory list / skills list --json` when the CLI is present, otherwise reports status:'unknown'; * tails the watchdog log and buckets each line by severity (critical / warn / info); * pulls `git -C log` against the instance's backup repo for backup history; * caches per-instance with a 30s TTL + in-flight coalescing, same pattern as hermes-ops. - `backend/src/modules/hermes-telemetry/routes.ts` — admin-only GET `/api/hermes/telemetry/:instance` (the `instance` path param is Zod-validated; the response is validated against `HermesTelemetrySnapshotSchema` before send so a shape regression surfaces here, not in the UI). - `backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts` — 6 unit tests: ENOENT-on-everything case validates against the schema, JSON-parse path for sessions/cron/memory/skills, watchdog log severity bucketing, backup-history `git log` parsing, cache hit, per-instance cache isolation. Coverage: 95.17% lines on the new repository module. - `backend/vitest.config.ts` — telemetry repository added to the coverage gate's `include` list (ratchet). - `web/src/lib/api.ts` — typed surface for the new endpoint: `HermesTelemetrySnapshot` + sub-types + `api.getHermesTelemetry`. What's NOT in this slice - UI consumption. The Task Ledger / Agents / History panes still render mock data; converting them is queued for the next slices. This slice ships the contract + the backend so those slices can build on a stable shape. - Backward-compat replacement of `/api/hermes/ops` (which is unauthenticated today). That comes with the Phase 7 auth pass. Verified: backend typecheck ✅, 57/57 unit tests ✅, web typecheck ✅, lint 0 errors, coverage gate ≥95% lines on every gated file. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../hermes-telemetry/hermes-telemetry.test.ts | 168 ++++++++++ .../modules/hermes-telemetry/repository.ts | 315 ++++++++++++++++++ .../src/modules/hermes-telemetry/routes.ts | 36 ++ .../src/modules/hermes-telemetry/types.ts | 118 +++++++ dashboard/backend/src/server.ts | 2 + dashboard/backend/vitest.config.ts | 1 + dashboard/web/src/lib/api.ts | 99 ++++++ 7 files changed, 739 insertions(+) create mode 100644 dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts create mode 100644 dashboard/backend/src/modules/hermes-telemetry/repository.ts create mode 100644 dashboard/backend/src/modules/hermes-telemetry/routes.ts create mode 100644 dashboard/backend/src/modules/hermes-telemetry/types.ts diff --git a/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts b/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts new file mode 100644 index 0000000..f558d10 --- /dev/null +++ b/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts @@ -0,0 +1,168 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; +import { HermesTelemetrySnapshotSchema } from './types.js'; + +// --- I/O mocks -------------------------------------------------------------- +const execFileMock = vi.hoisted(() => vi.fn()); +vi.mock('child_process', () => ({ execFile: execFileMock })); + +const readFileMock = vi.hoisted(() => vi.fn()); +const statMock = vi.hoisted(() => vi.fn()); +vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock })); + +type Handler = (command: string, args: string[]) => { error?: NodeJS.ErrnoException; stdout?: string }; + +function setExec(handler: Handler) { + execFileMock.mockImplementation( + ( + command: string, + args: string[], + _opts: unknown, + cb: (err: unknown, result?: { stdout: string }) => void, + ) => { + const res = handler(command, args); + if (res.error) cb(res.error); + else cb(null, { stdout: res.stdout ?? '' }); + }, + ); +} + +const { getHermesTelemetrySnapshot, clearHermesTelemetryCache } = await import('./repository.js'); + +describe('hermes-telemetry repository', () => { + beforeEach(() => { + vi.clearAllMocks(); + clearHermesTelemetryCache(); + }); + + it('returns a Zod-valid snapshot when every probe fails (CLI missing)', async () => { + // ENOENT on every shell-out (no `hermes` CLI available). + setExec(() => { + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + }); + statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readFileMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + + const snapshot = await getHermesTelemetrySnapshot('vijay'); + // The whole shape must validate even when nothing was readable — that's + // the contract the route handler relies on to send a stable JSON to the + // UI rather than a 500. + expect(() => HermesTelemetrySnapshotSchema.parse(snapshot)).not.toThrow(); + expect(snapshot.instanceId).toBe('vijay'); + expect(snapshot.sessions.status).toBe('unknown'); + expect(snapshot.cron.status).toBe('unknown'); + expect(snapshot.memory.status).toBe('unknown'); + expect(snapshot.skills.status).toBe('unknown'); + expect(snapshot.watchdog.status).toBe('unknown'); + expect(snapshot.backupHistory.status).toBe('unknown'); + expect(snapshot.warnings.length).toBeGreaterThan(0); + }); + + it('parses sessions/cron/memory/skills JSON output when the CLI is present', async () => { + setExec((command, args) => { + if (command === 'hermes' && args.slice(0, 2).join(' ') === 'sessions stats') { + return { stdout: JSON.stringify({ sessions: 59, messages: 5225 }) }; + } + if (command === 'hermes' && args.slice(0, 2).join(' ') === 'cron list') { + return { + stdout: JSON.stringify([ + { id: 'mem-rotate', name: 'Memory rotation', schedule: '0 4 * * *', last_run: '2026-01-01T04:00:00Z', next_run: '2026-01-02T04:00:00Z', last_status: 'ok', active: true }, + ]), + }; + } + if (command === 'hermes' && args.slice(0, 2).join(' ') === 'memory list') { + return { stdout: JSON.stringify([{ id: 'm1', type: 'note', key: 'gateway', summary: 'restart procedure' }]) }; + } + if (command === 'hermes' && args.slice(0, 2).join(' ') === 'skills list') { + return { stdout: JSON.stringify([{ id: 's1', name: 'restart', description: 'restart a service', enabled: true }]) }; + } + if (command === 'git') { + // No backup repo on the test box. + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + } + return { stdout: '' }; + }); + statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + + const snapshot = await getHermesTelemetrySnapshot('vijay'); + expect(snapshot.sessions).toEqual({ totalSessions: 59, totalMessages: 5225, status: 'up' }); + expect(snapshot.cron.status).toBe('up'); + expect(snapshot.cron.entries).toHaveLength(1); + expect(snapshot.cron.entries[0].name).toBe('Memory rotation'); + expect(snapshot.memory.status).toBe('up'); + expect(snapshot.memory.items[0].key).toBe('gateway'); + expect(snapshot.skills.status).toBe('up'); + expect(snapshot.skills.items[0].id).toBe('s1'); + }); + + it('parses watchdog log lines into severity-tagged alerts', async () => { + setExec(() => { + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + }); + statMock.mockResolvedValue({} as never); + readFileMock.mockResolvedValue([ + '2026-01-01T12:34:56 WARNING gateway is degraded', + '2026-01-01T12:35:01 CRITICAL backup repo HEAD missing', + '2026-01-01T12:35:30 INFO healthy after retry', + '', + ].join('\n')); + + const snapshot = await getHermesTelemetrySnapshot('bheem'); + expect(snapshot.watchdog.status).toBe('up'); + expect(snapshot.watchdog.alerts.map((a) => a.severity)).toEqual(['warn', 'critical', 'info']); + expect(snapshot.watchdog.alerts[0].message).toBe('gateway is degraded'); + }); + + it('parses backup history from `git log` output', async () => { + setExec((command, args) => { + if (command === 'git' && args.includes('log')) { + return { + stdout: [ + 'a1b2c3\x1f2026-01-01T01:00:00Z\xfeBackup at 01:00', + 'd4e5f6\x1f2026-01-01T00:00:00Z\xfeBackup at 00:00', + ].map((s) => s.replace('\xfe', '\x1f')).join('\n'), + }; + } + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + }); + statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + + const snapshot = await getHermesTelemetrySnapshot('vijay'); + expect(snapshot.backupHistory.status).toBe('up'); + expect(snapshot.backupHistory.entries).toHaveLength(2); + expect(snapshot.backupHistory.entries[0]).toMatchObject({ sha: 'a1b2c3', subject: 'Backup at 01:00' }); + }); + + it('serves a cached snapshot on the second call within the TTL window', async () => { + let calls = 0; + setExec(() => { + calls++; + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + }); + statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + + const a = await getHermesTelemetrySnapshot('vijay'); + const callsAfterFirst = calls; + const b = await getHermesTelemetrySnapshot('vijay'); + expect(calls).toBe(callsAfterFirst); // no extra subprocess work + expect(b.cached).toBe(true); + expect(a.instanceId).toBe(b.instanceId); + }); + + it('caches per instance independently', async () => { + setExec(() => { + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + }); + statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + + const v = await getHermesTelemetrySnapshot('vijay'); + const b = await getHermesTelemetrySnapshot('bheem'); + expect(v.instanceId).toBe('vijay'); + expect(b.instanceId).toBe('bheem'); + }); +}); diff --git a/dashboard/backend/src/modules/hermes-telemetry/repository.ts b/dashboard/backend/src/modules/hermes-telemetry/repository.ts new file mode 100644 index 0000000..afd8ce9 --- /dev/null +++ b/dashboard/backend/src/modules/hermes-telemetry/repository.ts @@ -0,0 +1,315 @@ +import { execFile } from 'child_process'; +import { promisify } from 'util'; +import { readFile, stat } from 'fs/promises'; +import { childLogger } from '../../lib/logger.js'; +import type { + HermesBackupHistory, + HermesBackupHistoryEntry, + HermesCronEntry, + HermesCronList, + HermesInstanceId, + HermesMemoryList, + HermesSessionStats, + HermesSkillList, + HermesTelemetrySnapshot, + HermesWatchdogAlert, + HermesWatchdogFeed, + HermesWatchdogSeverity, +} from './types.js'; + +const execFileAsync = promisify(execFile); +const log = childLogger('hermes-telemetry/repository'); + +// Per-instance config. Mirrors `instances` in `hermes-ops/repository.ts`. +// Anything we shell out to as the live instance owner ("uma" for Bheem, root +// for Vijay) is wrapped in `runuser -u ` so the command runs in the +// owner's environment, not the backend's. +interface InstanceConfig { + id: HermesInstanceId; + user: string | null; // null → run as the backend's own user (root in prod) + repoPath: string; + watchdogLog: string; +} + +const INSTANCES: Record = { + vijay: { + id: 'vijay', + user: null, + repoPath: '/root/repos/bytelyst_hostinger_hermes_vm', + watchdogLog: '/root/.hermes/logs/hermes-health-watchdog.log', + }, + bheem: { + id: 'bheem', + user: 'uma', + repoPath: '/home/uma/repos/uma_hostinger_hermes_vm', + watchdogLog: '/home/uma/.hermes/logs/hermes-health-watchdog.log', + }, +}; + +interface ExecResult { + stdout: string; + ran: boolean; +} + +async function exec( + command: string, + args: string[], + cwd?: string, + timeoutMs = 5000, +): Promise { + try { + const { stdout } = await execFileAsync(command, args, { cwd, timeout: timeoutMs }); + return { stdout: stdout.trim(), ran: true }; + } catch (err) { + const e = err as NodeJS.ErrnoException & { code?: string | number; stdout?: string }; + if (e?.code === 'ENOENT' || e?.code === 'ETIMEDOUT') { + return { stdout: '', ran: false }; + } + // A non-zero exit is still useful — `git log` on an empty repo, etc. + return { stdout: (e?.stdout ?? '').toString().trim(), ran: true }; + } +} + +// Wrap a command in `runuser -u --` when needed so it runs in the +// instance owner's environment (PATH, ~/.hermes config). For the local +// instance (user=null) we just call the binary directly. +async function execAs( + inst: InstanceConfig, + command: string, + args: string[], + timeoutMs = 5000, +): Promise { + if (!inst.user) return exec(command, args, undefined, timeoutMs); + return exec('runuser', ['-u', inst.user, '--', command, ...args], undefined, timeoutMs); +} + +// --- Sessions --------------------------------------------------------------- +// +// The Hermes CLI exposes `hermes sessions stats --json` (when present). When +// it's not available we report status:'unknown' rather than fabricating zeros. +async function readSessionStats(inst: InstanceConfig): Promise { + const result = await execAs(inst, 'hermes', ['sessions', 'stats', '--json']); + if (!result.ran) return { totalSessions: 0, totalMessages: 0, status: 'unknown' }; + + try { + const parsed = JSON.parse(result.stdout) as { sessions?: number; messages?: number }; + return { + totalSessions: Number(parsed.sessions ?? 0), + totalMessages: Number(parsed.messages ?? 0), + status: 'up', + }; + } catch { + return { totalSessions: 0, totalMessages: 0, status: 'unknown' }; + } +} + +// --- Cron ------------------------------------------------------------------- +// +// `hermes cron list --json` is the canonical source. It's distinct from +// systemd timers (which `hermes-ops` already covers) — this is for Hermes' +// own scheduled tasks (memory rotations, telegram digests, etc). +async function readCron(inst: InstanceConfig): Promise { + const result = await execAs(inst, 'hermes', ['cron', 'list', '--json']); + if (!result.ran) return { entries: [], status: 'unknown' }; + + try { + const raw = JSON.parse(result.stdout) as Array>; + const entries: HermesCronEntry[] = raw.map((row) => ({ + id: String(row.id ?? row.name ?? ''), + name: String(row.name ?? row.id ?? ''), + schedule: row.schedule ? String(row.schedule) : null, + lastRun: row.last_run ? String(row.last_run) : (row.lastRun ? String(row.lastRun) : null), + nextRun: row.next_run ? String(row.next_run) : (row.nextRun ? String(row.nextRun) : null), + lastStatus: row.last_status ? String(row.last_status) : (row.lastStatus ? String(row.lastStatus) : null), + active: Boolean(row.active ?? row.enabled ?? true), + })); + return { entries, status: 'up' }; + } catch { + return { entries: [], status: 'unknown' }; + } +} + +// --- Memory + skills -------------------------------------------------------- +async function readMemory(inst: InstanceConfig): Promise { + const result = await execAs(inst, 'hermes', ['memory', 'list', '--json']); + if (!result.ran) return { items: [], status: 'unknown' }; + + try { + const raw = JSON.parse(result.stdout) as Array>; + return { + items: raw.map((row) => ({ + id: String(row.id ?? ''), + type: String(row.type ?? 'note'), + key: String(row.key ?? row.name ?? ''), + summary: String(row.summary ?? row.value ?? ''), + updatedAt: row.updated_at ? String(row.updated_at) : (row.updatedAt ? String(row.updatedAt) : null), + })), + status: 'up', + }; + } catch { + return { items: [], status: 'unknown' }; + } +} + +async function readSkills(inst: InstanceConfig): Promise { + const result = await execAs(inst, 'hermes', ['skills', 'list', '--json']); + if (!result.ran) return { items: [], status: 'unknown' }; + + try { + const raw = JSON.parse(result.stdout) as Array>; + return { + items: raw.map((row) => ({ + id: String(row.id ?? row.name ?? ''), + name: String(row.name ?? row.id ?? ''), + description: String(row.description ?? ''), + enabled: Boolean(row.enabled ?? true), + })), + status: 'up', + }; + } catch { + return { items: [], status: 'unknown' }; + } +} + +// --- Watchdog --------------------------------------------------------------- +// +// Tail the last N lines of the watchdog log and bucket them by severity. +// The log format used by `scripts/hermes-health-watchdog.py` today is roughly: +// 2026-01-01T12:34:56 WARNING gateway is degraded: ... +// 2026-01-01T12:35:01 CRITICAL backup repo HEAD missing +// We accept any RFC3339-ish leading timestamp and a severity word. +const WATCHDOG_LINE = /^(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:[.\d]+)?(?:Z|[+-]\d{2}:?\d{2})?)\s+(\w+)\s+(.*)$/; + +function normalizeSeverity(token: string): HermesWatchdogSeverity { + const upper = token.toUpperCase(); + if (upper === 'CRITICAL' || upper === 'ERROR' || upper === 'FATAL') return 'critical'; + if (upper === 'WARNING' || upper === 'WARN') return 'warn'; + return 'info'; +} + +async function readWatchdog(inst: InstanceConfig): Promise { + try { + await stat(inst.watchdogLog); + } catch { + return { alerts: [], source: inst.watchdogLog, status: 'unknown' }; + } + + try { + const content = await readFile(inst.watchdogLog, 'utf8'); + const lines = content.split('\n').filter(Boolean); + // Cap to the last 50 entries; anything older isn't useful for the panel. + const tail = lines.slice(-50); + const alerts: HermesWatchdogAlert[] = []; + for (const line of tail) { + const match = WATCHDOG_LINE.exec(line); + if (!match) continue; + const [, ts, severityToken, message] = match; + alerts.push({ + timestamp: ts, + severity: normalizeSeverity(severityToken), + message: message.trim(), + }); + } + return { alerts, source: inst.watchdogLog, status: 'up' }; + } catch (err) { + log.warn({ err, instance: inst.id, source: inst.watchdogLog }, 'failed to read watchdog log'); + return { alerts: [], source: inst.watchdogLog, status: 'unknown' }; + } +} + +// --- Backup history --------------------------------------------------------- +// +// Cheap proxy for "is the backup pipeline alive": last N commits on the +// backup repo. `git -C log --pretty=...` runs as the backend user; +// repos are world-readable on the live host. +async function readBackupHistory(inst: InstanceConfig): Promise { + const result = await exec('git', [ + '-C', inst.repoPath, + 'log', + '--pretty=format:%H\x1f%cI\x1f%s', + '-n', '20', + ]); + if (!result.ran) return { entries: [], repoPath: inst.repoPath, status: 'unknown' }; + if (!result.stdout) return { entries: [], repoPath: inst.repoPath, status: 'up' }; + + const entries: HermesBackupHistoryEntry[] = []; + for (const line of result.stdout.split('\n')) { + const [sha, committedAt, ...rest] = line.split('\x1f'); + if (!sha || !committedAt) continue; + entries.push({ sha, committedAt, subject: rest.join('\x1f') }); + } + return { entries, repoPath: inst.repoPath, status: 'up' }; +} + +// --- Snapshot assembly ------------------------------------------------------ +const CACHE_TTL = 30000; +const cache = new Map(); +const inflight = new Map>(); + +async function buildSnapshot(instanceId: HermesInstanceId): Promise { + const inst = INSTANCES[instanceId]; + const [sessions, cron, memory, skills, watchdog, backupHistory] = await Promise.all([ + readSessionStats(inst), + readCron(inst), + readMemory(inst), + readSkills(inst), + readWatchdog(inst), + readBackupHistory(inst), + ]); + + const warnings: string[] = []; + if (sessions.status === 'unknown') warnings.push(`${instanceId}: hermes sessions stats unavailable (CLI missing or non-zero exit)`); + if (cron.status === 'unknown') warnings.push(`${instanceId}: hermes cron list unavailable`); + if (memory.status === 'unknown') warnings.push(`${instanceId}: hermes memory list unavailable`); + if (skills.status === 'unknown') warnings.push(`${instanceId}: hermes skills list unavailable`); + if (watchdog.status === 'unknown') warnings.push(`${instanceId}: watchdog log not readable at ${watchdog.source ?? 'unknown path'}`); + if (backupHistory.status === 'unknown') warnings.push(`${instanceId}: backup repo not readable at ${backupHistory.repoPath ?? 'unknown path'}`); + + return { + generatedAt: new Date().toISOString(), + cached: false, + instanceId, + sessions, + cron, + memory, + skills, + watchdog, + backupHistory, + warnings, + }; +} + +export async function getHermesTelemetrySnapshot( + instanceId: HermesInstanceId, + options?: { force?: boolean }, +): Promise { + const force = options?.force ?? false; + + if (!force) { + const cached = cache.get(instanceId); + if (cached && Date.now() - cached.at < CACHE_TTL) { + return { ...cached.snapshot, cached: true }; + } + + const pending = inflight.get(instanceId); + if (pending) return pending; + } + + const promise = buildSnapshot(instanceId) + .then((snapshot) => { + cache.set(instanceId, { snapshot, at: Date.now() }); + return snapshot; + }) + .finally(() => { + if (inflight.get(instanceId) === promise) inflight.delete(instanceId); + }); + + if (!force) inflight.set(instanceId, promise); + return promise; +} + +// Test hook so `vitest` cases don't bleed cached state across runs. +export function clearHermesTelemetryCache(): void { + cache.clear(); + inflight.clear(); +} diff --git a/dashboard/backend/src/modules/hermes-telemetry/routes.ts b/dashboard/backend/src/modules/hermes-telemetry/routes.ts new file mode 100644 index 0000000..105c97f --- /dev/null +++ b/dashboard/backend/src/modules/hermes-telemetry/routes.ts @@ -0,0 +1,36 @@ +import type { FastifyInstance } from 'fastify'; +import { z } from 'zod'; +import { getHermesTelemetrySnapshot } from './repository.js'; +import { HermesInstanceIdSchema, HermesTelemetrySnapshotSchema } from './types.js'; +import { requireAdmin } from '../../lib/auth.js'; + +const ParamsSchema = z.object({ instance: HermesInstanceIdSchema }); + +export async function hermesTelemetryRoutes(fastify: FastifyInstance) { + // GET /api/hermes/telemetry/:instance + // Admin-only: this endpoint shells out to `hermes` CLI in the instance + // owner's environment (`runuser -u uma --` for Bheem) and reads the + // watchdog log + backup repo. Treat it as privileged the same way the + // VM/system endpoints are. See `dashboard/DEPLOYMENT.md` Privilege Surface. + fastify.get('/hermes/telemetry/:instance', { + preHandler: async (req) => requireAdmin(req), + }, async (req, reply) => { + let params: z.infer; + try { + params = ParamsSchema.parse(req.params); + } catch (err) { + return reply.code(400).send({ error: 'Invalid instance', detail: (err as Error).message }); + } + + try { + const snapshot = await getHermesTelemetrySnapshot(params.instance); + // Validate our own response so a shape regression surfaces here as a + // 500 rather than a corrupt UI state — same approach as hermes-ops. + const validated = HermesTelemetrySnapshotSchema.parse(snapshot); + return reply.send(validated); + } catch (err) { + fastify.log.error(err, 'failed to build hermes telemetry snapshot'); + return reply.code(500).send({ error: 'Failed to build hermes telemetry snapshot' }); + } + }); +} diff --git a/dashboard/backend/src/modules/hermes-telemetry/types.ts b/dashboard/backend/src/modules/hermes-telemetry/types.ts new file mode 100644 index 0000000..b35a852 --- /dev/null +++ b/dashboard/backend/src/modules/hermes-telemetry/types.ts @@ -0,0 +1,118 @@ +import { z } from 'zod'; +import { ProbeStatusSchema } from '../hermes-ops/types.js'; + +// Hermes telemetry snapshot — read-only "real artifacts" per Phase 3 Decision #1 +// (sessions, cron, memory, skills, watchdog alerts, backup history). +// Each section carries its own `ProbeStatus` so the UI can distinguish +// "definitely empty" from "couldn't read the source" (CLI missing, permission +// denied, timed out). Mirrors the hermes-ops shape: every field set the UI +// renders has a status it can surface. + +export const HermesInstanceIdSchema = z.enum(['vijay', 'bheem']); +export type HermesInstanceId = z.infer; + +export const HermesSessionStatsSchema = z.object({ + totalSessions: z.number(), + totalMessages: z.number(), + status: ProbeStatusSchema, +}); +export type HermesSessionStats = z.infer; + +export const HermesCronEntrySchema = z.object({ + id: z.string(), + name: z.string(), + schedule: z.string().nullable(), + lastRun: z.string().nullable(), + nextRun: z.string().nullable(), + lastStatus: z.string().nullable(), + active: z.boolean(), +}); +export type HermesCronEntry = z.infer; + +export const HermesCronListSchema = z.object({ + entries: z.array(HermesCronEntrySchema), + status: ProbeStatusSchema, +}); +export type HermesCronList = z.infer; + +export const HermesMemoryItemSchema = z.object({ + id: z.string(), + type: z.string(), + key: z.string(), + summary: z.string(), + updatedAt: z.string().nullable(), +}); +export type HermesMemoryItem = z.infer; + +export const HermesMemoryListSchema = z.object({ + items: z.array(HermesMemoryItemSchema), + status: ProbeStatusSchema, +}); +export type HermesMemoryList = z.infer; + +export const HermesSkillItemSchema = z.object({ + id: z.string(), + name: z.string(), + description: z.string(), + enabled: z.boolean(), +}); +export type HermesSkillItem = z.infer; + +export const HermesSkillListSchema = z.object({ + items: z.array(HermesSkillItemSchema), + status: ProbeStatusSchema, +}); +export type HermesSkillList = z.infer; + +// Severity is a union of `info | warn | critical` so the UI can colour-code. +// Watchdog scripts emit "WARNING" / "CRITICAL" prefixes today; we normalize. +export const HermesWatchdogSeveritySchema = z.enum(['info', 'warn', 'critical']); +export type HermesWatchdogSeverity = z.infer; + +export const HermesWatchdogAlertSchema = z.object({ + timestamp: z.string(), + severity: HermesWatchdogSeveritySchema, + message: z.string(), +}); +export type HermesWatchdogAlert = z.infer; + +export const HermesWatchdogFeedSchema = z.object({ + alerts: z.array(HermesWatchdogAlertSchema), + // Path the alerts were read from (or where they would be read from when + // the source becomes available). Null when no canonical path is known. + source: z.string().nullable(), + status: ProbeStatusSchema, +}); +export type HermesWatchdogFeed = z.infer; + +export const HermesBackupHistoryEntrySchema = z.object({ + sha: z.string(), + committedAt: z.string(), + subject: z.string(), +}); +export type HermesBackupHistoryEntry = z.infer; + +export const HermesBackupHistorySchema = z.object({ + entries: z.array(HermesBackupHistoryEntrySchema), + // Repo path probed (informational; useful when status is `unknown`). + repoPath: z.string().nullable(), + status: ProbeStatusSchema, +}); +export type HermesBackupHistory = z.infer; + +export const HermesTelemetrySnapshotSchema = z.object({ + generatedAt: z.string(), + // True when this payload was served from the short-TTL cache. + cached: z.boolean(), + instanceId: HermesInstanceIdSchema, + sessions: HermesSessionStatsSchema, + cron: HermesCronListSchema, + memory: HermesMemoryListSchema, + skills: HermesSkillListSchema, + watchdog: HermesWatchdogFeedSchema, + backupHistory: HermesBackupHistorySchema, + // Roll-up of any "couldn't tell" / probe-failed conditions; the UI renders + // these inline without changing the structural shape of the snapshot. + warnings: z.array(z.string()), +}); +export type HermesTelemetrySnapshot = z.infer; diff --git a/dashboard/backend/src/server.ts b/dashboard/backend/src/server.ts index 0e86d29..61dd02b 100644 --- a/dashboard/backend/src/server.ts +++ b/dashboard/backend/src/server.ts @@ -15,6 +15,7 @@ import { azureConfigRoutes } from './modules/azure-config/routes.js'; import { codeQualityRoutes } from './modules/code-quality/routes.js'; import { cosmosConfigRoutes } from './modules/cosmos-config/routes.js'; import { hermesOpsRoutes } from './modules/hermes-ops/routes.js'; +import { hermesTelemetryRoutes } from './modules/hermes-telemetry/routes.js'; import { vmRoutes } from './modules/vm/routes.js'; import rateLimit from '@fastify/rate-limit'; import swagger from '@fastify/swagger'; @@ -277,6 +278,7 @@ await fastify.register(azureConfigRoutes, { prefix: '/api' }); await fastify.register(codeQualityRoutes, { prefix: '/api' }); await fastify.register(cosmosConfigRoutes, { prefix: '/api' }); await fastify.register(hermesOpsRoutes, { prefix: '/api' }); +await fastify.register(hermesTelemetryRoutes, { prefix: '/api' }); await fastify.register(vmRoutes, { prefix: '/api' }); // Start server diff --git a/dashboard/backend/vitest.config.ts b/dashboard/backend/vitest.config.ts index 2fae32d..6b6e363 100644 --- a/dashboard/backend/vitest.config.ts +++ b/dashboard/backend/vitest.config.ts @@ -17,6 +17,7 @@ export default defineConfig({ 'src/lib/csrf.ts', 'src/modules/health/repository.ts', 'src/modules/hermes-ops/repository.ts', + 'src/modules/hermes-telemetry/repository.ts', 'src/modules/deployments/orchestrator.ts', 'src/modules/services/repository.ts', ], diff --git a/dashboard/web/src/lib/api.ts b/dashboard/web/src/lib/api.ts index 51eff7e..03fab61 100644 --- a/dashboard/web/src/lib/api.ts +++ b/dashboard/web/src/lib/api.ts @@ -117,6 +117,98 @@ export interface HermesOpsLink { description: string; } +// --- Hermes telemetry (Phase 3) --------------------------------------------- +// Per-instance read-only telemetry: sessions, cron, memory/skills, watchdog +// alerts, backup history. Probe sources (`hermes` CLI, watchdog log, backup +// repo) may be unavailable on a given host; each section carries its own +// `status` so the UI can show "definitely empty" vs "couldn't read". +export type HermesProbeStatus = 'up' | 'down' | 'unknown'; + +export interface HermesSessionStats { + totalSessions: number; + totalMessages: number; + status: HermesProbeStatus; +} + +export interface HermesCronEntry { + id: string; + name: string; + schedule: string | null; + lastRun: string | null; + nextRun: string | null; + lastStatus: string | null; + active: boolean; +} + +export interface HermesCronList { + entries: HermesCronEntry[]; + status: HermesProbeStatus; +} + +export interface HermesMemoryItem { + id: string; + type: string; + key: string; + summary: string; + updatedAt: string | null; +} + +export interface HermesMemoryList { + items: HermesMemoryItem[]; + status: HermesProbeStatus; +} + +export interface HermesSkillItem { + id: string; + name: string; + description: string; + enabled: boolean; +} + +export interface HermesSkillList { + items: HermesSkillItem[]; + status: HermesProbeStatus; +} + +export type HermesWatchdogSeverity = 'info' | 'warn' | 'critical'; + +export interface HermesWatchdogAlert { + timestamp: string; + severity: HermesWatchdogSeverity; + message: string; +} + +export interface HermesWatchdogFeed { + alerts: HermesWatchdogAlert[]; + source: string | null; + status: HermesProbeStatus; +} + +export interface HermesBackupHistoryEntry { + sha: string; + committedAt: string; + subject: string; +} + +export interface HermesBackupHistory { + entries: HermesBackupHistoryEntry[]; + repoPath: string | null; + status: HermesProbeStatus; +} + +export interface HermesTelemetrySnapshot { + generatedAt: string; + cached: boolean; + instanceId: 'vijay' | 'bheem'; + sessions: HermesSessionStats; + cron: HermesCronList; + memory: HermesMemoryList; + skills: HermesSkillList; + watchdog: HermesWatchdogFeed; + backupHistory: HermesBackupHistory; + warnings: string[]; +} + export interface HermesOpsSnapshot { generatedAt: string; tailscaleIp: string | null; @@ -284,6 +376,13 @@ export const api = { // Hermes operations getHermesOps: () => apiRequest('/api/hermes/ops'), + // Hermes per-instance telemetry (Phase 3 — sessions/cron/memory/skills/ + // watchdog/backup-history). Returns a Zod-validated snapshot from the + // backend; sections may report status:'unknown' if their underlying + // source isn't readable in the current environment (CI / dev box). + getHermesTelemetry: (instance: 'vijay' | 'bheem') => + apiRequest(`/api/hermes/telemetry/${instance}`), + // Seed seedServices: () => apiRequest<{ message: string }>('/api/seed', { method: 'POST' }),