feat(dashboard): Phase 3 slice 1 — hermes telemetry contract + backend endpoint
First slice of Phase 3 ("real per-instance telemetry"). Defines the
read-only artifact contract from Decision #1 (sessions, cron, memory,
skills, watchdog alerts, backup history) and ships an admin-gated
backend endpoint that probes the live Hermes instance, gracefully
degrading to status:'unknown' wherever the source isn't readable.
What's new
- `backend/src/modules/hermes-telemetry/types.ts` — Zod schemas for
every section of the snapshot, plus a `HermesProbeStatus` reused
from hermes-ops so the UI can distinguish "definitely empty" from
"couldn't read the source" for each section independently.
- `backend/src/modules/hermes-telemetry/repository.ts` — implementation
that:
* shells out via `runuser -u <user> --` for cross-user instances
(Bheem/uma) the same way `hermes-ops/repository.ts` does;
* parses `hermes sessions stats / cron list / memory list /
skills list --json` when the CLI is present, otherwise
reports status:'unknown';
* tails the watchdog log and buckets each line by severity
(critical / warn / info);
* pulls `git -C <repo> log` against the instance's backup repo
for backup history;
* caches per-instance with a 30s TTL + in-flight coalescing,
same pattern as hermes-ops.
- `backend/src/modules/hermes-telemetry/routes.ts` — admin-only GET
`/api/hermes/telemetry/:instance` (the `instance` path param is
Zod-validated; the response is validated against
`HermesTelemetrySnapshotSchema` before send so a shape regression
surfaces here, not in the UI).
- `backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts` —
6 unit tests: ENOENT-on-everything case validates against the
schema, JSON-parse path for sessions/cron/memory/skills, watchdog
log severity bucketing, backup-history `git log` parsing, cache
hit, per-instance cache isolation. Coverage: 95.17% lines on the
new repository module.
- `backend/vitest.config.ts` — telemetry repository added to the
coverage gate's `include` list (ratchet).
- `web/src/lib/api.ts` — typed surface for the new endpoint:
`HermesTelemetrySnapshot` + sub-types + `api.getHermesTelemetry`.
What's NOT in this slice
- UI consumption. The Task Ledger / Agents / History panes still
render mock data; converting them is queued for the next slices.
This slice ships the contract + the backend so those slices can
build on a stable shape.
- Backward-compat replacement of `/api/hermes/ops` (which is
unauthenticated today). That comes with the Phase 7 auth pass.
Verified: backend typecheck ✅, 57/57 unit tests ✅, web typecheck ✅,
lint 0 errors, coverage gate ≥95% lines on every gated file.
Generated with [Devin](https://cli.devin.ai/docs)
Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
parent
ecd1f20d59
commit
ad16b1308e
@ -0,0 +1,168 @@
|
||||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
import { HermesTelemetrySnapshotSchema } from './types.js';
|
||||
|
||||
// --- I/O mocks --------------------------------------------------------------
|
||||
const execFileMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('child_process', () => ({ execFile: execFileMock }));
|
||||
|
||||
const readFileMock = vi.hoisted(() => vi.fn());
|
||||
const statMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock }));
|
||||
|
||||
type Handler = (command: string, args: string[]) => { error?: NodeJS.ErrnoException; stdout?: string };
|
||||
|
||||
function setExec(handler: Handler) {
|
||||
execFileMock.mockImplementation(
|
||||
(
|
||||
command: string,
|
||||
args: string[],
|
||||
_opts: unknown,
|
||||
cb: (err: unknown, result?: { stdout: string }) => void,
|
||||
) => {
|
||||
const res = handler(command, args);
|
||||
if (res.error) cb(res.error);
|
||||
else cb(null, { stdout: res.stdout ?? '' });
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
const { getHermesTelemetrySnapshot, clearHermesTelemetryCache } = await import('./repository.js');
|
||||
|
||||
describe('hermes-telemetry repository', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
clearHermesTelemetryCache();
|
||||
});
|
||||
|
||||
it('returns a Zod-valid snapshot when every probe fails (CLI missing)', async () => {
|
||||
// ENOENT on every shell-out (no `hermes` CLI available).
|
||||
setExec(() => {
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readFileMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
// The whole shape must validate even when nothing was readable — that's
|
||||
// the contract the route handler relies on to send a stable JSON to the
|
||||
// UI rather than a 500.
|
||||
expect(() => HermesTelemetrySnapshotSchema.parse(snapshot)).not.toThrow();
|
||||
expect(snapshot.instanceId).toBe('vijay');
|
||||
expect(snapshot.sessions.status).toBe('unknown');
|
||||
expect(snapshot.cron.status).toBe('unknown');
|
||||
expect(snapshot.memory.status).toBe('unknown');
|
||||
expect(snapshot.skills.status).toBe('unknown');
|
||||
expect(snapshot.watchdog.status).toBe('unknown');
|
||||
expect(snapshot.backupHistory.status).toBe('unknown');
|
||||
expect(snapshot.warnings.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('parses sessions/cron/memory/skills JSON output when the CLI is present', async () => {
|
||||
setExec((command, args) => {
|
||||
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'sessions stats') {
|
||||
return { stdout: JSON.stringify({ sessions: 59, messages: 5225 }) };
|
||||
}
|
||||
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'cron list') {
|
||||
return {
|
||||
stdout: JSON.stringify([
|
||||
{ id: 'mem-rotate', name: 'Memory rotation', schedule: '0 4 * * *', last_run: '2026-01-01T04:00:00Z', next_run: '2026-01-02T04:00:00Z', last_status: 'ok', active: true },
|
||||
]),
|
||||
};
|
||||
}
|
||||
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'memory list') {
|
||||
return { stdout: JSON.stringify([{ id: 'm1', type: 'note', key: 'gateway', summary: 'restart procedure' }]) };
|
||||
}
|
||||
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'skills list') {
|
||||
return { stdout: JSON.stringify([{ id: 's1', name: 'restart', description: 'restart a service', enabled: true }]) };
|
||||
}
|
||||
if (command === 'git') {
|
||||
// No backup repo on the test box.
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
}
|
||||
return { stdout: '' };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
expect(snapshot.sessions).toEqual({ totalSessions: 59, totalMessages: 5225, status: 'up' });
|
||||
expect(snapshot.cron.status).toBe('up');
|
||||
expect(snapshot.cron.entries).toHaveLength(1);
|
||||
expect(snapshot.cron.entries[0].name).toBe('Memory rotation');
|
||||
expect(snapshot.memory.status).toBe('up');
|
||||
expect(snapshot.memory.items[0].key).toBe('gateway');
|
||||
expect(snapshot.skills.status).toBe('up');
|
||||
expect(snapshot.skills.items[0].id).toBe('s1');
|
||||
});
|
||||
|
||||
it('parses watchdog log lines into severity-tagged alerts', async () => {
|
||||
setExec(() => {
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockResolvedValue({} as never);
|
||||
readFileMock.mockResolvedValue([
|
||||
'2026-01-01T12:34:56 WARNING gateway is degraded',
|
||||
'2026-01-01T12:35:01 CRITICAL backup repo HEAD missing',
|
||||
'2026-01-01T12:35:30 INFO healthy after retry',
|
||||
'',
|
||||
].join('\n'));
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('bheem');
|
||||
expect(snapshot.watchdog.status).toBe('up');
|
||||
expect(snapshot.watchdog.alerts.map((a) => a.severity)).toEqual(['warn', 'critical', 'info']);
|
||||
expect(snapshot.watchdog.alerts[0].message).toBe('gateway is degraded');
|
||||
});
|
||||
|
||||
it('parses backup history from `git log` output', async () => {
|
||||
setExec((command, args) => {
|
||||
if (command === 'git' && args.includes('log')) {
|
||||
return {
|
||||
stdout: [
|
||||
'a1b2c3\x1f2026-01-01T01:00:00Z\xfeBackup at 01:00',
|
||||
'd4e5f6\x1f2026-01-01T00:00:00Z\xfeBackup at 00:00',
|
||||
].map((s) => s.replace('\xfe', '\x1f')).join('\n'),
|
||||
};
|
||||
}
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
expect(snapshot.backupHistory.status).toBe('up');
|
||||
expect(snapshot.backupHistory.entries).toHaveLength(2);
|
||||
expect(snapshot.backupHistory.entries[0]).toMatchObject({ sha: 'a1b2c3', subject: 'Backup at 01:00' });
|
||||
});
|
||||
|
||||
it('serves a cached snapshot on the second call within the TTL window', async () => {
|
||||
let calls = 0;
|
||||
setExec(() => {
|
||||
calls++;
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const a = await getHermesTelemetrySnapshot('vijay');
|
||||
const callsAfterFirst = calls;
|
||||
const b = await getHermesTelemetrySnapshot('vijay');
|
||||
expect(calls).toBe(callsAfterFirst); // no extra subprocess work
|
||||
expect(b.cached).toBe(true);
|
||||
expect(a.instanceId).toBe(b.instanceId);
|
||||
});
|
||||
|
||||
it('caches per instance independently', async () => {
|
||||
setExec(() => {
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const v = await getHermesTelemetrySnapshot('vijay');
|
||||
const b = await getHermesTelemetrySnapshot('bheem');
|
||||
expect(v.instanceId).toBe('vijay');
|
||||
expect(b.instanceId).toBe('bheem');
|
||||
});
|
||||
});
|
||||
315
dashboard/backend/src/modules/hermes-telemetry/repository.ts
Normal file
315
dashboard/backend/src/modules/hermes-telemetry/repository.ts
Normal file
@ -0,0 +1,315 @@
|
||||
import { execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { readFile, stat } from 'fs/promises';
|
||||
import { childLogger } from '../../lib/logger.js';
|
||||
import type {
|
||||
HermesBackupHistory,
|
||||
HermesBackupHistoryEntry,
|
||||
HermesCronEntry,
|
||||
HermesCronList,
|
||||
HermesInstanceId,
|
||||
HermesMemoryList,
|
||||
HermesSessionStats,
|
||||
HermesSkillList,
|
||||
HermesTelemetrySnapshot,
|
||||
HermesWatchdogAlert,
|
||||
HermesWatchdogFeed,
|
||||
HermesWatchdogSeverity,
|
||||
} from './types.js';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
const log = childLogger('hermes-telemetry/repository');
|
||||
|
||||
// Per-instance config. Mirrors `instances` in `hermes-ops/repository.ts`.
|
||||
// Anything we shell out to as the live instance owner ("uma" for Bheem, root
|
||||
// for Vijay) is wrapped in `runuser -u <user>` so the command runs in the
|
||||
// owner's environment, not the backend's.
|
||||
interface InstanceConfig {
|
||||
id: HermesInstanceId;
|
||||
user: string | null; // null → run as the backend's own user (root in prod)
|
||||
repoPath: string;
|
||||
watchdogLog: string;
|
||||
}
|
||||
|
||||
const INSTANCES: Record<HermesInstanceId, InstanceConfig> = {
|
||||
vijay: {
|
||||
id: 'vijay',
|
||||
user: null,
|
||||
repoPath: '/root/repos/bytelyst_hostinger_hermes_vm',
|
||||
watchdogLog: '/root/.hermes/logs/hermes-health-watchdog.log',
|
||||
},
|
||||
bheem: {
|
||||
id: 'bheem',
|
||||
user: 'uma',
|
||||
repoPath: '/home/uma/repos/uma_hostinger_hermes_vm',
|
||||
watchdogLog: '/home/uma/.hermes/logs/hermes-health-watchdog.log',
|
||||
},
|
||||
};
|
||||
|
||||
interface ExecResult {
|
||||
stdout: string;
|
||||
ran: boolean;
|
||||
}
|
||||
|
||||
async function exec(
|
||||
command: string,
|
||||
args: string[],
|
||||
cwd?: string,
|
||||
timeoutMs = 5000,
|
||||
): Promise<ExecResult> {
|
||||
try {
|
||||
const { stdout } = await execFileAsync(command, args, { cwd, timeout: timeoutMs });
|
||||
return { stdout: stdout.trim(), ran: true };
|
||||
} catch (err) {
|
||||
const e = err as NodeJS.ErrnoException & { code?: string | number; stdout?: string };
|
||||
if (e?.code === 'ENOENT' || e?.code === 'ETIMEDOUT') {
|
||||
return { stdout: '', ran: false };
|
||||
}
|
||||
// A non-zero exit is still useful — `git log` on an empty repo, etc.
|
||||
return { stdout: (e?.stdout ?? '').toString().trim(), ran: true };
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap a command in `runuser -u <user> --` when needed so it runs in the
|
||||
// instance owner's environment (PATH, ~/.hermes config). For the local
|
||||
// instance (user=null) we just call the binary directly.
|
||||
async function execAs(
|
||||
inst: InstanceConfig,
|
||||
command: string,
|
||||
args: string[],
|
||||
timeoutMs = 5000,
|
||||
): Promise<ExecResult> {
|
||||
if (!inst.user) return exec(command, args, undefined, timeoutMs);
|
||||
return exec('runuser', ['-u', inst.user, '--', command, ...args], undefined, timeoutMs);
|
||||
}
|
||||
|
||||
// --- Sessions ---------------------------------------------------------------
|
||||
//
|
||||
// The Hermes CLI exposes `hermes sessions stats --json` (when present). When
|
||||
// it's not available we report status:'unknown' rather than fabricating zeros.
|
||||
async function readSessionStats(inst: InstanceConfig): Promise<HermesSessionStats> {
|
||||
const result = await execAs(inst, 'hermes', ['sessions', 'stats', '--json']);
|
||||
if (!result.ran) return { totalSessions: 0, totalMessages: 0, status: 'unknown' };
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(result.stdout) as { sessions?: number; messages?: number };
|
||||
return {
|
||||
totalSessions: Number(parsed.sessions ?? 0),
|
||||
totalMessages: Number(parsed.messages ?? 0),
|
||||
status: 'up',
|
||||
};
|
||||
} catch {
|
||||
return { totalSessions: 0, totalMessages: 0, status: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
// --- Cron -------------------------------------------------------------------
|
||||
//
|
||||
// `hermes cron list --json` is the canonical source. It's distinct from
|
||||
// systemd timers (which `hermes-ops` already covers) — this is for Hermes'
|
||||
// own scheduled tasks (memory rotations, telegram digests, etc).
|
||||
async function readCron(inst: InstanceConfig): Promise<HermesCronList> {
|
||||
const result = await execAs(inst, 'hermes', ['cron', 'list', '--json']);
|
||||
if (!result.ran) return { entries: [], status: 'unknown' };
|
||||
|
||||
try {
|
||||
const raw = JSON.parse(result.stdout) as Array<Record<string, unknown>>;
|
||||
const entries: HermesCronEntry[] = raw.map((row) => ({
|
||||
id: String(row.id ?? row.name ?? ''),
|
||||
name: String(row.name ?? row.id ?? ''),
|
||||
schedule: row.schedule ? String(row.schedule) : null,
|
||||
lastRun: row.last_run ? String(row.last_run) : (row.lastRun ? String(row.lastRun) : null),
|
||||
nextRun: row.next_run ? String(row.next_run) : (row.nextRun ? String(row.nextRun) : null),
|
||||
lastStatus: row.last_status ? String(row.last_status) : (row.lastStatus ? String(row.lastStatus) : null),
|
||||
active: Boolean(row.active ?? row.enabled ?? true),
|
||||
}));
|
||||
return { entries, status: 'up' };
|
||||
} catch {
|
||||
return { entries: [], status: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
// --- Memory + skills --------------------------------------------------------
|
||||
async function readMemory(inst: InstanceConfig): Promise<HermesMemoryList> {
|
||||
const result = await execAs(inst, 'hermes', ['memory', 'list', '--json']);
|
||||
if (!result.ran) return { items: [], status: 'unknown' };
|
||||
|
||||
try {
|
||||
const raw = JSON.parse(result.stdout) as Array<Record<string, unknown>>;
|
||||
return {
|
||||
items: raw.map((row) => ({
|
||||
id: String(row.id ?? ''),
|
||||
type: String(row.type ?? 'note'),
|
||||
key: String(row.key ?? row.name ?? ''),
|
||||
summary: String(row.summary ?? row.value ?? ''),
|
||||
updatedAt: row.updated_at ? String(row.updated_at) : (row.updatedAt ? String(row.updatedAt) : null),
|
||||
})),
|
||||
status: 'up',
|
||||
};
|
||||
} catch {
|
||||
return { items: [], status: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
async function readSkills(inst: InstanceConfig): Promise<HermesSkillList> {
|
||||
const result = await execAs(inst, 'hermes', ['skills', 'list', '--json']);
|
||||
if (!result.ran) return { items: [], status: 'unknown' };
|
||||
|
||||
try {
|
||||
const raw = JSON.parse(result.stdout) as Array<Record<string, unknown>>;
|
||||
return {
|
||||
items: raw.map((row) => ({
|
||||
id: String(row.id ?? row.name ?? ''),
|
||||
name: String(row.name ?? row.id ?? ''),
|
||||
description: String(row.description ?? ''),
|
||||
enabled: Boolean(row.enabled ?? true),
|
||||
})),
|
||||
status: 'up',
|
||||
};
|
||||
} catch {
|
||||
return { items: [], status: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
// --- Watchdog ---------------------------------------------------------------
|
||||
//
|
||||
// Tail the last N lines of the watchdog log and bucket them by severity.
|
||||
// The log format used by `scripts/hermes-health-watchdog.py` today is roughly:
|
||||
// 2026-01-01T12:34:56 WARNING gateway is degraded: ...
|
||||
// 2026-01-01T12:35:01 CRITICAL backup repo HEAD missing
|
||||
// We accept any RFC3339-ish leading timestamp and a severity word.
|
||||
const WATCHDOG_LINE = /^(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:[.\d]+)?(?:Z|[+-]\d{2}:?\d{2})?)\s+(\w+)\s+(.*)$/;
|
||||
|
||||
function normalizeSeverity(token: string): HermesWatchdogSeverity {
|
||||
const upper = token.toUpperCase();
|
||||
if (upper === 'CRITICAL' || upper === 'ERROR' || upper === 'FATAL') return 'critical';
|
||||
if (upper === 'WARNING' || upper === 'WARN') return 'warn';
|
||||
return 'info';
|
||||
}
|
||||
|
||||
async function readWatchdog(inst: InstanceConfig): Promise<HermesWatchdogFeed> {
|
||||
try {
|
||||
await stat(inst.watchdogLog);
|
||||
} catch {
|
||||
return { alerts: [], source: inst.watchdogLog, status: 'unknown' };
|
||||
}
|
||||
|
||||
try {
|
||||
const content = await readFile(inst.watchdogLog, 'utf8');
|
||||
const lines = content.split('\n').filter(Boolean);
|
||||
// Cap to the last 50 entries; anything older isn't useful for the panel.
|
||||
const tail = lines.slice(-50);
|
||||
const alerts: HermesWatchdogAlert[] = [];
|
||||
for (const line of tail) {
|
||||
const match = WATCHDOG_LINE.exec(line);
|
||||
if (!match) continue;
|
||||
const [, ts, severityToken, message] = match;
|
||||
alerts.push({
|
||||
timestamp: ts,
|
||||
severity: normalizeSeverity(severityToken),
|
||||
message: message.trim(),
|
||||
});
|
||||
}
|
||||
return { alerts, source: inst.watchdogLog, status: 'up' };
|
||||
} catch (err) {
|
||||
log.warn({ err, instance: inst.id, source: inst.watchdogLog }, 'failed to read watchdog log');
|
||||
return { alerts: [], source: inst.watchdogLog, status: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
// --- Backup history ---------------------------------------------------------
|
||||
//
|
||||
// Cheap proxy for "is the backup pipeline alive": last N commits on the
|
||||
// backup repo. `git -C <repo> log --pretty=...` runs as the backend user;
|
||||
// repos are world-readable on the live host.
|
||||
async function readBackupHistory(inst: InstanceConfig): Promise<HermesBackupHistory> {
|
||||
const result = await exec('git', [
|
||||
'-C', inst.repoPath,
|
||||
'log',
|
||||
'--pretty=format:%H\x1f%cI\x1f%s',
|
||||
'-n', '20',
|
||||
]);
|
||||
if (!result.ran) return { entries: [], repoPath: inst.repoPath, status: 'unknown' };
|
||||
if (!result.stdout) return { entries: [], repoPath: inst.repoPath, status: 'up' };
|
||||
|
||||
const entries: HermesBackupHistoryEntry[] = [];
|
||||
for (const line of result.stdout.split('\n')) {
|
||||
const [sha, committedAt, ...rest] = line.split('\x1f');
|
||||
if (!sha || !committedAt) continue;
|
||||
entries.push({ sha, committedAt, subject: rest.join('\x1f') });
|
||||
}
|
||||
return { entries, repoPath: inst.repoPath, status: 'up' };
|
||||
}
|
||||
|
||||
// --- Snapshot assembly ------------------------------------------------------
|
||||
const CACHE_TTL = 30000;
|
||||
const cache = new Map<HermesInstanceId, { snapshot: HermesTelemetrySnapshot; at: number }>();
|
||||
const inflight = new Map<HermesInstanceId, Promise<HermesTelemetrySnapshot>>();
|
||||
|
||||
async function buildSnapshot(instanceId: HermesInstanceId): Promise<HermesTelemetrySnapshot> {
|
||||
const inst = INSTANCES[instanceId];
|
||||
const [sessions, cron, memory, skills, watchdog, backupHistory] = await Promise.all([
|
||||
readSessionStats(inst),
|
||||
readCron(inst),
|
||||
readMemory(inst),
|
||||
readSkills(inst),
|
||||
readWatchdog(inst),
|
||||
readBackupHistory(inst),
|
||||
]);
|
||||
|
||||
const warnings: string[] = [];
|
||||
if (sessions.status === 'unknown') warnings.push(`${instanceId}: hermes sessions stats unavailable (CLI missing or non-zero exit)`);
|
||||
if (cron.status === 'unknown') warnings.push(`${instanceId}: hermes cron list unavailable`);
|
||||
if (memory.status === 'unknown') warnings.push(`${instanceId}: hermes memory list unavailable`);
|
||||
if (skills.status === 'unknown') warnings.push(`${instanceId}: hermes skills list unavailable`);
|
||||
if (watchdog.status === 'unknown') warnings.push(`${instanceId}: watchdog log not readable at ${watchdog.source ?? 'unknown path'}`);
|
||||
if (backupHistory.status === 'unknown') warnings.push(`${instanceId}: backup repo not readable at ${backupHistory.repoPath ?? 'unknown path'}`);
|
||||
|
||||
return {
|
||||
generatedAt: new Date().toISOString(),
|
||||
cached: false,
|
||||
instanceId,
|
||||
sessions,
|
||||
cron,
|
||||
memory,
|
||||
skills,
|
||||
watchdog,
|
||||
backupHistory,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
export async function getHermesTelemetrySnapshot(
|
||||
instanceId: HermesInstanceId,
|
||||
options?: { force?: boolean },
|
||||
): Promise<HermesTelemetrySnapshot> {
|
||||
const force = options?.force ?? false;
|
||||
|
||||
if (!force) {
|
||||
const cached = cache.get(instanceId);
|
||||
if (cached && Date.now() - cached.at < CACHE_TTL) {
|
||||
return { ...cached.snapshot, cached: true };
|
||||
}
|
||||
|
||||
const pending = inflight.get(instanceId);
|
||||
if (pending) return pending;
|
||||
}
|
||||
|
||||
const promise = buildSnapshot(instanceId)
|
||||
.then((snapshot) => {
|
||||
cache.set(instanceId, { snapshot, at: Date.now() });
|
||||
return snapshot;
|
||||
})
|
||||
.finally(() => {
|
||||
if (inflight.get(instanceId) === promise) inflight.delete(instanceId);
|
||||
});
|
||||
|
||||
if (!force) inflight.set(instanceId, promise);
|
||||
return promise;
|
||||
}
|
||||
|
||||
// Test hook so `vitest` cases don't bleed cached state across runs.
|
||||
export function clearHermesTelemetryCache(): void {
|
||||
cache.clear();
|
||||
inflight.clear();
|
||||
}
|
||||
36
dashboard/backend/src/modules/hermes-telemetry/routes.ts
Normal file
36
dashboard/backend/src/modules/hermes-telemetry/routes.ts
Normal file
@ -0,0 +1,36 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { z } from 'zod';
|
||||
import { getHermesTelemetrySnapshot } from './repository.js';
|
||||
import { HermesInstanceIdSchema, HermesTelemetrySnapshotSchema } from './types.js';
|
||||
import { requireAdmin } from '../../lib/auth.js';
|
||||
|
||||
const ParamsSchema = z.object({ instance: HermesInstanceIdSchema });
|
||||
|
||||
export async function hermesTelemetryRoutes(fastify: FastifyInstance) {
|
||||
// GET /api/hermes/telemetry/:instance
|
||||
// Admin-only: this endpoint shells out to `hermes` CLI in the instance
|
||||
// owner's environment (`runuser -u uma --` for Bheem) and reads the
|
||||
// watchdog log + backup repo. Treat it as privileged the same way the
|
||||
// VM/system endpoints are. See `dashboard/DEPLOYMENT.md` Privilege Surface.
|
||||
fastify.get('/hermes/telemetry/:instance', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
let params: z.infer<typeof ParamsSchema>;
|
||||
try {
|
||||
params = ParamsSchema.parse(req.params);
|
||||
} catch (err) {
|
||||
return reply.code(400).send({ error: 'Invalid instance', detail: (err as Error).message });
|
||||
}
|
||||
|
||||
try {
|
||||
const snapshot = await getHermesTelemetrySnapshot(params.instance);
|
||||
// Validate our own response so a shape regression surfaces here as a
|
||||
// 500 rather than a corrupt UI state — same approach as hermes-ops.
|
||||
const validated = HermesTelemetrySnapshotSchema.parse(snapshot);
|
||||
return reply.send(validated);
|
||||
} catch (err) {
|
||||
fastify.log.error(err, 'failed to build hermes telemetry snapshot');
|
||||
return reply.code(500).send({ error: 'Failed to build hermes telemetry snapshot' });
|
||||
}
|
||||
});
|
||||
}
|
||||
118
dashboard/backend/src/modules/hermes-telemetry/types.ts
Normal file
118
dashboard/backend/src/modules/hermes-telemetry/types.ts
Normal file
@ -0,0 +1,118 @@
|
||||
import { z } from 'zod';
|
||||
import { ProbeStatusSchema } from '../hermes-ops/types.js';
|
||||
|
||||
// Hermes telemetry snapshot — read-only "real artifacts" per Phase 3 Decision #1
|
||||
// (sessions, cron, memory, skills, watchdog alerts, backup history).
|
||||
// Each section carries its own `ProbeStatus` so the UI can distinguish
|
||||
// "definitely empty" from "couldn't read the source" (CLI missing, permission
|
||||
// denied, timed out). Mirrors the hermes-ops shape: every field set the UI
|
||||
// renders has a status it can surface.
|
||||
|
||||
export const HermesInstanceIdSchema = z.enum(['vijay', 'bheem']);
|
||||
export type HermesInstanceId = z.infer<typeof HermesInstanceIdSchema>;
|
||||
|
||||
export const HermesSessionStatsSchema = z.object({
|
||||
totalSessions: z.number(),
|
||||
totalMessages: z.number(),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesSessionStats = z.infer<typeof HermesSessionStatsSchema>;
|
||||
|
||||
export const HermesCronEntrySchema = z.object({
|
||||
id: z.string(),
|
||||
name: z.string(),
|
||||
schedule: z.string().nullable(),
|
||||
lastRun: z.string().nullable(),
|
||||
nextRun: z.string().nullable(),
|
||||
lastStatus: z.string().nullable(),
|
||||
active: z.boolean(),
|
||||
});
|
||||
export type HermesCronEntry = z.infer<typeof HermesCronEntrySchema>;
|
||||
|
||||
export const HermesCronListSchema = z.object({
|
||||
entries: z.array(HermesCronEntrySchema),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesCronList = z.infer<typeof HermesCronListSchema>;
|
||||
|
||||
export const HermesMemoryItemSchema = z.object({
|
||||
id: z.string(),
|
||||
type: z.string(),
|
||||
key: z.string(),
|
||||
summary: z.string(),
|
||||
updatedAt: z.string().nullable(),
|
||||
});
|
||||
export type HermesMemoryItem = z.infer<typeof HermesMemoryItemSchema>;
|
||||
|
||||
export const HermesMemoryListSchema = z.object({
|
||||
items: z.array(HermesMemoryItemSchema),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesMemoryList = z.infer<typeof HermesMemoryListSchema>;
|
||||
|
||||
export const HermesSkillItemSchema = z.object({
|
||||
id: z.string(),
|
||||
name: z.string(),
|
||||
description: z.string(),
|
||||
enabled: z.boolean(),
|
||||
});
|
||||
export type HermesSkillItem = z.infer<typeof HermesSkillItemSchema>;
|
||||
|
||||
export const HermesSkillListSchema = z.object({
|
||||
items: z.array(HermesSkillItemSchema),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesSkillList = z.infer<typeof HermesSkillListSchema>;
|
||||
|
||||
// Severity is a union of `info | warn | critical` so the UI can colour-code.
|
||||
// Watchdog scripts emit "WARNING" / "CRITICAL" prefixes today; we normalize.
|
||||
export const HermesWatchdogSeveritySchema = z.enum(['info', 'warn', 'critical']);
|
||||
export type HermesWatchdogSeverity = z.infer<typeof HermesWatchdogSeveritySchema>;
|
||||
|
||||
export const HermesWatchdogAlertSchema = z.object({
|
||||
timestamp: z.string(),
|
||||
severity: HermesWatchdogSeveritySchema,
|
||||
message: z.string(),
|
||||
});
|
||||
export type HermesWatchdogAlert = z.infer<typeof HermesWatchdogAlertSchema>;
|
||||
|
||||
export const HermesWatchdogFeedSchema = z.object({
|
||||
alerts: z.array(HermesWatchdogAlertSchema),
|
||||
// Path the alerts were read from (or where they would be read from when
|
||||
// the source becomes available). Null when no canonical path is known.
|
||||
source: z.string().nullable(),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesWatchdogFeed = z.infer<typeof HermesWatchdogFeedSchema>;
|
||||
|
||||
export const HermesBackupHistoryEntrySchema = z.object({
|
||||
sha: z.string(),
|
||||
committedAt: z.string(),
|
||||
subject: z.string(),
|
||||
});
|
||||
export type HermesBackupHistoryEntry = z.infer<typeof HermesBackupHistoryEntrySchema>;
|
||||
|
||||
export const HermesBackupHistorySchema = z.object({
|
||||
entries: z.array(HermesBackupHistoryEntrySchema),
|
||||
// Repo path probed (informational; useful when status is `unknown`).
|
||||
repoPath: z.string().nullable(),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesBackupHistory = z.infer<typeof HermesBackupHistorySchema>;
|
||||
|
||||
export const HermesTelemetrySnapshotSchema = z.object({
|
||||
generatedAt: z.string(),
|
||||
// True when this payload was served from the short-TTL cache.
|
||||
cached: z.boolean(),
|
||||
instanceId: HermesInstanceIdSchema,
|
||||
sessions: HermesSessionStatsSchema,
|
||||
cron: HermesCronListSchema,
|
||||
memory: HermesMemoryListSchema,
|
||||
skills: HermesSkillListSchema,
|
||||
watchdog: HermesWatchdogFeedSchema,
|
||||
backupHistory: HermesBackupHistorySchema,
|
||||
// Roll-up of any "couldn't tell" / probe-failed conditions; the UI renders
|
||||
// these inline without changing the structural shape of the snapshot.
|
||||
warnings: z.array(z.string()),
|
||||
});
|
||||
export type HermesTelemetrySnapshot = z.infer<typeof HermesTelemetrySnapshotSchema>;
|
||||
@ -15,6 +15,7 @@ import { azureConfigRoutes } from './modules/azure-config/routes.js';
|
||||
import { codeQualityRoutes } from './modules/code-quality/routes.js';
|
||||
import { cosmosConfigRoutes } from './modules/cosmos-config/routes.js';
|
||||
import { hermesOpsRoutes } from './modules/hermes-ops/routes.js';
|
||||
import { hermesTelemetryRoutes } from './modules/hermes-telemetry/routes.js';
|
||||
import { vmRoutes } from './modules/vm/routes.js';
|
||||
import rateLimit from '@fastify/rate-limit';
|
||||
import swagger from '@fastify/swagger';
|
||||
@ -277,6 +278,7 @@ await fastify.register(azureConfigRoutes, { prefix: '/api' });
|
||||
await fastify.register(codeQualityRoutes, { prefix: '/api' });
|
||||
await fastify.register(cosmosConfigRoutes, { prefix: '/api' });
|
||||
await fastify.register(hermesOpsRoutes, { prefix: '/api' });
|
||||
await fastify.register(hermesTelemetryRoutes, { prefix: '/api' });
|
||||
await fastify.register(vmRoutes, { prefix: '/api' });
|
||||
|
||||
// Start server
|
||||
|
||||
@ -17,6 +17,7 @@ export default defineConfig({
|
||||
'src/lib/csrf.ts',
|
||||
'src/modules/health/repository.ts',
|
||||
'src/modules/hermes-ops/repository.ts',
|
||||
'src/modules/hermes-telemetry/repository.ts',
|
||||
'src/modules/deployments/orchestrator.ts',
|
||||
'src/modules/services/repository.ts',
|
||||
],
|
||||
|
||||
@ -117,6 +117,98 @@ export interface HermesOpsLink {
|
||||
description: string;
|
||||
}
|
||||
|
||||
// --- Hermes telemetry (Phase 3) ---------------------------------------------
|
||||
// Per-instance read-only telemetry: sessions, cron, memory/skills, watchdog
|
||||
// alerts, backup history. Probe sources (`hermes` CLI, watchdog log, backup
|
||||
// repo) may be unavailable on a given host; each section carries its own
|
||||
// `status` so the UI can show "definitely empty" vs "couldn't read".
|
||||
export type HermesProbeStatus = 'up' | 'down' | 'unknown';
|
||||
|
||||
export interface HermesSessionStats {
|
||||
totalSessions: number;
|
||||
totalMessages: number;
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesCronEntry {
|
||||
id: string;
|
||||
name: string;
|
||||
schedule: string | null;
|
||||
lastRun: string | null;
|
||||
nextRun: string | null;
|
||||
lastStatus: string | null;
|
||||
active: boolean;
|
||||
}
|
||||
|
||||
export interface HermesCronList {
|
||||
entries: HermesCronEntry[];
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesMemoryItem {
|
||||
id: string;
|
||||
type: string;
|
||||
key: string;
|
||||
summary: string;
|
||||
updatedAt: string | null;
|
||||
}
|
||||
|
||||
export interface HermesMemoryList {
|
||||
items: HermesMemoryItem[];
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesSkillItem {
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
export interface HermesSkillList {
|
||||
items: HermesSkillItem[];
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export type HermesWatchdogSeverity = 'info' | 'warn' | 'critical';
|
||||
|
||||
export interface HermesWatchdogAlert {
|
||||
timestamp: string;
|
||||
severity: HermesWatchdogSeverity;
|
||||
message: string;
|
||||
}
|
||||
|
||||
export interface HermesWatchdogFeed {
|
||||
alerts: HermesWatchdogAlert[];
|
||||
source: string | null;
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesBackupHistoryEntry {
|
||||
sha: string;
|
||||
committedAt: string;
|
||||
subject: string;
|
||||
}
|
||||
|
||||
export interface HermesBackupHistory {
|
||||
entries: HermesBackupHistoryEntry[];
|
||||
repoPath: string | null;
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesTelemetrySnapshot {
|
||||
generatedAt: string;
|
||||
cached: boolean;
|
||||
instanceId: 'vijay' | 'bheem';
|
||||
sessions: HermesSessionStats;
|
||||
cron: HermesCronList;
|
||||
memory: HermesMemoryList;
|
||||
skills: HermesSkillList;
|
||||
watchdog: HermesWatchdogFeed;
|
||||
backupHistory: HermesBackupHistory;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface HermesOpsSnapshot {
|
||||
generatedAt: string;
|
||||
tailscaleIp: string | null;
|
||||
@ -284,6 +376,13 @@ export const api = {
|
||||
// Hermes operations
|
||||
getHermesOps: () => apiRequest<HermesOpsSnapshot>('/api/hermes/ops'),
|
||||
|
||||
// Hermes per-instance telemetry (Phase 3 — sessions/cron/memory/skills/
|
||||
// watchdog/backup-history). Returns a Zod-validated snapshot from the
|
||||
// backend; sections may report status:'unknown' if their underlying
|
||||
// source isn't readable in the current environment (CI / dev box).
|
||||
getHermesTelemetry: (instance: 'vijay' | 'bheem') =>
|
||||
apiRequest<HermesTelemetrySnapshot>(`/api/hermes/telemetry/${instance}`),
|
||||
|
||||
// Seed
|
||||
seedServices: () => apiRequest<{ message: string }>('/api/seed', { method: 'POST' }),
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user