feat(dashboard): Phase 3 slice 1 — hermes telemetry contract + backend endpoint

First slice of Phase 3 ("real per-instance telemetry"). Defines the
read-only artifact contract from Decision #1 (sessions, cron, memory,
skills, watchdog alerts, backup history) and ships an admin-gated
backend endpoint that probes the live Hermes instance, gracefully
degrading to status:'unknown' wherever the source isn't readable.

What's new
  - `backend/src/modules/hermes-telemetry/types.ts` — Zod schemas for
    every section of the snapshot, plus a `HermesProbeStatus` reused
    from hermes-ops so the UI can distinguish "definitely empty" from
    "couldn't read the source" for each section independently.
  - `backend/src/modules/hermes-telemetry/repository.ts` — implementation
    that:
      * shells out via `runuser -u <user> --` for cross-user instances
        (Bheem/uma) the same way `hermes-ops/repository.ts` does;
      * parses `hermes sessions stats / cron list / memory list /
        skills list --json` when the CLI is present, otherwise
        reports status:'unknown';
      * tails the watchdog log and buckets each line by severity
        (critical / warn / info);
      * pulls `git -C <repo> log` against the instance's backup repo
        for backup history;
      * caches per-instance with a 30s TTL + in-flight coalescing,
        same pattern as hermes-ops.
  - `backend/src/modules/hermes-telemetry/routes.ts` — admin-only GET
    `/api/hermes/telemetry/:instance` (the `instance` path param is
    Zod-validated; the response is validated against
    `HermesTelemetrySnapshotSchema` before send so a shape regression
    surfaces here, not in the UI).
  - `backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts` —
    6 unit tests: ENOENT-on-everything case validates against the
    schema, JSON-parse path for sessions/cron/memory/skills, watchdog
    log severity bucketing, backup-history `git log` parsing, cache
    hit, per-instance cache isolation. Coverage: 95.17% lines on the
    new repository module.
  - `backend/vitest.config.ts` — telemetry repository added to the
    coverage gate's `include` list (ratchet).
  - `web/src/lib/api.ts` — typed surface for the new endpoint:
    `HermesTelemetrySnapshot` + sub-types + `api.getHermesTelemetry`.

What's NOT in this slice
  - UI consumption. The Task Ledger / Agents / History panes still
    render mock data; converting them is queued for the next slices.
    This slice ships the contract + the backend so those slices can
    build on a stable shape.
  - Backward-compat replacement of `/api/hermes/ops` (which is
    unauthenticated today). That comes with the Phase 7 auth pass.

Verified: backend typecheck , 57/57 unit tests , web typecheck ,
lint 0 errors, coverage gate ≥95% lines on every gated file.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
Hermes VM 2026-05-30 07:53:37 +00:00
parent ecd1f20d59
commit ad16b1308e
7 changed files with 739 additions and 0 deletions

View File

@ -0,0 +1,168 @@
import { describe, it, expect, beforeEach, vi } from 'vitest';
import { HermesTelemetrySnapshotSchema } from './types.js';
// --- I/O mocks --------------------------------------------------------------
const execFileMock = vi.hoisted(() => vi.fn());
vi.mock('child_process', () => ({ execFile: execFileMock }));
const readFileMock = vi.hoisted(() => vi.fn());
const statMock = vi.hoisted(() => vi.fn());
vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock }));
type Handler = (command: string, args: string[]) => { error?: NodeJS.ErrnoException; stdout?: string };
function setExec(handler: Handler) {
execFileMock.mockImplementation(
(
command: string,
args: string[],
_opts: unknown,
cb: (err: unknown, result?: { stdout: string }) => void,
) => {
const res = handler(command, args);
if (res.error) cb(res.error);
else cb(null, { stdout: res.stdout ?? '' });
},
);
}
const { getHermesTelemetrySnapshot, clearHermesTelemetryCache } = await import('./repository.js');
describe('hermes-telemetry repository', () => {
beforeEach(() => {
vi.clearAllMocks();
clearHermesTelemetryCache();
});
it('returns a Zod-valid snapshot when every probe fails (CLI missing)', async () => {
// ENOENT on every shell-out (no `hermes` CLI available).
setExec(() => {
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readFileMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const snapshot = await getHermesTelemetrySnapshot('vijay');
// The whole shape must validate even when nothing was readable — that's
// the contract the route handler relies on to send a stable JSON to the
// UI rather than a 500.
expect(() => HermesTelemetrySnapshotSchema.parse(snapshot)).not.toThrow();
expect(snapshot.instanceId).toBe('vijay');
expect(snapshot.sessions.status).toBe('unknown');
expect(snapshot.cron.status).toBe('unknown');
expect(snapshot.memory.status).toBe('unknown');
expect(snapshot.skills.status).toBe('unknown');
expect(snapshot.watchdog.status).toBe('unknown');
expect(snapshot.backupHistory.status).toBe('unknown');
expect(snapshot.warnings.length).toBeGreaterThan(0);
});
it('parses sessions/cron/memory/skills JSON output when the CLI is present', async () => {
setExec((command, args) => {
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'sessions stats') {
return { stdout: JSON.stringify({ sessions: 59, messages: 5225 }) };
}
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'cron list') {
return {
stdout: JSON.stringify([
{ id: 'mem-rotate', name: 'Memory rotation', schedule: '0 4 * * *', last_run: '2026-01-01T04:00:00Z', next_run: '2026-01-02T04:00:00Z', last_status: 'ok', active: true },
]),
};
}
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'memory list') {
return { stdout: JSON.stringify([{ id: 'm1', type: 'note', key: 'gateway', summary: 'restart procedure' }]) };
}
if (command === 'hermes' && args.slice(0, 2).join(' ') === 'skills list') {
return { stdout: JSON.stringify([{ id: 's1', name: 'restart', description: 'restart a service', enabled: true }]) };
}
if (command === 'git') {
// No backup repo on the test box.
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
}
return { stdout: '' };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const snapshot = await getHermesTelemetrySnapshot('vijay');
expect(snapshot.sessions).toEqual({ totalSessions: 59, totalMessages: 5225, status: 'up' });
expect(snapshot.cron.status).toBe('up');
expect(snapshot.cron.entries).toHaveLength(1);
expect(snapshot.cron.entries[0].name).toBe('Memory rotation');
expect(snapshot.memory.status).toBe('up');
expect(snapshot.memory.items[0].key).toBe('gateway');
expect(snapshot.skills.status).toBe('up');
expect(snapshot.skills.items[0].id).toBe('s1');
});
it('parses watchdog log lines into severity-tagged alerts', async () => {
setExec(() => {
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
});
statMock.mockResolvedValue({} as never);
readFileMock.mockResolvedValue([
'2026-01-01T12:34:56 WARNING gateway is degraded',
'2026-01-01T12:35:01 CRITICAL backup repo HEAD missing',
'2026-01-01T12:35:30 INFO healthy after retry',
'',
].join('\n'));
const snapshot = await getHermesTelemetrySnapshot('bheem');
expect(snapshot.watchdog.status).toBe('up');
expect(snapshot.watchdog.alerts.map((a) => a.severity)).toEqual(['warn', 'critical', 'info']);
expect(snapshot.watchdog.alerts[0].message).toBe('gateway is degraded');
});
it('parses backup history from `git log` output', async () => {
setExec((command, args) => {
if (command === 'git' && args.includes('log')) {
return {
stdout: [
'a1b2c3\x1f2026-01-01T01:00:00Z\xfeBackup at 01:00',
'd4e5f6\x1f2026-01-01T00:00:00Z\xfeBackup at 00:00',
].map((s) => s.replace('\xfe', '\x1f')).join('\n'),
};
}
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const snapshot = await getHermesTelemetrySnapshot('vijay');
expect(snapshot.backupHistory.status).toBe('up');
expect(snapshot.backupHistory.entries).toHaveLength(2);
expect(snapshot.backupHistory.entries[0]).toMatchObject({ sha: 'a1b2c3', subject: 'Backup at 01:00' });
});
it('serves a cached snapshot on the second call within the TTL window', async () => {
let calls = 0;
setExec(() => {
calls++;
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const a = await getHermesTelemetrySnapshot('vijay');
const callsAfterFirst = calls;
const b = await getHermesTelemetrySnapshot('vijay');
expect(calls).toBe(callsAfterFirst); // no extra subprocess work
expect(b.cached).toBe(true);
expect(a.instanceId).toBe(b.instanceId);
});
it('caches per instance independently', async () => {
setExec(() => {
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const v = await getHermesTelemetrySnapshot('vijay');
const b = await getHermesTelemetrySnapshot('bheem');
expect(v.instanceId).toBe('vijay');
expect(b.instanceId).toBe('bheem');
});
});

View File

@ -0,0 +1,315 @@
import { execFile } from 'child_process';
import { promisify } from 'util';
import { readFile, stat } from 'fs/promises';
import { childLogger } from '../../lib/logger.js';
import type {
HermesBackupHistory,
HermesBackupHistoryEntry,
HermesCronEntry,
HermesCronList,
HermesInstanceId,
HermesMemoryList,
HermesSessionStats,
HermesSkillList,
HermesTelemetrySnapshot,
HermesWatchdogAlert,
HermesWatchdogFeed,
HermesWatchdogSeverity,
} from './types.js';
const execFileAsync = promisify(execFile);
const log = childLogger('hermes-telemetry/repository');
// Per-instance config. Mirrors `instances` in `hermes-ops/repository.ts`.
// Anything we shell out to as the live instance owner ("uma" for Bheem, root
// for Vijay) is wrapped in `runuser -u <user>` so the command runs in the
// owner's environment, not the backend's.
interface InstanceConfig {
id: HermesInstanceId;
user: string | null; // null → run as the backend's own user (root in prod)
repoPath: string;
watchdogLog: string;
}
const INSTANCES: Record<HermesInstanceId, InstanceConfig> = {
vijay: {
id: 'vijay',
user: null,
repoPath: '/root/repos/bytelyst_hostinger_hermes_vm',
watchdogLog: '/root/.hermes/logs/hermes-health-watchdog.log',
},
bheem: {
id: 'bheem',
user: 'uma',
repoPath: '/home/uma/repos/uma_hostinger_hermes_vm',
watchdogLog: '/home/uma/.hermes/logs/hermes-health-watchdog.log',
},
};
interface ExecResult {
stdout: string;
ran: boolean;
}
async function exec(
command: string,
args: string[],
cwd?: string,
timeoutMs = 5000,
): Promise<ExecResult> {
try {
const { stdout } = await execFileAsync(command, args, { cwd, timeout: timeoutMs });
return { stdout: stdout.trim(), ran: true };
} catch (err) {
const e = err as NodeJS.ErrnoException & { code?: string | number; stdout?: string };
if (e?.code === 'ENOENT' || e?.code === 'ETIMEDOUT') {
return { stdout: '', ran: false };
}
// A non-zero exit is still useful — `git log` on an empty repo, etc.
return { stdout: (e?.stdout ?? '').toString().trim(), ran: true };
}
}
// Wrap a command in `runuser -u <user> --` when needed so it runs in the
// instance owner's environment (PATH, ~/.hermes config). For the local
// instance (user=null) we just call the binary directly.
async function execAs(
inst: InstanceConfig,
command: string,
args: string[],
timeoutMs = 5000,
): Promise<ExecResult> {
if (!inst.user) return exec(command, args, undefined, timeoutMs);
return exec('runuser', ['-u', inst.user, '--', command, ...args], undefined, timeoutMs);
}
// --- Sessions ---------------------------------------------------------------
//
// The Hermes CLI exposes `hermes sessions stats --json` (when present). When
// it's not available we report status:'unknown' rather than fabricating zeros.
async function readSessionStats(inst: InstanceConfig): Promise<HermesSessionStats> {
const result = await execAs(inst, 'hermes', ['sessions', 'stats', '--json']);
if (!result.ran) return { totalSessions: 0, totalMessages: 0, status: 'unknown' };
try {
const parsed = JSON.parse(result.stdout) as { sessions?: number; messages?: number };
return {
totalSessions: Number(parsed.sessions ?? 0),
totalMessages: Number(parsed.messages ?? 0),
status: 'up',
};
} catch {
return { totalSessions: 0, totalMessages: 0, status: 'unknown' };
}
}
// --- Cron -------------------------------------------------------------------
//
// `hermes cron list --json` is the canonical source. It's distinct from
// systemd timers (which `hermes-ops` already covers) — this is for Hermes'
// own scheduled tasks (memory rotations, telegram digests, etc).
async function readCron(inst: InstanceConfig): Promise<HermesCronList> {
const result = await execAs(inst, 'hermes', ['cron', 'list', '--json']);
if (!result.ran) return { entries: [], status: 'unknown' };
try {
const raw = JSON.parse(result.stdout) as Array<Record<string, unknown>>;
const entries: HermesCronEntry[] = raw.map((row) => ({
id: String(row.id ?? row.name ?? ''),
name: String(row.name ?? row.id ?? ''),
schedule: row.schedule ? String(row.schedule) : null,
lastRun: row.last_run ? String(row.last_run) : (row.lastRun ? String(row.lastRun) : null),
nextRun: row.next_run ? String(row.next_run) : (row.nextRun ? String(row.nextRun) : null),
lastStatus: row.last_status ? String(row.last_status) : (row.lastStatus ? String(row.lastStatus) : null),
active: Boolean(row.active ?? row.enabled ?? true),
}));
return { entries, status: 'up' };
} catch {
return { entries: [], status: 'unknown' };
}
}
// --- Memory + skills --------------------------------------------------------
async function readMemory(inst: InstanceConfig): Promise<HermesMemoryList> {
const result = await execAs(inst, 'hermes', ['memory', 'list', '--json']);
if (!result.ran) return { items: [], status: 'unknown' };
try {
const raw = JSON.parse(result.stdout) as Array<Record<string, unknown>>;
return {
items: raw.map((row) => ({
id: String(row.id ?? ''),
type: String(row.type ?? 'note'),
key: String(row.key ?? row.name ?? ''),
summary: String(row.summary ?? row.value ?? ''),
updatedAt: row.updated_at ? String(row.updated_at) : (row.updatedAt ? String(row.updatedAt) : null),
})),
status: 'up',
};
} catch {
return { items: [], status: 'unknown' };
}
}
async function readSkills(inst: InstanceConfig): Promise<HermesSkillList> {
const result = await execAs(inst, 'hermes', ['skills', 'list', '--json']);
if (!result.ran) return { items: [], status: 'unknown' };
try {
const raw = JSON.parse(result.stdout) as Array<Record<string, unknown>>;
return {
items: raw.map((row) => ({
id: String(row.id ?? row.name ?? ''),
name: String(row.name ?? row.id ?? ''),
description: String(row.description ?? ''),
enabled: Boolean(row.enabled ?? true),
})),
status: 'up',
};
} catch {
return { items: [], status: 'unknown' };
}
}
// --- Watchdog ---------------------------------------------------------------
//
// Tail the last N lines of the watchdog log and bucket them by severity.
// The log format used by `scripts/hermes-health-watchdog.py` today is roughly:
// 2026-01-01T12:34:56 WARNING gateway is degraded: ...
// 2026-01-01T12:35:01 CRITICAL backup repo HEAD missing
// We accept any RFC3339-ish leading timestamp and a severity word.
const WATCHDOG_LINE = /^(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:[.\d]+)?(?:Z|[+-]\d{2}:?\d{2})?)\s+(\w+)\s+(.*)$/;
function normalizeSeverity(token: string): HermesWatchdogSeverity {
const upper = token.toUpperCase();
if (upper === 'CRITICAL' || upper === 'ERROR' || upper === 'FATAL') return 'critical';
if (upper === 'WARNING' || upper === 'WARN') return 'warn';
return 'info';
}
async function readWatchdog(inst: InstanceConfig): Promise<HermesWatchdogFeed> {
try {
await stat(inst.watchdogLog);
} catch {
return { alerts: [], source: inst.watchdogLog, status: 'unknown' };
}
try {
const content = await readFile(inst.watchdogLog, 'utf8');
const lines = content.split('\n').filter(Boolean);
// Cap to the last 50 entries; anything older isn't useful for the panel.
const tail = lines.slice(-50);
const alerts: HermesWatchdogAlert[] = [];
for (const line of tail) {
const match = WATCHDOG_LINE.exec(line);
if (!match) continue;
const [, ts, severityToken, message] = match;
alerts.push({
timestamp: ts,
severity: normalizeSeverity(severityToken),
message: message.trim(),
});
}
return { alerts, source: inst.watchdogLog, status: 'up' };
} catch (err) {
log.warn({ err, instance: inst.id, source: inst.watchdogLog }, 'failed to read watchdog log');
return { alerts: [], source: inst.watchdogLog, status: 'unknown' };
}
}
// --- Backup history ---------------------------------------------------------
//
// Cheap proxy for "is the backup pipeline alive": last N commits on the
// backup repo. `git -C <repo> log --pretty=...` runs as the backend user;
// repos are world-readable on the live host.
async function readBackupHistory(inst: InstanceConfig): Promise<HermesBackupHistory> {
const result = await exec('git', [
'-C', inst.repoPath,
'log',
'--pretty=format:%H\x1f%cI\x1f%s',
'-n', '20',
]);
if (!result.ran) return { entries: [], repoPath: inst.repoPath, status: 'unknown' };
if (!result.stdout) return { entries: [], repoPath: inst.repoPath, status: 'up' };
const entries: HermesBackupHistoryEntry[] = [];
for (const line of result.stdout.split('\n')) {
const [sha, committedAt, ...rest] = line.split('\x1f');
if (!sha || !committedAt) continue;
entries.push({ sha, committedAt, subject: rest.join('\x1f') });
}
return { entries, repoPath: inst.repoPath, status: 'up' };
}
// --- Snapshot assembly ------------------------------------------------------
const CACHE_TTL = 30000;
const cache = new Map<HermesInstanceId, { snapshot: HermesTelemetrySnapshot; at: number }>();
const inflight = new Map<HermesInstanceId, Promise<HermesTelemetrySnapshot>>();
async function buildSnapshot(instanceId: HermesInstanceId): Promise<HermesTelemetrySnapshot> {
const inst = INSTANCES[instanceId];
const [sessions, cron, memory, skills, watchdog, backupHistory] = await Promise.all([
readSessionStats(inst),
readCron(inst),
readMemory(inst),
readSkills(inst),
readWatchdog(inst),
readBackupHistory(inst),
]);
const warnings: string[] = [];
if (sessions.status === 'unknown') warnings.push(`${instanceId}: hermes sessions stats unavailable (CLI missing or non-zero exit)`);
if (cron.status === 'unknown') warnings.push(`${instanceId}: hermes cron list unavailable`);
if (memory.status === 'unknown') warnings.push(`${instanceId}: hermes memory list unavailable`);
if (skills.status === 'unknown') warnings.push(`${instanceId}: hermes skills list unavailable`);
if (watchdog.status === 'unknown') warnings.push(`${instanceId}: watchdog log not readable at ${watchdog.source ?? 'unknown path'}`);
if (backupHistory.status === 'unknown') warnings.push(`${instanceId}: backup repo not readable at ${backupHistory.repoPath ?? 'unknown path'}`);
return {
generatedAt: new Date().toISOString(),
cached: false,
instanceId,
sessions,
cron,
memory,
skills,
watchdog,
backupHistory,
warnings,
};
}
export async function getHermesTelemetrySnapshot(
instanceId: HermesInstanceId,
options?: { force?: boolean },
): Promise<HermesTelemetrySnapshot> {
const force = options?.force ?? false;
if (!force) {
const cached = cache.get(instanceId);
if (cached && Date.now() - cached.at < CACHE_TTL) {
return { ...cached.snapshot, cached: true };
}
const pending = inflight.get(instanceId);
if (pending) return pending;
}
const promise = buildSnapshot(instanceId)
.then((snapshot) => {
cache.set(instanceId, { snapshot, at: Date.now() });
return snapshot;
})
.finally(() => {
if (inflight.get(instanceId) === promise) inflight.delete(instanceId);
});
if (!force) inflight.set(instanceId, promise);
return promise;
}
// Test hook so `vitest` cases don't bleed cached state across runs.
export function clearHermesTelemetryCache(): void {
cache.clear();
inflight.clear();
}

View File

@ -0,0 +1,36 @@
import type { FastifyInstance } from 'fastify';
import { z } from 'zod';
import { getHermesTelemetrySnapshot } from './repository.js';
import { HermesInstanceIdSchema, HermesTelemetrySnapshotSchema } from './types.js';
import { requireAdmin } from '../../lib/auth.js';
const ParamsSchema = z.object({ instance: HermesInstanceIdSchema });
export async function hermesTelemetryRoutes(fastify: FastifyInstance) {
// GET /api/hermes/telemetry/:instance
// Admin-only: this endpoint shells out to `hermes` CLI in the instance
// owner's environment (`runuser -u uma --` for Bheem) and reads the
// watchdog log + backup repo. Treat it as privileged the same way the
// VM/system endpoints are. See `dashboard/DEPLOYMENT.md` Privilege Surface.
fastify.get('/hermes/telemetry/:instance', {
preHandler: async (req) => requireAdmin(req),
}, async (req, reply) => {
let params: z.infer<typeof ParamsSchema>;
try {
params = ParamsSchema.parse(req.params);
} catch (err) {
return reply.code(400).send({ error: 'Invalid instance', detail: (err as Error).message });
}
try {
const snapshot = await getHermesTelemetrySnapshot(params.instance);
// Validate our own response so a shape regression surfaces here as a
// 500 rather than a corrupt UI state — same approach as hermes-ops.
const validated = HermesTelemetrySnapshotSchema.parse(snapshot);
return reply.send(validated);
} catch (err) {
fastify.log.error(err, 'failed to build hermes telemetry snapshot');
return reply.code(500).send({ error: 'Failed to build hermes telemetry snapshot' });
}
});
}

View File

@ -0,0 +1,118 @@
import { z } from 'zod';
import { ProbeStatusSchema } from '../hermes-ops/types.js';
// Hermes telemetry snapshot — read-only "real artifacts" per Phase 3 Decision #1
// (sessions, cron, memory, skills, watchdog alerts, backup history).
// Each section carries its own `ProbeStatus` so the UI can distinguish
// "definitely empty" from "couldn't read the source" (CLI missing, permission
// denied, timed out). Mirrors the hermes-ops shape: every field set the UI
// renders has a status it can surface.
export const HermesInstanceIdSchema = z.enum(['vijay', 'bheem']);
export type HermesInstanceId = z.infer<typeof HermesInstanceIdSchema>;
export const HermesSessionStatsSchema = z.object({
totalSessions: z.number(),
totalMessages: z.number(),
status: ProbeStatusSchema,
});
export type HermesSessionStats = z.infer<typeof HermesSessionStatsSchema>;
export const HermesCronEntrySchema = z.object({
id: z.string(),
name: z.string(),
schedule: z.string().nullable(),
lastRun: z.string().nullable(),
nextRun: z.string().nullable(),
lastStatus: z.string().nullable(),
active: z.boolean(),
});
export type HermesCronEntry = z.infer<typeof HermesCronEntrySchema>;
export const HermesCronListSchema = z.object({
entries: z.array(HermesCronEntrySchema),
status: ProbeStatusSchema,
});
export type HermesCronList = z.infer<typeof HermesCronListSchema>;
export const HermesMemoryItemSchema = z.object({
id: z.string(),
type: z.string(),
key: z.string(),
summary: z.string(),
updatedAt: z.string().nullable(),
});
export type HermesMemoryItem = z.infer<typeof HermesMemoryItemSchema>;
export const HermesMemoryListSchema = z.object({
items: z.array(HermesMemoryItemSchema),
status: ProbeStatusSchema,
});
export type HermesMemoryList = z.infer<typeof HermesMemoryListSchema>;
export const HermesSkillItemSchema = z.object({
id: z.string(),
name: z.string(),
description: z.string(),
enabled: z.boolean(),
});
export type HermesSkillItem = z.infer<typeof HermesSkillItemSchema>;
export const HermesSkillListSchema = z.object({
items: z.array(HermesSkillItemSchema),
status: ProbeStatusSchema,
});
export type HermesSkillList = z.infer<typeof HermesSkillListSchema>;
// Severity is a union of `info | warn | critical` so the UI can colour-code.
// Watchdog scripts emit "WARNING" / "CRITICAL" prefixes today; we normalize.
export const HermesWatchdogSeveritySchema = z.enum(['info', 'warn', 'critical']);
export type HermesWatchdogSeverity = z.infer<typeof HermesWatchdogSeveritySchema>;
export const HermesWatchdogAlertSchema = z.object({
timestamp: z.string(),
severity: HermesWatchdogSeveritySchema,
message: z.string(),
});
export type HermesWatchdogAlert = z.infer<typeof HermesWatchdogAlertSchema>;
export const HermesWatchdogFeedSchema = z.object({
alerts: z.array(HermesWatchdogAlertSchema),
// Path the alerts were read from (or where they would be read from when
// the source becomes available). Null when no canonical path is known.
source: z.string().nullable(),
status: ProbeStatusSchema,
});
export type HermesWatchdogFeed = z.infer<typeof HermesWatchdogFeedSchema>;
export const HermesBackupHistoryEntrySchema = z.object({
sha: z.string(),
committedAt: z.string(),
subject: z.string(),
});
export type HermesBackupHistoryEntry = z.infer<typeof HermesBackupHistoryEntrySchema>;
export const HermesBackupHistorySchema = z.object({
entries: z.array(HermesBackupHistoryEntrySchema),
// Repo path probed (informational; useful when status is `unknown`).
repoPath: z.string().nullable(),
status: ProbeStatusSchema,
});
export type HermesBackupHistory = z.infer<typeof HermesBackupHistorySchema>;
export const HermesTelemetrySnapshotSchema = z.object({
generatedAt: z.string(),
// True when this payload was served from the short-TTL cache.
cached: z.boolean(),
instanceId: HermesInstanceIdSchema,
sessions: HermesSessionStatsSchema,
cron: HermesCronListSchema,
memory: HermesMemoryListSchema,
skills: HermesSkillListSchema,
watchdog: HermesWatchdogFeedSchema,
backupHistory: HermesBackupHistorySchema,
// Roll-up of any "couldn't tell" / probe-failed conditions; the UI renders
// these inline without changing the structural shape of the snapshot.
warnings: z.array(z.string()),
});
export type HermesTelemetrySnapshot = z.infer<typeof HermesTelemetrySnapshotSchema>;

View File

@ -15,6 +15,7 @@ import { azureConfigRoutes } from './modules/azure-config/routes.js';
import { codeQualityRoutes } from './modules/code-quality/routes.js';
import { cosmosConfigRoutes } from './modules/cosmos-config/routes.js';
import { hermesOpsRoutes } from './modules/hermes-ops/routes.js';
import { hermesTelemetryRoutes } from './modules/hermes-telemetry/routes.js';
import { vmRoutes } from './modules/vm/routes.js';
import rateLimit from '@fastify/rate-limit';
import swagger from '@fastify/swagger';
@ -277,6 +278,7 @@ await fastify.register(azureConfigRoutes, { prefix: '/api' });
await fastify.register(codeQualityRoutes, { prefix: '/api' });
await fastify.register(cosmosConfigRoutes, { prefix: '/api' });
await fastify.register(hermesOpsRoutes, { prefix: '/api' });
await fastify.register(hermesTelemetryRoutes, { prefix: '/api' });
await fastify.register(vmRoutes, { prefix: '/api' });
// Start server

View File

@ -17,6 +17,7 @@ export default defineConfig({
'src/lib/csrf.ts',
'src/modules/health/repository.ts',
'src/modules/hermes-ops/repository.ts',
'src/modules/hermes-telemetry/repository.ts',
'src/modules/deployments/orchestrator.ts',
'src/modules/services/repository.ts',
],

View File

@ -117,6 +117,98 @@ export interface HermesOpsLink {
description: string;
}
// --- Hermes telemetry (Phase 3) ---------------------------------------------
// Per-instance read-only telemetry: sessions, cron, memory/skills, watchdog
// alerts, backup history. Probe sources (`hermes` CLI, watchdog log, backup
// repo) may be unavailable on a given host; each section carries its own
// `status` so the UI can show "definitely empty" vs "couldn't read".
export type HermesProbeStatus = 'up' | 'down' | 'unknown';
export interface HermesSessionStats {
totalSessions: number;
totalMessages: number;
status: HermesProbeStatus;
}
export interface HermesCronEntry {
id: string;
name: string;
schedule: string | null;
lastRun: string | null;
nextRun: string | null;
lastStatus: string | null;
active: boolean;
}
export interface HermesCronList {
entries: HermesCronEntry[];
status: HermesProbeStatus;
}
export interface HermesMemoryItem {
id: string;
type: string;
key: string;
summary: string;
updatedAt: string | null;
}
export interface HermesMemoryList {
items: HermesMemoryItem[];
status: HermesProbeStatus;
}
export interface HermesSkillItem {
id: string;
name: string;
description: string;
enabled: boolean;
}
export interface HermesSkillList {
items: HermesSkillItem[];
status: HermesProbeStatus;
}
export type HermesWatchdogSeverity = 'info' | 'warn' | 'critical';
export interface HermesWatchdogAlert {
timestamp: string;
severity: HermesWatchdogSeverity;
message: string;
}
export interface HermesWatchdogFeed {
alerts: HermesWatchdogAlert[];
source: string | null;
status: HermesProbeStatus;
}
export interface HermesBackupHistoryEntry {
sha: string;
committedAt: string;
subject: string;
}
export interface HermesBackupHistory {
entries: HermesBackupHistoryEntry[];
repoPath: string | null;
status: HermesProbeStatus;
}
export interface HermesTelemetrySnapshot {
generatedAt: string;
cached: boolean;
instanceId: 'vijay' | 'bheem';
sessions: HermesSessionStats;
cron: HermesCronList;
memory: HermesMemoryList;
skills: HermesSkillList;
watchdog: HermesWatchdogFeed;
backupHistory: HermesBackupHistory;
warnings: string[];
}
export interface HermesOpsSnapshot {
generatedAt: string;
tailscaleIp: string | null;
@ -284,6 +376,13 @@ export const api = {
// Hermes operations
getHermesOps: () => apiRequest<HermesOpsSnapshot>('/api/hermes/ops'),
// Hermes per-instance telemetry (Phase 3 — sessions/cron/memory/skills/
// watchdog/backup-history). Returns a Zod-validated snapshot from the
// backend; sections may report status:'unknown' if their underlying
// source isn't readable in the current environment (CI / dev box).
getHermesTelemetry: (instance: 'vijay' | 'bheem') =>
apiRequest<HermesTelemetrySnapshot>(`/api/hermes/telemetry/${instance}`),
// Seed
seedServices: () => apiRequest<{ message: string }>('/api/seed', { method: 'POST' }),