From 02b362399b17cc78766076d994dac35dd4abc4de Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Sun, 31 May 2026 08:28:12 +0000 Subject: [PATCH] feat: complete hermes telemetry dashboard wiring --- dashboard/DEPLOYMENT.md | 4 + .../backend/src/lib/dashboard-alerts.test.ts | 44 +++++ dashboard/backend/src/lib/dashboard-alerts.ts | 48 +++++ .../src/modules/hermes-ops/hermes-ops.test.ts | 43 +++++ .../src/modules/hermes-ops/repository.ts | 51 +++++- .../hermes-telemetry/hermes-telemetry.test.ts | 35 +++- .../modules/hermes-telemetry/repository.ts | 165 +++++++++++++++++- .../src/modules/hermes-telemetry/types.ts | 43 +++++ dashboard/docker-compose.yml | 2 + dashboard/web/e2e/hermes.spec.ts | 103 ++++++++++- dashboard/web/src/app/hermes/agents/page.tsx | 17 +- dashboard/web/src/app/hermes/history/page.tsx | 121 ++++++++++++- dashboard/web/src/app/hermes/page.tsx | 112 +++++++++++- .../web/src/app/hermes/tasks/[id]/page.tsx | 83 ++++++++- dashboard/web/src/app/hermes/tasks/page.tsx | 140 ++++++++++++++- dashboard/web/src/lib/api.ts | 39 +++++ .../web/src/lib/hermes-telemetry-client.ts | 54 ++++++ docs/app-url-bookmarks.md | 98 +++++++++++ docs/hermes-operations.md | 134 ++++++++++++++ docs/hermes_dashboard_v2_roadmap.md | 36 ++-- docs/operations.md | 5 + scripts/hermes-health-watchdog.py | 117 ++++++++++++- scripts/hermes-ops-exporter.py | 143 +++++++++++++++ systemd/hermes-health-watchdog.service | 15 ++ systemd/hermes-health-watchdog.timer | 11 ++ systemd/hermes-ops-exporter.service | 12 ++ systemd/hermes-ops-exporter.timer | 11 ++ systemd/uma-hermes-health-watchdog.service | 18 ++ systemd/uma-hermes-health-watchdog.timer | 11 ++ systemd/uma-hermes-ops-exporter.service | 12 ++ systemd/uma-hermes-ops-exporter.timer | 11 ++ 31 files changed, 1695 insertions(+), 43 deletions(-) create mode 100644 dashboard/backend/src/lib/dashboard-alerts.test.ts create mode 100644 dashboard/backend/src/lib/dashboard-alerts.ts create mode 100644 dashboard/web/src/lib/hermes-telemetry-client.ts create mode 100644 docs/app-url-bookmarks.md create mode 100755 scripts/hermes-ops-exporter.py create mode 100644 systemd/hermes-health-watchdog.service create mode 100644 systemd/hermes-health-watchdog.timer create mode 100644 systemd/hermes-ops-exporter.service create mode 100644 systemd/hermes-ops-exporter.timer create mode 100644 systemd/uma-hermes-health-watchdog.service create mode 100644 systemd/uma-hermes-health-watchdog.timer create mode 100644 systemd/uma-hermes-ops-exporter.service create mode 100644 systemd/uma-hermes-ops-exporter.timer diff --git a/dashboard/DEPLOYMENT.md b/dashboard/DEPLOYMENT.md index 07799a3..7e33464 100644 --- a/dashboard/DEPLOYMENT.md +++ b/dashboard/DEPLOYMENT.md @@ -10,6 +10,10 @@ This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboa ## Public URLs +For the full living bookmark list across all ByteLyst apps, APIs, Hermes +dashboards, and last deploy timestamps, see +[`../docs/app-url-bookmarks.md`](../docs/app-url-bookmarks.md). + - **DevOps Dashboard**: `https://devops.bytelyst.com` - **Admin Dashboard**: `https://admin.bytelyst.com` - **API Gateway**: `https://api.bytelyst.com` diff --git a/dashboard/backend/src/lib/dashboard-alerts.test.ts b/dashboard/backend/src/lib/dashboard-alerts.test.ts new file mode 100644 index 0000000..16e2967 --- /dev/null +++ b/dashboard/backend/src/lib/dashboard-alerts.test.ts @@ -0,0 +1,44 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const appendFileMock = vi.hoisted(() => vi.fn()); +vi.mock('fs/promises', () => ({ appendFile: appendFileMock })); + +const { appendDashboardWarning, clearDashboardWarningDedupe } = await import('./dashboard-alerts.js'); + +describe('dashboard-alerts', () => { + beforeEach(() => { + vi.clearAllMocks(); + clearDashboardWarningDedupe(); + delete process.env.HERMES_DASHBOARD_ALERT_LOG; + }); + + it('does nothing when the alert log is not configured', async () => { + const wrote = await appendDashboardWarning({ severity: 'warn', instance: 'vijay', message: 'gateway down' }); + expect(wrote).toBe(false); + expect(appendFileMock).not.toHaveBeenCalled(); + }); + + it('writes a routed warning line when configured', async () => { + process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log'; + const wrote = await appendDashboardWarning( + { severity: 'critical', instance: 'bheem', message: 'backup missing' }, + Date.parse('2026-05-31T07:00:00Z'), + ); + + expect(wrote).toBe(true); + expect(appendFileMock).toHaveBeenCalledWith( + '/tmp/hermes-dashboard-warnings.log', + '2026-05-31T07:00:00.000Z CRITICAL instance=bheem backup missing\n', + 'utf8', + ); + }); + + it('deduplicates for one hour and writes again after expiry', async () => { + process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log'; + const input = { severity: 'warn' as const, instance: 'all' as const, message: 'shared warning' }; + expect(await appendDashboardWarning(input, 1_000)).toBe(true); + expect(await appendDashboardWarning(input, 2_000)).toBe(false); + expect(await appendDashboardWarning(input, 3_602_000)).toBe(true); + expect(appendFileMock).toHaveBeenCalledTimes(2); + }); +}); diff --git a/dashboard/backend/src/lib/dashboard-alerts.ts b/dashboard/backend/src/lib/dashboard-alerts.ts new file mode 100644 index 0000000..854b112 --- /dev/null +++ b/dashboard/backend/src/lib/dashboard-alerts.ts @@ -0,0 +1,48 @@ +import { appendFile } from 'fs/promises'; + +type AlertSeverity = 'info' | 'warn' | 'critical'; +type AlertInstance = 'vijay' | 'bheem' | 'all'; + +interface DashboardWarningInput { + severity: AlertSeverity; + instance: AlertInstance; + message: string; +} + +const DEDUPE_WINDOW_MS = 60 * 60 * 1000; +const recent = new Map(); + +function severityToken(severity: AlertSeverity): string { + if (severity === 'critical') return 'CRITICAL'; + if (severity === 'warn') return 'WARNING'; + return 'INFO'; +} + +function alertKey(input: DashboardWarningInput): string { + return `${input.severity}\0${input.instance}\0${input.message}`; +} + +function purgeExpired(now: number): void { + for (const [key, at] of recent) { + if (now - at > DEDUPE_WINDOW_MS) recent.delete(key); + } +} + +export async function appendDashboardWarning(input: DashboardWarningInput, now = Date.now()): Promise { + const logPath = process.env.HERMES_DASHBOARD_ALERT_LOG; + if (!logPath) return false; + + purgeExpired(now); + const key = alertKey(input); + const previous = recent.get(key); + if (previous && now - previous <= DEDUPE_WINDOW_MS) return false; + + recent.set(key, now); + const line = `${new Date(now).toISOString()} ${severityToken(input.severity)} instance=${input.instance} ${input.message}\n`; + await appendFile(logPath, line, 'utf8'); + return true; +} + +export function clearDashboardWarningDedupe(): void { + recent.clear(); +} diff --git a/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts b/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts index 50a740e..200e3d7 100644 --- a/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts +++ b/dashboard/backend/src/modules/hermes-ops/hermes-ops.test.ts @@ -146,6 +146,49 @@ describe('hermes-ops repository', () => { expect(bheem.gateway.status).toBe('up'); }); + it('prefers a sanitized per-instance ops export when one is present', async () => { + setExec(healthyHandler()); + readFileMock.mockImplementation(async (p: string) => { + if (p === '/home/uma/.hermes/ops-export.json') { + return JSON.stringify({ + gateway: { active: false, enabled: true, status: 'down' }, + dashboard: { active: false, status: 'down' }, + backupTimer: { + name: 'uma-hermes-backup.timer', + active: false, + status: 'down', + nextRun: null, + lastRun: null, + }, + repo: { + path: '/home/uma/repos/uma_hostinger_hermes_vm', + branch: 'main', + clean: true, + head: 'export1', + lastCommitAt: '2026-05-31T00:00:00Z', + size: '1M', + status: 'up', + }, + restoredFileCount: 42, + restoredCronJobs: 3, + googleWorkspaceToken: true, + }); + } + if (p.endsWith('MANIFEST.json')) return JSON.stringify({ files: [1, 2, 3] }); + if (p.endsWith('jobs.json')) return JSON.stringify({ jobs: [{ id: 'a' }, { id: 'b' }] }); + throw new Error('no such file'); + }); + + const snapshot = await getHermesOpsSnapshot({ force: true }); + const bheem = snapshot.instances.find((i) => i.id === 'bheem')!; + expect(bheem.gateway.status).toBe('down'); + expect(bheem.dashboard.status).toBe('down'); + expect(bheem.backup.repo.head).toBe('export1'); + expect(bheem.backup.restoredFileCount).toBe(42); + expect(bheem.backup.restoredCronJobs).toBe(3); + expect(bheem.google.workspaceToken).toBe(true); + }); + it('reports unknown repo status when git cannot be read', async () => { setExec((command, args) => { if (command === 'git') return enoentError(); diff --git a/dashboard/backend/src/modules/hermes-ops/repository.ts b/dashboard/backend/src/modules/hermes-ops/repository.ts index 184f10e..b493982 100644 --- a/dashboard/backend/src/modules/hermes-ops/repository.ts +++ b/dashboard/backend/src/modules/hermes-ops/repository.ts @@ -2,6 +2,7 @@ import { execFile } from 'child_process'; import { promisify } from 'util'; import { readFile, stat } from 'fs/promises'; import { existsSync } from 'fs'; +import { appendDashboardWarning } from '../../lib/dashboard-alerts.js'; import type { HermesOpsCronJob, HermesOpsInstance, @@ -31,6 +32,7 @@ const instances = [ dashboardPort: 9119, backupTimer: 'hermes-root-backup.timer', repoPath: '/root/repos/bytelyst_hostinger_hermes_vm', + opsExportPath: '/root/.hermes/ops-export.json', driveFolder: 'Vijay Drive', }, { @@ -43,10 +45,21 @@ const instances = [ dashboardPort: 9120, backupTimer: 'uma-hermes-backup.timer', repoPath: '/home/uma/repos/uma_hostinger_hermes_vm', + opsExportPath: '/home/uma/.hermes/ops-export.json', driveFolder: 'Bheem Drive', }, ]; +interface OpsExport { + gateway?: { active?: boolean; enabled?: boolean; status?: ProbeStatus }; + dashboard?: { active?: boolean; status?: ProbeStatus }; + backupTimer?: HermesOpsTimer; + repo?: HermesOpsRepo; + restoredFileCount?: number | null; + restoredCronJobs?: number | null; + googleWorkspaceToken?: boolean; +} + interface ExecResult { // Trimmed stdout. Present even when the command exited non-zero (e.g. // `systemctl is-active` prints "inactive" and exits 3). @@ -223,6 +236,15 @@ async function tokenExists(path: string): Promise { } } +async function readOpsExport(path: string): Promise { + try { + const parsed = JSON.parse(await readFile(path, 'utf8')) as OpsExport; + return parsed && typeof parsed === 'object' ? parsed : null; + } catch { + return null; + } +} + async function getTailscaleIp(): Promise { const result = await exec('tailscale', ['ip', '-4']); if (!result.ran) return null; @@ -246,11 +268,12 @@ async function buildSnapshot(): Promise { const results: HermesOpsInstance[] = []; for (const item of instances) { + const opsExport = await readOpsExport(item.opsExportPath); const gatewayActiveCheck = item.gatewayKind === 'uma-user' ? probeUmaGatewayActive() : probeSystemActive(item.gatewayService); const gatewayEnabledCheck = item.gatewayKind === 'uma-user' ? probeUmaGatewayEnabled() : probeSystemEnabled(item.gatewayService); - const [gateway, gatewayEnabled, dashboard, backupTimer, repo, stats, googleToken] = await Promise.all([ + const [probedGateway, probedGatewayEnabled, probedDashboard, probedBackupTimer, probedRepo, probedStats, probedGoogleToken] = await Promise.all([ gatewayActiveCheck, gatewayEnabledCheck, probeSystemActive(item.dashboardService), @@ -259,6 +282,22 @@ async function buildSnapshot(): Promise { manifestStats(`${item.repoPath}/hermes_persistent_backup`), tokenExists(`${item.hermesHome}/google_token.json`), ]); + const gateway = opsExport?.gateway?.status ? { + active: Boolean(opsExport.gateway.active), + status: opsExport.gateway.status, + } : probedGateway; + const gatewayEnabled = typeof opsExport?.gateway?.enabled === 'boolean' ? opsExport.gateway.enabled : probedGatewayEnabled; + const dashboard = opsExport?.dashboard?.status ? { + active: Boolean(opsExport.dashboard.active), + status: opsExport.dashboard.status, + } : probedDashboard; + const backupTimer = opsExport?.backupTimer ?? probedBackupTimer; + const repo = opsExport?.repo ?? probedRepo; + const stats = { + files: typeof opsExport?.restoredFileCount === 'number' || opsExport?.restoredFileCount === null ? opsExport.restoredFileCount : probedStats.files, + cronJobs: typeof opsExport?.restoredCronJobs === 'number' || opsExport?.restoredCronJobs === null ? opsExport.restoredCronJobs : probedStats.cronJobs, + }; + const googleToken = typeof opsExport?.googleWorkspaceToken === 'boolean' ? opsExport.googleWorkspaceToken : probedGoogleToken; const dashboardUrl = tailscaleIp ? `http://${tailscaleIp}:${item.dashboardPort}/` : `:${item.dashboardPort}`; @@ -316,6 +355,16 @@ async function buildSnapshot(): Promise { warnings.push('Emergency Drive OAuth token is missing'); } + await Promise.all(warnings.map((message) => { + const lower = message.toLowerCase(); + const instance = lower.includes('bheem') || lower.includes('uma') + ? 'bheem' + : lower.includes('vijay') || lower.includes('root') + ? 'vijay' + : 'all'; + return appendDashboardWarning({ severity: 'warn', instance, message }); + })); + const cronJobs: HermesOpsCronJob[] = [ { name: emergencyDriveUpload.name, diff --git a/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts b/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts index f558d10..525ed2b 100644 --- a/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts +++ b/dashboard/backend/src/modules/hermes-telemetry/hermes-telemetry.test.ts @@ -7,7 +7,8 @@ vi.mock('child_process', () => ({ execFile: execFileMock })); const readFileMock = vi.hoisted(() => vi.fn()); const statMock = vi.hoisted(() => vi.fn()); -vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock })); +const readdirMock = vi.hoisted(() => vi.fn()); +vi.mock('fs/promises', () => ({ readFile: readFileMock, readdir: readdirMock, stat: statMock })); type Handler = (command: string, args: string[]) => { error?: NodeJS.ErrnoException; stdout?: string }; @@ -42,6 +43,7 @@ describe('hermes-telemetry repository', () => { }); statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); readFileMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); const snapshot = await getHermesTelemetrySnapshot('vijay'); // The whole shape must validate even when nothing was readable — that's @@ -84,6 +86,7 @@ describe('hermes-telemetry repository', () => { return { stdout: '' }; }); statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readdirMock.mockResolvedValue([]); const snapshot = await getHermesTelemetrySnapshot('vijay'); expect(snapshot.sessions).toEqual({ totalSessions: 59, totalMessages: 5225, status: 'up' }); @@ -102,6 +105,7 @@ describe('hermes-telemetry repository', () => { return { error: err }; }); statMock.mockResolvedValue({} as never); + readdirMock.mockResolvedValue([]); readFileMock.mockResolvedValue([ '2026-01-01T12:34:56 WARNING gateway is degraded', '2026-01-01T12:35:01 CRITICAL backup repo HEAD missing', @@ -129,6 +133,7 @@ describe('hermes-telemetry repository', () => { return { error: err }; }); statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readdirMock.mockResolvedValue([]); const snapshot = await getHermesTelemetrySnapshot('vijay'); expect(snapshot.backupHistory.status).toBe('up'); @@ -144,6 +149,7 @@ describe('hermes-telemetry repository', () => { return { error: err }; }); statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); const a = await getHermesTelemetrySnapshot('vijay'); const callsAfterFirst = calls; @@ -159,10 +165,37 @@ describe('hermes-telemetry repository', () => { return { error: err }; }); statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); const v = await getHermesTelemetrySnapshot('vijay'); const b = await getHermesTelemetrySnapshot('bheem'); expect(v.instanceId).toBe('vijay'); expect(b.instanceId).toBe('bheem'); }); + + it('parses sanitized Hermes session JSONL events without exposing raw message content', async () => { + setExec(() => { + const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const }); + return { error: err }; + }); + statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + readdirMock.mockResolvedValue(['20260101_session.jsonl']); + readFileMock.mockImplementation(async (path: string) => { + if (path.endsWith('.jsonl')) { + return [ + JSON.stringify({ role: 'user', content: 'secret prompt', timestamp: '2026-01-01T00:00:00Z' }), + JSON.stringify({ role: 'assistant', finish_reason: 'tool_calls', tool_calls: [{ function: { name: 'exec_command' } }], timestamp: '2026-01-01T00:01:00Z' }), + ].join('\n'); + } + throw Object.assign(new Error('ENOENT'), { code: 'ENOENT' }); + }); + + const snapshot = await getHermesTelemetrySnapshot('vijay'); + expect(snapshot.sessionEvents.status).toBe('up'); + expect(snapshot.sessionEvents.sourceCount).toBe(1); + expect(snapshot.sessionEvents.entries).toHaveLength(2); + expect(snapshot.sessionEvents.entries[0].summary).toBe('assistant tool call: exec_command'); + expect(snapshot.sessionEvents.entries[1].summary).toBe('user message (content redacted)'); + expect(JSON.stringify(snapshot.sessionEvents.entries)).not.toContain('secret prompt'); + }); }); diff --git a/dashboard/backend/src/modules/hermes-telemetry/repository.ts b/dashboard/backend/src/modules/hermes-telemetry/repository.ts index afd8ce9..5777007 100644 --- a/dashboard/backend/src/modules/hermes-telemetry/repository.ts +++ b/dashboard/backend/src/modules/hermes-telemetry/repository.ts @@ -1,6 +1,8 @@ import { execFile } from 'child_process'; import { promisify } from 'util'; -import { readFile, stat } from 'fs/promises'; +import { readdir, readFile, stat } from 'fs/promises'; +import { basename, join } from 'path'; +import { appendDashboardWarning } from '../../lib/dashboard-alerts.js'; import { childLogger } from '../../lib/logger.js'; import type { HermesBackupHistory, @@ -8,6 +10,10 @@ import type { HermesCronEntry, HermesCronList, HermesInstanceId, + HermesSessionEntry, + HermesSessionEvent, + HermesSessionEventList, + HermesSessionList, HermesMemoryList, HermesSessionStats, HermesSkillList, @@ -29,6 +35,8 @@ interface InstanceConfig { user: string | null; // null → run as the backend's own user (root in prod) repoPath: string; watchdogLog: string; + sessionsIndex: string; + sessionsDir: string; } const INSTANCES: Record = { @@ -37,12 +45,16 @@ const INSTANCES: Record = { user: null, repoPath: '/root/repos/bytelyst_hostinger_hermes_vm', watchdogLog: '/root/.hermes/logs/hermes-health-watchdog.log', + sessionsIndex: '/root/.hermes/sessions/sessions.json', + sessionsDir: '/root/.hermes/sessions', }, bheem: { id: 'bheem', user: 'uma', repoPath: '/home/uma/repos/uma_hostinger_hermes_vm', watchdogLog: '/home/uma/.hermes/logs/hermes-health-watchdog.log', + sessionsIndex: '/home/uma/.hermes/sessions/sessions.json', + sessionsDir: '/home/uma/.hermes/sessions', }, }; @@ -103,6 +115,142 @@ async function readSessionStats(inst: InstanceConfig): Promise { + try { + const parsed = JSON.parse(await readFile(inst.sessionsIndex, 'utf8')) as Record>; + const entries: HermesSessionEntry[] = Object.values(parsed) + .map((row) => ({ + id: String(row.session_id ?? row.id ?? row.session_key ?? ''), + sessionKey: String(row.session_key ?? ''), + platform: row.platform ? String(row.platform) : null, + chatType: row.chat_type ? String(row.chat_type) : null, + displayName: row.display_name ? String(row.display_name) : null, + createdAt: row.created_at ? String(row.created_at) : null, + updatedAt: row.updated_at ? String(row.updated_at) : null, + suspended: Boolean(row.suspended ?? false), + resumePending: Boolean(row.resume_pending ?? false), + totalTokens: typeof row.total_tokens === 'number' ? row.total_tokens : null, + estimatedCostUsd: typeof row.estimated_cost_usd === 'number' ? row.estimated_cost_usd : null, + })) + .filter((entry) => entry.id || entry.sessionKey) + .sort((a, b) => new Date(b.updatedAt ?? b.createdAt ?? 0).getTime() - new Date(a.updatedAt ?? a.createdAt ?? 0).getTime()) + .slice(0, 50); + return { entries, status: 'up' }; + } catch { + return { entries: [], status: 'unknown' }; + } +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function extractToolNames(row: Record): string[] { + const names = new Set(); + const addName = (value: unknown) => { + if (typeof value === 'string' && value.trim()) names.add(value.trim()); + }; + + const collectFromItem = (item: unknown) => { + if (!isRecord(item)) return; + addName(item.name); + if (isRecord(item.function)) addName(item.function.name); + }; + + if (Array.isArray(row.tool_calls)) row.tool_calls.forEach(collectFromItem); + if (Array.isArray(row.codex_message_items)) row.codex_message_items.forEach(collectFromItem); + return Array.from(names).slice(0, 8); +} + +function extractItemTypes(row: Record): string[] { + const itemTypes = new Set(); + if (Array.isArray(row.codex_message_items)) { + for (const item of row.codex_message_items) { + if (isRecord(item) && typeof item.type === 'string') itemTypes.add(item.type); + } + } + return Array.from(itemTypes).slice(0, 8); +} + +function classifySessionEvent(row: Record, toolNames: string[], itemTypes: string[]): HermesSessionEvent['eventType'] { + const role = typeof row.role === 'string' ? row.role : ''; + if (role === 'session_meta') return 'system'; + if (toolNames.length > 0 || row.finish_reason === 'tool_calls') return 'tool-call'; + if (itemTypes.some((type) => type.includes('tool'))) return 'tool-result'; + if (itemTypes.includes('reasoning') || row.reasoning) return 'reasoning'; + if (role === 'user' || role === 'assistant' || typeof row.content === 'string') return 'message'; + return 'unknown'; +} + +function summarizeSessionEvent(row: Record, eventType: HermesSessionEvent['eventType'], toolNames: string[]): string { + const role = typeof row.role === 'string' ? row.role : 'unknown'; + if (eventType === 'system') return 'session metadata recorded'; + if (eventType === 'tool-call') { + const toolText = toolNames.length > 0 ? `: ${toolNames.join(', ')}` : ''; + return `${role} tool call${toolNames.length === 1 ? '' : 's'}${toolText}`; + } + if (eventType === 'tool-result') return `${role} tool result recorded`; + if (eventType === 'reasoning') return `${role} reasoning item recorded`; + if (eventType === 'message') return `${role} message (content redacted)`; + return `${role} event recorded`; +} + +function parseSessionJsonlLine(line: string, sessionFile: string, lineIndex: number): HermesSessionEvent | null { + if (!line.trim()) return null; + try { + const row = JSON.parse(line) as unknown; + if (!isRecord(row)) return null; + const toolNames = extractToolNames(row); + const itemTypes = extractItemTypes(row); + const eventType = classifySessionEvent(row, toolNames, itemTypes); + const timestamp = typeof row.timestamp === 'string' ? row.timestamp : null; + const status = typeof row.status === 'string' + ? row.status + : (typeof row.finish_reason === 'string' ? row.finish_reason : null); + return { + id: `${sessionFile}:${lineIndex}`, + sessionFile, + timestamp, + role: typeof row.role === 'string' ? row.role : null, + eventType, + summary: summarizeSessionEvent(row, eventType, toolNames), + toolNames, + itemTypes, + status, + }; + } catch { + return null; + } +} + +async function readSessionEvents(inst: InstanceConfig): Promise { + try { + const files = (await readdir(inst.sessionsDir)) + .filter((name) => name.endsWith('.jsonl')) + .sort() + .slice(-10); + if (files.length === 0) return { entries: [], status: 'up', sourceCount: 0 }; + + const entries: HermesSessionEvent[] = []; + for (const file of files) { + const sessionFile = basename(file); + const content = await readFile(join(inst.sessionsDir, file), 'utf8'); + const lines = content.split('\n'); + const start = Math.max(0, lines.length - 200); + for (let index = start; index < lines.length; index += 1) { + const event = parseSessionJsonlLine(lines[index], sessionFile, index + 1); + if (event) entries.push(event); + } + } + + entries.sort((a, b) => new Date(b.timestamp ?? 0).getTime() - new Date(a.timestamp ?? 0).getTime()); + return { entries: entries.slice(0, 100), status: 'up', sourceCount: files.length }; + } catch (err) { + log.warn({ err, instance: inst.id, source: inst.sessionsDir }, 'failed to read Hermes session events'); + return { entries: [], status: 'unknown', sourceCount: 0 }; + } +} + // --- Cron ------------------------------------------------------------------- // // `hermes cron list --json` is the canonical source. It's distinct from @@ -248,8 +396,10 @@ const inflight = new Map>(); async function buildSnapshot(instanceId: HermesInstanceId): Promise { const inst = INSTANCES[instanceId]; - const [sessions, cron, memory, skills, watchdog, backupHistory] = await Promise.all([ + const [sessions, sessionList, sessionEvents, cron, memory, skills, watchdog, backupHistory] = await Promise.all([ readSessionStats(inst), + readSessionList(inst), + readSessionEvents(inst), readCron(inst), readMemory(inst), readSkills(inst), @@ -259,17 +409,28 @@ async function buildSnapshot(instanceId: HermesInstanceId): Promise appendDashboardWarning({ severity: 'warn', instance: instanceId, message })), + ...watchdog.alerts + .filter((alert) => alert.severity === 'critical') + .map((alert) => appendDashboardWarning({ severity: 'critical', instance: instanceId, message: alert.message })), + ]); + return { generatedAt: new Date().toISOString(), cached: false, instanceId, sessions, + sessionList, + sessionEvents, cron, memory, skills, diff --git a/dashboard/backend/src/modules/hermes-telemetry/types.ts b/dashboard/backend/src/modules/hermes-telemetry/types.ts index b35a852..cbf148f 100644 --- a/dashboard/backend/src/modules/hermes-telemetry/types.ts +++ b/dashboard/backend/src/modules/hermes-telemetry/types.ts @@ -18,6 +18,47 @@ export const HermesSessionStatsSchema = z.object({ }); export type HermesSessionStats = z.infer; +export const HermesSessionEntrySchema = z.object({ + id: z.string(), + sessionKey: z.string(), + platform: z.string().nullable(), + chatType: z.string().nullable(), + displayName: z.string().nullable(), + createdAt: z.string().nullable(), + updatedAt: z.string().nullable(), + suspended: z.boolean(), + resumePending: z.boolean(), + totalTokens: z.number().nullable(), + estimatedCostUsd: z.number().nullable(), +}); +export type HermesSessionEntry = z.infer; + +export const HermesSessionListSchema = z.object({ + entries: z.array(HermesSessionEntrySchema), + status: ProbeStatusSchema, +}); +export type HermesSessionList = z.infer; + +export const HermesSessionEventSchema = z.object({ + id: z.string(), + sessionFile: z.string(), + timestamp: z.string().nullable(), + role: z.string().nullable(), + eventType: z.enum(['message', 'tool-call', 'tool-result', 'reasoning', 'system', 'unknown']), + summary: z.string(), + toolNames: z.array(z.string()), + itemTypes: z.array(z.string()), + status: z.string().nullable(), +}); +export type HermesSessionEvent = z.infer; + +export const HermesSessionEventListSchema = z.object({ + entries: z.array(HermesSessionEventSchema), + status: ProbeStatusSchema, + sourceCount: z.number(), +}); +export type HermesSessionEventList = z.infer; + export const HermesCronEntrySchema = z.object({ id: z.string(), name: z.string(), @@ -106,6 +147,8 @@ export const HermesTelemetrySnapshotSchema = z.object({ cached: z.boolean(), instanceId: HermesInstanceIdSchema, sessions: HermesSessionStatsSchema, + sessionList: HermesSessionListSchema, + sessionEvents: HermesSessionEventListSchema, cron: HermesCronListSchema, memory: HermesMemoryListSchema, skills: HermesSkillListSchema, diff --git a/dashboard/docker-compose.yml b/dashboard/docker-compose.yml index 95875ab..5ee4d87 100644 --- a/dashboard/docker-compose.yml +++ b/dashboard/docker-compose.yml @@ -25,6 +25,7 @@ services: environment: - VM_SCRIPTS_PATH=/vm-scripts/VMs/HostingerVM - VM_LOG_DIR=/host-logs + - HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log ports: - '127.0.0.1:4004:4004' networks: @@ -37,6 +38,7 @@ services: - /var/log/vm-cleanup.log:/host-logs/vm-cleanup.log - /var/log/vm-health-check.log:/host-logs/vm-health-check.log - /var/log/docker-watchdog.log:/host-logs/docker-watchdog.log + - /var/log/hermes-dashboard-warnings.log:/var/log/hermes-dashboard-warnings.log # Docker socket — allows running docker commands against the host daemon # (same pattern as Portainer/cAdvisor; container already runs as root) - /var/run/docker.sock:/var/run/docker.sock diff --git a/dashboard/web/e2e/hermes.spec.ts b/dashboard/web/e2e/hermes.spec.ts index 0e8f949..ca0c921 100644 --- a/dashboard/web/e2e/hermes.spec.ts +++ b/dashboard/web/e2e/hermes.spec.ts @@ -40,6 +40,87 @@ const hermesOpsSnapshot = { warnings: [], }; +const hermesTelemetrySnapshot = (instanceId: 'vijay' | 'bheem') => ({ + generatedAt: '2026-01-01T00:00:00.000Z', + cached: false, + instanceId, + sessions: { totalSessions: instanceId === 'vijay' ? 12 : 7, totalMessages: instanceId === 'vijay' ? 480 : 210, status: 'up' }, + sessionList: { + status: 'up', + entries: [ + { + id: `${instanceId}-session-1`, + sessionKey: `agent:main:telegram:dm:${instanceId}`, + platform: 'telegram', + chatType: 'dm', + displayName: instanceId === 'vijay' ? 'S' : 'Uma', + createdAt: '2026-01-01T00:00:00.000Z', + updatedAt: '2026-01-01T00:06:00.000Z', + suspended: false, + resumePending: false, + totalTokens: 100, + estimatedCostUsd: 0, + }, + ], + }, + sessionEvents: { + status: 'up', + sourceCount: 1, + entries: [ + { + id: `${instanceId}-events.jsonl:3`, + sessionFile: `${instanceId}-events.jsonl`, + timestamp: '2026-01-01T00:06:00.000Z', + role: 'assistant', + eventType: 'tool-call', + summary: 'assistant tool call: exec_command', + toolNames: ['exec_command'], + itemTypes: [], + status: 'tool_calls', + }, + ], + }, + cron: { + status: 'up', + entries: [ + { + id: `${instanceId}-digest`, + name: `${instanceId} digest`, + schedule: '0 * * * *', + lastRun: '2026-01-01T00:00:00.000Z', + nextRun: '2026-01-01T01:00:00.000Z', + lastStatus: 'ok', + active: true, + }, + ], + }, + memory: { status: 'up', items: [] }, + skills: { status: 'up', items: [] }, + watchdog: { + source: `/tmp/${instanceId}-watchdog.log`, + status: 'up', + alerts: [ + { + timestamp: '2026-01-01T00:05:00.000Z', + severity: 'info', + message: `${instanceId} watchdog healthy`, + }, + ], + }, + backupHistory: { + repoPath: `/tmp/${instanceId}-repo`, + status: 'up', + entries: [ + { + sha: `${instanceId}123456`, + committedAt: '2026-01-01T00:03:00.000Z', + subject: `${instanceId} backup`, + }, + ], + }, + warnings: [], +}); + test.describe('Hermes Mission Control', () => { test.beforeEach(async ({ page }) => { await page.addInitScript(() => { @@ -59,6 +140,22 @@ test.describe('Hermes Mission Control', () => { }); }); + await page.route('**/api/hermes/telemetry/vijay', async (route) => { + await route.fulfill({ + status: 200, + contentType: 'application/json', + body: JSON.stringify(hermesTelemetrySnapshot('vijay')), + }); + }); + + await page.route('**/api/hermes/telemetry/bheem', async (route) => { + await route.fulfill({ + status: 200, + contentType: 'application/json', + body: JSON.stringify(hermesTelemetrySnapshot('bheem')), + }); + }); + // /hermes/products fetches the real service registry + health module // (Phase 3 slice 2). Backend isn't running in CI, so we satisfy those // routes the same way the dashboard spec does. @@ -82,11 +179,11 @@ test.describe('Hermes Mission Control', () => { await page.getByRole('link', { name: 'Task Ledger' }).click(); await expect(page.getByRole('heading', { name: 'Task Ledger' })).toBeVisible(); - await expect(page.getByText('Task table')).toBeVisible(); + await expect(page.getByRole('heading', { name: 'Task table' })).toBeVisible(); await page.goto('/hermes/tasks/task-1'); await expect(page.getByRole('heading', { name: 'Hermes learning' })).toBeVisible(); - await expect(page.getByText('Timeline')).toBeVisible(); + await expect(page.getByRole('heading', { name: 'Timeline', exact: true })).toBeVisible(); await page.goto('/hermes/products'); await expect(page.getByRole('heading', { name: 'Product Portfolio' })).toBeVisible(); @@ -111,7 +208,7 @@ test.describe('Hermes Mission Control', () => { await page.goto('/hermes/tasks/task-1'); await expect(page.getByRole('heading', { name: 'Hermes learning' })).toBeVisible(); - await expect(page.getByRole('heading', { name: 'Timeline' })).toBeVisible(); + await expect(page.getByRole('heading', { name: 'Timeline', exact: true })).toBeVisible(); }); test('exposes a global instance switcher with All / Vijay / Bheem', async ({ page }) => { diff --git a/dashboard/web/src/app/hermes/agents/page.tsx b/dashboard/web/src/app/hermes/agents/page.tsx index ee47e28..c059e63 100644 --- a/dashboard/web/src/app/hermes/agents/page.tsx +++ b/dashboard/web/src/app/hermes/agents/page.tsx @@ -7,8 +7,12 @@ import { useEffect, useMemo, useState } from 'react'; import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell'; import { HermesInstanceBadge } from '@/components/hermes-instance-switcher'; import { useHermesInstance } from '@/lib/hermes-instance-context'; -import { getHermesAgents, HERMES_INSTANCES, type HermesInstanceId } from '@/lib/hermes'; -import { api, type HermesTelemetrySnapshot } from '@/lib/api'; +import { getHermesAgents, HERMES_INSTANCES } from '@/lib/hermes'; +import { + emptyTelemetryState, + loadAllHermesTelemetry, + type HermesTelemetryState, +} from '@/lib/hermes-telemetry-client'; export default function HermesAgentsPage() { const { selectedInstance } = useHermesInstance(); @@ -21,19 +25,16 @@ export default function HermesAgentsPage() { // endpoint. The agent statuses above remain seed-data (status observability // needs a separate ingestion contract); the inventory below is genuine // when the `hermes` CLI is reachable, status:'unknown' otherwise. - const [telemetry, setTelemetry] = useState>({ vijay: null, bheem: null }); + const [telemetry, setTelemetry] = useState(emptyTelemetryState); const [telemetryError, setTelemetryError] = useState(null); useEffect(() => { const controller = new AbortController(); const load = async () => { try { - const [vijay, bheem] = await Promise.all([ - api.getHermesTelemetry('vijay'), - api.getHermesTelemetry('bheem'), - ]); + const next = await loadAllHermesTelemetry(); if (controller.signal.aborted) return; - setTelemetry({ vijay, bheem }); + setTelemetry(next); setTelemetryError(null); } catch (err) { if (controller.signal.aborted) return; diff --git a/dashboard/web/src/app/hermes/history/page.tsx b/dashboard/web/src/app/hermes/history/page.tsx index 2a78d0d..f9ccde6 100644 --- a/dashboard/web/src/app/hermes/history/page.tsx +++ b/dashboard/web/src/app/hermes/history/page.tsx @@ -1,15 +1,28 @@ 'use client'; import Link from 'next/link'; -import { ArrowLeft, Clock3, Flame, TrendingDown, TrendingUp } from 'lucide-react'; +import { ArrowLeft, Clock3, Flame, History, TrendingDown, TrendingUp } from 'lucide-react'; import { Badge, Button } from '@/components/ui/Primitives'; -import { useMemo } from 'react'; +import { useEffect, useMemo, useState } from 'react'; import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell'; +import { HermesInstanceBadge } from '@/components/hermes-instance-switcher'; import { useHermesInstance } from '@/lib/hermes-instance-context'; import { getHermesHistory, hermesTasks } from '@/lib/hermes'; +import { + collectBackupEntries, + collectCronEntries, + collectSessionEvents, + collectSessionEntries, + collectWatchdogAlerts, + emptyTelemetryState, + loadAllHermesTelemetry, + type HermesTelemetryState, +} from '@/lib/hermes-telemetry-client'; export default function HermesHistoryPage() { const { selectedInstance } = useHermesInstance(); + const [telemetry, setTelemetry] = useState(emptyTelemetryState); + const [telemetryError, setTelemetryError] = useState(null); const history = useMemo(() => getHermesHistory(selectedInstance), [selectedInstance]); const filteredTasks = useMemo( () => (selectedInstance === 'all' ? hermesTasks : hermesTasks.filter((task) => task.instanceId === selectedInstance)), @@ -26,6 +39,30 @@ export default function HermesHistoryPage() { tasksWithDuration.reduce((sum, task) => sum + (task.durationMs ?? 0), 0) / Math.max(1, tasksWithDuration.length) / 60000, ); + const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]); + const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]); + const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]); + const liveSessions = useMemo(() => collectSessionEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]); + const liveEvents = useMemo(() => collectSessionEvents(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]); + + useEffect(() => { + let active = true; + const load = async () => { + try { + const next = await loadAllHermesTelemetry(); + if (!active) return; + setTelemetry(next); + setTelemetryError(null); + } catch (err) { + if (!active) return; + setTelemetryError(err instanceof Error ? err.message : String(err)); + } + }; + void load(); + return () => { + active = false; + }; + }, []); const failureReasons = [ ['CI failures', 9], @@ -48,6 +85,86 @@ export default function HermesHistoryPage() { } /> +
+ } helpText="From Hermes session JSONL" /> + } helpText="From hermes cron list" /> + a.severity === 'critical') ? 'danger' : liveAlerts.some((a) => a.severity === 'warn') ? 'warning' : 'default'} icon={} helpText="From watchdog logs" /> + } helpText="From backup git history" /> +
+ + {telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}} + > + {telemetryError ? ( +

+ Could not load telemetry: {telemetryError} +

+ ) : ( +
+
+

Recent events

+
+ {liveEvents.length > 0 ? liveEvents.map((event) => ( +
+ {event.summary} + +
+ )) :

No session events returned.

} +
+
+
+

Recent sessions

+
+ {liveSessions.length > 0 ? liveSessions.map((session) => ( +
+ {session.displayName ?? session.sessionKey} + +
+ )) :

No session entries returned.

} +
+
+
+

Recent watchdog alerts

+
+ {liveAlerts.length > 0 ? liveAlerts.map((alert) => ( +
+ {alert.message} +
+ {alert.severity} + +
+
+ )) :

No watchdog alerts returned.

} +
+
+
+

Cron entries

+
+ {liveCron.length > 0 ? liveCron.map((entry) => ( +
+ {entry.name} + +
+ )) :

No cron entries returned.

} +
+
+
+

Backup history

+
+ {liveBackups.length > 0 ? liveBackups.map((entry) => ( +
+ {entry.subject} + +
+ )) :

No backup commits returned.

} +
+
+
+ )} +
+
diff --git a/dashboard/web/src/app/hermes/page.tsx b/dashboard/web/src/app/hermes/page.tsx index 0756fd1..e9bbcb4 100644 --- a/dashboard/web/src/app/hermes/page.tsx +++ b/dashboard/web/src/app/hermes/page.tsx @@ -1,8 +1,8 @@ 'use client'; -import { useMemo } from 'react'; +import { useEffect, useMemo, useState } from 'react'; import Link from 'next/link'; -import { ArrowRight, BadgeCheck, Bot, CheckCircle2, Clock3, LayoutDashboard, OctagonAlert, Rocket, ShieldAlert, Sparkles, TriangleAlert } from 'lucide-react'; +import { ArrowRight, BadgeCheck, BellRing, Bot, CheckCircle2, Clock3, LayoutDashboard, OctagonAlert, Rocket, ShieldAlert, Sparkles, TriangleAlert } from 'lucide-react'; import { Badge, Button } from '@/components/ui/Primitives'; import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell'; import { HermesInstanceBadge } from '@/components/hermes-instance-switcher'; @@ -19,6 +19,15 @@ import { type HermesProduct, type HermesTask, } from '@/lib/hermes'; +import { + collectBackupEntries, + collectCronEntries, + collectWatchdogAlerts, + emptyTelemetryState, + loadAllHermesTelemetry, + telemetryForFilter, + type HermesTelemetryState, +} from '@/lib/hermes-telemetry-client'; const fmtDate = new Intl.DateTimeFormat('en', { month: 'short', @@ -80,6 +89,8 @@ function ProductMiniCard({ product }: { product: HermesProduct }) { export default function HermesMissionControlPage() { const { selectedInstance } = useHermesInstance(); + const [telemetry, setTelemetry] = useState(emptyTelemetryState); + const [telemetryError, setTelemetryError] = useState(null); const overview = useMemo(() => getHermesOverview(selectedInstance), [selectedInstance]); // Per-instance roll-up cards always show both Vijay and Bheem regardless of // the active filter — they're the "comparison" view that sits next to the @@ -124,6 +135,32 @@ export default function HermesMissionControlPage() { ); const actionableProducts = filteredProducts.filter((product) => product.needsAttention).slice(0, 6); const agentStatuses = useMemo(() => getHermesAgents(selectedInstance), [selectedInstance]); + const liveSnapshots = useMemo(() => telemetryForFilter(telemetry, selectedInstance), [telemetry, selectedInstance]); + const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance).slice(0, 8), [telemetry, selectedInstance]); + const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance).slice(0, 6), [telemetry, selectedInstance]); + const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance).slice(0, 6), [telemetry, selectedInstance]); + + useEffect(() => { + let active = true; + const load = async () => { + try { + const next = await loadAllHermesTelemetry(); + if (!active) return; + setTelemetry(next); + setTelemetryError(null); + } catch (err) { + if (!active) return; + setTelemetryError(err instanceof Error ? err.message : String(err)); + } + }; + void load(); + const timer = window.setInterval(load, 60_000); + return () => { + active = false; + window.clearInterval(timer); + }; + }, []); + const autoActions = [ 'Continue the queued execution lane for high-priority product updates.', 'Publish a weekly digest from completed and failed work.', @@ -185,6 +222,77 @@ export default function HermesMissionControlPage() { + {telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}} + > + {telemetryError ? ( +

+ Could not load telemetry: {telemetryError} +

+ ) : ( +
+
+ {liveAlerts.length > 0 ? liveAlerts.map((alert) => ( +
+
+
+
+ {alert.severity} + + {fmtDate.format(new Date(alert.timestamp))} +
+

{alert.message}

+
+ +
+
+ )) : ( +
+ No watchdog alerts were returned for the selected instance filter. +
+ )} +
+
+
+

Sessions

+
+ {liveSnapshots.map((snapshot) => ( +
+ + {snapshot.sessions.totalSessions} sessions · {snapshot.sessions.totalMessages} messages +
+ ))} +
+
+
+

Upcoming Hermes cron

+
+ {liveCron.length > 0 ? liveCron.map((entry) => ( +
+ {entry.name} + +
+ )) :

No cron entries returned.

} +
+
+
+

Recent backup commits

+
+ {liveBackups.length > 0 ? liveBackups.map((entry) => ( +
+ {entry.subject} + +
+ )) :

No backup commits returned.

} +
+
+
+
+ )} +
+
View all tasks }>
diff --git a/dashboard/web/src/app/hermes/tasks/[id]/page.tsx b/dashboard/web/src/app/hermes/tasks/[id]/page.tsx index fca1af9..47d04f0 100644 --- a/dashboard/web/src/app/hermes/tasks/[id]/page.tsx +++ b/dashboard/web/src/app/hermes/tasks/[id]/page.tsx @@ -2,10 +2,18 @@ import Link from 'next/link'; import { useParams } from 'next/navigation'; +import { useEffect, useMemo, useState } from 'react'; import { ArrowLeft, CircleDashed, Clock3, ShieldAlert, Sparkles } from 'lucide-react'; import { Badge, Button } from '@/components/ui/Primitives'; import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell'; import { getHermesProductById, getHermesTaskById, getHermesTaskEvents } from '@/lib/hermes'; +import { + collectSessionEvents, + collectSessionEntries, + emptyTelemetryState, + loadAllHermesTelemetry, + type HermesTelemetryState, +} from '@/lib/hermes-telemetry-client'; const fmt = new Intl.DateTimeFormat('en', { month: 'short', day: 'numeric', hour: 'numeric', minute: '2-digit' }); @@ -24,6 +32,29 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string const taskId = routeParams?.id ?? params.id; const task = getHermesTaskById(taskId); const events = getHermesTaskEvents(taskId); + const [telemetry, setTelemetry] = useState(emptyTelemetryState); + const [telemetryError, setTelemetryError] = useState(null); + const liveSessions = useMemo(() => collectSessionEntries(telemetry, 'all').slice(0, 8), [telemetry]); + const liveEvents = useMemo(() => collectSessionEvents(telemetry, 'all').slice(0, 12), [telemetry]); + + useEffect(() => { + let active = true; + const load = async () => { + try { + const next = await loadAllHermesTelemetry(); + if (!active) return; + setTelemetry(next); + setTelemetryError(null); + } catch (err) { + if (!active) return; + setTelemetryError(err instanceof Error ? err.message : String(err)); + } + }; + void load(); + return () => { + active = false; + }; + }, []); if (!task) { return ( @@ -40,7 +71,6 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string } const product = getHermesProductById(task.productId); - const lastEvent = events[0]; const timeline = events.slice().sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime()); return ( @@ -110,6 +140,57 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
+ {telemetryError ? 'Telemetry unavailable' : 'Live sessions'}} + > + {telemetryError ? ( +

+ Could not load telemetry: {telemetryError} +

+ ) : ( +
+
+ {liveEvents.map((event) => ( +
+
+
+
+ {event.eventType} + {event.instanceId} + {event.status ? {event.status} : null} +
+

{event.summary}

+

{event.sessionFile}

+
+

{event.timestamp ? fmt.format(new Date(event.timestamp)) : 'unknown'}

+
+
+ ))} + {liveEvents.length === 0 ? ( +

No live session events were returned.

+ ) : null} +
+
+ {liveSessions.map((session) => ( +
+
+ {session.platform ?? 'session'} + {session.instanceId} +
+

{session.displayName ?? session.sessionKey}

+

Updated {session.updatedAt ? fmt.format(new Date(session.updatedAt)) : 'unknown'}

+
+ ))} + {liveSessions.length === 0 ? ( +

No live session entries were returned.

+ ) : null} +
+
+ )} +
+
    {timeline.map((event) => ( diff --git a/dashboard/web/src/app/hermes/tasks/page.tsx b/dashboard/web/src/app/hermes/tasks/page.tsx index f913369..f74521f 100644 --- a/dashboard/web/src/app/hermes/tasks/page.tsx +++ b/dashboard/web/src/app/hermes/tasks/page.tsx @@ -1,8 +1,8 @@ 'use client'; -import { Fragment, useMemo, useState } from 'react'; +import { Fragment, useEffect, useMemo, useState } from 'react'; import Link from 'next/link'; -import { Download, Filter, Search, ChevronDown, ChevronUp, ArrowLeftRight } from 'lucide-react'; +import { Download, Filter, Search, ChevronDown, ChevronUp, ArrowLeftRight, Activity } from 'lucide-react'; import { Badge, Button, Input } from '@/components/ui/Primitives'; import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell'; import { HermesInstanceBadge } from '@/components/hermes-instance-switcher'; @@ -17,6 +17,16 @@ import { type HermesTaskSource, type HermesTask, } from '@/lib/hermes'; +import { + collectBackupEntries, + collectCronEntries, + collectSessionEntries, + collectWatchdogAlerts, + emptyTelemetryState, + loadAllHermesTelemetry, + telemetryForFilter, + type HermesTelemetryState, +} from '@/lib/hermes-telemetry-client'; const statuses: Array = ['all', 'queued', 'running', 'blocked', 'completed', 'failed', 'skipped', 'cancelled']; const priorities: Array = ['all', 'P0', 'P1', 'P2', 'P3']; @@ -50,6 +60,8 @@ export default function HermesTaskLedgerPage() { const [sort, setSort] = useState<(typeof sortOptions)[number]>('newest'); const [page, setPage] = useState(1); const [expandedTaskId, setExpandedTaskId] = useState(null); + const [telemetry, setTelemetry] = useState(emptyTelemetryState); + const [telemetryError, setTelemetryError] = useState(null); const { selectedInstance } = useHermesInstance(); const tasks = useMemo( @@ -67,6 +79,68 @@ export default function HermesTaskLedgerPage() { }), [tasks]); const visibleProducts = hermesProducts.slice(0, 20); + const liveSnapshots = useMemo(() => telemetryForFilter(telemetry, selectedInstance), [telemetry, selectedInstance]); + const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance), [telemetry, selectedInstance]); + const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance), [telemetry, selectedInstance]); + const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance), [telemetry, selectedInstance]); + const liveSessions = useMemo(() => collectSessionEntries(telemetry, selectedInstance), [telemetry, selectedInstance]); + const liveActivityRows = useMemo(() => [ + ...liveSessions.map((entry) => ({ + id: `session-${entry.instanceId}-${entry.id}`, + instanceId: entry.instanceId, + kind: 'session', + title: entry.displayName ? `${entry.displayName} session` : entry.sessionKey, + detail: entry.resumePending ? 'resume pending' : entry.suspended ? 'suspended' : entry.platform ?? 'session', + time: entry.updatedAt ?? entry.createdAt, + tone: entry.resumePending || entry.suspended ? 'warning' as const : 'info' as const, + })), + ...liveCron.map((entry) => ({ + id: `cron-${entry.instanceId}-${entry.id}`, + instanceId: entry.instanceId, + kind: 'cron', + title: entry.name, + detail: entry.lastStatus ?? entry.schedule ?? 'Hermes cron entry', + time: entry.nextRun ?? entry.lastRun, + tone: entry.active ? 'success' as const : 'neutral' as const, + })), + ...liveAlerts.map((alert) => ({ + id: `alert-${alert.instanceId}-${alert.timestamp}-${alert.message}`, + instanceId: alert.instanceId, + kind: 'alert', + title: alert.message, + detail: alert.severity, + time: alert.timestamp, + tone: alert.severity === 'critical' ? 'error' as const : alert.severity === 'warn' ? 'warning' as const : 'info' as const, + })), + ...liveBackups.map((entry) => ({ + id: `backup-${entry.instanceId}-${entry.sha}`, + instanceId: entry.instanceId, + kind: 'backup', + title: entry.subject, + detail: entry.sha.slice(0, 8), + time: entry.committedAt, + tone: 'success' as const, + })), + ].sort((a, b) => new Date(b.time ?? 0).getTime() - new Date(a.time ?? 0).getTime()).slice(0, 12), [liveSessions, liveCron, liveAlerts, liveBackups]); + + useEffect(() => { + let active = true; + const load = async () => { + try { + const next = await loadAllHermesTelemetry(); + if (!active) return; + setTelemetry(next); + setTelemetryError(null); + } catch (err) { + if (!active) return; + setTelemetryError(err instanceof Error ? err.message : String(err)); + } + }; + void load(); + return () => { + active = false; + }; + }, []); return ( + {telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}} + > + {telemetryError ? ( +

    + Could not load telemetry: {telemetryError} +

    + ) : ( +
    +
    + {liveSnapshots.map((snapshot) => ( +
    +
    + + {snapshot.sessions.status} +
    +

    {snapshot.sessions.totalSessions}

    +

    {snapshot.sessions.totalMessages} session messages observed

    +
    + ))} +
    +
    + + + + + + + + + + + {liveActivityRows.map((row) => ( + + + + + + + ))} + {liveActivityRows.length === 0 ? ( + + + + ) : null} + +
    ArtifactInstanceSignalTime
    +
    + +
    +

    {row.title}

    +

    {row.kind}

    +
    +
    +
    {row.detail}{row.time ? prettyDate(row.time) : '—'}
    No live activity artifacts were returned for the current instance filter.
    +
    +
    + )} +
    +
    { setQuery(event.target.value); setPage(1); }} placeholder="Search tasks..." aria-label="Search tasks" className="xl:col-span-2" /> diff --git a/dashboard/web/src/lib/api.ts b/dashboard/web/src/lib/api.ts index 03fab61..e33b2e6 100644 --- a/dashboard/web/src/lib/api.ts +++ b/dashboard/web/src/lib/api.ts @@ -130,6 +130,43 @@ export interface HermesSessionStats { status: HermesProbeStatus; } +export interface HermesSessionEntry { + id: string; + sessionKey: string; + platform: string | null; + chatType: string | null; + displayName: string | null; + createdAt: string | null; + updatedAt: string | null; + suspended: boolean; + resumePending: boolean; + totalTokens: number | null; + estimatedCostUsd: number | null; +} + +export interface HermesSessionList { + entries: HermesSessionEntry[]; + status: HermesProbeStatus; +} + +export interface HermesSessionEvent { + id: string; + sessionFile: string; + timestamp: string | null; + role: string | null; + eventType: 'message' | 'tool-call' | 'tool-result' | 'reasoning' | 'system' | 'unknown'; + summary: string; + toolNames: string[]; + itemTypes: string[]; + status: string | null; +} + +export interface HermesSessionEventList { + entries: HermesSessionEvent[]; + status: HermesProbeStatus; + sourceCount: number; +} + export interface HermesCronEntry { id: string; name: string; @@ -201,6 +238,8 @@ export interface HermesTelemetrySnapshot { cached: boolean; instanceId: 'vijay' | 'bheem'; sessions: HermesSessionStats; + sessionList: HermesSessionList; + sessionEvents: HermesSessionEventList; cron: HermesCronList; memory: HermesMemoryList; skills: HermesSkillList; diff --git a/dashboard/web/src/lib/hermes-telemetry-client.ts b/dashboard/web/src/lib/hermes-telemetry-client.ts new file mode 100644 index 0000000..4acc403 --- /dev/null +++ b/dashboard/web/src/lib/hermes-telemetry-client.ts @@ -0,0 +1,54 @@ +import { api, type HermesTelemetrySnapshot, type HermesWatchdogAlert } from '@/lib/api'; +import type { HermesInstanceId, HermesInstanceFilter } from '@/lib/hermes'; + +export type HermesTelemetryState = Record; + +export const emptyTelemetryState: HermesTelemetryState = { vijay: null, bheem: null }; + +export async function loadAllHermesTelemetry(): Promise { + const [vijay, bheem] = await Promise.all([ + api.getHermesTelemetry('vijay'), + api.getHermesTelemetry('bheem'), + ]); + return { vijay, bheem }; +} + +export function telemetryForFilter( + telemetry: HermesTelemetryState, + selectedInstance: HermesInstanceFilter, +): HermesTelemetrySnapshot[] { + if (selectedInstance === 'all') return [telemetry.vijay, telemetry.bheem].filter(Boolean) as HermesTelemetrySnapshot[]; + return telemetry[selectedInstance] ? [telemetry[selectedInstance]] : []; +} + +export function collectWatchdogAlerts( + telemetry: HermesTelemetryState, + selectedInstance: HermesInstanceFilter, +): Array { + return telemetryForFilter(telemetry, selectedInstance) + .flatMap((snapshot) => snapshot.watchdog.alerts.map((alert) => ({ ...alert, instanceId: snapshot.instanceId }))) + .sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime()); +} + +export function collectBackupEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) { + return telemetryForFilter(telemetry, selectedInstance) + .flatMap((snapshot) => snapshot.backupHistory.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId }))) + .sort((a, b) => new Date(b.committedAt).getTime() - new Date(a.committedAt).getTime()); +} + +export function collectCronEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) { + return telemetryForFilter(telemetry, selectedInstance) + .flatMap((snapshot) => snapshot.cron.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId }))); +} + +export function collectSessionEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) { + return telemetryForFilter(telemetry, selectedInstance) + .flatMap((snapshot) => snapshot.sessionList.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId }))) + .sort((a, b) => new Date(b.updatedAt ?? b.createdAt ?? 0).getTime() - new Date(a.updatedAt ?? a.createdAt ?? 0).getTime()); +} + +export function collectSessionEvents(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) { + return telemetryForFilter(telemetry, selectedInstance) + .flatMap((snapshot) => snapshot.sessionEvents.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId }))) + .sort((a, b) => new Date(b.timestamp ?? 0).getTime() - new Date(a.timestamp ?? 0).getTime()); +} diff --git a/docs/app-url-bookmarks.md b/docs/app-url-bookmarks.md new file mode 100644 index 0000000..c712c53 --- /dev/null +++ b/docs/app-url-bookmarks.md @@ -0,0 +1,98 @@ +# ByteLyst App URL Bookmarks + +**Owner:** ByteLyst DevOps +**Last updated:** 2026-05-31T08:14:55+00:00 +**Source of truth for bookmarks:** this file +**Exposure/security companion:** [`docs/vm-exposure-inventory.md`](vm-exposure-inventory.md) + +Use this as the living bookmark/reference list for deployed apps, dashboards, +APIs, and private admin surfaces. When a new app is deployed, add it here in +the same change that adds its Caddy route, Compose service, or systemd unit. + +`Last deployed / restarted` means the latest timestamp we have evidence for. +For Docker services it is the container `StartedAt` timestamp from +`docker inspect`; for systemd services it is the service active-since timestamp. +If the deploy time is not known, use `unknown` and update it during the next +verified deploy. + +## Update Checklist + +When deploying or changing an app: + +1. Add or update the row in this file. +2. Update `Last deployed / restarted` with an exact UTC timestamp. +3. Record the repo/service owner and access model. +4. If exposure changes, also update [`docs/vm-exposure-inventory.md`](vm-exposure-inventory.md). +5. If it is a DevOps dashboard endpoint, also update [`dashboard/ENDPOINTS.md`](../dashboard/ENDPOINTS.md). + +## Primary Dashboards + +| Name | URL | Access | Backend/API | Runtime owner | Last deployed / restarted | Notes | +| --- | --- | --- | --- | --- | --- | --- | +| DevOps custom dashboard | `https://devops.bytelyst.com` | private-admin/auth | `https://api.bytelyst.com/devops` | `dashboard/docker-compose.yml` (`devops-web`, `devops-backend`) | `2026-05-31T04:02:24Z` web, `2026-05-31T04:02:23Z` backend | Unified ByteLyst DevOps dashboard. Hermes Mission Control lives under `/hermes`. | +| DevOps Tailscale entry | `https://srv1491630.tailf85608.ts.net/login` | Tailscale/private-admin/auth | `http://127.0.0.1:4004` | Tailscale serve -> `localhost:3049` | `2026-05-31T04:02:24Z` | Private login path used for VM-side dashboard review. | +| Platform admin dashboard | `https://admin.bytelyst.com` | private-admin/auth | `https://api.bytelyst.com/platform/api` | common platform `admin-web` | `unknown` | Caddy route is documented; container was not present in the 2026-05-27 exposure inventory. Verify before relying on it. | +| Hermes Mission Control | `https://devops.bytelyst.com/hermes` | private-admin/auth | `https://api.bytelyst.com/devops/api/hermes/*` | DevOps custom dashboard | `2026-05-31T04:02:24Z` | Unified custom Hermes dashboard over Vijay/root and Bheem/Uma. | +| Hermes native Vijay dashboard | `http://100.87.53.10:9119/` | Tailscale-only/private-admin | native Hermes service | `hermes-root-dashboard.service` | `2026-05-31T04:02:20Z` | Built-in Hermes dashboard for root/Vijay. No public Caddy route. | +| Hermes native Bheem dashboard | `http://100.87.53.10:9120/` | Tailscale-only/private-admin | native Hermes service | `uma-hermes-dashboard.service` | `2026-05-31T04:02:20Z` | Built-in Hermes dashboard for Uma/Bheem. No public Caddy route. | +| LLM Lab dashboard | `https://llmlab.bytelyst.com` | private-admin | local/dashboard service | common platform `llmlab-dashboard` | `2026-05-31T04:02:24Z` | Keep private/auth-gated. Local host port `127.0.0.1:3075`. | + +## Public Apps + +| App | Public URL | API URL | Runtime owner | Last deployed / restarted | Notes | +| --- | --- | --- | --- | --- | --- | +| InvtTrdg | `https://invttrdg.bytelyst.com` | `https://api.bytelyst.com/invttrdg/*` | `/opt/bytelyst/learning_ai_invt_trdg` | `unknown` | Exposure inventory maps web to `:3085` and backend to `:4025`. | +| Clock / Chronomind | `https://clock.bytelyst.com` | `https://api.bytelyst.com/chronomind/*` | `/opt/bytelyst/learning_ai_clock` | `2026-05-31T04:02:24Z` web/backend | Local web `127.0.0.1:3030`, backend `127.0.0.1:4011`. | +| Notes / Notelett | `https://notes.bytelyst.com` | `https://api.bytelyst.com/notelett/*` | `/opt/bytelyst/learning_ai_notes` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` backend | Local web `127.0.0.1:3000`, backend `127.0.0.1:4016`. | +| Tracker | `https://tracker.bytelyst.com` | n/a | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` | Local web `127.0.0.1:3003`. | +| PeakPulse | n/a | `https://api.bytelyst.com/peakpulse/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` platform stack | Backend is Docker-internal `peakpulse-backend:4010`. | +| Jarvis Jr | n/a | `https://api.bytelyst.com/jarvisjr/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` platform stack | Local web `127.0.0.1:3035`, backend Docker-internal `jarvisjr-backend:4012`. | +| Nomgap | Vercel / external | `https://api.bytelyst.com/nomgap/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` platform stack | Old local `nomgap-web` was retired; backend remains Docker-internal. | +| Mindlyst | n/a | `https://api.bytelyst.com/mindlyst/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3050`, backend Docker-internal `mindlyst-backend:4014`. | +| LysnrAI | n/a | `https://api.bytelyst.com/lysnrai/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` dashboard, `2026-05-31T04:02:24Z` platform stack | Local dashboard `127.0.0.1:3002`, backend Docker-internal `lysnrai-backend:4015`. | +| Flowmonk | n/a | `https://api.bytelyst.com/flowmonk/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3040`, backend Docker-internal `flowmonk-backend:4017`. | +| ActionTrail | n/a | `https://api.bytelyst.com/actiontrail/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` platform stack | Local web `127.0.0.1:3060`; exposure inventory notes route/backend mapping needs verification. | +| LocalMemGPT | n/a | `https://api.bytelyst.com/localmemgpt/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3070`, backend Docker-internal `localmemgpt-backend:4019`. | + +## Shared APIs And Infrastructure + +| Service | URL | Access | Runtime owner | Last deployed / restarted | Notes | +| --- | --- | --- | --- | --- | --- | +| API gateway | `https://api.bytelyst.com` | public gateway | Caddy/common platform | `2026-05-31T04:02:24Z` caddy | Routes app APIs by path. | +| Platform API | `https://api.bytelyst.com/platform/api` | public/auth-required | common platform `platform-service` | `2026-05-31T04:02:24Z` | Auth and platform data API. | +| Extraction API | `https://api.bytelyst.com/extraction/*` | public/API-controlled | common platform `extraction-service` | `2026-05-31T04:02:23Z` | Confirm auth posture before exposing new consumers. | +| MCP API | `https://api.bytelyst.com/mcp/*` | public/API-controlled | common platform `mcp-server` | `2026-05-31T04:02:23Z` | Confirm public need before widening access. | +| Gitea | `https://gitea.bytelyst.com` | public/admin-auth | `gitea-npm-registry` | `2026-05-31T04:02:23Z` | Local direct registry also listens on `:3300`; see registry docs. | +| Gitea npm registry | `http://localhost:3300/api/packages/bytelyst/npm/` | VM/local or tunnel | `gitea-npm-registry` | `2026-05-31T04:02:23Z` | Do not use from laptop unless tunneled. See [`docs/gitea-registry-and-package-resolution.md`](gitea-registry-and-package-resolution.md). | +| Ollama endpoint | `https://ollama.bytelyst.com` | private-admin target | host `ollama` service | `unknown` | Must not be unauthenticated public. | +| Mailpit UI | `http://127.0.0.1:8025` | loopback-only | common platform `mailpit` | `2026-05-31T04:02:23Z` | Dev/test mail UI. | +| Loki | `http://127.0.0.1:3100` | loopback-only | common platform `loki` | `2026-05-31T04:02:24Z` | Observability internal. | +| Cosmos emulator UI | `http://127.0.0.1:1234` / `http://127.0.0.1:8081` | loopback-only | common platform `cosmos-emulator` | `2026-05-31T04:02:23Z` | Dev/test only; current production data uses real Cosmos for platform. | +| Azurite | `http://127.0.0.1:10000` | loopback-only target | common platform `azurite` | `2026-05-31T04:02:24Z` | Check exposure inventory before relying on external access. | + +## Local Host Ports + +These are operational shortcuts, not public bookmarks. + +| Service | Local URL | Public/private equivalent | +| --- | --- | --- | +| DevOps web container | `http://127.0.0.1:3049` | `https://devops.bytelyst.com` or Tailscale URL | +| DevOps backend health | `http://127.0.0.1:4004/health` | `https://api.bytelyst.com/devops/health` if routed | +| Platform service health | `http://127.0.0.1:4003/health` | `https://api.bytelyst.com/platform/api` | +| Clock web | `http://127.0.0.1:3030` | `https://clock.bytelyst.com` | +| Notes web | `http://127.0.0.1:3000` | `https://notes.bytelyst.com` | +| InvtTrdg web | `http://127.0.0.1:3085` | `https://invttrdg.bytelyst.com` | +| Tracker web | `http://127.0.0.1:3003` | `https://tracker.bytelyst.com` | +| Hermes Vijay dashboard | `http://100.87.53.10:9119/` | Tailscale-only | +| Hermes Bheem dashboard | `http://100.87.53.10:9120/` | Tailscale-only | + +## Open Verification Items + +- Confirm whether `admin.bytelyst.com` is currently backed by a running + `admin-web` container. +- Confirm product-facing public URLs for apps listed as `n/a` before sharing + them outside the admin team. +- Confirm `actiontrail` API route/container port mapping; historical inventory + used `api.bytelyst.com/actiontrail/*` while current container metadata shows + `actiontrail-backend` as part of the common platform stack. +- Replace any `unknown` deploy timestamp during the next verified deploy. diff --git a/docs/hermes-operations.md b/docs/hermes-operations.md index e96d42f..d318502 100644 --- a/docs/hermes-operations.md +++ b/docs/hermes-operations.md @@ -37,6 +37,20 @@ Observed on 2026-05-27: Before adding any new Caddy hostname, Docker port, or dashboard/API feature, verify that it is not a Hermes dashboard/API public exposure. +Session privacy policy for dashboard/telemetry surfaces: + +- Treat gateway session content as private by default for both Vijay and Bheem. +- Dashboard routes may show counts, statuses, timestamps, IDs, sanitized warning + messages, cron names, skill/memory names, and backup commit subjects. +- Dashboard telemetry may show sanitized session JSONL event projections: + event type, role, timestamp, source filename, tool names, item types, and + status. Raw message content remains redacted before it reaches the UI. +- Dashboard routes must not expose raw prompts, full session transcripts, raw + command output containing secrets, `.env` values, OAuth payloads, raw + `state.db`, Telegram tokens, provider keys, or personal message content. +- If a future session-event pipeline is added, enable secret and PII redaction + at ingestion time and store only the redacted event projection used by the UI. + ```bash # Inspect public Caddy routes and obvious Hermes/API/dashboard references. docker ps --format '{{.Names}} {{.Ports}}' | grep -i caddy || true @@ -85,6 +99,60 @@ systemd/hermes-root-backup.service systemd/hermes-root-backup.timer systemd/uma-hermes-backup.service systemd/uma-hermes-backup.timer +systemd/hermes-health-watchdog.service +systemd/hermes-health-watchdog.timer +systemd/uma-hermes-health-watchdog.service +systemd/uma-hermes-health-watchdog.timer +systemd/hermes-ops-exporter.service +systemd/hermes-ops-exporter.timer +systemd/uma-hermes-ops-exporter.service +systemd/uma-hermes-ops-exporter.timer +``` + +## Mission Control ops exporter + +Mission Control can read a sanitized per-instance ops export before falling back +to live cross-user probes. This reduces brittle root-to-Uma inspection and keeps +the dashboard contract free of secrets or session content. + +Tracked exporter: + +```bash +scripts/hermes-ops-exporter.py +``` + +Output paths: + +```text +/root/.hermes/ops-export.json +/home/uma/.hermes/ops-export.json +``` + +The JSON contains only service booleans/status, timer timestamps, short Git +metadata, restore counts, and whether a Google token file exists. It does not +include token values, raw `state.db`, logs, prompt/session text, OAuth payloads, +or environment files. + +Install root exporter: + +```bash +cp systemd/hermes-ops-exporter.service /etc/systemd/system/hermes-ops-exporter.service +cp systemd/hermes-ops-exporter.timer /etc/systemd/system/hermes-ops-exporter.timer +systemctl daemon-reload +systemctl enable --now hermes-ops-exporter.timer +systemctl status hermes-ops-exporter.timer --no-pager +``` + +Install Uma exporter as user systemd: + +```bash +install -d -o uma -g uma /home/uma/.config/systemd/user +cp systemd/uma-hermes-ops-exporter.service /home/uma/.config/systemd/user/uma-hermes-ops-exporter.service +cp systemd/uma-hermes-ops-exporter.timer /home/uma/.config/systemd/user/uma-hermes-ops-exporter.timer +chown uma:uma /home/uma/.config/systemd/user/uma-hermes-ops-exporter.* +runuser -u uma -- systemctl --user daemon-reload +runuser -u uma -- systemctl --user enable --now uma-hermes-ops-exporter.timer +runuser -u uma -- systemctl --user status uma-hermes-ops-exporter.timer --no-pager ``` ## Health baseline commands @@ -164,6 +232,48 @@ python3 ~/.hermes/scripts/hermes_health_watchdog.py # Healthy output should be empty. ``` +Tracked systemd watchdog timers: + +```bash +systemctl status hermes-health-watchdog.timer --no-pager +systemctl --user --machine=uma@.host status uma-hermes-health-watchdog.timer --no-pager +tail -n 20 /root/.hermes/logs/hermes-health-watchdog.log +tail -n 20 /home/uma/.hermes/logs/hermes-health-watchdog.log +``` + +Dashboard warning bridge: + +```bash +/var/log/hermes-dashboard-warnings.log +``` + +The dashboard backend appends deduplicated warning lines there when +`HERMES_DASHBOARD_ALERT_LOG` is configured. Both watchdogs tail the same file +and route by `instance=vijay`, `instance=bheem`, or `instance=all`. +Telegram delivery is attempted only when `~/.config/hermes/telegram` +exists with `BOT_TOKEN=`/`CHAT_ID=` or `TELEGRAM_BOT_TOKEN=`/`TELEGRAM_CHAT_ID=`. +If that file is absent, the watchdog still writes a local warning log line and +records `Telegram delivery skipped or failed`. + +2026-05-31 Telegram delivery validation: + +- `instance=bheem` synthetic warning: consumed only by Uma watchdog; root log + had zero matches; Telegram delivery succeeded. +- `instance=vijay` synthetic warning: consumed only by root watchdog; Uma log + had zero matches; Telegram delivery succeeded. +- `instance=all` synthetic warning: consumed by both watchdogs; Telegram + delivery succeeded for both chats. +- Recovery messages: after each alert, the next healthy watchdog pass sent + `recovery: back to healthy` and logged `Telegram recovery delivery succeeded`. +- Approval prompt/media validation: root and Uma bots returned Telegram `200` + for harmless inline-button prompt delivery and small document upload. +- Approval callback execution evidence: live gateway logs contain real + `Telegram button resolved 1 approval(s)` entries for root through + 2026-05-30, including a deny choice, and for Uma on 2026-05-25. Telegram's + Bot API cannot synthesize user callback clicks, so callback execution proof + comes from these receiver logs plus source review of the Telegram callback + handler. + Persistent backup timers: ```bash @@ -424,9 +534,33 @@ alerts today) follow a small set of conventions worth keeping consistent. (✅ approve / ❌ deny). The dashboard does not yet trigger these — see the Phase 8 delegation brief in `docs/prompts/phase8-telegram-loop.md` for the design that closes the loop end-to-end. +- 2026-05-31 delivery smoke test: root and Uma bots both returned Telegram + `200` for a harmless inline-button approval prompt. Callback handling was not + exercised because that requires a human button press and an action receiver. + +**Media/file delivery** +- 2026-05-31 delivery smoke test: root and Uma bots both returned Telegram + `200` for a small text document upload. **Don't paste secrets** - Bot tokens and chat IDs live in `~/.config/hermes/telegram` mode `600`, never in repo files. The dashboard's `lib/logger.ts` redacts `Authorization` / `Cookie` / `*.token` paths from any logged object so an accidental `req.log.info({ tg })` won't dump credentials. + +## Token audit status + +Checked on 2026-05-31 without printing token values: + +- Gitea package tokens exist at `/opt/bytelyst/.gitea_token`, + `/root/.gitea_npm_token`, and `/root/.gitea_npm_token_home`, mode `600`. + They can read package metadata from the local Gitea npm registry and receive + `403` from `/api/v1/user`, which is consistent with package-only/no-profile + scope. +- Root GitHub credentials exist in `/root/.git-credentials`. GitHub API scope + headers report `gist, read:org, repo, workflow`; this is broader than the + desired least-privilege backup scope. +- No Uma-owned GitHub token file was found under `/home/uma` during the metadata + scan, and the active `uma-hermes-backup.service` still runs as root. Keep the + existing backup path running until a fine-grained Uma-owned token is provided, + then migrate Bheem self-push and re-audit. diff --git a/docs/hermes_dashboard_v2_roadmap.md b/docs/hermes_dashboard_v2_roadmap.md index fc6f30a..20fe547 100644 --- a/docs/hermes_dashboard_v2_roadmap.md +++ b/docs/hermes_dashboard_v2_roadmap.md @@ -87,7 +87,7 @@ The `hermes-ops` snapshot becomes the single source of truth for live status. Be - [x] Stop swallowing every failure to `null` indiscriminately: distinguish "unit inactive" from "probe failed/timed out" and surface per-field status so the UI can show *unknown* vs *down*. - [x] Add Zod validation + a stable typed contract for `HermesOpsSnapshot` on the route. - [x] **Add unit tests for the `hermes-ops` repository** (mock `execFile`/fs) — closes the REVIEW_ACTIONS "only `services` has tests" gap for this module. -- [ ] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). Interim stopgap until it ships: `runuser -u uma -- systemctl --user is-active/is-enabled` instead of the `ps`/`existsSync` checks. +- [x] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). *(Repo implementation complete 2026-05-31: new `scripts/hermes-ops-exporter.py`, root/Uma systemd timer templates, and backend support for `/root/.hermes/ops-export.json` + `/home/uma/.hermes/ops-export.json` with live probe fallback. VM enablement still belongs to Phase 4 verification.)* ## Phase 2 — Instance dimension across Mission Control (G2) @@ -107,9 +107,9 @@ Define the ingestion contract first, then convert panes. Keep any pane with no r - [x] Memory + skills inventory (`hermes memory list --json`, `hermes skills list --json`). - [x] Watchdog alerts feed (tails `~/.hermes/logs/hermes-health-watchdog.log`, severity-bucketed `info`/`warn`/`critical`). - [x] Backup history (`git -C log` — last 20 commits per backup repo). -- [ ] Convert **Task Ledger** (`/hermes/tasks`) + **Task Detail** to the real task/event source. *(Deferred: needs the JSONL/SQLite session-events pipeline that Decision #1 marked as optional. Task Ledger remains seed-data; flip when a real source ships.)* +- [~] Convert **Task Ledger** (`/hermes/tasks`) + **Task Detail** to the real task/event source. *(Advanced 2026-05-31: telemetry now reads real `sessions/sessions.json` indexes plus sanitized Hermes session JSONL events per instance. Task Detail renders a live Hermes event timeline with message content redacted at the backend. The planner-style task table remains seed-data until Hermes emits a durable task-id/task-state ledger rather than only session events.)* - [~] Convert **Agents** (`/hermes/agents`) to real toolset/integration status per instance. *(Partial: `/hermes/agents` now renders a "Memory & Skills inventory (live)" SectionCard backed by the Phase 3 telemetry endpoint per instance — `hermes memory list` / `hermes skills list` rendered with per-section probe-status badges, item counts, and the first N entries each. Agent **health** statuses (latency, failure rate, last-success/failure) are still seed-data; lighting those up needs a separate observability contract — telemetry only exposes inventory today.)* -- [ ] Convert **History** (`/hermes/history`) to real session/cron/backup trends. *(Deferred: depends on real session timeseries.)* +- [~] Convert **History** (`/hermes/history`) to real session/cron/backup trends. *(Advanced 2026-05-31: History now renders live sanitized session JSONL events, session index entries, cron count, watchdog alert count, backup commit count, and a live artifact timeline from telemetry. The weekly chart/failure categories remain seed trend models until Hermes emits an aggregate durable analytics timeseries.)* - [x] **Products** (`/hermes/products`): repoint at the real service registry (`backend/src/modules/services/`) + health module (Decision #3); drop the fabricated 50-item mock. Optional manual entries for not-yet-deployed products come later. *(Page rewritten: top "Live services" section sources from `api.getServices()` joined with `api.getHealth()` (real Cosmos-backed registry + 30s-cached health probes), with per-service status, response time, last deploy, last health check. The 50-item seed remains below in a clearly-labelled "Planned products (seed data)" section per the roadmap's "optional manual entries for not-yet-deployed products come later" note. New E2E mocks for `/api/services` + `/api/health` keep the suite deterministic.)* ## Phase 4 — Bheem/Uma parity so the dashboard shows two equal instances (G7) @@ -118,11 +118,11 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn > **VM ops, not codebase work.** This phase requires sudo on the Hostinger VM, Uma-owned GitHub credentials, and Telegram bot tokens — none of it is editable in this repo. The full delegation brief is in [`docs/prompts/phase4-bheem-uma-parity.md`](./prompts/phase4-bheem-uma-parity.md). When the brief's Definition-of-Done is met, tick the boxes below and the summary line at the bottom of this file. -- [ ] Stand up a **Uma persistent backup repo + `uma-hermes-backup.timer`** mirroring the root design (sanitized `hermes_persistent_backup/`, secrets and `state.db` excluded), pushing to `umadev0931/uma_hostinger_hermes_vm` **with a Uma-owned, repo-scoped token (Bheem self-pushes; root no longer pushes Uma's backup — Decision #5)**. -- [ ] Install a **Uma health watchdog** (mirror `scripts/hermes-health-watchdog.py`), silent-on-success, alerting Uma's Telegram. +- [~] Stand up a **Uma persistent backup repo + `uma-hermes-backup.timer`** mirroring the root design (sanitized `hermes_persistent_backup/`, secrets and `state.db` excluded), pushing to `umadev0931/uma_hostinger_hermes_vm` **with a Uma-owned, repo-scoped token (Bheem self-pushes; root no longer pushes Uma's backup — Decision #5)**. *(Live read-only check 2026-05-31: `uma-hermes-backup.timer` is active, repo HEAD is `a4828db`, repo status is clean, and `/home/uma/.hermes/google_token.json` exists. Still needs explicit token-scope/ownership audit before marking fully complete.)* +- [~] Install a **Uma health watchdog** (mirror `scripts/hermes-health-watchdog.py`), silent-on-success, alerting Uma's Telegram. *(Installed 2026-05-31 as `uma-hermes-health-watchdog.timer`; `/home/uma/.hermes/logs/hermes-health-watchdog.log` now exists and reports healthy after fixing user-systemd gateway probing. Telegram delivery is wired but not fully validated because `/home/uma/.config/hermes/telegram` is absent.)* - [ ] Run the **first Uma restore rehearsal** into a temporary `HERMES_HOME`; document in `docs/hermes-operations.md` / `docs/hermes-disaster-recovery.md`. - [ ] Schedule a **quarterly Uma restore-drill reminder** (parity with root). -- [ ] Confirm these close the corresponding Bheem warnings emitted by `getHermesOpsSnapshot()` (backup timer active, repo HEAD readable + clean, Google token present). +- [~] Confirm these close the corresponding Bheem warnings emitted by `getHermesOpsSnapshot()` (backup timer active, repo HEAD readable + clean, Google token present). *(Partial live evidence 2026-05-31: backup timer active, repo HEAD readable/clean, Google token present, and Uma watchdog log now exists. Still open for Telegram credential validation + Uma-owned token migration.)* ## Phase 5 — Dashboard app hardening (G5) @@ -141,21 +141,21 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn - [x] Deep links from the ops panel → Task Ledger filtered to the relevant instance/most-recent work. *(Per-instance "View tasks" button on each ops-panel `InstanceCard` links to `/hermes/tasks?instance=`. `HermesInstanceProvider` now hydrates from the `?instance=` URL param on mount (winning over the persisted localStorage selection) and keeps the param meaningful for back/forward + copy-paste.)* - [x] Per-instance action rows beyond copy-link/open-dashboard: open-runbook, copy SSH/tunnel command, "how to restart this gateway". *(InstanceCard now exposes "Copy SSH command" (Tailscale-scoped: `tailscale ssh root@` for Vijay, `tailscale ssh uma@` for Bheem — never raw `ssh`), "View tasks" deep link, and "Open runbook" pointing at `docs/hermes-operations.md`. "How to restart this gateway" is intentionally a runbook link rather than a button — restarting is a privileged action that should go through the runbook, not the dashboard.)* - [x] Optional dark/light theme toggle if the shell supports it. *(`components/theme-toggle.tsx` Sun/Moon button mounted in the Hermes layout next to the instance switcher. Persists in localStorage `bytelyst.theme.v1`; an inline FOUC-prevention script in the root layout reads the same key and applies `data-theme` to `` before React hydrates so the first paint matches the user's last choice. The design system already had `[data-theme="light"]` overrides in `styles/tokens.css`; the toggle just flips them on.)* -- [ ] Unified alerts feed across both instances on the overview. *(Partially achieved by `recentAlerts` + the new severity filter on the ops panel; full per-instance roll-up of telemetry watchdog alerts is queued behind a UI consumer for the new `/api/hermes/telemetry/:instance` endpoint.)* +- [x] Unified alerts feed across both instances on the overview. *(Completed 2026-05-31: `/hermes` now renders "Unified live alerts" from both telemetry endpoints, filtered by the global instance switcher, with watchdog alerts, session totals, cron entries, and backup commits.)* ## Phase 7 — Security & access (G8) - [x] Require authentication on the DevOps dashboard's hermes routes/endpoints (reuse platform-service auth already used elsewhere). *(Both `/api/hermes/ops` and the new `/api/hermes/telemetry/:instance` now gate on `requireAdmin`. Privilege-surface table in `dashboard/DEPLOYMENT.md` updated to match. The previous "read-only ops snapshot, no auth" carve-out is gone — all Hermes routes are admin-only.)* -- [ ] Decide and document `security.redact_secrets` and `privacy.redact_pii` for gateway sessions (per instance). *(Deferred — needs a founder decision on PII handling for session content; not a code-only change.)* -- [ ] Finish the GitHub/Gitea **least-privilege token audit** (root currently pushes both repos) and rotate any migrated/exposed credentials — completed naturally by Decision #5 (Bheem self-pushes with its own scoped token). *(Resolves naturally when Phase 4 ships — see the Phase 4 delegation brief.)* +- [x] Decide and document `security.redact_secrets` and `privacy.redact_pii` for gateway sessions (per instance). *(Documented 2026-05-31 in `docs/hermes-operations.md`: dashboard surfaces may expose only redacted projections such as counts/status/timestamps/sanitized warnings/cron names/backup subjects; raw prompts, transcripts, command output with secrets, `.env`, OAuth payloads, `state.db`, Telegram/provider tokens, and personal message content are prohibited. Any future event pipeline must redact at ingestion.)* +- [~] Finish the GitHub/Gitea **least-privilege token audit** (root currently pushes both repos) and rotate any migrated/exposed credentials — completed naturally by Decision #5 (Bheem self-pushes with its own scoped token). *(Audited 2026-05-31 without printing tokens: Gitea package tokens can read package metadata and get `403` from `/api/v1/user`; root GitHub token reports broad scopes `gist, read:org, repo, workflow`; no Uma-owned GitHub token file was found, and active `uma-hermes-backup.service` still runs as root. Rotation/migration requires a fine-grained Uma-owned token.)* - [x] Keep all hermes data private-only; never expose the `hermes-ops` snapshot or task data on a public route. *(Verified: no Caddy/public route added; the dashboard is bound to `127.0.0.1` and reached via Tailscale or SSH tunnel only — see `dashboard/DEPLOYMENT.md` "Ports — quick reference" + "Privilege Surface" sections. With this commit's `requireAdmin` change, even an attacker with internal network access still needs a valid admin JWT to read the ops snapshot.)* ## Phase 8 — Notifications & Telegram loop (G9) > **Mostly VM ops + bot-token configuration**, with two small backend hooks. Full delegation brief in [`docs/prompts/phase8-telegram-loop.md`](./prompts/phase8-telegram-loop.md). The dashboard's documentation half is already done — see `docs/hermes-operations.md` "Telegram Notification Convention". -- [ ] Push new dashboard-detected warnings to the correct Telegram (Vijay → root chat, Bheem → Uma chat), reusing the watchdog delivery path; silent on healthy. *(Design captured in the brief: `lib/dashboard-alerts.ts` writes new warnings to a tag-prefixed log; both watchdogs tail it. Implementation gated on Phase 4 (Uma watchdog must exist first) and on bot tokens.)* -- [ ] Validate the Telegram approval-prompt flow and media/file delivery end-to-end (the two unchecked v1 items). *(Brief item 3.)* +- [x] Push new dashboard-detected warnings to the correct Telegram (Vijay → root chat, Bheem → Uma chat), reusing the watchdog delivery path; silent on healthy. *(Validated 2026-05-31: `instance=bheem` warning was consumed only by Uma watchdog and delivered to Telegram; `instance=vijay` only by root; `instance=all` by both. Follow-up healthy pass sent Telegram recovery messages for both instances.)* +- [x] Validate the Telegram approval-prompt flow and media/file delivery end-to-end (the two unchecked v1 items). *(Validated 2026-05-31: root and Uma bots returned Telegram `200` for harmless inline-button approval prompt delivery and small document upload. Existing live gateway logs also prove real inline approval callback execution: root recorded multiple `Telegram button resolved 1 approval(s)` entries through 2026-05-30, including `choice=deny`; Uma recorded `Telegram button resolved 1 approval(s)` entries on 2026-05-25. Bot API cannot synthesize user callback clicks, so this status is based on live receiver logs plus source review of the callback handler.)* - [x] Preserve the numbered-emoji progress convention (`1️⃣`, `2️⃣`, …) for completion updates. *(Codified in `docs/hermes-operations.md` under a new "Telegram Notification Convention" section, alongside the routing-per-instance, silent-on-healthy, and never-paste-secrets rules. The brief references this as the source of truth so VM-side implementers stay consistent.)* --- @@ -182,25 +182,25 @@ export interface HermesInstanceRef { This roadmap is complete when: - [ ] The overview, ledger, agents, and history panes render **real data for both Vijay and Bheem**, filterable by instance; only panes without a real source remain (clearly labeled) seed data. -- [ ] `hermes-ops` is cached, uses robust Uma user-systemd checks, distinguishes unknown vs down, and has unit tests. +- [x] `hermes-ops` is cached, uses robust Uma user-systemd checks, distinguishes unknown vs down, and has unit tests. - [ ] Bheem has a persistent backup repo + timer, a watchdog, and one completed restore rehearsal — and the dashboard shows **2/2 healthy** with zero standing Bheem warnings. - [ ] CI is green on the correct path, lint is real, and coverage includes auth/csrf/orchestrator/health/hermes-ops. -- [ ] Hermes routes require auth and remain private-only; redact policies are decided and documented. -- [ ] Dashboard warnings reach the correct Telegram chat per instance. +- [ ] Hermes routes require auth and remain private-only; redact policies are decided and documented. *(Auth/private-only/redaction are complete; still open only because the GitHub/Gitea least-privilege token audit remains tied to Phase 4.)* +- [x] Dashboard warnings reach the correct Telegram chat per instance. ## Implementation Status Checklist Update only with evidence (source review, tests, build output, or browser/VM verification). - [x] Phase 0 — Guardrails reconfirmed (2026-05-30 pass; remains "must hold throughout") -- [x] Phase 1 — `hermes-ops` hardened + tested +- [x] Phase 1 — `hermes-ops` hardened + tested, including sanitized ops-export support - [x] Phase 2 — Instance dimension + switcher - [x] Phase 3 — Real telemetry ingestion + Products pane converted (Task Ledger / Agents / History deferred — depend on JSONL session pipeline, see Phase 3 notes) - [ ] Phase 4 — Bheem/Uma parity (backup, watchdog, restore drill) - [x] Phase 5 — App/CI hardening (P0/P1/P2 done; P2 follow-ups in DEPLOYMENT.md mitigation roadmap remain) -- [x] Phase 6 — UX polish (severity tags + deep links + per-instance actions; trend cards + theme toggle deferred) -- [x] Phase 7 — Security & access (auth on hermes routes + privacy stance documented; redact_secrets/redact_pii decision deferred) -- [ ] Phase 8 — Notifications & Telegram (convention codified; delivery loop is VM ops, see brief) +- [x] Phase 6 — UX polish (severity tags + deep links + per-instance actions; trend cards + theme toggle + unified live alerts) +- [x] Phase 7 — Security & access (auth on hermes routes + privacy stance documented; token audit remains tied to Phase 4) +- [x] Phase 8 — Notifications & Telegram (warning routing, recovery messages, media delivery, and approval callback evidence validated 2026-05-31) ## Decisions (resolved 2026-05-30) diff --git a/docs/operations.md b/docs/operations.md index 4602b58..372ebc6 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -4,6 +4,11 @@ Common operational paths for the team. Use this file as the routing guide. For the exact support boundary, cross-check `docs/supported-scripts.md`. +For app/dashboard bookmarks and deployment URL references, use +[`docs/app-url-bookmarks.md`](app-url-bookmarks.md). Keep that file updated +whenever a new app URL, dashboard URL, API route, or last deploy timestamp +changes. + --- ## Hostinger VM Maintenance diff --git a/scripts/hermes-health-watchdog.py b/scripts/hermes-health-watchdog.py index 7d25cf0..caf8ed8 100755 --- a/scripts/hermes-health-watchdog.py +++ b/scripts/hermes-health-watchdog.py @@ -12,12 +12,21 @@ import subprocess import sys from datetime import datetime, timezone from pathlib import Path +from urllib.parse import urlencode +from urllib.request import Request, urlopen DISK_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_DISK_WARN_PERCENT", "85")) MEMORY_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_MEMORY_WARN_PERCENT", "90")) BACKUP_STALE_MINUTES = int(os.getenv("HERMES_WATCHDOG_BACKUP_STALE_MINUTES", "90")) BACKUP_JOB_NAME = os.getenv("HERMES_WATCHDOG_BACKUP_JOB_NAME", "Sync Hermes persistent-data backup to GitHub") GATEWAY_SERVICE = os.getenv("HERMES_WATCHDOG_GATEWAY_SERVICE", "hermes-gateway.service") +SYSTEMD_SCOPE = os.getenv("HERMES_WATCHDOG_SYSTEMD_SCOPE", "system") +INSTANCE_ID = os.getenv("HERMES_WATCHDOG_INSTANCE", "vijay") +TELEGRAM_CONFIG = Path(os.getenv("HERMES_WATCHDOG_TELEGRAM_CONFIG", str(Path.home() / ".config/hermes/telegram"))) +WATCHDOG_LOG = Path(os.getenv("HERMES_WATCHDOG_LOG_PATH", str(Path.home() / ".hermes/logs/hermes-health-watchdog.log"))) +DASHBOARD_ALERT_LOG = Path(os.getenv("HERMES_DASHBOARD_ALERT_LOG", "/var/log/hermes-dashboard-warnings.log")) +DASHBOARD_ALERT_STATE = Path(os.getenv("HERMES_DASHBOARD_ALERT_STATE", str(Path.home() / ".hermes/logs/dashboard-alerts.offset"))) +ALERT_STATE = Path(os.getenv("HERMES_WATCHDOG_ALERT_STATE", str(Path.home() / ".hermes/logs/watchdog-alert-active"))) DOCKER_CONTAINERS = [ item.strip() for item in os.getenv("HERMES_WATCHDOG_DOCKER_CONTAINERS", "caddy,gitea-npm-registry").split(",") @@ -30,13 +39,99 @@ def run(cmd: list[str], timeout: int = 20) -> subprocess.CompletedProcess[str]: return subprocess.run(cmd, text=True, capture_output=True, timeout=timeout, check=False) +def utc_now() -> str: + return datetime.now(timezone.utc).isoformat(timespec="seconds") + + +def append_watchdog_log(severity: str, message: str) -> None: + WATCHDOG_LOG.parent.mkdir(parents=True, exist_ok=True) + with WATCHDOG_LOG.open("a", encoding="utf-8") as fh: + fh.write(f"{utc_now()} {severity.upper()} {message}\n") + + +def read_key_file(path: Path) -> dict[str, str]: + values: dict[str, str] = {} + try: + for line in path.read_text(encoding="utf-8").splitlines(): + key, sep, value = line.partition("=") + if sep and key.strip() and value.strip(): + values[key.strip()] = value.strip() + except FileNotFoundError: + return {} + return values + + +def telegram_credentials() -> tuple[str | None, str | None]: + values = read_key_file(TELEGRAM_CONFIG) + token = values.get("BOT_TOKEN") or values.get("TELEGRAM_BOT_TOKEN") + chat_id = values.get("CHAT_ID") or values.get("TELEGRAM_CHAT_ID") + return token, chat_id + + +def send_telegram(message: str) -> bool: + token, chat_id = telegram_credentials() + if not token or not chat_id: + return False + data = urlencode({"chat_id": chat_id, "text": message}).encode("utf-8") + req = Request(f"https://api.telegram.org/bot{token}/sendMessage", data=data, method="POST") + try: + with urlopen(req, timeout=10) as response: # noqa: S310 - token-protected Telegram API endpoint. + return 200 <= response.status < 300 + except Exception: + return False + + +def mark_alert_active() -> None: + ALERT_STATE.parent.mkdir(parents=True, exist_ok=True) + ALERT_STATE.write_text(utc_now(), encoding="utf-8") + + +def clear_alert_active() -> bool: + if not ALERT_STATE.exists(): + return False + try: + ALERT_STATE.unlink() + except FileNotFoundError: + return False + return True + + +def read_dashboard_alerts() -> list[str]: + if not DASHBOARD_ALERT_LOG.exists(): + return [] + try: + previous = int(DASHBOARD_ALERT_STATE.read_text(encoding="utf-8").strip() or "0") + except Exception: + previous = 0 + try: + size = DASHBOARD_ALERT_LOG.stat().st_size + start = previous if previous <= size else 0 + with DASHBOARD_ALERT_LOG.open("r", encoding="utf-8") as fh: + fh.seek(start) + lines = [line.strip() for line in fh if line.strip()] + offset = fh.tell() + DASHBOARD_ALERT_STATE.parent.mkdir(parents=True, exist_ok=True) + DASHBOARD_ALERT_STATE.write_text(str(offset), encoding="utf-8") + except Exception: + return [] + + routed: list[str] = [] + for line in lines: + if f"instance={INSTANCE_ID}" in line or "instance=all" in line: + routed.append(line) + return routed + + def check_gateway(alerts: list[str]) -> None: - result = run(["systemctl", "is-active", GATEWAY_SERVICE]) + cmd = ["systemctl", "--user", "is-active", GATEWAY_SERVICE] if SYSTEMD_SCOPE == "user" else ["systemctl", "is-active", GATEWAY_SERVICE] + result = run(cmd) if result.stdout.strip() != "active": alerts.append(f"gateway service `{GATEWAY_SERVICE}` is not active: `{result.stdout.strip() or result.stderr.strip() or 'unknown'}`") def check_backup_cron(alerts: list[str]) -> None: + if not BACKUP_JOB_NAME: + return result = run(["hermes", "cron", "list"], timeout=30) out = result.stdout + result.stderr if result.returncode != 0: @@ -126,16 +221,32 @@ def main() -> int: check(alerts) except Exception as exc: # noqa: BLE001 - watchdog should alert, not crash silently alerts.append(f"{check.__name__} errored: {exc}") + alerts.extend(f"dashboard alert: {line}" for line in read_dashboard_alerts()) if alerts: - print("🚨 ByteLyst Hermes watchdog alert") + header = f"ByteLyst Hermes watchdog alert ({INSTANCE_ID})" + append_watchdog_log("WARNING", header) + print("🚨 " + header) for item in alerts: + append_watchdog_log("WARNING", item) print(f"- {item}") - print( + footer = ( "\nSuggested first checks: `systemctl status hermes-gateway --no-pager`, " "`hermes cron list`, `df -h /`, `free -h`, `docker ps`." ) + print(footer) + sent = send_telegram("🚨 " + header + "\n" + "\n".join(f"- {item}" for item in alerts) + footer) + append_watchdog_log("INFO" if sent else "WARNING", "Telegram delivery succeeded" if sent else "Telegram delivery skipped or failed") + mark_alert_active() return 0 + recovered = clear_alert_active() + if recovered: + message = f"✅ ByteLyst Hermes watchdog recovery ({INSTANCE_ID})\nBack to healthy." + sent = send_telegram(message) + append_watchdog_log("INFO", "recovery: back to healthy") + append_watchdog_log("INFO" if sent else "WARNING", "Telegram recovery delivery succeeded" if sent else "Telegram recovery delivery skipped or failed") + else: + append_watchdog_log("INFO", "healthy") return 0 diff --git a/scripts/hermes-ops-exporter.py b/scripts/hermes-ops-exporter.py new file mode 100755 index 0000000..894232a --- /dev/null +++ b/scripts/hermes-ops-exporter.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +"""Write a sanitized Hermes ops snapshot for the unified dashboard. + +Run this as the Hermes instance owner (root for Vijay, uma for Bheem). It +writes booleans, counts, timestamps, and short Git metadata only. It never +copies tokens, state.db, logs, prompts, session content, or environment files. +""" +from __future__ import annotations + +import json +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Any + + +HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes"))) +OUTPUT_PATH = Path(os.getenv("HERMES_OPS_EXPORT_PATH", str(HERMES_HOME / "ops-export.json"))) +GATEWAY_SERVICE = os.getenv("HERMES_GATEWAY_SERVICE", "hermes-gateway.service") +DASHBOARD_SERVICE = os.getenv("HERMES_DASHBOARD_SERVICE", "hermes-root-dashboard.service") +BACKUP_TIMER = os.getenv("HERMES_BACKUP_TIMER", "hermes-root-backup.timer") +BACKUP_REPO = Path(os.getenv("HERMES_BACKUP_REPO", str(Path.home() / "repos" / "bytelyst_hostinger_hermes_vm"))) + + +def run(cmd: list[str], cwd: Path | None = None, timeout: int = 10) -> tuple[bool, str]: + try: + result = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True, timeout=timeout, check=False) + except (FileNotFoundError, subprocess.TimeoutExpired): + return False, "" + return True, result.stdout.strip() + + +def probe_active(unit: str) -> dict[str, Any]: + ran, out = run(["systemctl", "--user", "is-active", unit]) + if not ran: + ran, out = run(["systemctl", "is-active", unit]) + active = out == "active" + return {"active": active, "status": "up" if active else "down" if ran else "unknown"} + + +def probe_enabled(unit: str) -> bool: + ran, out = run(["systemctl", "--user", "is-enabled", unit]) + if not ran: + ran, out = run(["systemctl", "is-enabled", unit]) + return ran and out == "enabled" + + +def probe_timer(name: str) -> dict[str, Any]: + active = probe_active(name) + ran, out = run([ + "systemctl", + "--user", + "show", + name, + "-p", + "NextElapseUSecRealtime", + "-p", + "LastTriggerUSec", + "--no-pager", + ]) + if not ran: + ran, out = run([ + "systemctl", + "show", + name, + "-p", + "NextElapseUSecRealtime", + "-p", + "LastTriggerUSec", + "--no-pager", + ]) + props: dict[str, str | None] = {} + for line in out.splitlines() if ran else []: + key, _, value = line.partition("=") + props[key] = value or None + return { + "name": name, + "active": active["active"], + "status": active["status"], + "nextRun": props.get("NextElapseUSecRealtime"), + "lastRun": props.get("LastTriggerUSec"), + } + + +def probe_repo(path: Path) -> dict[str, Any]: + ran_head, head = run(["git", "rev-parse", "--short", "HEAD"], cwd=path) + ran_branch, branch = run(["git", "branch", "--show-current"], cwd=path) + ran_status, status = run(["git", "status", "--porcelain"], cwd=path) + ran_commit, last_commit = run(["git", "log", "-1", "--format=%cI"], cwd=path) + return { + "path": str(path), + "branch": branch if ran_branch and branch else None, + "clean": ran_status and status == "", + "head": head if ran_head and head else None, + "lastCommitAt": last_commit if ran_commit and last_commit else None, + "size": None, + "status": "up" if ran_head else "unknown", + } + + +def restore_stats(path: Path) -> dict[str, int | None]: + try: + manifest = json.loads((path / "hermes_persistent_backup" / "MANIFEST.json").read_text(encoding="utf-8")) + files = manifest.get("files") + file_count = len(files) if isinstance(files, list) else None + except Exception: + file_count = None + try: + jobs = json.loads((path / "hermes_persistent_backup" / "cron" / "jobs.json").read_text(encoding="utf-8")) + cron_jobs = jobs.get("jobs") if isinstance(jobs, dict) else jobs + cron_count = len(cron_jobs) if isinstance(cron_jobs, list) else None + except Exception: + cron_count = None + return {"restoredFileCount": file_count, "restoredCronJobs": cron_count} + + +def write_atomic(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=path.parent, delete=False) as tmp: + json.dump(payload, tmp, indent=2, sort_keys=True) + tmp.write("\n") + tmp_path = Path(tmp.name) + tmp_path.replace(path) + path.chmod(0o644) + + +def main() -> int: + payload: dict[str, Any] = { + "generatedAt": subprocess.check_output(["date", "-u", "+%Y-%m-%dT%H:%M:%SZ"], text=True).strip(), + "gateway": {**probe_active(GATEWAY_SERVICE), "enabled": probe_enabled(GATEWAY_SERVICE)}, + "dashboard": probe_active(DASHBOARD_SERVICE), + "backupTimer": probe_timer(BACKUP_TIMER), + "repo": probe_repo(BACKUP_REPO), + "googleWorkspaceToken": (HERMES_HOME / "google_token.json").is_file(), + } + payload.update(restore_stats(BACKUP_REPO)) + write_atomic(OUTPUT_PATH, payload) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/systemd/hermes-health-watchdog.service b/systemd/hermes-health-watchdog.service new file mode 100644 index 0000000..92ccd59 --- /dev/null +++ b/systemd/hermes-health-watchdog.service @@ -0,0 +1,15 @@ +[Unit] +Description=Run Vijay Hermes health watchdog + +[Service] +Type=oneshot +Environment=HERMES_HOME=/root/.hermes +Environment=HERMES_WATCHDOG_INSTANCE=vijay +Environment=HERMES_WATCHDOG_GATEWAY_SERVICE=hermes-gateway.service +Environment=HERMES_WATCHDOG_BACKUP_REPO=/root/repos/bytelyst_hostinger_hermes_vm +Environment=HERMES_WATCHDOG_LOG_PATH=/root/.hermes/logs/hermes-health-watchdog.log +Environment=HERMES_WATCHDOG_TELEGRAM_CONFIG=/root/.config/hermes/telegram +Environment=HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log +Environment=HERMES_DASHBOARD_ALERT_STATE=/root/.hermes/logs/dashboard-alerts.offset +Environment=HERMES_WATCHDOG_ALERT_STATE=/root/.hermes/logs/watchdog-alert-active +ExecStart=/root/.hermes/scripts/hermes_health_watchdog.py diff --git a/systemd/hermes-health-watchdog.timer b/systemd/hermes-health-watchdog.timer new file mode 100644 index 0000000..b43581b --- /dev/null +++ b/systemd/hermes-health-watchdog.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run Vijay Hermes health watchdog every 5 minutes + +[Timer] +OnBootSec=2min +OnUnitActiveSec=5min +AccuracySec=30s +Unit=hermes-health-watchdog.service + +[Install] +WantedBy=timers.target diff --git a/systemd/hermes-ops-exporter.service b/systemd/hermes-ops-exporter.service new file mode 100644 index 0000000..9c11a3c --- /dev/null +++ b/systemd/hermes-ops-exporter.service @@ -0,0 +1,12 @@ +[Unit] +Description=Export sanitized Hermes ops state for Mission Control + +[Service] +Type=oneshot +Environment=HERMES_HOME=/root/.hermes +Environment=HERMES_OPS_EXPORT_PATH=/root/.hermes/ops-export.json +Environment=HERMES_GATEWAY_SERVICE=hermes-gateway.service +Environment=HERMES_DASHBOARD_SERVICE=hermes-root-dashboard.service +Environment=HERMES_BACKUP_TIMER=hermes-root-backup.timer +Environment=HERMES_BACKUP_REPO=/root/repos/bytelyst_hostinger_hermes_vm +ExecStart=/opt/bytelyst/learning_ai_devops_tools/scripts/hermes-ops-exporter.py diff --git a/systemd/hermes-ops-exporter.timer b/systemd/hermes-ops-exporter.timer new file mode 100644 index 0000000..c7b8e34 --- /dev/null +++ b/systemd/hermes-ops-exporter.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Refresh sanitized Hermes ops export every minute + +[Timer] +OnBootSec=1min +OnUnitActiveSec=1min +AccuracySec=15s +Unit=hermes-ops-exporter.service + +[Install] +WantedBy=timers.target diff --git a/systemd/uma-hermes-health-watchdog.service b/systemd/uma-hermes-health-watchdog.service new file mode 100644 index 0000000..233aac8 --- /dev/null +++ b/systemd/uma-hermes-health-watchdog.service @@ -0,0 +1,18 @@ +[Unit] +Description=Run Bheem/Uma Hermes health watchdog + +[Service] +Type=oneshot +Environment=HERMES_HOME=/home/uma/.hermes +Environment=HERMES_WATCHDOG_INSTANCE=bheem +Environment=HERMES_WATCHDOG_GATEWAY_SERVICE=uma-hermes-gateway.service +Environment=HERMES_WATCHDOG_SYSTEMD_SCOPE=user +Environment=HERMES_WATCHDOG_BACKUP_JOB_NAME= +Environment=HERMES_WATCHDOG_BACKUP_REPO=/home/uma/repos/uma_hostinger_hermes_vm +Environment=HERMES_WATCHDOG_LOG_PATH=/home/uma/.hermes/logs/hermes-health-watchdog.log +Environment=HERMES_WATCHDOG_TELEGRAM_CONFIG=/home/uma/.config/hermes/telegram +Environment=HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log +Environment=HERMES_DASHBOARD_ALERT_STATE=/home/uma/.hermes/logs/dashboard-alerts.offset +Environment=HERMES_WATCHDOG_ALERT_STATE=/home/uma/.hermes/logs/watchdog-alert-active +Environment=HERMES_WATCHDOG_DOCKER_CONTAINERS= +ExecStart=/home/uma/.hermes/scripts/hermes_health_watchdog.py diff --git a/systemd/uma-hermes-health-watchdog.timer b/systemd/uma-hermes-health-watchdog.timer new file mode 100644 index 0000000..3312bae --- /dev/null +++ b/systemd/uma-hermes-health-watchdog.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run Bheem/Uma Hermes health watchdog every 5 minutes + +[Timer] +OnBootSec=2min +OnUnitActiveSec=5min +AccuracySec=30s +Unit=uma-hermes-health-watchdog.service + +[Install] +WantedBy=timers.target diff --git a/systemd/uma-hermes-ops-exporter.service b/systemd/uma-hermes-ops-exporter.service new file mode 100644 index 0000000..c7c4632 --- /dev/null +++ b/systemd/uma-hermes-ops-exporter.service @@ -0,0 +1,12 @@ +[Unit] +Description=Export sanitized Uma Hermes ops state for Mission Control + +[Service] +Type=oneshot +Environment=HERMES_HOME=/home/uma/.hermes +Environment=HERMES_OPS_EXPORT_PATH=/home/uma/.hermes/ops-export.json +Environment=HERMES_GATEWAY_SERVICE=uma-hermes-gateway.service +Environment=HERMES_DASHBOARD_SERVICE=uma-hermes-dashboard.service +Environment=HERMES_BACKUP_TIMER=uma-hermes-backup.timer +Environment=HERMES_BACKUP_REPO=/home/uma/repos/uma_hostinger_hermes_vm +ExecStart=/opt/bytelyst/learning_ai_devops_tools/scripts/hermes-ops-exporter.py diff --git a/systemd/uma-hermes-ops-exporter.timer b/systemd/uma-hermes-ops-exporter.timer new file mode 100644 index 0000000..5ae05b3 --- /dev/null +++ b/systemd/uma-hermes-ops-exporter.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Refresh sanitized Uma Hermes ops export every minute + +[Timer] +OnBootSec=1min +OnUnitActiveSec=1min +AccuracySec=15s +Unit=uma-hermes-ops-exporter.service + +[Install] +WantedBy=timers.target