feat: complete hermes telemetry dashboard wiring

This commit is contained in:
Hermes VM 2026-05-31 08:28:12 +00:00
parent 38aefb05e4
commit 02b362399b
31 changed files with 1695 additions and 43 deletions

View File

@ -10,6 +10,10 @@ This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboa
## Public URLs
For the full living bookmark list across all ByteLyst apps, APIs, Hermes
dashboards, and last deploy timestamps, see
[`../docs/app-url-bookmarks.md`](../docs/app-url-bookmarks.md).
- **DevOps Dashboard**: `https://devops.bytelyst.com`
- **Admin Dashboard**: `https://admin.bytelyst.com`
- **API Gateway**: `https://api.bytelyst.com`

View File

@ -0,0 +1,44 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
const appendFileMock = vi.hoisted(() => vi.fn());
vi.mock('fs/promises', () => ({ appendFile: appendFileMock }));
const { appendDashboardWarning, clearDashboardWarningDedupe } = await import('./dashboard-alerts.js');
describe('dashboard-alerts', () => {
beforeEach(() => {
vi.clearAllMocks();
clearDashboardWarningDedupe();
delete process.env.HERMES_DASHBOARD_ALERT_LOG;
});
it('does nothing when the alert log is not configured', async () => {
const wrote = await appendDashboardWarning({ severity: 'warn', instance: 'vijay', message: 'gateway down' });
expect(wrote).toBe(false);
expect(appendFileMock).not.toHaveBeenCalled();
});
it('writes a routed warning line when configured', async () => {
process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
const wrote = await appendDashboardWarning(
{ severity: 'critical', instance: 'bheem', message: 'backup missing' },
Date.parse('2026-05-31T07:00:00Z'),
);
expect(wrote).toBe(true);
expect(appendFileMock).toHaveBeenCalledWith(
'/tmp/hermes-dashboard-warnings.log',
'2026-05-31T07:00:00.000Z CRITICAL instance=bheem backup missing\n',
'utf8',
);
});
it('deduplicates for one hour and writes again after expiry', async () => {
process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
const input = { severity: 'warn' as const, instance: 'all' as const, message: 'shared warning' };
expect(await appendDashboardWarning(input, 1_000)).toBe(true);
expect(await appendDashboardWarning(input, 2_000)).toBe(false);
expect(await appendDashboardWarning(input, 3_602_000)).toBe(true);
expect(appendFileMock).toHaveBeenCalledTimes(2);
});
});

View File

@ -0,0 +1,48 @@
import { appendFile } from 'fs/promises';
type AlertSeverity = 'info' | 'warn' | 'critical';
type AlertInstance = 'vijay' | 'bheem' | 'all';
interface DashboardWarningInput {
severity: AlertSeverity;
instance: AlertInstance;
message: string;
}
const DEDUPE_WINDOW_MS = 60 * 60 * 1000;
const recent = new Map<string, number>();
function severityToken(severity: AlertSeverity): string {
if (severity === 'critical') return 'CRITICAL';
if (severity === 'warn') return 'WARNING';
return 'INFO';
}
function alertKey(input: DashboardWarningInput): string {
return `${input.severity}\0${input.instance}\0${input.message}`;
}
function purgeExpired(now: number): void {
for (const [key, at] of recent) {
if (now - at > DEDUPE_WINDOW_MS) recent.delete(key);
}
}
export async function appendDashboardWarning(input: DashboardWarningInput, now = Date.now()): Promise<boolean> {
const logPath = process.env.HERMES_DASHBOARD_ALERT_LOG;
if (!logPath) return false;
purgeExpired(now);
const key = alertKey(input);
const previous = recent.get(key);
if (previous && now - previous <= DEDUPE_WINDOW_MS) return false;
recent.set(key, now);
const line = `${new Date(now).toISOString()} ${severityToken(input.severity)} instance=${input.instance} ${input.message}\n`;
await appendFile(logPath, line, 'utf8');
return true;
}
export function clearDashboardWarningDedupe(): void {
recent.clear();
}

View File

@ -146,6 +146,49 @@ describe('hermes-ops repository', () => {
expect(bheem.gateway.status).toBe('up');
});
it('prefers a sanitized per-instance ops export when one is present', async () => {
setExec(healthyHandler());
readFileMock.mockImplementation(async (p: string) => {
if (p === '/home/uma/.hermes/ops-export.json') {
return JSON.stringify({
gateway: { active: false, enabled: true, status: 'down' },
dashboard: { active: false, status: 'down' },
backupTimer: {
name: 'uma-hermes-backup.timer',
active: false,
status: 'down',
nextRun: null,
lastRun: null,
},
repo: {
path: '/home/uma/repos/uma_hostinger_hermes_vm',
branch: 'main',
clean: true,
head: 'export1',
lastCommitAt: '2026-05-31T00:00:00Z',
size: '1M',
status: 'up',
},
restoredFileCount: 42,
restoredCronJobs: 3,
googleWorkspaceToken: true,
});
}
if (p.endsWith('MANIFEST.json')) return JSON.stringify({ files: [1, 2, 3] });
if (p.endsWith('jobs.json')) return JSON.stringify({ jobs: [{ id: 'a' }, { id: 'b' }] });
throw new Error('no such file');
});
const snapshot = await getHermesOpsSnapshot({ force: true });
const bheem = snapshot.instances.find((i) => i.id === 'bheem')!;
expect(bheem.gateway.status).toBe('down');
expect(bheem.dashboard.status).toBe('down');
expect(bheem.backup.repo.head).toBe('export1');
expect(bheem.backup.restoredFileCount).toBe(42);
expect(bheem.backup.restoredCronJobs).toBe(3);
expect(bheem.google.workspaceToken).toBe(true);
});
it('reports unknown repo status when git cannot be read', async () => {
setExec((command, args) => {
if (command === 'git') return enoentError();

View File

@ -2,6 +2,7 @@ import { execFile } from 'child_process';
import { promisify } from 'util';
import { readFile, stat } from 'fs/promises';
import { existsSync } from 'fs';
import { appendDashboardWarning } from '../../lib/dashboard-alerts.js';
import type {
HermesOpsCronJob,
HermesOpsInstance,
@ -31,6 +32,7 @@ const instances = [
dashboardPort: 9119,
backupTimer: 'hermes-root-backup.timer',
repoPath: '/root/repos/bytelyst_hostinger_hermes_vm',
opsExportPath: '/root/.hermes/ops-export.json',
driveFolder: 'Vijay Drive',
},
{
@ -43,10 +45,21 @@ const instances = [
dashboardPort: 9120,
backupTimer: 'uma-hermes-backup.timer',
repoPath: '/home/uma/repos/uma_hostinger_hermes_vm',
opsExportPath: '/home/uma/.hermes/ops-export.json',
driveFolder: 'Bheem Drive',
},
];
interface OpsExport {
gateway?: { active?: boolean; enabled?: boolean; status?: ProbeStatus };
dashboard?: { active?: boolean; status?: ProbeStatus };
backupTimer?: HermesOpsTimer;
repo?: HermesOpsRepo;
restoredFileCount?: number | null;
restoredCronJobs?: number | null;
googleWorkspaceToken?: boolean;
}
interface ExecResult {
// Trimmed stdout. Present even when the command exited non-zero (e.g.
// `systemctl is-active` prints "inactive" and exits 3).
@ -223,6 +236,15 @@ async function tokenExists(path: string): Promise<boolean> {
}
}
async function readOpsExport(path: string): Promise<OpsExport | null> {
try {
const parsed = JSON.parse(await readFile(path, 'utf8')) as OpsExport;
return parsed && typeof parsed === 'object' ? parsed : null;
} catch {
return null;
}
}
async function getTailscaleIp(): Promise<string | null> {
const result = await exec('tailscale', ['ip', '-4']);
if (!result.ran) return null;
@ -246,11 +268,12 @@ async function buildSnapshot(): Promise<HermesOpsSnapshot> {
const results: HermesOpsInstance[] = [];
for (const item of instances) {
const opsExport = await readOpsExport(item.opsExportPath);
const gatewayActiveCheck =
item.gatewayKind === 'uma-user' ? probeUmaGatewayActive() : probeSystemActive(item.gatewayService);
const gatewayEnabledCheck =
item.gatewayKind === 'uma-user' ? probeUmaGatewayEnabled() : probeSystemEnabled(item.gatewayService);
const [gateway, gatewayEnabled, dashboard, backupTimer, repo, stats, googleToken] = await Promise.all([
const [probedGateway, probedGatewayEnabled, probedDashboard, probedBackupTimer, probedRepo, probedStats, probedGoogleToken] = await Promise.all([
gatewayActiveCheck,
gatewayEnabledCheck,
probeSystemActive(item.dashboardService),
@ -259,6 +282,22 @@ async function buildSnapshot(): Promise<HermesOpsSnapshot> {
manifestStats(`${item.repoPath}/hermes_persistent_backup`),
tokenExists(`${item.hermesHome}/google_token.json`),
]);
const gateway = opsExport?.gateway?.status ? {
active: Boolean(opsExport.gateway.active),
status: opsExport.gateway.status,
} : probedGateway;
const gatewayEnabled = typeof opsExport?.gateway?.enabled === 'boolean' ? opsExport.gateway.enabled : probedGatewayEnabled;
const dashboard = opsExport?.dashboard?.status ? {
active: Boolean(opsExport.dashboard.active),
status: opsExport.dashboard.status,
} : probedDashboard;
const backupTimer = opsExport?.backupTimer ?? probedBackupTimer;
const repo = opsExport?.repo ?? probedRepo;
const stats = {
files: typeof opsExport?.restoredFileCount === 'number' || opsExport?.restoredFileCount === null ? opsExport.restoredFileCount : probedStats.files,
cronJobs: typeof opsExport?.restoredCronJobs === 'number' || opsExport?.restoredCronJobs === null ? opsExport.restoredCronJobs : probedStats.cronJobs,
};
const googleToken = typeof opsExport?.googleWorkspaceToken === 'boolean' ? opsExport.googleWorkspaceToken : probedGoogleToken;
const dashboardUrl = tailscaleIp ? `http://${tailscaleIp}:${item.dashboardPort}/` : `:${item.dashboardPort}`;
@ -316,6 +355,16 @@ async function buildSnapshot(): Promise<HermesOpsSnapshot> {
warnings.push('Emergency Drive OAuth token is missing');
}
await Promise.all(warnings.map((message) => {
const lower = message.toLowerCase();
const instance = lower.includes('bheem') || lower.includes('uma')
? 'bheem'
: lower.includes('vijay') || lower.includes('root')
? 'vijay'
: 'all';
return appendDashboardWarning({ severity: 'warn', instance, message });
}));
const cronJobs: HermesOpsCronJob[] = [
{
name: emergencyDriveUpload.name,

View File

@ -7,7 +7,8 @@ vi.mock('child_process', () => ({ execFile: execFileMock }));
const readFileMock = vi.hoisted(() => vi.fn());
const statMock = vi.hoisted(() => vi.fn());
vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock }));
const readdirMock = vi.hoisted(() => vi.fn());
vi.mock('fs/promises', () => ({ readFile: readFileMock, readdir: readdirMock, stat: statMock }));
type Handler = (command: string, args: string[]) => { error?: NodeJS.ErrnoException; stdout?: string };
@ -42,6 +43,7 @@ describe('hermes-telemetry repository', () => {
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readFileMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const snapshot = await getHermesTelemetrySnapshot('vijay');
// The whole shape must validate even when nothing was readable — that's
@ -84,6 +86,7 @@ describe('hermes-telemetry repository', () => {
return { stdout: '' };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readdirMock.mockResolvedValue([]);
const snapshot = await getHermesTelemetrySnapshot('vijay');
expect(snapshot.sessions).toEqual({ totalSessions: 59, totalMessages: 5225, status: 'up' });
@ -102,6 +105,7 @@ describe('hermes-telemetry repository', () => {
return { error: err };
});
statMock.mockResolvedValue({} as never);
readdirMock.mockResolvedValue([]);
readFileMock.mockResolvedValue([
'2026-01-01T12:34:56 WARNING gateway is degraded',
'2026-01-01T12:35:01 CRITICAL backup repo HEAD missing',
@ -129,6 +133,7 @@ describe('hermes-telemetry repository', () => {
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readdirMock.mockResolvedValue([]);
const snapshot = await getHermesTelemetrySnapshot('vijay');
expect(snapshot.backupHistory.status).toBe('up');
@ -144,6 +149,7 @@ describe('hermes-telemetry repository', () => {
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const a = await getHermesTelemetrySnapshot('vijay');
const callsAfterFirst = calls;
@ -159,10 +165,37 @@ describe('hermes-telemetry repository', () => {
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
const v = await getHermesTelemetrySnapshot('vijay');
const b = await getHermesTelemetrySnapshot('bheem');
expect(v.instanceId).toBe('vijay');
expect(b.instanceId).toBe('bheem');
});
it('parses sanitized Hermes session JSONL events without exposing raw message content', async () => {
setExec(() => {
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
return { error: err };
});
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
readdirMock.mockResolvedValue(['20260101_session.jsonl']);
readFileMock.mockImplementation(async (path: string) => {
if (path.endsWith('.jsonl')) {
return [
JSON.stringify({ role: 'user', content: 'secret prompt', timestamp: '2026-01-01T00:00:00Z' }),
JSON.stringify({ role: 'assistant', finish_reason: 'tool_calls', tool_calls: [{ function: { name: 'exec_command' } }], timestamp: '2026-01-01T00:01:00Z' }),
].join('\n');
}
throw Object.assign(new Error('ENOENT'), { code: 'ENOENT' });
});
const snapshot = await getHermesTelemetrySnapshot('vijay');
expect(snapshot.sessionEvents.status).toBe('up');
expect(snapshot.sessionEvents.sourceCount).toBe(1);
expect(snapshot.sessionEvents.entries).toHaveLength(2);
expect(snapshot.sessionEvents.entries[0].summary).toBe('assistant tool call: exec_command');
expect(snapshot.sessionEvents.entries[1].summary).toBe('user message (content redacted)');
expect(JSON.stringify(snapshot.sessionEvents.entries)).not.toContain('secret prompt');
});
});

View File

@ -1,6 +1,8 @@
import { execFile } from 'child_process';
import { promisify } from 'util';
import { readFile, stat } from 'fs/promises';
import { readdir, readFile, stat } from 'fs/promises';
import { basename, join } from 'path';
import { appendDashboardWarning } from '../../lib/dashboard-alerts.js';
import { childLogger } from '../../lib/logger.js';
import type {
HermesBackupHistory,
@ -8,6 +10,10 @@ import type {
HermesCronEntry,
HermesCronList,
HermesInstanceId,
HermesSessionEntry,
HermesSessionEvent,
HermesSessionEventList,
HermesSessionList,
HermesMemoryList,
HermesSessionStats,
HermesSkillList,
@ -29,6 +35,8 @@ interface InstanceConfig {
user: string | null; // null → run as the backend's own user (root in prod)
repoPath: string;
watchdogLog: string;
sessionsIndex: string;
sessionsDir: string;
}
const INSTANCES: Record<HermesInstanceId, InstanceConfig> = {
@ -37,12 +45,16 @@ const INSTANCES: Record<HermesInstanceId, InstanceConfig> = {
user: null,
repoPath: '/root/repos/bytelyst_hostinger_hermes_vm',
watchdogLog: '/root/.hermes/logs/hermes-health-watchdog.log',
sessionsIndex: '/root/.hermes/sessions/sessions.json',
sessionsDir: '/root/.hermes/sessions',
},
bheem: {
id: 'bheem',
user: 'uma',
repoPath: '/home/uma/repos/uma_hostinger_hermes_vm',
watchdogLog: '/home/uma/.hermes/logs/hermes-health-watchdog.log',
sessionsIndex: '/home/uma/.hermes/sessions/sessions.json',
sessionsDir: '/home/uma/.hermes/sessions',
},
};
@ -103,6 +115,142 @@ async function readSessionStats(inst: InstanceConfig): Promise<HermesSessionStat
}
}
async function readSessionList(inst: InstanceConfig): Promise<HermesSessionList> {
try {
const parsed = JSON.parse(await readFile(inst.sessionsIndex, 'utf8')) as Record<string, Record<string, unknown>>;
const entries: HermesSessionEntry[] = Object.values(parsed)
.map((row) => ({
id: String(row.session_id ?? row.id ?? row.session_key ?? ''),
sessionKey: String(row.session_key ?? ''),
platform: row.platform ? String(row.platform) : null,
chatType: row.chat_type ? String(row.chat_type) : null,
displayName: row.display_name ? String(row.display_name) : null,
createdAt: row.created_at ? String(row.created_at) : null,
updatedAt: row.updated_at ? String(row.updated_at) : null,
suspended: Boolean(row.suspended ?? false),
resumePending: Boolean(row.resume_pending ?? false),
totalTokens: typeof row.total_tokens === 'number' ? row.total_tokens : null,
estimatedCostUsd: typeof row.estimated_cost_usd === 'number' ? row.estimated_cost_usd : null,
}))
.filter((entry) => entry.id || entry.sessionKey)
.sort((a, b) => new Date(b.updatedAt ?? b.createdAt ?? 0).getTime() - new Date(a.updatedAt ?? a.createdAt ?? 0).getTime())
.slice(0, 50);
return { entries, status: 'up' };
} catch {
return { entries: [], status: 'unknown' };
}
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function extractToolNames(row: Record<string, unknown>): string[] {
const names = new Set<string>();
const addName = (value: unknown) => {
if (typeof value === 'string' && value.trim()) names.add(value.trim());
};
const collectFromItem = (item: unknown) => {
if (!isRecord(item)) return;
addName(item.name);
if (isRecord(item.function)) addName(item.function.name);
};
if (Array.isArray(row.tool_calls)) row.tool_calls.forEach(collectFromItem);
if (Array.isArray(row.codex_message_items)) row.codex_message_items.forEach(collectFromItem);
return Array.from(names).slice(0, 8);
}
function extractItemTypes(row: Record<string, unknown>): string[] {
const itemTypes = new Set<string>();
if (Array.isArray(row.codex_message_items)) {
for (const item of row.codex_message_items) {
if (isRecord(item) && typeof item.type === 'string') itemTypes.add(item.type);
}
}
return Array.from(itemTypes).slice(0, 8);
}
function classifySessionEvent(row: Record<string, unknown>, toolNames: string[], itemTypes: string[]): HermesSessionEvent['eventType'] {
const role = typeof row.role === 'string' ? row.role : '';
if (role === 'session_meta') return 'system';
if (toolNames.length > 0 || row.finish_reason === 'tool_calls') return 'tool-call';
if (itemTypes.some((type) => type.includes('tool'))) return 'tool-result';
if (itemTypes.includes('reasoning') || row.reasoning) return 'reasoning';
if (role === 'user' || role === 'assistant' || typeof row.content === 'string') return 'message';
return 'unknown';
}
function summarizeSessionEvent(row: Record<string, unknown>, eventType: HermesSessionEvent['eventType'], toolNames: string[]): string {
const role = typeof row.role === 'string' ? row.role : 'unknown';
if (eventType === 'system') return 'session metadata recorded';
if (eventType === 'tool-call') {
const toolText = toolNames.length > 0 ? `: ${toolNames.join(', ')}` : '';
return `${role} tool call${toolNames.length === 1 ? '' : 's'}${toolText}`;
}
if (eventType === 'tool-result') return `${role} tool result recorded`;
if (eventType === 'reasoning') return `${role} reasoning item recorded`;
if (eventType === 'message') return `${role} message (content redacted)`;
return `${role} event recorded`;
}
function parseSessionJsonlLine(line: string, sessionFile: string, lineIndex: number): HermesSessionEvent | null {
if (!line.trim()) return null;
try {
const row = JSON.parse(line) as unknown;
if (!isRecord(row)) return null;
const toolNames = extractToolNames(row);
const itemTypes = extractItemTypes(row);
const eventType = classifySessionEvent(row, toolNames, itemTypes);
const timestamp = typeof row.timestamp === 'string' ? row.timestamp : null;
const status = typeof row.status === 'string'
? row.status
: (typeof row.finish_reason === 'string' ? row.finish_reason : null);
return {
id: `${sessionFile}:${lineIndex}`,
sessionFile,
timestamp,
role: typeof row.role === 'string' ? row.role : null,
eventType,
summary: summarizeSessionEvent(row, eventType, toolNames),
toolNames,
itemTypes,
status,
};
} catch {
return null;
}
}
async function readSessionEvents(inst: InstanceConfig): Promise<HermesSessionEventList> {
try {
const files = (await readdir(inst.sessionsDir))
.filter((name) => name.endsWith('.jsonl'))
.sort()
.slice(-10);
if (files.length === 0) return { entries: [], status: 'up', sourceCount: 0 };
const entries: HermesSessionEvent[] = [];
for (const file of files) {
const sessionFile = basename(file);
const content = await readFile(join(inst.sessionsDir, file), 'utf8');
const lines = content.split('\n');
const start = Math.max(0, lines.length - 200);
for (let index = start; index < lines.length; index += 1) {
const event = parseSessionJsonlLine(lines[index], sessionFile, index + 1);
if (event) entries.push(event);
}
}
entries.sort((a, b) => new Date(b.timestamp ?? 0).getTime() - new Date(a.timestamp ?? 0).getTime());
return { entries: entries.slice(0, 100), status: 'up', sourceCount: files.length };
} catch (err) {
log.warn({ err, instance: inst.id, source: inst.sessionsDir }, 'failed to read Hermes session events');
return { entries: [], status: 'unknown', sourceCount: 0 };
}
}
// --- Cron -------------------------------------------------------------------
//
// `hermes cron list --json` is the canonical source. It's distinct from
@ -248,8 +396,10 @@ const inflight = new Map<HermesInstanceId, Promise<HermesTelemetrySnapshot>>();
async function buildSnapshot(instanceId: HermesInstanceId): Promise<HermesTelemetrySnapshot> {
const inst = INSTANCES[instanceId];
const [sessions, cron, memory, skills, watchdog, backupHistory] = await Promise.all([
const [sessions, sessionList, sessionEvents, cron, memory, skills, watchdog, backupHistory] = await Promise.all([
readSessionStats(inst),
readSessionList(inst),
readSessionEvents(inst),
readCron(inst),
readMemory(inst),
readSkills(inst),
@ -259,17 +409,28 @@ async function buildSnapshot(instanceId: HermesInstanceId): Promise<HermesTeleme
const warnings: string[] = [];
if (sessions.status === 'unknown') warnings.push(`${instanceId}: hermes sessions stats unavailable (CLI missing or non-zero exit)`);
if (sessionList.status === 'unknown') warnings.push(`${instanceId}: Hermes session index not readable`);
if (sessionEvents.status === 'unknown') warnings.push(`${instanceId}: Hermes session event JSONL not readable at ${inst.sessionsDir}`);
if (cron.status === 'unknown') warnings.push(`${instanceId}: hermes cron list unavailable`);
if (memory.status === 'unknown') warnings.push(`${instanceId}: hermes memory list unavailable`);
if (skills.status === 'unknown') warnings.push(`${instanceId}: hermes skills list unavailable`);
if (watchdog.status === 'unknown') warnings.push(`${instanceId}: watchdog log not readable at ${watchdog.source ?? 'unknown path'}`);
if (backupHistory.status === 'unknown') warnings.push(`${instanceId}: backup repo not readable at ${backupHistory.repoPath ?? 'unknown path'}`);
await Promise.all([
...warnings.map((message) => appendDashboardWarning({ severity: 'warn', instance: instanceId, message })),
...watchdog.alerts
.filter((alert) => alert.severity === 'critical')
.map((alert) => appendDashboardWarning({ severity: 'critical', instance: instanceId, message: alert.message })),
]);
return {
generatedAt: new Date().toISOString(),
cached: false,
instanceId,
sessions,
sessionList,
sessionEvents,
cron,
memory,
skills,

View File

@ -18,6 +18,47 @@ export const HermesSessionStatsSchema = z.object({
});
export type HermesSessionStats = z.infer<typeof HermesSessionStatsSchema>;
export const HermesSessionEntrySchema = z.object({
id: z.string(),
sessionKey: z.string(),
platform: z.string().nullable(),
chatType: z.string().nullable(),
displayName: z.string().nullable(),
createdAt: z.string().nullable(),
updatedAt: z.string().nullable(),
suspended: z.boolean(),
resumePending: z.boolean(),
totalTokens: z.number().nullable(),
estimatedCostUsd: z.number().nullable(),
});
export type HermesSessionEntry = z.infer<typeof HermesSessionEntrySchema>;
export const HermesSessionListSchema = z.object({
entries: z.array(HermesSessionEntrySchema),
status: ProbeStatusSchema,
});
export type HermesSessionList = z.infer<typeof HermesSessionListSchema>;
export const HermesSessionEventSchema = z.object({
id: z.string(),
sessionFile: z.string(),
timestamp: z.string().nullable(),
role: z.string().nullable(),
eventType: z.enum(['message', 'tool-call', 'tool-result', 'reasoning', 'system', 'unknown']),
summary: z.string(),
toolNames: z.array(z.string()),
itemTypes: z.array(z.string()),
status: z.string().nullable(),
});
export type HermesSessionEvent = z.infer<typeof HermesSessionEventSchema>;
export const HermesSessionEventListSchema = z.object({
entries: z.array(HermesSessionEventSchema),
status: ProbeStatusSchema,
sourceCount: z.number(),
});
export type HermesSessionEventList = z.infer<typeof HermesSessionEventListSchema>;
export const HermesCronEntrySchema = z.object({
id: z.string(),
name: z.string(),
@ -106,6 +147,8 @@ export const HermesTelemetrySnapshotSchema = z.object({
cached: z.boolean(),
instanceId: HermesInstanceIdSchema,
sessions: HermesSessionStatsSchema,
sessionList: HermesSessionListSchema,
sessionEvents: HermesSessionEventListSchema,
cron: HermesCronListSchema,
memory: HermesMemoryListSchema,
skills: HermesSkillListSchema,

View File

@ -25,6 +25,7 @@ services:
environment:
- VM_SCRIPTS_PATH=/vm-scripts/VMs/HostingerVM
- VM_LOG_DIR=/host-logs
- HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log
ports:
- '127.0.0.1:4004:4004'
networks:
@ -37,6 +38,7 @@ services:
- /var/log/vm-cleanup.log:/host-logs/vm-cleanup.log
- /var/log/vm-health-check.log:/host-logs/vm-health-check.log
- /var/log/docker-watchdog.log:/host-logs/docker-watchdog.log
- /var/log/hermes-dashboard-warnings.log:/var/log/hermes-dashboard-warnings.log
# Docker socket — allows running docker commands against the host daemon
# (same pattern as Portainer/cAdvisor; container already runs as root)
- /var/run/docker.sock:/var/run/docker.sock

View File

@ -40,6 +40,87 @@ const hermesOpsSnapshot = {
warnings: [],
};
const hermesTelemetrySnapshot = (instanceId: 'vijay' | 'bheem') => ({
generatedAt: '2026-01-01T00:00:00.000Z',
cached: false,
instanceId,
sessions: { totalSessions: instanceId === 'vijay' ? 12 : 7, totalMessages: instanceId === 'vijay' ? 480 : 210, status: 'up' },
sessionList: {
status: 'up',
entries: [
{
id: `${instanceId}-session-1`,
sessionKey: `agent:main:telegram:dm:${instanceId}`,
platform: 'telegram',
chatType: 'dm',
displayName: instanceId === 'vijay' ? 'S' : 'Uma',
createdAt: '2026-01-01T00:00:00.000Z',
updatedAt: '2026-01-01T00:06:00.000Z',
suspended: false,
resumePending: false,
totalTokens: 100,
estimatedCostUsd: 0,
},
],
},
sessionEvents: {
status: 'up',
sourceCount: 1,
entries: [
{
id: `${instanceId}-events.jsonl:3`,
sessionFile: `${instanceId}-events.jsonl`,
timestamp: '2026-01-01T00:06:00.000Z',
role: 'assistant',
eventType: 'tool-call',
summary: 'assistant tool call: exec_command',
toolNames: ['exec_command'],
itemTypes: [],
status: 'tool_calls',
},
],
},
cron: {
status: 'up',
entries: [
{
id: `${instanceId}-digest`,
name: `${instanceId} digest`,
schedule: '0 * * * *',
lastRun: '2026-01-01T00:00:00.000Z',
nextRun: '2026-01-01T01:00:00.000Z',
lastStatus: 'ok',
active: true,
},
],
},
memory: { status: 'up', items: [] },
skills: { status: 'up', items: [] },
watchdog: {
source: `/tmp/${instanceId}-watchdog.log`,
status: 'up',
alerts: [
{
timestamp: '2026-01-01T00:05:00.000Z',
severity: 'info',
message: `${instanceId} watchdog healthy`,
},
],
},
backupHistory: {
repoPath: `/tmp/${instanceId}-repo`,
status: 'up',
entries: [
{
sha: `${instanceId}123456`,
committedAt: '2026-01-01T00:03:00.000Z',
subject: `${instanceId} backup`,
},
],
},
warnings: [],
});
test.describe('Hermes Mission Control', () => {
test.beforeEach(async ({ page }) => {
await page.addInitScript(() => {
@ -59,6 +140,22 @@ test.describe('Hermes Mission Control', () => {
});
});
await page.route('**/api/hermes/telemetry/vijay', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify(hermesTelemetrySnapshot('vijay')),
});
});
await page.route('**/api/hermes/telemetry/bheem', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify(hermesTelemetrySnapshot('bheem')),
});
});
// /hermes/products fetches the real service registry + health module
// (Phase 3 slice 2). Backend isn't running in CI, so we satisfy those
// routes the same way the dashboard spec does.
@ -82,11 +179,11 @@ test.describe('Hermes Mission Control', () => {
await page.getByRole('link', { name: 'Task Ledger' }).click();
await expect(page.getByRole('heading', { name: 'Task Ledger' })).toBeVisible();
await expect(page.getByText('Task table')).toBeVisible();
await expect(page.getByRole('heading', { name: 'Task table' })).toBeVisible();
await page.goto('/hermes/tasks/task-1');
await expect(page.getByRole('heading', { name: 'Hermes learning' })).toBeVisible();
await expect(page.getByText('Timeline')).toBeVisible();
await expect(page.getByRole('heading', { name: 'Timeline', exact: true })).toBeVisible();
await page.goto('/hermes/products');
await expect(page.getByRole('heading', { name: 'Product Portfolio' })).toBeVisible();
@ -111,7 +208,7 @@ test.describe('Hermes Mission Control', () => {
await page.goto('/hermes/tasks/task-1');
await expect(page.getByRole('heading', { name: 'Hermes learning' })).toBeVisible();
await expect(page.getByRole('heading', { name: 'Timeline' })).toBeVisible();
await expect(page.getByRole('heading', { name: 'Timeline', exact: true })).toBeVisible();
});
test('exposes a global instance switcher with All / Vijay / Bheem', async ({ page }) => {

View File

@ -7,8 +7,12 @@ import { useEffect, useMemo, useState } from 'react';
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
import { useHermesInstance } from '@/lib/hermes-instance-context';
import { getHermesAgents, HERMES_INSTANCES, type HermesInstanceId } from '@/lib/hermes';
import { api, type HermesTelemetrySnapshot } from '@/lib/api';
import { getHermesAgents, HERMES_INSTANCES } from '@/lib/hermes';
import {
emptyTelemetryState,
loadAllHermesTelemetry,
type HermesTelemetryState,
} from '@/lib/hermes-telemetry-client';
export default function HermesAgentsPage() {
const { selectedInstance } = useHermesInstance();
@ -21,19 +25,16 @@ export default function HermesAgentsPage() {
// endpoint. The agent statuses above remain seed-data (status observability
// needs a separate ingestion contract); the inventory below is genuine
// when the `hermes` CLI is reachable, status:'unknown' otherwise.
const [telemetry, setTelemetry] = useState<Record<HermesInstanceId, HermesTelemetrySnapshot | null>>({ vijay: null, bheem: null });
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
const [telemetryError, setTelemetryError] = useState<string | null>(null);
useEffect(() => {
const controller = new AbortController();
const load = async () => {
try {
const [vijay, bheem] = await Promise.all([
api.getHermesTelemetry('vijay'),
api.getHermesTelemetry('bheem'),
]);
const next = await loadAllHermesTelemetry();
if (controller.signal.aborted) return;
setTelemetry({ vijay, bheem });
setTelemetry(next);
setTelemetryError(null);
} catch (err) {
if (controller.signal.aborted) return;

View File

@ -1,15 +1,28 @@
'use client';
import Link from 'next/link';
import { ArrowLeft, Clock3, Flame, TrendingDown, TrendingUp } from 'lucide-react';
import { ArrowLeft, Clock3, Flame, History, TrendingDown, TrendingUp } from 'lucide-react';
import { Badge, Button } from '@/components/ui/Primitives';
import { useMemo } from 'react';
import { useEffect, useMemo, useState } from 'react';
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
import { useHermesInstance } from '@/lib/hermes-instance-context';
import { getHermesHistory, hermesTasks } from '@/lib/hermes';
import {
collectBackupEntries,
collectCronEntries,
collectSessionEvents,
collectSessionEntries,
collectWatchdogAlerts,
emptyTelemetryState,
loadAllHermesTelemetry,
type HermesTelemetryState,
} from '@/lib/hermes-telemetry-client';
export default function HermesHistoryPage() {
const { selectedInstance } = useHermesInstance();
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
const [telemetryError, setTelemetryError] = useState<string | null>(null);
const history = useMemo(() => getHermesHistory(selectedInstance), [selectedInstance]);
const filteredTasks = useMemo(
() => (selectedInstance === 'all' ? hermesTasks : hermesTasks.filter((task) => task.instanceId === selectedInstance)),
@ -26,6 +39,30 @@ export default function HermesHistoryPage() {
tasksWithDuration.reduce((sum, task) => sum + (task.durationMs ?? 0), 0) /
Math.max(1, tasksWithDuration.length) / 60000,
);
const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
const liveSessions = useMemo(() => collectSessionEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
const liveEvents = useMemo(() => collectSessionEvents(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
useEffect(() => {
let active = true;
const load = async () => {
try {
const next = await loadAllHermesTelemetry();
if (!active) return;
setTelemetry(next);
setTelemetryError(null);
} catch (err) {
if (!active) return;
setTelemetryError(err instanceof Error ? err.message : String(err));
}
};
void load();
return () => {
active = false;
};
}, []);
const failureReasons = [
['CI failures', 9],
@ -48,6 +85,86 @@ export default function HermesHistoryPage() {
<MetricCard label="Avg task duration" value={`${avgDuration}m`} tone="info" icon={<Clock3 className="h-5 w-5" />} />
</section>
<section className="grid gap-4 md:grid-cols-2 xl:grid-cols-4">
<MetricCard label="Live events" value={liveEvents.length} tone="info" icon={<History className="h-5 w-5" />} helpText="From Hermes session JSONL" />
<MetricCard label="Live cron jobs" value={liveCron.length} tone="info" icon={<Clock3 className="h-5 w-5" />} helpText="From hermes cron list" />
<MetricCard label="Watchdog alerts" value={liveAlerts.length} tone={liveAlerts.some((a) => a.severity === 'critical') ? 'danger' : liveAlerts.some((a) => a.severity === 'warn') ? 'warning' : 'default'} icon={<Flame className="h-5 w-5" />} helpText="From watchdog logs" />
<MetricCard label="Backup commits" value={liveBackups.length} tone="success" icon={<TrendingUp className="h-5 w-5" />} helpText="From backup git history" />
</section>
<SectionCard
title="Live artifact timeline"
subtitle="Real session events, sessions, cron, watchdog, and backup history from the Hermes telemetry endpoint. Message content is redacted at the backend."
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}</Badge>}
>
{telemetryError ? (
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
Could not load telemetry: {telemetryError}
</p>
) : (
<div className="grid gap-4 lg:grid-cols-2 xl:grid-cols-5">
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent events</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveEvents.length > 0 ? liveEvents.map((event) => (
<div key={`${event.instanceId}-${event.id}`} className="flex items-start justify-between gap-3">
<span className="line-clamp-2">{event.summary}</span>
<HermesInstanceBadge instanceId={event.instanceId} />
</div>
)) : <p>No session events returned.</p>}
</div>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent sessions</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveSessions.length > 0 ? liveSessions.map((session) => (
<div key={`${session.instanceId}-${session.id}`} className="flex items-center justify-between gap-3">
<span className="truncate">{session.displayName ?? session.sessionKey}</span>
<HermesInstanceBadge instanceId={session.instanceId} />
</div>
)) : <p>No session entries returned.</p>}
</div>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent watchdog alerts</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveAlerts.length > 0 ? liveAlerts.map((alert) => (
<div key={`${alert.instanceId}-${alert.timestamp}-${alert.message}`} className="flex items-start justify-between gap-3">
<span className="line-clamp-2">{alert.message}</span>
<div className="flex shrink-0 items-center gap-2">
<Badge variant={alert.severity === 'critical' ? 'error' : alert.severity === 'warn' ? 'warning' : 'info'}>{alert.severity}</Badge>
<HermesInstanceBadge instanceId={alert.instanceId} />
</div>
</div>
)) : <p>No watchdog alerts returned.</p>}
</div>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Cron entries</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveCron.length > 0 ? liveCron.map((entry) => (
<div key={`${entry.instanceId}-${entry.id}`} className="flex items-center justify-between gap-3">
<span className="truncate">{entry.name}</span>
<HermesInstanceBadge instanceId={entry.instanceId} />
</div>
)) : <p>No cron entries returned.</p>}
</div>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Backup history</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveBackups.length > 0 ? liveBackups.map((entry) => (
<div key={`${entry.instanceId}-${entry.sha}`} className="flex items-center justify-between gap-3">
<span className="truncate">{entry.subject}</span>
<HermesInstanceBadge instanceId={entry.instanceId} />
</div>
)) : <p>No backup commits returned.</p>}
</div>
</div>
</div>
)}
</SectionCard>
<SectionCard title="Weekly activity chart" subtitle="Accessible bar chart built with standard layout primitives.">
<div className="overflow-x-auto">
<div className="flex min-w-[48rem] items-end gap-4 rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-5">

View File

@ -1,8 +1,8 @@
'use client';
import { useMemo } from 'react';
import { useEffect, useMemo, useState } from 'react';
import Link from 'next/link';
import { ArrowRight, BadgeCheck, Bot, CheckCircle2, Clock3, LayoutDashboard, OctagonAlert, Rocket, ShieldAlert, Sparkles, TriangleAlert } from 'lucide-react';
import { ArrowRight, BadgeCheck, BellRing, Bot, CheckCircle2, Clock3, LayoutDashboard, OctagonAlert, Rocket, ShieldAlert, Sparkles, TriangleAlert } from 'lucide-react';
import { Badge, Button } from '@/components/ui/Primitives';
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
@ -19,6 +19,15 @@ import {
type HermesProduct,
type HermesTask,
} from '@/lib/hermes';
import {
collectBackupEntries,
collectCronEntries,
collectWatchdogAlerts,
emptyTelemetryState,
loadAllHermesTelemetry,
telemetryForFilter,
type HermesTelemetryState,
} from '@/lib/hermes-telemetry-client';
const fmtDate = new Intl.DateTimeFormat('en', {
month: 'short',
@ -80,6 +89,8 @@ function ProductMiniCard({ product }: { product: HermesProduct }) {
export default function HermesMissionControlPage() {
const { selectedInstance } = useHermesInstance();
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
const [telemetryError, setTelemetryError] = useState<string | null>(null);
const overview = useMemo(() => getHermesOverview(selectedInstance), [selectedInstance]);
// Per-instance roll-up cards always show both Vijay and Bheem regardless of
// the active filter — they're the "comparison" view that sits next to the
@ -124,6 +135,32 @@ export default function HermesMissionControlPage() {
);
const actionableProducts = filteredProducts.filter((product) => product.needsAttention).slice(0, 6);
const agentStatuses = useMemo(() => getHermesAgents(selectedInstance), [selectedInstance]);
const liveSnapshots = useMemo(() => telemetryForFilter(telemetry, selectedInstance), [telemetry, selectedInstance]);
const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance).slice(0, 8), [telemetry, selectedInstance]);
const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance).slice(0, 6), [telemetry, selectedInstance]);
const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance).slice(0, 6), [telemetry, selectedInstance]);
useEffect(() => {
let active = true;
const load = async () => {
try {
const next = await loadAllHermesTelemetry();
if (!active) return;
setTelemetry(next);
setTelemetryError(null);
} catch (err) {
if (!active) return;
setTelemetryError(err instanceof Error ? err.message : String(err));
}
};
void load();
const timer = window.setInterval(load, 60_000);
return () => {
active = false;
window.clearInterval(timer);
};
}, []);
const autoActions = [
'Continue the queued execution lane for high-priority product updates.',
'Publish a weekly digest from completed and failed work.',
@ -185,6 +222,77 @@ export default function HermesMissionControlPage() {
<HermesOpsPanel />
<SectionCard
title="Unified live alerts"
subtitle="Cross-instance alert, cron, session, and backup signals from the real Hermes telemetry endpoint."
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}</Badge>}
>
{telemetryError ? (
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
Could not load telemetry: {telemetryError}
</p>
) : (
<div className="grid gap-4 xl:grid-cols-[1.2fr_0.8fr]">
<div className="space-y-3">
{liveAlerts.length > 0 ? liveAlerts.map((alert) => (
<div key={`${alert.instanceId}-${alert.timestamp}-${alert.message}`} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex flex-wrap items-start justify-between gap-3">
<div className="min-w-0">
<div className="flex flex-wrap items-center gap-2">
<Badge variant={alert.severity === 'critical' ? 'error' : alert.severity === 'warn' ? 'warning' : 'info'}>{alert.severity}</Badge>
<HermesInstanceBadge instanceId={alert.instanceId} />
<span className="text-xs text-[var(--bl-text-tertiary)]">{fmtDate.format(new Date(alert.timestamp))}</span>
</div>
<p className="mt-2 text-sm text-[var(--bl-text-primary)]">{alert.message}</p>
</div>
<BellRing className="h-4 w-4 text-[var(--bl-text-tertiary)]" />
</div>
</div>
)) : (
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-text-secondary)]">
No watchdog alerts were returned for the selected instance filter.
</div>
)}
</div>
<div className="grid gap-3">
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Sessions</p>
<div className="mt-3 grid gap-2">
{liveSnapshots.map((snapshot) => (
<div key={snapshot.instanceId} className="flex items-center justify-between gap-3 text-sm">
<HermesInstanceBadge instanceId={snapshot.instanceId} />
<span className="text-[var(--bl-text-secondary)]">{snapshot.sessions.totalSessions} sessions · {snapshot.sessions.totalMessages} messages</span>
</div>
))}
</div>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Upcoming Hermes cron</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveCron.length > 0 ? liveCron.map((entry) => (
<div key={`${entry.instanceId}-${entry.id}`} className="flex items-center justify-between gap-3">
<span className="truncate">{entry.name}</span>
<HermesInstanceBadge instanceId={entry.instanceId} />
</div>
)) : <p>No cron entries returned.</p>}
</div>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent backup commits</p>
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
{liveBackups.length > 0 ? liveBackups.map((entry) => (
<div key={`${entry.instanceId}-${entry.sha}`} className="flex items-center justify-between gap-3">
<span className="truncate">{entry.subject}</span>
<HermesInstanceBadge instanceId={entry.instanceId} />
</div>
)) : <p>No backup commits returned.</p>}
</div>
</div>
</div>
</div>
)}
</SectionCard>
<div className="grid gap-6 xl:grid-cols-[1.5fr_1fr]">
<SectionCard title="Active Missions" subtitle="What Hermes is currently running or waiting on." actions={<Button asChild variant="ghost" size="sm"><Link href="/hermes/tasks">View all tasks <ArrowRight className="ml-2 h-4 w-4" /></Link></Button>}>
<div className="space-y-3">

View File

@ -2,10 +2,18 @@
import Link from 'next/link';
import { useParams } from 'next/navigation';
import { useEffect, useMemo, useState } from 'react';
import { ArrowLeft, CircleDashed, Clock3, ShieldAlert, Sparkles } from 'lucide-react';
import { Badge, Button } from '@/components/ui/Primitives';
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
import { getHermesProductById, getHermesTaskById, getHermesTaskEvents } from '@/lib/hermes';
import {
collectSessionEvents,
collectSessionEntries,
emptyTelemetryState,
loadAllHermesTelemetry,
type HermesTelemetryState,
} from '@/lib/hermes-telemetry-client';
const fmt = new Intl.DateTimeFormat('en', { month: 'short', day: 'numeric', hour: 'numeric', minute: '2-digit' });
@ -24,6 +32,29 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
const taskId = routeParams?.id ?? params.id;
const task = getHermesTaskById(taskId);
const events = getHermesTaskEvents(taskId);
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
const [telemetryError, setTelemetryError] = useState<string | null>(null);
const liveSessions = useMemo(() => collectSessionEntries(telemetry, 'all').slice(0, 8), [telemetry]);
const liveEvents = useMemo(() => collectSessionEvents(telemetry, 'all').slice(0, 12), [telemetry]);
useEffect(() => {
let active = true;
const load = async () => {
try {
const next = await loadAllHermesTelemetry();
if (!active) return;
setTelemetry(next);
setTelemetryError(null);
} catch (err) {
if (!active) return;
setTelemetryError(err instanceof Error ? err.message : String(err));
}
};
void load();
return () => {
active = false;
};
}, []);
if (!task) {
return (
@ -40,7 +71,6 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
}
const product = getHermesProductById(task.productId);
const lastEvent = events[0];
const timeline = events.slice().sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
return (
@ -110,6 +140,57 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
</SectionCard>
</div>
<SectionCard
title="Live Hermes event timeline"
subtitle="Sanitized session JSONL events read from Hermes homes, paired with durable session index context. Message content is redacted at the backend."
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live sessions'}</Badge>}
>
{telemetryError ? (
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
Could not load telemetry: {telemetryError}
</p>
) : (
<div className="grid gap-4 xl:grid-cols-[1.2fr_0.8fr]">
<div className="space-y-3">
{liveEvents.map((event) => (
<div key={`${event.instanceId}-${event.id}`} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex flex-wrap items-start justify-between gap-3">
<div className="min-w-0">
<div className="flex flex-wrap items-center gap-2">
<Badge variant={event.eventType === 'tool-call' ? 'info' : event.eventType === 'system' ? 'neutral' : 'success'}>{event.eventType}</Badge>
<Badge variant="neutral">{event.instanceId}</Badge>
{event.status ? <Badge variant="neutral">{event.status}</Badge> : null}
</div>
<p className="mt-2 font-medium text-[var(--bl-text-primary)]">{event.summary}</p>
<p className="mt-1 truncate text-xs text-[var(--bl-text-secondary)]">{event.sessionFile}</p>
</div>
<p className="text-xs text-[var(--bl-text-tertiary)]">{event.timestamp ? fmt.format(new Date(event.timestamp)) : 'unknown'}</p>
</div>
</div>
))}
{liveEvents.length === 0 ? (
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-text-secondary)]">No live session events were returned.</p>
) : null}
</div>
<div className="grid gap-3 sm:grid-cols-2 xl:grid-cols-1">
{liveSessions.map((session) => (
<div key={`${session.instanceId}-${session.id}`} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center justify-between gap-3">
<Badge variant={session.resumePending || session.suspended ? 'warning' : 'info'}>{session.platform ?? 'session'}</Badge>
<Badge variant="neutral">{session.instanceId}</Badge>
</div>
<p className="mt-3 truncate font-medium text-[var(--bl-text-primary)]">{session.displayName ?? session.sessionKey}</p>
<p className="mt-1 text-xs text-[var(--bl-text-secondary)]">Updated {session.updatedAt ? fmt.format(new Date(session.updatedAt)) : 'unknown'}</p>
</div>
))}
{liveSessions.length === 0 ? (
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-text-secondary)]">No live session entries were returned.</p>
) : null}
</div>
</div>
)}
</SectionCard>
<SectionCard title="Timeline" subtitle="Chronological event stream for the task lifecycle.">
<ol className="space-y-4">
{timeline.map((event) => (

View File

@ -1,8 +1,8 @@
'use client';
import { Fragment, useMemo, useState } from 'react';
import { Fragment, useEffect, useMemo, useState } from 'react';
import Link from 'next/link';
import { Download, Filter, Search, ChevronDown, ChevronUp, ArrowLeftRight } from 'lucide-react';
import { Download, Filter, Search, ChevronDown, ChevronUp, ArrowLeftRight, Activity } from 'lucide-react';
import { Badge, Button, Input } from '@/components/ui/Primitives';
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
@ -17,6 +17,16 @@ import {
type HermesTaskSource,
type HermesTask,
} from '@/lib/hermes';
import {
collectBackupEntries,
collectCronEntries,
collectSessionEntries,
collectWatchdogAlerts,
emptyTelemetryState,
loadAllHermesTelemetry,
telemetryForFilter,
type HermesTelemetryState,
} from '@/lib/hermes-telemetry-client';
const statuses: Array<HermesTaskStatus | 'all'> = ['all', 'queued', 'running', 'blocked', 'completed', 'failed', 'skipped', 'cancelled'];
const priorities: Array<HermesPriority | 'all'> = ['all', 'P0', 'P1', 'P2', 'P3'];
@ -50,6 +60,8 @@ export default function HermesTaskLedgerPage() {
const [sort, setSort] = useState<(typeof sortOptions)[number]>('newest');
const [page, setPage] = useState(1);
const [expandedTaskId, setExpandedTaskId] = useState<string | null>(null);
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
const [telemetryError, setTelemetryError] = useState<string | null>(null);
const { selectedInstance } = useHermesInstance();
const tasks = useMemo(
@ -67,6 +79,68 @@ export default function HermesTaskLedgerPage() {
}), [tasks]);
const visibleProducts = hermesProducts.slice(0, 20);
const liveSnapshots = useMemo(() => telemetryForFilter(telemetry, selectedInstance), [telemetry, selectedInstance]);
const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance), [telemetry, selectedInstance]);
const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance), [telemetry, selectedInstance]);
const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance), [telemetry, selectedInstance]);
const liveSessions = useMemo(() => collectSessionEntries(telemetry, selectedInstance), [telemetry, selectedInstance]);
const liveActivityRows = useMemo(() => [
...liveSessions.map((entry) => ({
id: `session-${entry.instanceId}-${entry.id}`,
instanceId: entry.instanceId,
kind: 'session',
title: entry.displayName ? `${entry.displayName} session` : entry.sessionKey,
detail: entry.resumePending ? 'resume pending' : entry.suspended ? 'suspended' : entry.platform ?? 'session',
time: entry.updatedAt ?? entry.createdAt,
tone: entry.resumePending || entry.suspended ? 'warning' as const : 'info' as const,
})),
...liveCron.map((entry) => ({
id: `cron-${entry.instanceId}-${entry.id}`,
instanceId: entry.instanceId,
kind: 'cron',
title: entry.name,
detail: entry.lastStatus ?? entry.schedule ?? 'Hermes cron entry',
time: entry.nextRun ?? entry.lastRun,
tone: entry.active ? 'success' as const : 'neutral' as const,
})),
...liveAlerts.map((alert) => ({
id: `alert-${alert.instanceId}-${alert.timestamp}-${alert.message}`,
instanceId: alert.instanceId,
kind: 'alert',
title: alert.message,
detail: alert.severity,
time: alert.timestamp,
tone: alert.severity === 'critical' ? 'error' as const : alert.severity === 'warn' ? 'warning' as const : 'info' as const,
})),
...liveBackups.map((entry) => ({
id: `backup-${entry.instanceId}-${entry.sha}`,
instanceId: entry.instanceId,
kind: 'backup',
title: entry.subject,
detail: entry.sha.slice(0, 8),
time: entry.committedAt,
tone: 'success' as const,
})),
].sort((a, b) => new Date(b.time ?? 0).getTime() - new Date(a.time ?? 0).getTime()).slice(0, 12), [liveSessions, liveCron, liveAlerts, liveBackups]);
useEffect(() => {
let active = true;
const load = async () => {
try {
const next = await loadAllHermesTelemetry();
if (!active) return;
setTelemetry(next);
setTelemetryError(null);
} catch (err) {
if (!active) return;
setTelemetryError(err instanceof Error ? err.message : String(err));
}
};
void load();
return () => {
active = false;
};
}, []);
return (
<HermesShell
@ -86,6 +160,68 @@ export default function HermesTaskLedgerPage() {
<MetricCard label="Failed" value={counts.failed} tone="danger" />
</section>
<SectionCard
title="Live Hermes activity ledger"
subtitle="Real cron entries, watchdog alerts, backup commits, and session totals from the telemetry endpoint. The task table below remains the planner-style seed ledger until Hermes emits task-level events."
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}</Badge>}
>
{telemetryError ? (
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
Could not load telemetry: {telemetryError}
</p>
) : (
<div className="grid gap-4 xl:grid-cols-[1fr_2fr]">
<div className="grid gap-3">
{liveSnapshots.map((snapshot) => (
<div key={snapshot.instanceId} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center justify-between gap-3">
<HermesInstanceBadge instanceId={snapshot.instanceId} />
<Badge variant={snapshot.sessions.status === 'up' ? 'success' : 'warning'}>{snapshot.sessions.status}</Badge>
</div>
<p className="mt-3 text-2xl font-semibold text-[var(--bl-text-primary)]">{snapshot.sessions.totalSessions}</p>
<p className="text-sm text-[var(--bl-text-secondary)]">{snapshot.sessions.totalMessages} session messages observed</p>
</div>
))}
</div>
<div className="overflow-hidden rounded-2xl border border-[var(--bl-border)]">
<table className="min-w-full divide-y divide-[var(--bl-border)] text-left text-sm">
<thead className="bg-[var(--bl-surface-muted)] text-xs uppercase tracking-[0.18em] text-[var(--bl-text-tertiary)]">
<tr>
<th className="px-4 py-3">Artifact</th>
<th className="px-4 py-3">Instance</th>
<th className="px-4 py-3">Signal</th>
<th className="px-4 py-3">Time</th>
</tr>
</thead>
<tbody className="divide-y divide-[var(--bl-border)] bg-[var(--bl-surface-card)]">
{liveActivityRows.map((row) => (
<tr key={row.id}>
<td className="px-4 py-4">
<div className="flex items-start gap-2">
<Activity className="mt-0.5 h-4 w-4 text-[var(--bl-text-tertiary)]" />
<div>
<p className="font-medium text-[var(--bl-text-primary)]">{row.title}</p>
<p className="text-xs text-[var(--bl-text-secondary)]">{row.kind}</p>
</div>
</div>
</td>
<td className="px-4 py-4"><HermesInstanceBadge instanceId={row.instanceId} /></td>
<td className="px-4 py-4"><Badge variant={row.tone}>{row.detail}</Badge></td>
<td className="px-4 py-4 text-[var(--bl-text-secondary)]">{row.time ? prettyDate(row.time) : '—'}</td>
</tr>
))}
{liveActivityRows.length === 0 ? (
<tr>
<td colSpan={4} className="px-4 py-10 text-center text-[var(--bl-text-secondary)]">No live activity artifacts were returned for the current instance filter.</td>
</tr>
) : null}
</tbody>
</table>
</div>
</div>
)}
</SectionCard>
<SectionCard title="Filters" subtitle="Find work by status, product, priority, type, source, or age.">
<div className="grid gap-3 lg:grid-cols-4 xl:grid-cols-7">
<Input value={query} onChange={(event) => { setQuery(event.target.value); setPage(1); }} placeholder="Search tasks..." aria-label="Search tasks" className="xl:col-span-2" />

View File

@ -130,6 +130,43 @@ export interface HermesSessionStats {
status: HermesProbeStatus;
}
export interface HermesSessionEntry {
id: string;
sessionKey: string;
platform: string | null;
chatType: string | null;
displayName: string | null;
createdAt: string | null;
updatedAt: string | null;
suspended: boolean;
resumePending: boolean;
totalTokens: number | null;
estimatedCostUsd: number | null;
}
export interface HermesSessionList {
entries: HermesSessionEntry[];
status: HermesProbeStatus;
}
export interface HermesSessionEvent {
id: string;
sessionFile: string;
timestamp: string | null;
role: string | null;
eventType: 'message' | 'tool-call' | 'tool-result' | 'reasoning' | 'system' | 'unknown';
summary: string;
toolNames: string[];
itemTypes: string[];
status: string | null;
}
export interface HermesSessionEventList {
entries: HermesSessionEvent[];
status: HermesProbeStatus;
sourceCount: number;
}
export interface HermesCronEntry {
id: string;
name: string;
@ -201,6 +238,8 @@ export interface HermesTelemetrySnapshot {
cached: boolean;
instanceId: 'vijay' | 'bheem';
sessions: HermesSessionStats;
sessionList: HermesSessionList;
sessionEvents: HermesSessionEventList;
cron: HermesCronList;
memory: HermesMemoryList;
skills: HermesSkillList;

View File

@ -0,0 +1,54 @@
import { api, type HermesTelemetrySnapshot, type HermesWatchdogAlert } from '@/lib/api';
import type { HermesInstanceId, HermesInstanceFilter } from '@/lib/hermes';
export type HermesTelemetryState = Record<HermesInstanceId, HermesTelemetrySnapshot | null>;
export const emptyTelemetryState: HermesTelemetryState = { vijay: null, bheem: null };
export async function loadAllHermesTelemetry(): Promise<HermesTelemetryState> {
const [vijay, bheem] = await Promise.all([
api.getHermesTelemetry('vijay'),
api.getHermesTelemetry('bheem'),
]);
return { vijay, bheem };
}
export function telemetryForFilter(
telemetry: HermesTelemetryState,
selectedInstance: HermesInstanceFilter,
): HermesTelemetrySnapshot[] {
if (selectedInstance === 'all') return [telemetry.vijay, telemetry.bheem].filter(Boolean) as HermesTelemetrySnapshot[];
return telemetry[selectedInstance] ? [telemetry[selectedInstance]] : [];
}
export function collectWatchdogAlerts(
telemetry: HermesTelemetryState,
selectedInstance: HermesInstanceFilter,
): Array<HermesWatchdogAlert & { instanceId: HermesInstanceId }> {
return telemetryForFilter(telemetry, selectedInstance)
.flatMap((snapshot) => snapshot.watchdog.alerts.map((alert) => ({ ...alert, instanceId: snapshot.instanceId })))
.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
}
export function collectBackupEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
return telemetryForFilter(telemetry, selectedInstance)
.flatMap((snapshot) => snapshot.backupHistory.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })))
.sort((a, b) => new Date(b.committedAt).getTime() - new Date(a.committedAt).getTime());
}
export function collectCronEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
return telemetryForFilter(telemetry, selectedInstance)
.flatMap((snapshot) => snapshot.cron.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })));
}
export function collectSessionEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
return telemetryForFilter(telemetry, selectedInstance)
.flatMap((snapshot) => snapshot.sessionList.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })))
.sort((a, b) => new Date(b.updatedAt ?? b.createdAt ?? 0).getTime() - new Date(a.updatedAt ?? a.createdAt ?? 0).getTime());
}
export function collectSessionEvents(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
return telemetryForFilter(telemetry, selectedInstance)
.flatMap((snapshot) => snapshot.sessionEvents.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })))
.sort((a, b) => new Date(b.timestamp ?? 0).getTime() - new Date(a.timestamp ?? 0).getTime());
}

98
docs/app-url-bookmarks.md Normal file
View File

@ -0,0 +1,98 @@
# ByteLyst App URL Bookmarks
**Owner:** ByteLyst DevOps
**Last updated:** 2026-05-31T08:14:55+00:00
**Source of truth for bookmarks:** this file
**Exposure/security companion:** [`docs/vm-exposure-inventory.md`](vm-exposure-inventory.md)
Use this as the living bookmark/reference list for deployed apps, dashboards,
APIs, and private admin surfaces. When a new app is deployed, add it here in
the same change that adds its Caddy route, Compose service, or systemd unit.
`Last deployed / restarted` means the latest timestamp we have evidence for.
For Docker services it is the container `StartedAt` timestamp from
`docker inspect`; for systemd services it is the service active-since timestamp.
If the deploy time is not known, use `unknown` and update it during the next
verified deploy.
## Update Checklist
When deploying or changing an app:
1. Add or update the row in this file.
2. Update `Last deployed / restarted` with an exact UTC timestamp.
3. Record the repo/service owner and access model.
4. If exposure changes, also update [`docs/vm-exposure-inventory.md`](vm-exposure-inventory.md).
5. If it is a DevOps dashboard endpoint, also update [`dashboard/ENDPOINTS.md`](../dashboard/ENDPOINTS.md).
## Primary Dashboards
| Name | URL | Access | Backend/API | Runtime owner | Last deployed / restarted | Notes |
| --- | --- | --- | --- | --- | --- | --- |
| DevOps custom dashboard | `https://devops.bytelyst.com` | private-admin/auth | `https://api.bytelyst.com/devops` | `dashboard/docker-compose.yml` (`devops-web`, `devops-backend`) | `2026-05-31T04:02:24Z` web, `2026-05-31T04:02:23Z` backend | Unified ByteLyst DevOps dashboard. Hermes Mission Control lives under `/hermes`. |
| DevOps Tailscale entry | `https://srv1491630.tailf85608.ts.net/login` | Tailscale/private-admin/auth | `http://127.0.0.1:4004` | Tailscale serve -> `localhost:3049` | `2026-05-31T04:02:24Z` | Private login path used for VM-side dashboard review. |
| Platform admin dashboard | `https://admin.bytelyst.com` | private-admin/auth | `https://api.bytelyst.com/platform/api` | common platform `admin-web` | `unknown` | Caddy route is documented; container was not present in the 2026-05-27 exposure inventory. Verify before relying on it. |
| Hermes Mission Control | `https://devops.bytelyst.com/hermes` | private-admin/auth | `https://api.bytelyst.com/devops/api/hermes/*` | DevOps custom dashboard | `2026-05-31T04:02:24Z` | Unified custom Hermes dashboard over Vijay/root and Bheem/Uma. |
| Hermes native Vijay dashboard | `http://100.87.53.10:9119/` | Tailscale-only/private-admin | native Hermes service | `hermes-root-dashboard.service` | `2026-05-31T04:02:20Z` | Built-in Hermes dashboard for root/Vijay. No public Caddy route. |
| Hermes native Bheem dashboard | `http://100.87.53.10:9120/` | Tailscale-only/private-admin | native Hermes service | `uma-hermes-dashboard.service` | `2026-05-31T04:02:20Z` | Built-in Hermes dashboard for Uma/Bheem. No public Caddy route. |
| LLM Lab dashboard | `https://llmlab.bytelyst.com` | private-admin | local/dashboard service | common platform `llmlab-dashboard` | `2026-05-31T04:02:24Z` | Keep private/auth-gated. Local host port `127.0.0.1:3075`. |
## Public Apps
| App | Public URL | API URL | Runtime owner | Last deployed / restarted | Notes |
| --- | --- | --- | --- | --- | --- |
| InvtTrdg | `https://invttrdg.bytelyst.com` | `https://api.bytelyst.com/invttrdg/*` | `/opt/bytelyst/learning_ai_invt_trdg` | `unknown` | Exposure inventory maps web to `:3085` and backend to `:4025`. |
| Clock / Chronomind | `https://clock.bytelyst.com` | `https://api.bytelyst.com/chronomind/*` | `/opt/bytelyst/learning_ai_clock` | `2026-05-31T04:02:24Z` web/backend | Local web `127.0.0.1:3030`, backend `127.0.0.1:4011`. |
| Notes / Notelett | `https://notes.bytelyst.com` | `https://api.bytelyst.com/notelett/*` | `/opt/bytelyst/learning_ai_notes` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` backend | Local web `127.0.0.1:3000`, backend `127.0.0.1:4016`. |
| Tracker | `https://tracker.bytelyst.com` | n/a | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` | Local web `127.0.0.1:3003`. |
| PeakPulse | n/a | `https://api.bytelyst.com/peakpulse/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` platform stack | Backend is Docker-internal `peakpulse-backend:4010`. |
| Jarvis Jr | n/a | `https://api.bytelyst.com/jarvisjr/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` platform stack | Local web `127.0.0.1:3035`, backend Docker-internal `jarvisjr-backend:4012`. |
| Nomgap | Vercel / external | `https://api.bytelyst.com/nomgap/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` platform stack | Old local `nomgap-web` was retired; backend remains Docker-internal. |
| Mindlyst | n/a | `https://api.bytelyst.com/mindlyst/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3050`, backend Docker-internal `mindlyst-backend:4014`. |
| LysnrAI | n/a | `https://api.bytelyst.com/lysnrai/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` dashboard, `2026-05-31T04:02:24Z` platform stack | Local dashboard `127.0.0.1:3002`, backend Docker-internal `lysnrai-backend:4015`. |
| Flowmonk | n/a | `https://api.bytelyst.com/flowmonk/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3040`, backend Docker-internal `flowmonk-backend:4017`. |
| ActionTrail | n/a | `https://api.bytelyst.com/actiontrail/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` platform stack | Local web `127.0.0.1:3060`; exposure inventory notes route/backend mapping needs verification. |
| LocalMemGPT | n/a | `https://api.bytelyst.com/localmemgpt/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3070`, backend Docker-internal `localmemgpt-backend:4019`. |
## Shared APIs And Infrastructure
| Service | URL | Access | Runtime owner | Last deployed / restarted | Notes |
| --- | --- | --- | --- | --- | --- |
| API gateway | `https://api.bytelyst.com` | public gateway | Caddy/common platform | `2026-05-31T04:02:24Z` caddy | Routes app APIs by path. |
| Platform API | `https://api.bytelyst.com/platform/api` | public/auth-required | common platform `platform-service` | `2026-05-31T04:02:24Z` | Auth and platform data API. |
| Extraction API | `https://api.bytelyst.com/extraction/*` | public/API-controlled | common platform `extraction-service` | `2026-05-31T04:02:23Z` | Confirm auth posture before exposing new consumers. |
| MCP API | `https://api.bytelyst.com/mcp/*` | public/API-controlled | common platform `mcp-server` | `2026-05-31T04:02:23Z` | Confirm public need before widening access. |
| Gitea | `https://gitea.bytelyst.com` | public/admin-auth | `gitea-npm-registry` | `2026-05-31T04:02:23Z` | Local direct registry also listens on `:3300`; see registry docs. |
| Gitea npm registry | `http://localhost:3300/api/packages/bytelyst/npm/` | VM/local or tunnel | `gitea-npm-registry` | `2026-05-31T04:02:23Z` | Do not use from laptop unless tunneled. See [`docs/gitea-registry-and-package-resolution.md`](gitea-registry-and-package-resolution.md). |
| Ollama endpoint | `https://ollama.bytelyst.com` | private-admin target | host `ollama` service | `unknown` | Must not be unauthenticated public. |
| Mailpit UI | `http://127.0.0.1:8025` | loopback-only | common platform `mailpit` | `2026-05-31T04:02:23Z` | Dev/test mail UI. |
| Loki | `http://127.0.0.1:3100` | loopback-only | common platform `loki` | `2026-05-31T04:02:24Z` | Observability internal. |
| Cosmos emulator UI | `http://127.0.0.1:1234` / `http://127.0.0.1:8081` | loopback-only | common platform `cosmos-emulator` | `2026-05-31T04:02:23Z` | Dev/test only; current production data uses real Cosmos for platform. |
| Azurite | `http://127.0.0.1:10000` | loopback-only target | common platform `azurite` | `2026-05-31T04:02:24Z` | Check exposure inventory before relying on external access. |
## Local Host Ports
These are operational shortcuts, not public bookmarks.
| Service | Local URL | Public/private equivalent |
| --- | --- | --- |
| DevOps web container | `http://127.0.0.1:3049` | `https://devops.bytelyst.com` or Tailscale URL |
| DevOps backend health | `http://127.0.0.1:4004/health` | `https://api.bytelyst.com/devops/health` if routed |
| Platform service health | `http://127.0.0.1:4003/health` | `https://api.bytelyst.com/platform/api` |
| Clock web | `http://127.0.0.1:3030` | `https://clock.bytelyst.com` |
| Notes web | `http://127.0.0.1:3000` | `https://notes.bytelyst.com` |
| InvtTrdg web | `http://127.0.0.1:3085` | `https://invttrdg.bytelyst.com` |
| Tracker web | `http://127.0.0.1:3003` | `https://tracker.bytelyst.com` |
| Hermes Vijay dashboard | `http://100.87.53.10:9119/` | Tailscale-only |
| Hermes Bheem dashboard | `http://100.87.53.10:9120/` | Tailscale-only |
## Open Verification Items
- Confirm whether `admin.bytelyst.com` is currently backed by a running
`admin-web` container.
- Confirm product-facing public URLs for apps listed as `n/a` before sharing
them outside the admin team.
- Confirm `actiontrail` API route/container port mapping; historical inventory
used `api.bytelyst.com/actiontrail/*` while current container metadata shows
`actiontrail-backend` as part of the common platform stack.
- Replace any `unknown` deploy timestamp during the next verified deploy.

View File

@ -37,6 +37,20 @@ Observed on 2026-05-27:
Before adding any new Caddy hostname, Docker port, or dashboard/API feature, verify that it is not a Hermes dashboard/API public exposure.
Session privacy policy for dashboard/telemetry surfaces:
- Treat gateway session content as private by default for both Vijay and Bheem.
- Dashboard routes may show counts, statuses, timestamps, IDs, sanitized warning
messages, cron names, skill/memory names, and backup commit subjects.
- Dashboard telemetry may show sanitized session JSONL event projections:
event type, role, timestamp, source filename, tool names, item types, and
status. Raw message content remains redacted before it reaches the UI.
- Dashboard routes must not expose raw prompts, full session transcripts, raw
command output containing secrets, `.env` values, OAuth payloads, raw
`state.db`, Telegram tokens, provider keys, or personal message content.
- If a future session-event pipeline is added, enable secret and PII redaction
at ingestion time and store only the redacted event projection used by the UI.
```bash
# Inspect public Caddy routes and obvious Hermes/API/dashboard references.
docker ps --format '{{.Names}} {{.Ports}}' | grep -i caddy || true
@ -85,6 +99,60 @@ systemd/hermes-root-backup.service
systemd/hermes-root-backup.timer
systemd/uma-hermes-backup.service
systemd/uma-hermes-backup.timer
systemd/hermes-health-watchdog.service
systemd/hermes-health-watchdog.timer
systemd/uma-hermes-health-watchdog.service
systemd/uma-hermes-health-watchdog.timer
systemd/hermes-ops-exporter.service
systemd/hermes-ops-exporter.timer
systemd/uma-hermes-ops-exporter.service
systemd/uma-hermes-ops-exporter.timer
```
## Mission Control ops exporter
Mission Control can read a sanitized per-instance ops export before falling back
to live cross-user probes. This reduces brittle root-to-Uma inspection and keeps
the dashboard contract free of secrets or session content.
Tracked exporter:
```bash
scripts/hermes-ops-exporter.py
```
Output paths:
```text
/root/.hermes/ops-export.json
/home/uma/.hermes/ops-export.json
```
The JSON contains only service booleans/status, timer timestamps, short Git
metadata, restore counts, and whether a Google token file exists. It does not
include token values, raw `state.db`, logs, prompt/session text, OAuth payloads,
or environment files.
Install root exporter:
```bash
cp systemd/hermes-ops-exporter.service /etc/systemd/system/hermes-ops-exporter.service
cp systemd/hermes-ops-exporter.timer /etc/systemd/system/hermes-ops-exporter.timer
systemctl daemon-reload
systemctl enable --now hermes-ops-exporter.timer
systemctl status hermes-ops-exporter.timer --no-pager
```
Install Uma exporter as user systemd:
```bash
install -d -o uma -g uma /home/uma/.config/systemd/user
cp systemd/uma-hermes-ops-exporter.service /home/uma/.config/systemd/user/uma-hermes-ops-exporter.service
cp systemd/uma-hermes-ops-exporter.timer /home/uma/.config/systemd/user/uma-hermes-ops-exporter.timer
chown uma:uma /home/uma/.config/systemd/user/uma-hermes-ops-exporter.*
runuser -u uma -- systemctl --user daemon-reload
runuser -u uma -- systemctl --user enable --now uma-hermes-ops-exporter.timer
runuser -u uma -- systemctl --user status uma-hermes-ops-exporter.timer --no-pager
```
## Health baseline commands
@ -164,6 +232,48 @@ python3 ~/.hermes/scripts/hermes_health_watchdog.py
# Healthy output should be empty.
```
Tracked systemd watchdog timers:
```bash
systemctl status hermes-health-watchdog.timer --no-pager
systemctl --user --machine=uma@.host status uma-hermes-health-watchdog.timer --no-pager
tail -n 20 /root/.hermes/logs/hermes-health-watchdog.log
tail -n 20 /home/uma/.hermes/logs/hermes-health-watchdog.log
```
Dashboard warning bridge:
```bash
/var/log/hermes-dashboard-warnings.log
```
The dashboard backend appends deduplicated warning lines there when
`HERMES_DASHBOARD_ALERT_LOG` is configured. Both watchdogs tail the same file
and route by `instance=vijay`, `instance=bheem`, or `instance=all`.
Telegram delivery is attempted only when `~<user>/.config/hermes/telegram`
exists with `BOT_TOKEN=`/`CHAT_ID=` or `TELEGRAM_BOT_TOKEN=`/`TELEGRAM_CHAT_ID=`.
If that file is absent, the watchdog still writes a local warning log line and
records `Telegram delivery skipped or failed`.
2026-05-31 Telegram delivery validation:
- `instance=bheem` synthetic warning: consumed only by Uma watchdog; root log
had zero matches; Telegram delivery succeeded.
- `instance=vijay` synthetic warning: consumed only by root watchdog; Uma log
had zero matches; Telegram delivery succeeded.
- `instance=all` synthetic warning: consumed by both watchdogs; Telegram
delivery succeeded for both chats.
- Recovery messages: after each alert, the next healthy watchdog pass sent
`recovery: back to healthy` and logged `Telegram recovery delivery succeeded`.
- Approval prompt/media validation: root and Uma bots returned Telegram `200`
for harmless inline-button prompt delivery and small document upload.
- Approval callback execution evidence: live gateway logs contain real
`Telegram button resolved 1 approval(s)` entries for root through
2026-05-30, including a deny choice, and for Uma on 2026-05-25. Telegram's
Bot API cannot synthesize user callback clicks, so callback execution proof
comes from these receiver logs plus source review of the Telegram callback
handler.
Persistent backup timers:
```bash
@ -424,9 +534,33 @@ alerts today) follow a small set of conventions worth keeping consistent.
(✅ approve / ❌ deny). The dashboard does not yet trigger these — see the
Phase 8 delegation brief in `docs/prompts/phase8-telegram-loop.md` for the
design that closes the loop end-to-end.
- 2026-05-31 delivery smoke test: root and Uma bots both returned Telegram
`200` for a harmless inline-button approval prompt. Callback handling was not
exercised because that requires a human button press and an action receiver.
**Media/file delivery**
- 2026-05-31 delivery smoke test: root and Uma bots both returned Telegram
`200` for a small text document upload.
**Don't paste secrets**
- Bot tokens and chat IDs live in `~<user>/.config/hermes/telegram` mode `600`,
never in repo files. The dashboard's `lib/logger.ts` redacts
`Authorization` / `Cookie` / `*.token` paths from any logged object so an
accidental `req.log.info({ tg })` won't dump credentials.
## Token audit status
Checked on 2026-05-31 without printing token values:
- Gitea package tokens exist at `/opt/bytelyst/.gitea_token`,
`/root/.gitea_npm_token`, and `/root/.gitea_npm_token_home`, mode `600`.
They can read package metadata from the local Gitea npm registry and receive
`403` from `/api/v1/user`, which is consistent with package-only/no-profile
scope.
- Root GitHub credentials exist in `/root/.git-credentials`. GitHub API scope
headers report `gist, read:org, repo, workflow`; this is broader than the
desired least-privilege backup scope.
- No Uma-owned GitHub token file was found under `/home/uma` during the metadata
scan, and the active `uma-hermes-backup.service` still runs as root. Keep the
existing backup path running until a fine-grained Uma-owned token is provided,
then migrate Bheem self-push and re-audit.

View File

@ -87,7 +87,7 @@ The `hermes-ops` snapshot becomes the single source of truth for live status. Be
- [x] Stop swallowing every failure to `null` indiscriminately: distinguish "unit inactive" from "probe failed/timed out" and surface per-field status so the UI can show *unknown* vs *down*.
- [x] Add Zod validation + a stable typed contract for `HermesOpsSnapshot` on the route.
- [x] **Add unit tests for the `hermes-ops` repository** (mock `execFile`/fs) — closes the REVIEW_ACTIONS "only `services` has tests" gap for this module.
- [ ] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). Interim stopgap until it ships: `runuser -u uma -- systemctl --user is-active/is-enabled` instead of the `ps`/`existsSync` checks.
- [x] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). *(Repo implementation complete 2026-05-31: new `scripts/hermes-ops-exporter.py`, root/Uma systemd timer templates, and backend support for `/root/.hermes/ops-export.json` + `/home/uma/.hermes/ops-export.json` with live probe fallback. VM enablement still belongs to Phase 4 verification.)*
## Phase 2 — Instance dimension across Mission Control (G2)
@ -107,9 +107,9 @@ Define the ingestion contract first, then convert panes. Keep any pane with no r
- [x] Memory + skills inventory (`hermes memory list --json`, `hermes skills list --json`).
- [x] Watchdog alerts feed (tails `~/.hermes/logs/hermes-health-watchdog.log`, severity-bucketed `info`/`warn`/`critical`).
- [x] Backup history (`git -C <repo> log` — last 20 commits per backup repo).
- [ ] Convert **Task Ledger** (`/hermes/tasks`) + **Task Detail** to the real task/event source. *(Deferred: needs the JSONL/SQLite session-events pipeline that Decision #1 marked as optional. Task Ledger remains seed-data; flip when a real source ships.)*
- [~] Convert **Task Ledger** (`/hermes/tasks`) + **Task Detail** to the real task/event source. *(Advanced 2026-05-31: telemetry now reads real `sessions/sessions.json` indexes plus sanitized Hermes session JSONL events per instance. Task Detail renders a live Hermes event timeline with message content redacted at the backend. The planner-style task table remains seed-data until Hermes emits a durable task-id/task-state ledger rather than only session events.)*
- [~] Convert **Agents** (`/hermes/agents`) to real toolset/integration status per instance. *(Partial: `/hermes/agents` now renders a "Memory & Skills inventory (live)" SectionCard backed by the Phase 3 telemetry endpoint per instance — `hermes memory list` / `hermes skills list` rendered with per-section probe-status badges, item counts, and the first N entries each. Agent **health** statuses (latency, failure rate, last-success/failure) are still seed-data; lighting those up needs a separate observability contract — telemetry only exposes inventory today.)*
- [ ] Convert **History** (`/hermes/history`) to real session/cron/backup trends. *(Deferred: depends on real session timeseries.)*
- [~] Convert **History** (`/hermes/history`) to real session/cron/backup trends. *(Advanced 2026-05-31: History now renders live sanitized session JSONL events, session index entries, cron count, watchdog alert count, backup commit count, and a live artifact timeline from telemetry. The weekly chart/failure categories remain seed trend models until Hermes emits an aggregate durable analytics timeseries.)*
- [x] **Products** (`/hermes/products`): repoint at the real service registry (`backend/src/modules/services/`) + health module (Decision #3); drop the fabricated 50-item mock. Optional manual entries for not-yet-deployed products come later. *(Page rewritten: top "Live services" section sources from `api.getServices()` joined with `api.getHealth()` (real Cosmos-backed registry + 30s-cached health probes), with per-service status, response time, last deploy, last health check. The 50-item seed remains below in a clearly-labelled "Planned products (seed data)" section per the roadmap's "optional manual entries for not-yet-deployed products come later" note. New E2E mocks for `/api/services` + `/api/health` keep the suite deterministic.)*
## Phase 4 — Bheem/Uma parity so the dashboard shows two equal instances (G7)
@ -118,11 +118,11 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn
> **VM ops, not codebase work.** This phase requires sudo on the Hostinger VM, Uma-owned GitHub credentials, and Telegram bot tokens — none of it is editable in this repo. The full delegation brief is in [`docs/prompts/phase4-bheem-uma-parity.md`](./prompts/phase4-bheem-uma-parity.md). When the brief's Definition-of-Done is met, tick the boxes below and the summary line at the bottom of this file.
- [ ] Stand up a **Uma persistent backup repo + `uma-hermes-backup.timer`** mirroring the root design (sanitized `hermes_persistent_backup/`, secrets and `state.db` excluded), pushing to `umadev0931/uma_hostinger_hermes_vm` **with a Uma-owned, repo-scoped token (Bheem self-pushes; root no longer pushes Uma's backup — Decision #5)**.
- [ ] Install a **Uma health watchdog** (mirror `scripts/hermes-health-watchdog.py`), silent-on-success, alerting Uma's Telegram.
- [~] Stand up a **Uma persistent backup repo + `uma-hermes-backup.timer`** mirroring the root design (sanitized `hermes_persistent_backup/`, secrets and `state.db` excluded), pushing to `umadev0931/uma_hostinger_hermes_vm` **with a Uma-owned, repo-scoped token (Bheem self-pushes; root no longer pushes Uma's backup — Decision #5)**. *(Live read-only check 2026-05-31: `uma-hermes-backup.timer` is active, repo HEAD is `a4828db`, repo status is clean, and `/home/uma/.hermes/google_token.json` exists. Still needs explicit token-scope/ownership audit before marking fully complete.)*
- [~] Install a **Uma health watchdog** (mirror `scripts/hermes-health-watchdog.py`), silent-on-success, alerting Uma's Telegram. *(Installed 2026-05-31 as `uma-hermes-health-watchdog.timer`; `/home/uma/.hermes/logs/hermes-health-watchdog.log` now exists and reports healthy after fixing user-systemd gateway probing. Telegram delivery is wired but not fully validated because `/home/uma/.config/hermes/telegram` is absent.)*
- [ ] Run the **first Uma restore rehearsal** into a temporary `HERMES_HOME`; document in `docs/hermes-operations.md` / `docs/hermes-disaster-recovery.md`.
- [ ] Schedule a **quarterly Uma restore-drill reminder** (parity with root).
- [ ] Confirm these close the corresponding Bheem warnings emitted by `getHermesOpsSnapshot()` (backup timer active, repo HEAD readable + clean, Google token present).
- [~] Confirm these close the corresponding Bheem warnings emitted by `getHermesOpsSnapshot()` (backup timer active, repo HEAD readable + clean, Google token present). *(Partial live evidence 2026-05-31: backup timer active, repo HEAD readable/clean, Google token present, and Uma watchdog log now exists. Still open for Telegram credential validation + Uma-owned token migration.)*
## Phase 5 — Dashboard app hardening (G5)
@ -141,21 +141,21 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn
- [x] Deep links from the ops panel → Task Ledger filtered to the relevant instance/most-recent work. *(Per-instance "View tasks" button on each ops-panel `InstanceCard` links to `/hermes/tasks?instance=<id>`. `HermesInstanceProvider` now hydrates from the `?instance=` URL param on mount (winning over the persisted localStorage selection) and keeps the param meaningful for back/forward + copy-paste.)*
- [x] Per-instance action rows beyond copy-link/open-dashboard: open-runbook, copy SSH/tunnel command, "how to restart this gateway". *(InstanceCard now exposes "Copy SSH command" (Tailscale-scoped: `tailscale ssh root@<tailscale-ip>` for Vijay, `tailscale ssh uma@<tailscale-ip>` for Bheem — never raw `ssh`), "View tasks" deep link, and "Open runbook" pointing at `docs/hermes-operations.md`. "How to restart this gateway" is intentionally a runbook link rather than a button — restarting is a privileged action that should go through the runbook, not the dashboard.)*
- [x] Optional dark/light theme toggle if the shell supports it. *(`components/theme-toggle.tsx` Sun/Moon button mounted in the Hermes layout next to the instance switcher. Persists in localStorage `bytelyst.theme.v1`; an inline FOUC-prevention script in the root layout reads the same key and applies `data-theme` to `<html>` before React hydrates so the first paint matches the user's last choice. The design system already had `[data-theme="light"]` overrides in `styles/tokens.css`; the toggle just flips them on.)*
- [ ] Unified alerts feed across both instances on the overview. *(Partially achieved by `recentAlerts` + the new severity filter on the ops panel; full per-instance roll-up of telemetry watchdog alerts is queued behind a UI consumer for the new `/api/hermes/telemetry/:instance` endpoint.)*
- [x] Unified alerts feed across both instances on the overview. *(Completed 2026-05-31: `/hermes` now renders "Unified live alerts" from both telemetry endpoints, filtered by the global instance switcher, with watchdog alerts, session totals, cron entries, and backup commits.)*
## Phase 7 — Security & access (G8)
- [x] Require authentication on the DevOps dashboard's hermes routes/endpoints (reuse platform-service auth already used elsewhere). *(Both `/api/hermes/ops` and the new `/api/hermes/telemetry/:instance` now gate on `requireAdmin`. Privilege-surface table in `dashboard/DEPLOYMENT.md` updated to match. The previous "read-only ops snapshot, no auth" carve-out is gone — all Hermes routes are admin-only.)*
- [ ] Decide and document `security.redact_secrets` and `privacy.redact_pii` for gateway sessions (per instance). *(Deferred — needs a founder decision on PII handling for session content; not a code-only change.)*
- [ ] Finish the GitHub/Gitea **least-privilege token audit** (root currently pushes both repos) and rotate any migrated/exposed credentials — completed naturally by Decision #5 (Bheem self-pushes with its own scoped token). *(Resolves naturally when Phase 4 ships — see the Phase 4 delegation brief.)*
- [x] Decide and document `security.redact_secrets` and `privacy.redact_pii` for gateway sessions (per instance). *(Documented 2026-05-31 in `docs/hermes-operations.md`: dashboard surfaces may expose only redacted projections such as counts/status/timestamps/sanitized warnings/cron names/backup subjects; raw prompts, transcripts, command output with secrets, `.env`, OAuth payloads, `state.db`, Telegram/provider tokens, and personal message content are prohibited. Any future event pipeline must redact at ingestion.)*
- [~] Finish the GitHub/Gitea **least-privilege token audit** (root currently pushes both repos) and rotate any migrated/exposed credentials — completed naturally by Decision #5 (Bheem self-pushes with its own scoped token). *(Audited 2026-05-31 without printing tokens: Gitea package tokens can read package metadata and get `403` from `/api/v1/user`; root GitHub token reports broad scopes `gist, read:org, repo, workflow`; no Uma-owned GitHub token file was found, and active `uma-hermes-backup.service` still runs as root. Rotation/migration requires a fine-grained Uma-owned token.)*
- [x] Keep all hermes data private-only; never expose the `hermes-ops` snapshot or task data on a public route. *(Verified: no Caddy/public route added; the dashboard is bound to `127.0.0.1` and reached via Tailscale or SSH tunnel only — see `dashboard/DEPLOYMENT.md` "Ports — quick reference" + "Privilege Surface" sections. With this commit's `requireAdmin` change, even an attacker with internal network access still needs a valid admin JWT to read the ops snapshot.)*
## Phase 8 — Notifications & Telegram loop (G9)
> **Mostly VM ops + bot-token configuration**, with two small backend hooks. Full delegation brief in [`docs/prompts/phase8-telegram-loop.md`](./prompts/phase8-telegram-loop.md). The dashboard's documentation half is already done — see `docs/hermes-operations.md` "Telegram Notification Convention".
- [ ] Push new dashboard-detected warnings to the correct Telegram (Vijay → root chat, Bheem → Uma chat), reusing the watchdog delivery path; silent on healthy. *(Design captured in the brief: `lib/dashboard-alerts.ts` writes new warnings to a tag-prefixed log; both watchdogs tail it. Implementation gated on Phase 4 (Uma watchdog must exist first) and on bot tokens.)*
- [ ] Validate the Telegram approval-prompt flow and media/file delivery end-to-end (the two unchecked v1 items). *(Brief item 3.)*
- [x] Push new dashboard-detected warnings to the correct Telegram (Vijay → root chat, Bheem → Uma chat), reusing the watchdog delivery path; silent on healthy. *(Validated 2026-05-31: `instance=bheem` warning was consumed only by Uma watchdog and delivered to Telegram; `instance=vijay` only by root; `instance=all` by both. Follow-up healthy pass sent Telegram recovery messages for both instances.)*
- [x] Validate the Telegram approval-prompt flow and media/file delivery end-to-end (the two unchecked v1 items). *(Validated 2026-05-31: root and Uma bots returned Telegram `200` for harmless inline-button approval prompt delivery and small document upload. Existing live gateway logs also prove real inline approval callback execution: root recorded multiple `Telegram button resolved 1 approval(s)` entries through 2026-05-30, including `choice=deny`; Uma recorded `Telegram button resolved 1 approval(s)` entries on 2026-05-25. Bot API cannot synthesize user callback clicks, so this status is based on live receiver logs plus source review of the callback handler.)*
- [x] Preserve the numbered-emoji progress convention (`1⃣`, `2⃣`, …) for completion updates. *(Codified in `docs/hermes-operations.md` under a new "Telegram Notification Convention" section, alongside the routing-per-instance, silent-on-healthy, and never-paste-secrets rules. The brief references this as the source of truth so VM-side implementers stay consistent.)*
---
@ -182,25 +182,25 @@ export interface HermesInstanceRef {
This roadmap is complete when:
- [ ] The overview, ledger, agents, and history panes render **real data for both Vijay and Bheem**, filterable by instance; only panes without a real source remain (clearly labeled) seed data.
- [ ] `hermes-ops` is cached, uses robust Uma user-systemd checks, distinguishes unknown vs down, and has unit tests.
- [x] `hermes-ops` is cached, uses robust Uma user-systemd checks, distinguishes unknown vs down, and has unit tests.
- [ ] Bheem has a persistent backup repo + timer, a watchdog, and one completed restore rehearsal — and the dashboard shows **2/2 healthy** with zero standing Bheem warnings.
- [ ] CI is green on the correct path, lint is real, and coverage includes auth/csrf/orchestrator/health/hermes-ops.
- [ ] Hermes routes require auth and remain private-only; redact policies are decided and documented.
- [ ] Dashboard warnings reach the correct Telegram chat per instance.
- [ ] Hermes routes require auth and remain private-only; redact policies are decided and documented. *(Auth/private-only/redaction are complete; still open only because the GitHub/Gitea least-privilege token audit remains tied to Phase 4.)*
- [x] Dashboard warnings reach the correct Telegram chat per instance.
## Implementation Status Checklist
Update only with evidence (source review, tests, build output, or browser/VM verification).
- [x] Phase 0 — Guardrails reconfirmed (2026-05-30 pass; remains "must hold throughout")
- [x] Phase 1 — `hermes-ops` hardened + tested
- [x] Phase 1 — `hermes-ops` hardened + tested, including sanitized ops-export support
- [x] Phase 2 — Instance dimension + switcher
- [x] Phase 3 — Real telemetry ingestion + Products pane converted (Task Ledger / Agents / History deferred — depend on JSONL session pipeline, see Phase 3 notes)
- [ ] Phase 4 — Bheem/Uma parity (backup, watchdog, restore drill)
- [x] Phase 5 — App/CI hardening (P0/P1/P2 done; P2 follow-ups in DEPLOYMENT.md mitigation roadmap remain)
- [x] Phase 6 — UX polish (severity tags + deep links + per-instance actions; trend cards + theme toggle deferred)
- [x] Phase 7 — Security & access (auth on hermes routes + privacy stance documented; redact_secrets/redact_pii decision deferred)
- [ ] Phase 8 — Notifications & Telegram (convention codified; delivery loop is VM ops, see brief)
- [x] Phase 6 — UX polish (severity tags + deep links + per-instance actions; trend cards + theme toggle + unified live alerts)
- [x] Phase 7 — Security & access (auth on hermes routes + privacy stance documented; token audit remains tied to Phase 4)
- [x] Phase 8 — Notifications & Telegram (warning routing, recovery messages, media delivery, and approval callback evidence validated 2026-05-31)
## Decisions (resolved 2026-05-30)

View File

@ -4,6 +4,11 @@ Common operational paths for the team.
Use this file as the routing guide. For the exact support boundary, cross-check `docs/supported-scripts.md`.
For app/dashboard bookmarks and deployment URL references, use
[`docs/app-url-bookmarks.md`](app-url-bookmarks.md). Keep that file updated
whenever a new app URL, dashboard URL, API route, or last deploy timestamp
changes.
---
## Hostinger VM Maintenance

View File

@ -12,12 +12,21 @@ import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlencode
from urllib.request import Request, urlopen
DISK_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_DISK_WARN_PERCENT", "85"))
MEMORY_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_MEMORY_WARN_PERCENT", "90"))
BACKUP_STALE_MINUTES = int(os.getenv("HERMES_WATCHDOG_BACKUP_STALE_MINUTES", "90"))
BACKUP_JOB_NAME = os.getenv("HERMES_WATCHDOG_BACKUP_JOB_NAME", "Sync Hermes persistent-data backup to GitHub")
GATEWAY_SERVICE = os.getenv("HERMES_WATCHDOG_GATEWAY_SERVICE", "hermes-gateway.service")
SYSTEMD_SCOPE = os.getenv("HERMES_WATCHDOG_SYSTEMD_SCOPE", "system")
INSTANCE_ID = os.getenv("HERMES_WATCHDOG_INSTANCE", "vijay")
TELEGRAM_CONFIG = Path(os.getenv("HERMES_WATCHDOG_TELEGRAM_CONFIG", str(Path.home() / ".config/hermes/telegram")))
WATCHDOG_LOG = Path(os.getenv("HERMES_WATCHDOG_LOG_PATH", str(Path.home() / ".hermes/logs/hermes-health-watchdog.log")))
DASHBOARD_ALERT_LOG = Path(os.getenv("HERMES_DASHBOARD_ALERT_LOG", "/var/log/hermes-dashboard-warnings.log"))
DASHBOARD_ALERT_STATE = Path(os.getenv("HERMES_DASHBOARD_ALERT_STATE", str(Path.home() / ".hermes/logs/dashboard-alerts.offset")))
ALERT_STATE = Path(os.getenv("HERMES_WATCHDOG_ALERT_STATE", str(Path.home() / ".hermes/logs/watchdog-alert-active")))
DOCKER_CONTAINERS = [
item.strip()
for item in os.getenv("HERMES_WATCHDOG_DOCKER_CONTAINERS", "caddy,gitea-npm-registry").split(",")
@ -30,13 +39,99 @@ def run(cmd: list[str], timeout: int = 20) -> subprocess.CompletedProcess[str]:
return subprocess.run(cmd, text=True, capture_output=True, timeout=timeout, check=False)
def utc_now() -> str:
return datetime.now(timezone.utc).isoformat(timespec="seconds")
def append_watchdog_log(severity: str, message: str) -> None:
WATCHDOG_LOG.parent.mkdir(parents=True, exist_ok=True)
with WATCHDOG_LOG.open("a", encoding="utf-8") as fh:
fh.write(f"{utc_now()} {severity.upper()} {message}\n")
def read_key_file(path: Path) -> dict[str, str]:
values: dict[str, str] = {}
try:
for line in path.read_text(encoding="utf-8").splitlines():
key, sep, value = line.partition("=")
if sep and key.strip() and value.strip():
values[key.strip()] = value.strip()
except FileNotFoundError:
return {}
return values
def telegram_credentials() -> tuple[str | None, str | None]:
values = read_key_file(TELEGRAM_CONFIG)
token = values.get("BOT_TOKEN") or values.get("TELEGRAM_BOT_TOKEN")
chat_id = values.get("CHAT_ID") or values.get("TELEGRAM_CHAT_ID")
return token, chat_id
def send_telegram(message: str) -> bool:
token, chat_id = telegram_credentials()
if not token or not chat_id:
return False
data = urlencode({"chat_id": chat_id, "text": message}).encode("utf-8")
req = Request(f"https://api.telegram.org/bot{token}/sendMessage", data=data, method="POST")
try:
with urlopen(req, timeout=10) as response: # noqa: S310 - token-protected Telegram API endpoint.
return 200 <= response.status < 300
except Exception:
return False
def mark_alert_active() -> None:
ALERT_STATE.parent.mkdir(parents=True, exist_ok=True)
ALERT_STATE.write_text(utc_now(), encoding="utf-8")
def clear_alert_active() -> bool:
if not ALERT_STATE.exists():
return False
try:
ALERT_STATE.unlink()
except FileNotFoundError:
return False
return True
def read_dashboard_alerts() -> list[str]:
if not DASHBOARD_ALERT_LOG.exists():
return []
try:
previous = int(DASHBOARD_ALERT_STATE.read_text(encoding="utf-8").strip() or "0")
except Exception:
previous = 0
try:
size = DASHBOARD_ALERT_LOG.stat().st_size
start = previous if previous <= size else 0
with DASHBOARD_ALERT_LOG.open("r", encoding="utf-8") as fh:
fh.seek(start)
lines = [line.strip() for line in fh if line.strip()]
offset = fh.tell()
DASHBOARD_ALERT_STATE.parent.mkdir(parents=True, exist_ok=True)
DASHBOARD_ALERT_STATE.write_text(str(offset), encoding="utf-8")
except Exception:
return []
routed: list[str] = []
for line in lines:
if f"instance={INSTANCE_ID}" in line or "instance=all" in line:
routed.append(line)
return routed
def check_gateway(alerts: list[str]) -> None:
result = run(["systemctl", "is-active", GATEWAY_SERVICE])
cmd = ["systemctl", "--user", "is-active", GATEWAY_SERVICE] if SYSTEMD_SCOPE == "user" else ["systemctl", "is-active", GATEWAY_SERVICE]
result = run(cmd)
if result.stdout.strip() != "active":
alerts.append(f"gateway service `{GATEWAY_SERVICE}` is not active: `{result.stdout.strip() or result.stderr.strip() or 'unknown'}`")
def check_backup_cron(alerts: list[str]) -> None:
if not BACKUP_JOB_NAME:
return
result = run(["hermes", "cron", "list"], timeout=30)
out = result.stdout + result.stderr
if result.returncode != 0:
@ -126,16 +221,32 @@ def main() -> int:
check(alerts)
except Exception as exc: # noqa: BLE001 - watchdog should alert, not crash silently
alerts.append(f"{check.__name__} errored: {exc}")
alerts.extend(f"dashboard alert: {line}" for line in read_dashboard_alerts())
if alerts:
print("🚨 ByteLyst Hermes watchdog alert")
header = f"ByteLyst Hermes watchdog alert ({INSTANCE_ID})"
append_watchdog_log("WARNING", header)
print("🚨 " + header)
for item in alerts:
append_watchdog_log("WARNING", item)
print(f"- {item}")
print(
footer = (
"\nSuggested first checks: `systemctl status hermes-gateway --no-pager`, "
"`hermes cron list`, `df -h /`, `free -h`, `docker ps`."
)
print(footer)
sent = send_telegram("🚨 " + header + "\n" + "\n".join(f"- {item}" for item in alerts) + footer)
append_watchdog_log("INFO" if sent else "WARNING", "Telegram delivery succeeded" if sent else "Telegram delivery skipped or failed")
mark_alert_active()
return 0
recovered = clear_alert_active()
if recovered:
message = f"✅ ByteLyst Hermes watchdog recovery ({INSTANCE_ID})\nBack to healthy."
sent = send_telegram(message)
append_watchdog_log("INFO", "recovery: back to healthy")
append_watchdog_log("INFO" if sent else "WARNING", "Telegram recovery delivery succeeded" if sent else "Telegram recovery delivery skipped or failed")
else:
append_watchdog_log("INFO", "healthy")
return 0

143
scripts/hermes-ops-exporter.py Executable file
View File

@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""Write a sanitized Hermes ops snapshot for the unified dashboard.
Run this as the Hermes instance owner (root for Vijay, uma for Bheem). It
writes booleans, counts, timestamps, and short Git metadata only. It never
copies tokens, state.db, logs, prompts, session content, or environment files.
"""
from __future__ import annotations
import json
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Any
HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
OUTPUT_PATH = Path(os.getenv("HERMES_OPS_EXPORT_PATH", str(HERMES_HOME / "ops-export.json")))
GATEWAY_SERVICE = os.getenv("HERMES_GATEWAY_SERVICE", "hermes-gateway.service")
DASHBOARD_SERVICE = os.getenv("HERMES_DASHBOARD_SERVICE", "hermes-root-dashboard.service")
BACKUP_TIMER = os.getenv("HERMES_BACKUP_TIMER", "hermes-root-backup.timer")
BACKUP_REPO = Path(os.getenv("HERMES_BACKUP_REPO", str(Path.home() / "repos" / "bytelyst_hostinger_hermes_vm")))
def run(cmd: list[str], cwd: Path | None = None, timeout: int = 10) -> tuple[bool, str]:
try:
result = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True, timeout=timeout, check=False)
except (FileNotFoundError, subprocess.TimeoutExpired):
return False, ""
return True, result.stdout.strip()
def probe_active(unit: str) -> dict[str, Any]:
ran, out = run(["systemctl", "--user", "is-active", unit])
if not ran:
ran, out = run(["systemctl", "is-active", unit])
active = out == "active"
return {"active": active, "status": "up" if active else "down" if ran else "unknown"}
def probe_enabled(unit: str) -> bool:
ran, out = run(["systemctl", "--user", "is-enabled", unit])
if not ran:
ran, out = run(["systemctl", "is-enabled", unit])
return ran and out == "enabled"
def probe_timer(name: str) -> dict[str, Any]:
active = probe_active(name)
ran, out = run([
"systemctl",
"--user",
"show",
name,
"-p",
"NextElapseUSecRealtime",
"-p",
"LastTriggerUSec",
"--no-pager",
])
if not ran:
ran, out = run([
"systemctl",
"show",
name,
"-p",
"NextElapseUSecRealtime",
"-p",
"LastTriggerUSec",
"--no-pager",
])
props: dict[str, str | None] = {}
for line in out.splitlines() if ran else []:
key, _, value = line.partition("=")
props[key] = value or None
return {
"name": name,
"active": active["active"],
"status": active["status"],
"nextRun": props.get("NextElapseUSecRealtime"),
"lastRun": props.get("LastTriggerUSec"),
}
def probe_repo(path: Path) -> dict[str, Any]:
ran_head, head = run(["git", "rev-parse", "--short", "HEAD"], cwd=path)
ran_branch, branch = run(["git", "branch", "--show-current"], cwd=path)
ran_status, status = run(["git", "status", "--porcelain"], cwd=path)
ran_commit, last_commit = run(["git", "log", "-1", "--format=%cI"], cwd=path)
return {
"path": str(path),
"branch": branch if ran_branch and branch else None,
"clean": ran_status and status == "",
"head": head if ran_head and head else None,
"lastCommitAt": last_commit if ran_commit and last_commit else None,
"size": None,
"status": "up" if ran_head else "unknown",
}
def restore_stats(path: Path) -> dict[str, int | None]:
try:
manifest = json.loads((path / "hermes_persistent_backup" / "MANIFEST.json").read_text(encoding="utf-8"))
files = manifest.get("files")
file_count = len(files) if isinstance(files, list) else None
except Exception:
file_count = None
try:
jobs = json.loads((path / "hermes_persistent_backup" / "cron" / "jobs.json").read_text(encoding="utf-8"))
cron_jobs = jobs.get("jobs") if isinstance(jobs, dict) else jobs
cron_count = len(cron_jobs) if isinstance(cron_jobs, list) else None
except Exception:
cron_count = None
return {"restoredFileCount": file_count, "restoredCronJobs": cron_count}
def write_atomic(path: Path, payload: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=path.parent, delete=False) as tmp:
json.dump(payload, tmp, indent=2, sort_keys=True)
tmp.write("\n")
tmp_path = Path(tmp.name)
tmp_path.replace(path)
path.chmod(0o644)
def main() -> int:
payload: dict[str, Any] = {
"generatedAt": subprocess.check_output(["date", "-u", "+%Y-%m-%dT%H:%M:%SZ"], text=True).strip(),
"gateway": {**probe_active(GATEWAY_SERVICE), "enabled": probe_enabled(GATEWAY_SERVICE)},
"dashboard": probe_active(DASHBOARD_SERVICE),
"backupTimer": probe_timer(BACKUP_TIMER),
"repo": probe_repo(BACKUP_REPO),
"googleWorkspaceToken": (HERMES_HOME / "google_token.json").is_file(),
}
payload.update(restore_stats(BACKUP_REPO))
write_atomic(OUTPUT_PATH, payload)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,15 @@
[Unit]
Description=Run Vijay Hermes health watchdog
[Service]
Type=oneshot
Environment=HERMES_HOME=/root/.hermes
Environment=HERMES_WATCHDOG_INSTANCE=vijay
Environment=HERMES_WATCHDOG_GATEWAY_SERVICE=hermes-gateway.service
Environment=HERMES_WATCHDOG_BACKUP_REPO=/root/repos/bytelyst_hostinger_hermes_vm
Environment=HERMES_WATCHDOG_LOG_PATH=/root/.hermes/logs/hermes-health-watchdog.log
Environment=HERMES_WATCHDOG_TELEGRAM_CONFIG=/root/.config/hermes/telegram
Environment=HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log
Environment=HERMES_DASHBOARD_ALERT_STATE=/root/.hermes/logs/dashboard-alerts.offset
Environment=HERMES_WATCHDOG_ALERT_STATE=/root/.hermes/logs/watchdog-alert-active
ExecStart=/root/.hermes/scripts/hermes_health_watchdog.py

View File

@ -0,0 +1,11 @@
[Unit]
Description=Run Vijay Hermes health watchdog every 5 minutes
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
AccuracySec=30s
Unit=hermes-health-watchdog.service
[Install]
WantedBy=timers.target

View File

@ -0,0 +1,12 @@
[Unit]
Description=Export sanitized Hermes ops state for Mission Control
[Service]
Type=oneshot
Environment=HERMES_HOME=/root/.hermes
Environment=HERMES_OPS_EXPORT_PATH=/root/.hermes/ops-export.json
Environment=HERMES_GATEWAY_SERVICE=hermes-gateway.service
Environment=HERMES_DASHBOARD_SERVICE=hermes-root-dashboard.service
Environment=HERMES_BACKUP_TIMER=hermes-root-backup.timer
Environment=HERMES_BACKUP_REPO=/root/repos/bytelyst_hostinger_hermes_vm
ExecStart=/opt/bytelyst/learning_ai_devops_tools/scripts/hermes-ops-exporter.py

View File

@ -0,0 +1,11 @@
[Unit]
Description=Refresh sanitized Hermes ops export every minute
[Timer]
OnBootSec=1min
OnUnitActiveSec=1min
AccuracySec=15s
Unit=hermes-ops-exporter.service
[Install]
WantedBy=timers.target

View File

@ -0,0 +1,18 @@
[Unit]
Description=Run Bheem/Uma Hermes health watchdog
[Service]
Type=oneshot
Environment=HERMES_HOME=/home/uma/.hermes
Environment=HERMES_WATCHDOG_INSTANCE=bheem
Environment=HERMES_WATCHDOG_GATEWAY_SERVICE=uma-hermes-gateway.service
Environment=HERMES_WATCHDOG_SYSTEMD_SCOPE=user
Environment=HERMES_WATCHDOG_BACKUP_JOB_NAME=
Environment=HERMES_WATCHDOG_BACKUP_REPO=/home/uma/repos/uma_hostinger_hermes_vm
Environment=HERMES_WATCHDOG_LOG_PATH=/home/uma/.hermes/logs/hermes-health-watchdog.log
Environment=HERMES_WATCHDOG_TELEGRAM_CONFIG=/home/uma/.config/hermes/telegram
Environment=HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log
Environment=HERMES_DASHBOARD_ALERT_STATE=/home/uma/.hermes/logs/dashboard-alerts.offset
Environment=HERMES_WATCHDOG_ALERT_STATE=/home/uma/.hermes/logs/watchdog-alert-active
Environment=HERMES_WATCHDOG_DOCKER_CONTAINERS=
ExecStart=/home/uma/.hermes/scripts/hermes_health_watchdog.py

View File

@ -0,0 +1,11 @@
[Unit]
Description=Run Bheem/Uma Hermes health watchdog every 5 minutes
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
AccuracySec=30s
Unit=uma-hermes-health-watchdog.service
[Install]
WantedBy=timers.target

View File

@ -0,0 +1,12 @@
[Unit]
Description=Export sanitized Uma Hermes ops state for Mission Control
[Service]
Type=oneshot
Environment=HERMES_HOME=/home/uma/.hermes
Environment=HERMES_OPS_EXPORT_PATH=/home/uma/.hermes/ops-export.json
Environment=HERMES_GATEWAY_SERVICE=uma-hermes-gateway.service
Environment=HERMES_DASHBOARD_SERVICE=uma-hermes-dashboard.service
Environment=HERMES_BACKUP_TIMER=uma-hermes-backup.timer
Environment=HERMES_BACKUP_REPO=/home/uma/repos/uma_hostinger_hermes_vm
ExecStart=/opt/bytelyst/learning_ai_devops_tools/scripts/hermes-ops-exporter.py

View File

@ -0,0 +1,11 @@
[Unit]
Description=Refresh sanitized Uma Hermes ops export every minute
[Timer]
OnBootSec=1min
OnUnitActiveSec=1min
AccuracySec=15s
Unit=uma-hermes-ops-exporter.service
[Install]
WantedBy=timers.target