feat: complete hermes telemetry dashboard wiring
This commit is contained in:
parent
38aefb05e4
commit
02b362399b
@ -10,6 +10,10 @@ This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboa
|
||||
|
||||
## Public URLs
|
||||
|
||||
For the full living bookmark list across all ByteLyst apps, APIs, Hermes
|
||||
dashboards, and last deploy timestamps, see
|
||||
[`../docs/app-url-bookmarks.md`](../docs/app-url-bookmarks.md).
|
||||
|
||||
- **DevOps Dashboard**: `https://devops.bytelyst.com`
|
||||
- **Admin Dashboard**: `https://admin.bytelyst.com`
|
||||
- **API Gateway**: `https://api.bytelyst.com`
|
||||
|
||||
44
dashboard/backend/src/lib/dashboard-alerts.test.ts
Normal file
44
dashboard/backend/src/lib/dashboard-alerts.test.ts
Normal file
@ -0,0 +1,44 @@
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
const appendFileMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('fs/promises', () => ({ appendFile: appendFileMock }));
|
||||
|
||||
const { appendDashboardWarning, clearDashboardWarningDedupe } = await import('./dashboard-alerts.js');
|
||||
|
||||
describe('dashboard-alerts', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
clearDashboardWarningDedupe();
|
||||
delete process.env.HERMES_DASHBOARD_ALERT_LOG;
|
||||
});
|
||||
|
||||
it('does nothing when the alert log is not configured', async () => {
|
||||
const wrote = await appendDashboardWarning({ severity: 'warn', instance: 'vijay', message: 'gateway down' });
|
||||
expect(wrote).toBe(false);
|
||||
expect(appendFileMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('writes a routed warning line when configured', async () => {
|
||||
process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
|
||||
const wrote = await appendDashboardWarning(
|
||||
{ severity: 'critical', instance: 'bheem', message: 'backup missing' },
|
||||
Date.parse('2026-05-31T07:00:00Z'),
|
||||
);
|
||||
|
||||
expect(wrote).toBe(true);
|
||||
expect(appendFileMock).toHaveBeenCalledWith(
|
||||
'/tmp/hermes-dashboard-warnings.log',
|
||||
'2026-05-31T07:00:00.000Z CRITICAL instance=bheem backup missing\n',
|
||||
'utf8',
|
||||
);
|
||||
});
|
||||
|
||||
it('deduplicates for one hour and writes again after expiry', async () => {
|
||||
process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
|
||||
const input = { severity: 'warn' as const, instance: 'all' as const, message: 'shared warning' };
|
||||
expect(await appendDashboardWarning(input, 1_000)).toBe(true);
|
||||
expect(await appendDashboardWarning(input, 2_000)).toBe(false);
|
||||
expect(await appendDashboardWarning(input, 3_602_000)).toBe(true);
|
||||
expect(appendFileMock).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
48
dashboard/backend/src/lib/dashboard-alerts.ts
Normal file
48
dashboard/backend/src/lib/dashboard-alerts.ts
Normal file
@ -0,0 +1,48 @@
|
||||
import { appendFile } from 'fs/promises';
|
||||
|
||||
type AlertSeverity = 'info' | 'warn' | 'critical';
|
||||
type AlertInstance = 'vijay' | 'bheem' | 'all';
|
||||
|
||||
interface DashboardWarningInput {
|
||||
severity: AlertSeverity;
|
||||
instance: AlertInstance;
|
||||
message: string;
|
||||
}
|
||||
|
||||
const DEDUPE_WINDOW_MS = 60 * 60 * 1000;
|
||||
const recent = new Map<string, number>();
|
||||
|
||||
function severityToken(severity: AlertSeverity): string {
|
||||
if (severity === 'critical') return 'CRITICAL';
|
||||
if (severity === 'warn') return 'WARNING';
|
||||
return 'INFO';
|
||||
}
|
||||
|
||||
function alertKey(input: DashboardWarningInput): string {
|
||||
return `${input.severity}\0${input.instance}\0${input.message}`;
|
||||
}
|
||||
|
||||
function purgeExpired(now: number): void {
|
||||
for (const [key, at] of recent) {
|
||||
if (now - at > DEDUPE_WINDOW_MS) recent.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
export async function appendDashboardWarning(input: DashboardWarningInput, now = Date.now()): Promise<boolean> {
|
||||
const logPath = process.env.HERMES_DASHBOARD_ALERT_LOG;
|
||||
if (!logPath) return false;
|
||||
|
||||
purgeExpired(now);
|
||||
const key = alertKey(input);
|
||||
const previous = recent.get(key);
|
||||
if (previous && now - previous <= DEDUPE_WINDOW_MS) return false;
|
||||
|
||||
recent.set(key, now);
|
||||
const line = `${new Date(now).toISOString()} ${severityToken(input.severity)} instance=${input.instance} ${input.message}\n`;
|
||||
await appendFile(logPath, line, 'utf8');
|
||||
return true;
|
||||
}
|
||||
|
||||
export function clearDashboardWarningDedupe(): void {
|
||||
recent.clear();
|
||||
}
|
||||
@ -146,6 +146,49 @@ describe('hermes-ops repository', () => {
|
||||
expect(bheem.gateway.status).toBe('up');
|
||||
});
|
||||
|
||||
it('prefers a sanitized per-instance ops export when one is present', async () => {
|
||||
setExec(healthyHandler());
|
||||
readFileMock.mockImplementation(async (p: string) => {
|
||||
if (p === '/home/uma/.hermes/ops-export.json') {
|
||||
return JSON.stringify({
|
||||
gateway: { active: false, enabled: true, status: 'down' },
|
||||
dashboard: { active: false, status: 'down' },
|
||||
backupTimer: {
|
||||
name: 'uma-hermes-backup.timer',
|
||||
active: false,
|
||||
status: 'down',
|
||||
nextRun: null,
|
||||
lastRun: null,
|
||||
},
|
||||
repo: {
|
||||
path: '/home/uma/repos/uma_hostinger_hermes_vm',
|
||||
branch: 'main',
|
||||
clean: true,
|
||||
head: 'export1',
|
||||
lastCommitAt: '2026-05-31T00:00:00Z',
|
||||
size: '1M',
|
||||
status: 'up',
|
||||
},
|
||||
restoredFileCount: 42,
|
||||
restoredCronJobs: 3,
|
||||
googleWorkspaceToken: true,
|
||||
});
|
||||
}
|
||||
if (p.endsWith('MANIFEST.json')) return JSON.stringify({ files: [1, 2, 3] });
|
||||
if (p.endsWith('jobs.json')) return JSON.stringify({ jobs: [{ id: 'a' }, { id: 'b' }] });
|
||||
throw new Error('no such file');
|
||||
});
|
||||
|
||||
const snapshot = await getHermesOpsSnapshot({ force: true });
|
||||
const bheem = snapshot.instances.find((i) => i.id === 'bheem')!;
|
||||
expect(bheem.gateway.status).toBe('down');
|
||||
expect(bheem.dashboard.status).toBe('down');
|
||||
expect(bheem.backup.repo.head).toBe('export1');
|
||||
expect(bheem.backup.restoredFileCount).toBe(42);
|
||||
expect(bheem.backup.restoredCronJobs).toBe(3);
|
||||
expect(bheem.google.workspaceToken).toBe(true);
|
||||
});
|
||||
|
||||
it('reports unknown repo status when git cannot be read', async () => {
|
||||
setExec((command, args) => {
|
||||
if (command === 'git') return enoentError();
|
||||
|
||||
@ -2,6 +2,7 @@ import { execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { readFile, stat } from 'fs/promises';
|
||||
import { existsSync } from 'fs';
|
||||
import { appendDashboardWarning } from '../../lib/dashboard-alerts.js';
|
||||
import type {
|
||||
HermesOpsCronJob,
|
||||
HermesOpsInstance,
|
||||
@ -31,6 +32,7 @@ const instances = [
|
||||
dashboardPort: 9119,
|
||||
backupTimer: 'hermes-root-backup.timer',
|
||||
repoPath: '/root/repos/bytelyst_hostinger_hermes_vm',
|
||||
opsExportPath: '/root/.hermes/ops-export.json',
|
||||
driveFolder: 'Vijay Drive',
|
||||
},
|
||||
{
|
||||
@ -43,10 +45,21 @@ const instances = [
|
||||
dashboardPort: 9120,
|
||||
backupTimer: 'uma-hermes-backup.timer',
|
||||
repoPath: '/home/uma/repos/uma_hostinger_hermes_vm',
|
||||
opsExportPath: '/home/uma/.hermes/ops-export.json',
|
||||
driveFolder: 'Bheem Drive',
|
||||
},
|
||||
];
|
||||
|
||||
interface OpsExport {
|
||||
gateway?: { active?: boolean; enabled?: boolean; status?: ProbeStatus };
|
||||
dashboard?: { active?: boolean; status?: ProbeStatus };
|
||||
backupTimer?: HermesOpsTimer;
|
||||
repo?: HermesOpsRepo;
|
||||
restoredFileCount?: number | null;
|
||||
restoredCronJobs?: number | null;
|
||||
googleWorkspaceToken?: boolean;
|
||||
}
|
||||
|
||||
interface ExecResult {
|
||||
// Trimmed stdout. Present even when the command exited non-zero (e.g.
|
||||
// `systemctl is-active` prints "inactive" and exits 3).
|
||||
@ -223,6 +236,15 @@ async function tokenExists(path: string): Promise<boolean> {
|
||||
}
|
||||
}
|
||||
|
||||
async function readOpsExport(path: string): Promise<OpsExport | null> {
|
||||
try {
|
||||
const parsed = JSON.parse(await readFile(path, 'utf8')) as OpsExport;
|
||||
return parsed && typeof parsed === 'object' ? parsed : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function getTailscaleIp(): Promise<string | null> {
|
||||
const result = await exec('tailscale', ['ip', '-4']);
|
||||
if (!result.ran) return null;
|
||||
@ -246,11 +268,12 @@ async function buildSnapshot(): Promise<HermesOpsSnapshot> {
|
||||
|
||||
const results: HermesOpsInstance[] = [];
|
||||
for (const item of instances) {
|
||||
const opsExport = await readOpsExport(item.opsExportPath);
|
||||
const gatewayActiveCheck =
|
||||
item.gatewayKind === 'uma-user' ? probeUmaGatewayActive() : probeSystemActive(item.gatewayService);
|
||||
const gatewayEnabledCheck =
|
||||
item.gatewayKind === 'uma-user' ? probeUmaGatewayEnabled() : probeSystemEnabled(item.gatewayService);
|
||||
const [gateway, gatewayEnabled, dashboard, backupTimer, repo, stats, googleToken] = await Promise.all([
|
||||
const [probedGateway, probedGatewayEnabled, probedDashboard, probedBackupTimer, probedRepo, probedStats, probedGoogleToken] = await Promise.all([
|
||||
gatewayActiveCheck,
|
||||
gatewayEnabledCheck,
|
||||
probeSystemActive(item.dashboardService),
|
||||
@ -259,6 +282,22 @@ async function buildSnapshot(): Promise<HermesOpsSnapshot> {
|
||||
manifestStats(`${item.repoPath}/hermes_persistent_backup`),
|
||||
tokenExists(`${item.hermesHome}/google_token.json`),
|
||||
]);
|
||||
const gateway = opsExport?.gateway?.status ? {
|
||||
active: Boolean(opsExport.gateway.active),
|
||||
status: opsExport.gateway.status,
|
||||
} : probedGateway;
|
||||
const gatewayEnabled = typeof opsExport?.gateway?.enabled === 'boolean' ? opsExport.gateway.enabled : probedGatewayEnabled;
|
||||
const dashboard = opsExport?.dashboard?.status ? {
|
||||
active: Boolean(opsExport.dashboard.active),
|
||||
status: opsExport.dashboard.status,
|
||||
} : probedDashboard;
|
||||
const backupTimer = opsExport?.backupTimer ?? probedBackupTimer;
|
||||
const repo = opsExport?.repo ?? probedRepo;
|
||||
const stats = {
|
||||
files: typeof opsExport?.restoredFileCount === 'number' || opsExport?.restoredFileCount === null ? opsExport.restoredFileCount : probedStats.files,
|
||||
cronJobs: typeof opsExport?.restoredCronJobs === 'number' || opsExport?.restoredCronJobs === null ? opsExport.restoredCronJobs : probedStats.cronJobs,
|
||||
};
|
||||
const googleToken = typeof opsExport?.googleWorkspaceToken === 'boolean' ? opsExport.googleWorkspaceToken : probedGoogleToken;
|
||||
|
||||
const dashboardUrl = tailscaleIp ? `http://${tailscaleIp}:${item.dashboardPort}/` : `:${item.dashboardPort}`;
|
||||
|
||||
@ -316,6 +355,16 @@ async function buildSnapshot(): Promise<HermesOpsSnapshot> {
|
||||
warnings.push('Emergency Drive OAuth token is missing');
|
||||
}
|
||||
|
||||
await Promise.all(warnings.map((message) => {
|
||||
const lower = message.toLowerCase();
|
||||
const instance = lower.includes('bheem') || lower.includes('uma')
|
||||
? 'bheem'
|
||||
: lower.includes('vijay') || lower.includes('root')
|
||||
? 'vijay'
|
||||
: 'all';
|
||||
return appendDashboardWarning({ severity: 'warn', instance, message });
|
||||
}));
|
||||
|
||||
const cronJobs: HermesOpsCronJob[] = [
|
||||
{
|
||||
name: emergencyDriveUpload.name,
|
||||
|
||||
@ -7,7 +7,8 @@ vi.mock('child_process', () => ({ execFile: execFileMock }));
|
||||
|
||||
const readFileMock = vi.hoisted(() => vi.fn());
|
||||
const statMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('fs/promises', () => ({ readFile: readFileMock, stat: statMock }));
|
||||
const readdirMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('fs/promises', () => ({ readFile: readFileMock, readdir: readdirMock, stat: statMock }));
|
||||
|
||||
type Handler = (command: string, args: string[]) => { error?: NodeJS.ErrnoException; stdout?: string };
|
||||
|
||||
@ -42,6 +43,7 @@ describe('hermes-telemetry repository', () => {
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readFileMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
// The whole shape must validate even when nothing was readable — that's
|
||||
@ -84,6 +86,7 @@ describe('hermes-telemetry repository', () => {
|
||||
return { stdout: '' };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readdirMock.mockResolvedValue([]);
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
expect(snapshot.sessions).toEqual({ totalSessions: 59, totalMessages: 5225, status: 'up' });
|
||||
@ -102,6 +105,7 @@ describe('hermes-telemetry repository', () => {
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockResolvedValue({} as never);
|
||||
readdirMock.mockResolvedValue([]);
|
||||
readFileMock.mockResolvedValue([
|
||||
'2026-01-01T12:34:56 WARNING gateway is degraded',
|
||||
'2026-01-01T12:35:01 CRITICAL backup repo HEAD missing',
|
||||
@ -129,6 +133,7 @@ describe('hermes-telemetry repository', () => {
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readdirMock.mockResolvedValue([]);
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
expect(snapshot.backupHistory.status).toBe('up');
|
||||
@ -144,6 +149,7 @@ describe('hermes-telemetry repository', () => {
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const a = await getHermesTelemetrySnapshot('vijay');
|
||||
const callsAfterFirst = calls;
|
||||
@ -159,10 +165,37 @@ describe('hermes-telemetry repository', () => {
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readdirMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
|
||||
const v = await getHermesTelemetrySnapshot('vijay');
|
||||
const b = await getHermesTelemetrySnapshot('bheem');
|
||||
expect(v.instanceId).toBe('vijay');
|
||||
expect(b.instanceId).toBe('bheem');
|
||||
});
|
||||
|
||||
it('parses sanitized Hermes session JSONL events without exposing raw message content', async () => {
|
||||
setExec(() => {
|
||||
const err = Object.assign(new Error('ENOENT'), { code: 'ENOENT' as const });
|
||||
return { error: err };
|
||||
});
|
||||
statMock.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' }));
|
||||
readdirMock.mockResolvedValue(['20260101_session.jsonl']);
|
||||
readFileMock.mockImplementation(async (path: string) => {
|
||||
if (path.endsWith('.jsonl')) {
|
||||
return [
|
||||
JSON.stringify({ role: 'user', content: 'secret prompt', timestamp: '2026-01-01T00:00:00Z' }),
|
||||
JSON.stringify({ role: 'assistant', finish_reason: 'tool_calls', tool_calls: [{ function: { name: 'exec_command' } }], timestamp: '2026-01-01T00:01:00Z' }),
|
||||
].join('\n');
|
||||
}
|
||||
throw Object.assign(new Error('ENOENT'), { code: 'ENOENT' });
|
||||
});
|
||||
|
||||
const snapshot = await getHermesTelemetrySnapshot('vijay');
|
||||
expect(snapshot.sessionEvents.status).toBe('up');
|
||||
expect(snapshot.sessionEvents.sourceCount).toBe(1);
|
||||
expect(snapshot.sessionEvents.entries).toHaveLength(2);
|
||||
expect(snapshot.sessionEvents.entries[0].summary).toBe('assistant tool call: exec_command');
|
||||
expect(snapshot.sessionEvents.entries[1].summary).toBe('user message (content redacted)');
|
||||
expect(JSON.stringify(snapshot.sessionEvents.entries)).not.toContain('secret prompt');
|
||||
});
|
||||
});
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import { execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { readFile, stat } from 'fs/promises';
|
||||
import { readdir, readFile, stat } from 'fs/promises';
|
||||
import { basename, join } from 'path';
|
||||
import { appendDashboardWarning } from '../../lib/dashboard-alerts.js';
|
||||
import { childLogger } from '../../lib/logger.js';
|
||||
import type {
|
||||
HermesBackupHistory,
|
||||
@ -8,6 +10,10 @@ import type {
|
||||
HermesCronEntry,
|
||||
HermesCronList,
|
||||
HermesInstanceId,
|
||||
HermesSessionEntry,
|
||||
HermesSessionEvent,
|
||||
HermesSessionEventList,
|
||||
HermesSessionList,
|
||||
HermesMemoryList,
|
||||
HermesSessionStats,
|
||||
HermesSkillList,
|
||||
@ -29,6 +35,8 @@ interface InstanceConfig {
|
||||
user: string | null; // null → run as the backend's own user (root in prod)
|
||||
repoPath: string;
|
||||
watchdogLog: string;
|
||||
sessionsIndex: string;
|
||||
sessionsDir: string;
|
||||
}
|
||||
|
||||
const INSTANCES: Record<HermesInstanceId, InstanceConfig> = {
|
||||
@ -37,12 +45,16 @@ const INSTANCES: Record<HermesInstanceId, InstanceConfig> = {
|
||||
user: null,
|
||||
repoPath: '/root/repos/bytelyst_hostinger_hermes_vm',
|
||||
watchdogLog: '/root/.hermes/logs/hermes-health-watchdog.log',
|
||||
sessionsIndex: '/root/.hermes/sessions/sessions.json',
|
||||
sessionsDir: '/root/.hermes/sessions',
|
||||
},
|
||||
bheem: {
|
||||
id: 'bheem',
|
||||
user: 'uma',
|
||||
repoPath: '/home/uma/repos/uma_hostinger_hermes_vm',
|
||||
watchdogLog: '/home/uma/.hermes/logs/hermes-health-watchdog.log',
|
||||
sessionsIndex: '/home/uma/.hermes/sessions/sessions.json',
|
||||
sessionsDir: '/home/uma/.hermes/sessions',
|
||||
},
|
||||
};
|
||||
|
||||
@ -103,6 +115,142 @@ async function readSessionStats(inst: InstanceConfig): Promise<HermesSessionStat
|
||||
}
|
||||
}
|
||||
|
||||
async function readSessionList(inst: InstanceConfig): Promise<HermesSessionList> {
|
||||
try {
|
||||
const parsed = JSON.parse(await readFile(inst.sessionsIndex, 'utf8')) as Record<string, Record<string, unknown>>;
|
||||
const entries: HermesSessionEntry[] = Object.values(parsed)
|
||||
.map((row) => ({
|
||||
id: String(row.session_id ?? row.id ?? row.session_key ?? ''),
|
||||
sessionKey: String(row.session_key ?? ''),
|
||||
platform: row.platform ? String(row.platform) : null,
|
||||
chatType: row.chat_type ? String(row.chat_type) : null,
|
||||
displayName: row.display_name ? String(row.display_name) : null,
|
||||
createdAt: row.created_at ? String(row.created_at) : null,
|
||||
updatedAt: row.updated_at ? String(row.updated_at) : null,
|
||||
suspended: Boolean(row.suspended ?? false),
|
||||
resumePending: Boolean(row.resume_pending ?? false),
|
||||
totalTokens: typeof row.total_tokens === 'number' ? row.total_tokens : null,
|
||||
estimatedCostUsd: typeof row.estimated_cost_usd === 'number' ? row.estimated_cost_usd : null,
|
||||
}))
|
||||
.filter((entry) => entry.id || entry.sessionKey)
|
||||
.sort((a, b) => new Date(b.updatedAt ?? b.createdAt ?? 0).getTime() - new Date(a.updatedAt ?? a.createdAt ?? 0).getTime())
|
||||
.slice(0, 50);
|
||||
return { entries, status: 'up' };
|
||||
} catch {
|
||||
return { entries: [], status: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function extractToolNames(row: Record<string, unknown>): string[] {
|
||||
const names = new Set<string>();
|
||||
const addName = (value: unknown) => {
|
||||
if (typeof value === 'string' && value.trim()) names.add(value.trim());
|
||||
};
|
||||
|
||||
const collectFromItem = (item: unknown) => {
|
||||
if (!isRecord(item)) return;
|
||||
addName(item.name);
|
||||
if (isRecord(item.function)) addName(item.function.name);
|
||||
};
|
||||
|
||||
if (Array.isArray(row.tool_calls)) row.tool_calls.forEach(collectFromItem);
|
||||
if (Array.isArray(row.codex_message_items)) row.codex_message_items.forEach(collectFromItem);
|
||||
return Array.from(names).slice(0, 8);
|
||||
}
|
||||
|
||||
function extractItemTypes(row: Record<string, unknown>): string[] {
|
||||
const itemTypes = new Set<string>();
|
||||
if (Array.isArray(row.codex_message_items)) {
|
||||
for (const item of row.codex_message_items) {
|
||||
if (isRecord(item) && typeof item.type === 'string') itemTypes.add(item.type);
|
||||
}
|
||||
}
|
||||
return Array.from(itemTypes).slice(0, 8);
|
||||
}
|
||||
|
||||
function classifySessionEvent(row: Record<string, unknown>, toolNames: string[], itemTypes: string[]): HermesSessionEvent['eventType'] {
|
||||
const role = typeof row.role === 'string' ? row.role : '';
|
||||
if (role === 'session_meta') return 'system';
|
||||
if (toolNames.length > 0 || row.finish_reason === 'tool_calls') return 'tool-call';
|
||||
if (itemTypes.some((type) => type.includes('tool'))) return 'tool-result';
|
||||
if (itemTypes.includes('reasoning') || row.reasoning) return 'reasoning';
|
||||
if (role === 'user' || role === 'assistant' || typeof row.content === 'string') return 'message';
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
function summarizeSessionEvent(row: Record<string, unknown>, eventType: HermesSessionEvent['eventType'], toolNames: string[]): string {
|
||||
const role = typeof row.role === 'string' ? row.role : 'unknown';
|
||||
if (eventType === 'system') return 'session metadata recorded';
|
||||
if (eventType === 'tool-call') {
|
||||
const toolText = toolNames.length > 0 ? `: ${toolNames.join(', ')}` : '';
|
||||
return `${role} tool call${toolNames.length === 1 ? '' : 's'}${toolText}`;
|
||||
}
|
||||
if (eventType === 'tool-result') return `${role} tool result recorded`;
|
||||
if (eventType === 'reasoning') return `${role} reasoning item recorded`;
|
||||
if (eventType === 'message') return `${role} message (content redacted)`;
|
||||
return `${role} event recorded`;
|
||||
}
|
||||
|
||||
function parseSessionJsonlLine(line: string, sessionFile: string, lineIndex: number): HermesSessionEvent | null {
|
||||
if (!line.trim()) return null;
|
||||
try {
|
||||
const row = JSON.parse(line) as unknown;
|
||||
if (!isRecord(row)) return null;
|
||||
const toolNames = extractToolNames(row);
|
||||
const itemTypes = extractItemTypes(row);
|
||||
const eventType = classifySessionEvent(row, toolNames, itemTypes);
|
||||
const timestamp = typeof row.timestamp === 'string' ? row.timestamp : null;
|
||||
const status = typeof row.status === 'string'
|
||||
? row.status
|
||||
: (typeof row.finish_reason === 'string' ? row.finish_reason : null);
|
||||
return {
|
||||
id: `${sessionFile}:${lineIndex}`,
|
||||
sessionFile,
|
||||
timestamp,
|
||||
role: typeof row.role === 'string' ? row.role : null,
|
||||
eventType,
|
||||
summary: summarizeSessionEvent(row, eventType, toolNames),
|
||||
toolNames,
|
||||
itemTypes,
|
||||
status,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function readSessionEvents(inst: InstanceConfig): Promise<HermesSessionEventList> {
|
||||
try {
|
||||
const files = (await readdir(inst.sessionsDir))
|
||||
.filter((name) => name.endsWith('.jsonl'))
|
||||
.sort()
|
||||
.slice(-10);
|
||||
if (files.length === 0) return { entries: [], status: 'up', sourceCount: 0 };
|
||||
|
||||
const entries: HermesSessionEvent[] = [];
|
||||
for (const file of files) {
|
||||
const sessionFile = basename(file);
|
||||
const content = await readFile(join(inst.sessionsDir, file), 'utf8');
|
||||
const lines = content.split('\n');
|
||||
const start = Math.max(0, lines.length - 200);
|
||||
for (let index = start; index < lines.length; index += 1) {
|
||||
const event = parseSessionJsonlLine(lines[index], sessionFile, index + 1);
|
||||
if (event) entries.push(event);
|
||||
}
|
||||
}
|
||||
|
||||
entries.sort((a, b) => new Date(b.timestamp ?? 0).getTime() - new Date(a.timestamp ?? 0).getTime());
|
||||
return { entries: entries.slice(0, 100), status: 'up', sourceCount: files.length };
|
||||
} catch (err) {
|
||||
log.warn({ err, instance: inst.id, source: inst.sessionsDir }, 'failed to read Hermes session events');
|
||||
return { entries: [], status: 'unknown', sourceCount: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
// --- Cron -------------------------------------------------------------------
|
||||
//
|
||||
// `hermes cron list --json` is the canonical source. It's distinct from
|
||||
@ -248,8 +396,10 @@ const inflight = new Map<HermesInstanceId, Promise<HermesTelemetrySnapshot>>();
|
||||
|
||||
async function buildSnapshot(instanceId: HermesInstanceId): Promise<HermesTelemetrySnapshot> {
|
||||
const inst = INSTANCES[instanceId];
|
||||
const [sessions, cron, memory, skills, watchdog, backupHistory] = await Promise.all([
|
||||
const [sessions, sessionList, sessionEvents, cron, memory, skills, watchdog, backupHistory] = await Promise.all([
|
||||
readSessionStats(inst),
|
||||
readSessionList(inst),
|
||||
readSessionEvents(inst),
|
||||
readCron(inst),
|
||||
readMemory(inst),
|
||||
readSkills(inst),
|
||||
@ -259,17 +409,28 @@ async function buildSnapshot(instanceId: HermesInstanceId): Promise<HermesTeleme
|
||||
|
||||
const warnings: string[] = [];
|
||||
if (sessions.status === 'unknown') warnings.push(`${instanceId}: hermes sessions stats unavailable (CLI missing or non-zero exit)`);
|
||||
if (sessionList.status === 'unknown') warnings.push(`${instanceId}: Hermes session index not readable`);
|
||||
if (sessionEvents.status === 'unknown') warnings.push(`${instanceId}: Hermes session event JSONL not readable at ${inst.sessionsDir}`);
|
||||
if (cron.status === 'unknown') warnings.push(`${instanceId}: hermes cron list unavailable`);
|
||||
if (memory.status === 'unknown') warnings.push(`${instanceId}: hermes memory list unavailable`);
|
||||
if (skills.status === 'unknown') warnings.push(`${instanceId}: hermes skills list unavailable`);
|
||||
if (watchdog.status === 'unknown') warnings.push(`${instanceId}: watchdog log not readable at ${watchdog.source ?? 'unknown path'}`);
|
||||
if (backupHistory.status === 'unknown') warnings.push(`${instanceId}: backup repo not readable at ${backupHistory.repoPath ?? 'unknown path'}`);
|
||||
|
||||
await Promise.all([
|
||||
...warnings.map((message) => appendDashboardWarning({ severity: 'warn', instance: instanceId, message })),
|
||||
...watchdog.alerts
|
||||
.filter((alert) => alert.severity === 'critical')
|
||||
.map((alert) => appendDashboardWarning({ severity: 'critical', instance: instanceId, message: alert.message })),
|
||||
]);
|
||||
|
||||
return {
|
||||
generatedAt: new Date().toISOString(),
|
||||
cached: false,
|
||||
instanceId,
|
||||
sessions,
|
||||
sessionList,
|
||||
sessionEvents,
|
||||
cron,
|
||||
memory,
|
||||
skills,
|
||||
|
||||
@ -18,6 +18,47 @@ export const HermesSessionStatsSchema = z.object({
|
||||
});
|
||||
export type HermesSessionStats = z.infer<typeof HermesSessionStatsSchema>;
|
||||
|
||||
export const HermesSessionEntrySchema = z.object({
|
||||
id: z.string(),
|
||||
sessionKey: z.string(),
|
||||
platform: z.string().nullable(),
|
||||
chatType: z.string().nullable(),
|
||||
displayName: z.string().nullable(),
|
||||
createdAt: z.string().nullable(),
|
||||
updatedAt: z.string().nullable(),
|
||||
suspended: z.boolean(),
|
||||
resumePending: z.boolean(),
|
||||
totalTokens: z.number().nullable(),
|
||||
estimatedCostUsd: z.number().nullable(),
|
||||
});
|
||||
export type HermesSessionEntry = z.infer<typeof HermesSessionEntrySchema>;
|
||||
|
||||
export const HermesSessionListSchema = z.object({
|
||||
entries: z.array(HermesSessionEntrySchema),
|
||||
status: ProbeStatusSchema,
|
||||
});
|
||||
export type HermesSessionList = z.infer<typeof HermesSessionListSchema>;
|
||||
|
||||
export const HermesSessionEventSchema = z.object({
|
||||
id: z.string(),
|
||||
sessionFile: z.string(),
|
||||
timestamp: z.string().nullable(),
|
||||
role: z.string().nullable(),
|
||||
eventType: z.enum(['message', 'tool-call', 'tool-result', 'reasoning', 'system', 'unknown']),
|
||||
summary: z.string(),
|
||||
toolNames: z.array(z.string()),
|
||||
itemTypes: z.array(z.string()),
|
||||
status: z.string().nullable(),
|
||||
});
|
||||
export type HermesSessionEvent = z.infer<typeof HermesSessionEventSchema>;
|
||||
|
||||
export const HermesSessionEventListSchema = z.object({
|
||||
entries: z.array(HermesSessionEventSchema),
|
||||
status: ProbeStatusSchema,
|
||||
sourceCount: z.number(),
|
||||
});
|
||||
export type HermesSessionEventList = z.infer<typeof HermesSessionEventListSchema>;
|
||||
|
||||
export const HermesCronEntrySchema = z.object({
|
||||
id: z.string(),
|
||||
name: z.string(),
|
||||
@ -106,6 +147,8 @@ export const HermesTelemetrySnapshotSchema = z.object({
|
||||
cached: z.boolean(),
|
||||
instanceId: HermesInstanceIdSchema,
|
||||
sessions: HermesSessionStatsSchema,
|
||||
sessionList: HermesSessionListSchema,
|
||||
sessionEvents: HermesSessionEventListSchema,
|
||||
cron: HermesCronListSchema,
|
||||
memory: HermesMemoryListSchema,
|
||||
skills: HermesSkillListSchema,
|
||||
|
||||
@ -25,6 +25,7 @@ services:
|
||||
environment:
|
||||
- VM_SCRIPTS_PATH=/vm-scripts/VMs/HostingerVM
|
||||
- VM_LOG_DIR=/host-logs
|
||||
- HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log
|
||||
ports:
|
||||
- '127.0.0.1:4004:4004'
|
||||
networks:
|
||||
@ -37,6 +38,7 @@ services:
|
||||
- /var/log/vm-cleanup.log:/host-logs/vm-cleanup.log
|
||||
- /var/log/vm-health-check.log:/host-logs/vm-health-check.log
|
||||
- /var/log/docker-watchdog.log:/host-logs/docker-watchdog.log
|
||||
- /var/log/hermes-dashboard-warnings.log:/var/log/hermes-dashboard-warnings.log
|
||||
# Docker socket — allows running docker commands against the host daemon
|
||||
# (same pattern as Portainer/cAdvisor; container already runs as root)
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
|
||||
@ -40,6 +40,87 @@ const hermesOpsSnapshot = {
|
||||
warnings: [],
|
||||
};
|
||||
|
||||
const hermesTelemetrySnapshot = (instanceId: 'vijay' | 'bheem') => ({
|
||||
generatedAt: '2026-01-01T00:00:00.000Z',
|
||||
cached: false,
|
||||
instanceId,
|
||||
sessions: { totalSessions: instanceId === 'vijay' ? 12 : 7, totalMessages: instanceId === 'vijay' ? 480 : 210, status: 'up' },
|
||||
sessionList: {
|
||||
status: 'up',
|
||||
entries: [
|
||||
{
|
||||
id: `${instanceId}-session-1`,
|
||||
sessionKey: `agent:main:telegram:dm:${instanceId}`,
|
||||
platform: 'telegram',
|
||||
chatType: 'dm',
|
||||
displayName: instanceId === 'vijay' ? 'S' : 'Uma',
|
||||
createdAt: '2026-01-01T00:00:00.000Z',
|
||||
updatedAt: '2026-01-01T00:06:00.000Z',
|
||||
suspended: false,
|
||||
resumePending: false,
|
||||
totalTokens: 100,
|
||||
estimatedCostUsd: 0,
|
||||
},
|
||||
],
|
||||
},
|
||||
sessionEvents: {
|
||||
status: 'up',
|
||||
sourceCount: 1,
|
||||
entries: [
|
||||
{
|
||||
id: `${instanceId}-events.jsonl:3`,
|
||||
sessionFile: `${instanceId}-events.jsonl`,
|
||||
timestamp: '2026-01-01T00:06:00.000Z',
|
||||
role: 'assistant',
|
||||
eventType: 'tool-call',
|
||||
summary: 'assistant tool call: exec_command',
|
||||
toolNames: ['exec_command'],
|
||||
itemTypes: [],
|
||||
status: 'tool_calls',
|
||||
},
|
||||
],
|
||||
},
|
||||
cron: {
|
||||
status: 'up',
|
||||
entries: [
|
||||
{
|
||||
id: `${instanceId}-digest`,
|
||||
name: `${instanceId} digest`,
|
||||
schedule: '0 * * * *',
|
||||
lastRun: '2026-01-01T00:00:00.000Z',
|
||||
nextRun: '2026-01-01T01:00:00.000Z',
|
||||
lastStatus: 'ok',
|
||||
active: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
memory: { status: 'up', items: [] },
|
||||
skills: { status: 'up', items: [] },
|
||||
watchdog: {
|
||||
source: `/tmp/${instanceId}-watchdog.log`,
|
||||
status: 'up',
|
||||
alerts: [
|
||||
{
|
||||
timestamp: '2026-01-01T00:05:00.000Z',
|
||||
severity: 'info',
|
||||
message: `${instanceId} watchdog healthy`,
|
||||
},
|
||||
],
|
||||
},
|
||||
backupHistory: {
|
||||
repoPath: `/tmp/${instanceId}-repo`,
|
||||
status: 'up',
|
||||
entries: [
|
||||
{
|
||||
sha: `${instanceId}123456`,
|
||||
committedAt: '2026-01-01T00:03:00.000Z',
|
||||
subject: `${instanceId} backup`,
|
||||
},
|
||||
],
|
||||
},
|
||||
warnings: [],
|
||||
});
|
||||
|
||||
test.describe('Hermes Mission Control', () => {
|
||||
test.beforeEach(async ({ page }) => {
|
||||
await page.addInitScript(() => {
|
||||
@ -59,6 +140,22 @@ test.describe('Hermes Mission Control', () => {
|
||||
});
|
||||
});
|
||||
|
||||
await page.route('**/api/hermes/telemetry/vijay', async (route) => {
|
||||
await route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
body: JSON.stringify(hermesTelemetrySnapshot('vijay')),
|
||||
});
|
||||
});
|
||||
|
||||
await page.route('**/api/hermes/telemetry/bheem', async (route) => {
|
||||
await route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
body: JSON.stringify(hermesTelemetrySnapshot('bheem')),
|
||||
});
|
||||
});
|
||||
|
||||
// /hermes/products fetches the real service registry + health module
|
||||
// (Phase 3 slice 2). Backend isn't running in CI, so we satisfy those
|
||||
// routes the same way the dashboard spec does.
|
||||
@ -82,11 +179,11 @@ test.describe('Hermes Mission Control', () => {
|
||||
|
||||
await page.getByRole('link', { name: 'Task Ledger' }).click();
|
||||
await expect(page.getByRole('heading', { name: 'Task Ledger' })).toBeVisible();
|
||||
await expect(page.getByText('Task table')).toBeVisible();
|
||||
await expect(page.getByRole('heading', { name: 'Task table' })).toBeVisible();
|
||||
|
||||
await page.goto('/hermes/tasks/task-1');
|
||||
await expect(page.getByRole('heading', { name: 'Hermes learning' })).toBeVisible();
|
||||
await expect(page.getByText('Timeline')).toBeVisible();
|
||||
await expect(page.getByRole('heading', { name: 'Timeline', exact: true })).toBeVisible();
|
||||
|
||||
await page.goto('/hermes/products');
|
||||
await expect(page.getByRole('heading', { name: 'Product Portfolio' })).toBeVisible();
|
||||
@ -111,7 +208,7 @@ test.describe('Hermes Mission Control', () => {
|
||||
|
||||
await page.goto('/hermes/tasks/task-1');
|
||||
await expect(page.getByRole('heading', { name: 'Hermes learning' })).toBeVisible();
|
||||
await expect(page.getByRole('heading', { name: 'Timeline' })).toBeVisible();
|
||||
await expect(page.getByRole('heading', { name: 'Timeline', exact: true })).toBeVisible();
|
||||
});
|
||||
|
||||
test('exposes a global instance switcher with All / Vijay / Bheem', async ({ page }) => {
|
||||
|
||||
@ -7,8 +7,12 @@ import { useEffect, useMemo, useState } from 'react';
|
||||
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
|
||||
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
|
||||
import { useHermesInstance } from '@/lib/hermes-instance-context';
|
||||
import { getHermesAgents, HERMES_INSTANCES, type HermesInstanceId } from '@/lib/hermes';
|
||||
import { api, type HermesTelemetrySnapshot } from '@/lib/api';
|
||||
import { getHermesAgents, HERMES_INSTANCES } from '@/lib/hermes';
|
||||
import {
|
||||
emptyTelemetryState,
|
||||
loadAllHermesTelemetry,
|
||||
type HermesTelemetryState,
|
||||
} from '@/lib/hermes-telemetry-client';
|
||||
|
||||
export default function HermesAgentsPage() {
|
||||
const { selectedInstance } = useHermesInstance();
|
||||
@ -21,19 +25,16 @@ export default function HermesAgentsPage() {
|
||||
// endpoint. The agent statuses above remain seed-data (status observability
|
||||
// needs a separate ingestion contract); the inventory below is genuine
|
||||
// when the `hermes` CLI is reachable, status:'unknown' otherwise.
|
||||
const [telemetry, setTelemetry] = useState<Record<HermesInstanceId, HermesTelemetrySnapshot | null>>({ vijay: null, bheem: null });
|
||||
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
|
||||
const [telemetryError, setTelemetryError] = useState<string | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
const controller = new AbortController();
|
||||
const load = async () => {
|
||||
try {
|
||||
const [vijay, bheem] = await Promise.all([
|
||||
api.getHermesTelemetry('vijay'),
|
||||
api.getHermesTelemetry('bheem'),
|
||||
]);
|
||||
const next = await loadAllHermesTelemetry();
|
||||
if (controller.signal.aborted) return;
|
||||
setTelemetry({ vijay, bheem });
|
||||
setTelemetry(next);
|
||||
setTelemetryError(null);
|
||||
} catch (err) {
|
||||
if (controller.signal.aborted) return;
|
||||
|
||||
@ -1,15 +1,28 @@
|
||||
'use client';
|
||||
|
||||
import Link from 'next/link';
|
||||
import { ArrowLeft, Clock3, Flame, TrendingDown, TrendingUp } from 'lucide-react';
|
||||
import { ArrowLeft, Clock3, Flame, History, TrendingDown, TrendingUp } from 'lucide-react';
|
||||
import { Badge, Button } from '@/components/ui/Primitives';
|
||||
import { useMemo } from 'react';
|
||||
import { useEffect, useMemo, useState } from 'react';
|
||||
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
|
||||
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
|
||||
import { useHermesInstance } from '@/lib/hermes-instance-context';
|
||||
import { getHermesHistory, hermesTasks } from '@/lib/hermes';
|
||||
import {
|
||||
collectBackupEntries,
|
||||
collectCronEntries,
|
||||
collectSessionEvents,
|
||||
collectSessionEntries,
|
||||
collectWatchdogAlerts,
|
||||
emptyTelemetryState,
|
||||
loadAllHermesTelemetry,
|
||||
type HermesTelemetryState,
|
||||
} from '@/lib/hermes-telemetry-client';
|
||||
|
||||
export default function HermesHistoryPage() {
|
||||
const { selectedInstance } = useHermesInstance();
|
||||
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
|
||||
const [telemetryError, setTelemetryError] = useState<string | null>(null);
|
||||
const history = useMemo(() => getHermesHistory(selectedInstance), [selectedInstance]);
|
||||
const filteredTasks = useMemo(
|
||||
() => (selectedInstance === 'all' ? hermesTasks : hermesTasks.filter((task) => task.instanceId === selectedInstance)),
|
||||
@ -26,6 +39,30 @@ export default function HermesHistoryPage() {
|
||||
tasksWithDuration.reduce((sum, task) => sum + (task.durationMs ?? 0), 0) /
|
||||
Math.max(1, tasksWithDuration.length) / 60000,
|
||||
);
|
||||
const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
|
||||
const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
|
||||
const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
|
||||
const liveSessions = useMemo(() => collectSessionEntries(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
|
||||
const liveEvents = useMemo(() => collectSessionEvents(telemetry, selectedInstance).slice(0, 10), [telemetry, selectedInstance]);
|
||||
|
||||
useEffect(() => {
|
||||
let active = true;
|
||||
const load = async () => {
|
||||
try {
|
||||
const next = await loadAllHermesTelemetry();
|
||||
if (!active) return;
|
||||
setTelemetry(next);
|
||||
setTelemetryError(null);
|
||||
} catch (err) {
|
||||
if (!active) return;
|
||||
setTelemetryError(err instanceof Error ? err.message : String(err));
|
||||
}
|
||||
};
|
||||
void load();
|
||||
return () => {
|
||||
active = false;
|
||||
};
|
||||
}, []);
|
||||
|
||||
const failureReasons = [
|
||||
['CI failures', 9],
|
||||
@ -48,6 +85,86 @@ export default function HermesHistoryPage() {
|
||||
<MetricCard label="Avg task duration" value={`${avgDuration}m`} tone="info" icon={<Clock3 className="h-5 w-5" />} />
|
||||
</section>
|
||||
|
||||
<section className="grid gap-4 md:grid-cols-2 xl:grid-cols-4">
|
||||
<MetricCard label="Live events" value={liveEvents.length} tone="info" icon={<History className="h-5 w-5" />} helpText="From Hermes session JSONL" />
|
||||
<MetricCard label="Live cron jobs" value={liveCron.length} tone="info" icon={<Clock3 className="h-5 w-5" />} helpText="From hermes cron list" />
|
||||
<MetricCard label="Watchdog alerts" value={liveAlerts.length} tone={liveAlerts.some((a) => a.severity === 'critical') ? 'danger' : liveAlerts.some((a) => a.severity === 'warn') ? 'warning' : 'default'} icon={<Flame className="h-5 w-5" />} helpText="From watchdog logs" />
|
||||
<MetricCard label="Backup commits" value={liveBackups.length} tone="success" icon={<TrendingUp className="h-5 w-5" />} helpText="From backup git history" />
|
||||
</section>
|
||||
|
||||
<SectionCard
|
||||
title="Live artifact timeline"
|
||||
subtitle="Real session events, sessions, cron, watchdog, and backup history from the Hermes telemetry endpoint. Message content is redacted at the backend."
|
||||
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}</Badge>}
|
||||
>
|
||||
{telemetryError ? (
|
||||
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
|
||||
Could not load telemetry: {telemetryError}
|
||||
</p>
|
||||
) : (
|
||||
<div className="grid gap-4 lg:grid-cols-2 xl:grid-cols-5">
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent events</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveEvents.length > 0 ? liveEvents.map((event) => (
|
||||
<div key={`${event.instanceId}-${event.id}`} className="flex items-start justify-between gap-3">
|
||||
<span className="line-clamp-2">{event.summary}</span>
|
||||
<HermesInstanceBadge instanceId={event.instanceId} />
|
||||
</div>
|
||||
)) : <p>No session events returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent sessions</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveSessions.length > 0 ? liveSessions.map((session) => (
|
||||
<div key={`${session.instanceId}-${session.id}`} className="flex items-center justify-between gap-3">
|
||||
<span className="truncate">{session.displayName ?? session.sessionKey}</span>
|
||||
<HermesInstanceBadge instanceId={session.instanceId} />
|
||||
</div>
|
||||
)) : <p>No session entries returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent watchdog alerts</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveAlerts.length > 0 ? liveAlerts.map((alert) => (
|
||||
<div key={`${alert.instanceId}-${alert.timestamp}-${alert.message}`} className="flex items-start justify-between gap-3">
|
||||
<span className="line-clamp-2">{alert.message}</span>
|
||||
<div className="flex shrink-0 items-center gap-2">
|
||||
<Badge variant={alert.severity === 'critical' ? 'error' : alert.severity === 'warn' ? 'warning' : 'info'}>{alert.severity}</Badge>
|
||||
<HermesInstanceBadge instanceId={alert.instanceId} />
|
||||
</div>
|
||||
</div>
|
||||
)) : <p>No watchdog alerts returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Cron entries</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveCron.length > 0 ? liveCron.map((entry) => (
|
||||
<div key={`${entry.instanceId}-${entry.id}`} className="flex items-center justify-between gap-3">
|
||||
<span className="truncate">{entry.name}</span>
|
||||
<HermesInstanceBadge instanceId={entry.instanceId} />
|
||||
</div>
|
||||
)) : <p>No cron entries returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Backup history</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveBackups.length > 0 ? liveBackups.map((entry) => (
|
||||
<div key={`${entry.instanceId}-${entry.sha}`} className="flex items-center justify-between gap-3">
|
||||
<span className="truncate">{entry.subject}</span>
|
||||
<HermesInstanceBadge instanceId={entry.instanceId} />
|
||||
</div>
|
||||
)) : <p>No backup commits returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</SectionCard>
|
||||
|
||||
<SectionCard title="Weekly activity chart" subtitle="Accessible bar chart built with standard layout primitives.">
|
||||
<div className="overflow-x-auto">
|
||||
<div className="flex min-w-[48rem] items-end gap-4 rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-5">
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
'use client';
|
||||
|
||||
import { useMemo } from 'react';
|
||||
import { useEffect, useMemo, useState } from 'react';
|
||||
import Link from 'next/link';
|
||||
import { ArrowRight, BadgeCheck, Bot, CheckCircle2, Clock3, LayoutDashboard, OctagonAlert, Rocket, ShieldAlert, Sparkles, TriangleAlert } from 'lucide-react';
|
||||
import { ArrowRight, BadgeCheck, BellRing, Bot, CheckCircle2, Clock3, LayoutDashboard, OctagonAlert, Rocket, ShieldAlert, Sparkles, TriangleAlert } from 'lucide-react';
|
||||
import { Badge, Button } from '@/components/ui/Primitives';
|
||||
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
|
||||
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
|
||||
@ -19,6 +19,15 @@ import {
|
||||
type HermesProduct,
|
||||
type HermesTask,
|
||||
} from '@/lib/hermes';
|
||||
import {
|
||||
collectBackupEntries,
|
||||
collectCronEntries,
|
||||
collectWatchdogAlerts,
|
||||
emptyTelemetryState,
|
||||
loadAllHermesTelemetry,
|
||||
telemetryForFilter,
|
||||
type HermesTelemetryState,
|
||||
} from '@/lib/hermes-telemetry-client';
|
||||
|
||||
const fmtDate = new Intl.DateTimeFormat('en', {
|
||||
month: 'short',
|
||||
@ -80,6 +89,8 @@ function ProductMiniCard({ product }: { product: HermesProduct }) {
|
||||
|
||||
export default function HermesMissionControlPage() {
|
||||
const { selectedInstance } = useHermesInstance();
|
||||
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
|
||||
const [telemetryError, setTelemetryError] = useState<string | null>(null);
|
||||
const overview = useMemo(() => getHermesOverview(selectedInstance), [selectedInstance]);
|
||||
// Per-instance roll-up cards always show both Vijay and Bheem regardless of
|
||||
// the active filter — they're the "comparison" view that sits next to the
|
||||
@ -124,6 +135,32 @@ export default function HermesMissionControlPage() {
|
||||
);
|
||||
const actionableProducts = filteredProducts.filter((product) => product.needsAttention).slice(0, 6);
|
||||
const agentStatuses = useMemo(() => getHermesAgents(selectedInstance), [selectedInstance]);
|
||||
const liveSnapshots = useMemo(() => telemetryForFilter(telemetry, selectedInstance), [telemetry, selectedInstance]);
|
||||
const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance).slice(0, 8), [telemetry, selectedInstance]);
|
||||
const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance).slice(0, 6), [telemetry, selectedInstance]);
|
||||
const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance).slice(0, 6), [telemetry, selectedInstance]);
|
||||
|
||||
useEffect(() => {
|
||||
let active = true;
|
||||
const load = async () => {
|
||||
try {
|
||||
const next = await loadAllHermesTelemetry();
|
||||
if (!active) return;
|
||||
setTelemetry(next);
|
||||
setTelemetryError(null);
|
||||
} catch (err) {
|
||||
if (!active) return;
|
||||
setTelemetryError(err instanceof Error ? err.message : String(err));
|
||||
}
|
||||
};
|
||||
void load();
|
||||
const timer = window.setInterval(load, 60_000);
|
||||
return () => {
|
||||
active = false;
|
||||
window.clearInterval(timer);
|
||||
};
|
||||
}, []);
|
||||
|
||||
const autoActions = [
|
||||
'Continue the queued execution lane for high-priority product updates.',
|
||||
'Publish a weekly digest from completed and failed work.',
|
||||
@ -185,6 +222,77 @@ export default function HermesMissionControlPage() {
|
||||
|
||||
<HermesOpsPanel />
|
||||
|
||||
<SectionCard
|
||||
title="Unified live alerts"
|
||||
subtitle="Cross-instance alert, cron, session, and backup signals from the real Hermes telemetry endpoint."
|
||||
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}</Badge>}
|
||||
>
|
||||
{telemetryError ? (
|
||||
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
|
||||
Could not load telemetry: {telemetryError}
|
||||
</p>
|
||||
) : (
|
||||
<div className="grid gap-4 xl:grid-cols-[1.2fr_0.8fr]">
|
||||
<div className="space-y-3">
|
||||
{liveAlerts.length > 0 ? liveAlerts.map((alert) => (
|
||||
<div key={`${alert.instanceId}-${alert.timestamp}-${alert.message}`} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<div className="flex flex-wrap items-start justify-between gap-3">
|
||||
<div className="min-w-0">
|
||||
<div className="flex flex-wrap items-center gap-2">
|
||||
<Badge variant={alert.severity === 'critical' ? 'error' : alert.severity === 'warn' ? 'warning' : 'info'}>{alert.severity}</Badge>
|
||||
<HermesInstanceBadge instanceId={alert.instanceId} />
|
||||
<span className="text-xs text-[var(--bl-text-tertiary)]">{fmtDate.format(new Date(alert.timestamp))}</span>
|
||||
</div>
|
||||
<p className="mt-2 text-sm text-[var(--bl-text-primary)]">{alert.message}</p>
|
||||
</div>
|
||||
<BellRing className="h-4 w-4 text-[var(--bl-text-tertiary)]" />
|
||||
</div>
|
||||
</div>
|
||||
)) : (
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-text-secondary)]">
|
||||
No watchdog alerts were returned for the selected instance filter.
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<div className="grid gap-3">
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Sessions</p>
|
||||
<div className="mt-3 grid gap-2">
|
||||
{liveSnapshots.map((snapshot) => (
|
||||
<div key={snapshot.instanceId} className="flex items-center justify-between gap-3 text-sm">
|
||||
<HermesInstanceBadge instanceId={snapshot.instanceId} />
|
||||
<span className="text-[var(--bl-text-secondary)]">{snapshot.sessions.totalSessions} sessions · {snapshot.sessions.totalMessages} messages</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Upcoming Hermes cron</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveCron.length > 0 ? liveCron.map((entry) => (
|
||||
<div key={`${entry.instanceId}-${entry.id}`} className="flex items-center justify-between gap-3">
|
||||
<span className="truncate">{entry.name}</span>
|
||||
<HermesInstanceBadge instanceId={entry.instanceId} />
|
||||
</div>
|
||||
)) : <p>No cron entries returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">Recent backup commits</p>
|
||||
<div className="mt-3 space-y-2 text-sm text-[var(--bl-text-secondary)]">
|
||||
{liveBackups.length > 0 ? liveBackups.map((entry) => (
|
||||
<div key={`${entry.instanceId}-${entry.sha}`} className="flex items-center justify-between gap-3">
|
||||
<span className="truncate">{entry.subject}</span>
|
||||
<HermesInstanceBadge instanceId={entry.instanceId} />
|
||||
</div>
|
||||
)) : <p>No backup commits returned.</p>}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</SectionCard>
|
||||
|
||||
<div className="grid gap-6 xl:grid-cols-[1.5fr_1fr]">
|
||||
<SectionCard title="Active Missions" subtitle="What Hermes is currently running or waiting on." actions={<Button asChild variant="ghost" size="sm"><Link href="/hermes/tasks">View all tasks <ArrowRight className="ml-2 h-4 w-4" /></Link></Button>}>
|
||||
<div className="space-y-3">
|
||||
|
||||
@ -2,10 +2,18 @@
|
||||
|
||||
import Link from 'next/link';
|
||||
import { useParams } from 'next/navigation';
|
||||
import { useEffect, useMemo, useState } from 'react';
|
||||
import { ArrowLeft, CircleDashed, Clock3, ShieldAlert, Sparkles } from 'lucide-react';
|
||||
import { Badge, Button } from '@/components/ui/Primitives';
|
||||
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
|
||||
import { getHermesProductById, getHermesTaskById, getHermesTaskEvents } from '@/lib/hermes';
|
||||
import {
|
||||
collectSessionEvents,
|
||||
collectSessionEntries,
|
||||
emptyTelemetryState,
|
||||
loadAllHermesTelemetry,
|
||||
type HermesTelemetryState,
|
||||
} from '@/lib/hermes-telemetry-client';
|
||||
|
||||
const fmt = new Intl.DateTimeFormat('en', { month: 'short', day: 'numeric', hour: 'numeric', minute: '2-digit' });
|
||||
|
||||
@ -24,6 +32,29 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
|
||||
const taskId = routeParams?.id ?? params.id;
|
||||
const task = getHermesTaskById(taskId);
|
||||
const events = getHermesTaskEvents(taskId);
|
||||
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
|
||||
const [telemetryError, setTelemetryError] = useState<string | null>(null);
|
||||
const liveSessions = useMemo(() => collectSessionEntries(telemetry, 'all').slice(0, 8), [telemetry]);
|
||||
const liveEvents = useMemo(() => collectSessionEvents(telemetry, 'all').slice(0, 12), [telemetry]);
|
||||
|
||||
useEffect(() => {
|
||||
let active = true;
|
||||
const load = async () => {
|
||||
try {
|
||||
const next = await loadAllHermesTelemetry();
|
||||
if (!active) return;
|
||||
setTelemetry(next);
|
||||
setTelemetryError(null);
|
||||
} catch (err) {
|
||||
if (!active) return;
|
||||
setTelemetryError(err instanceof Error ? err.message : String(err));
|
||||
}
|
||||
};
|
||||
void load();
|
||||
return () => {
|
||||
active = false;
|
||||
};
|
||||
}, []);
|
||||
|
||||
if (!task) {
|
||||
return (
|
||||
@ -40,7 +71,6 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
|
||||
}
|
||||
|
||||
const product = getHermesProductById(task.productId);
|
||||
const lastEvent = events[0];
|
||||
const timeline = events.slice().sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
|
||||
|
||||
return (
|
||||
@ -110,6 +140,57 @@ export default function HermesTaskDetailPage({ params }: { params: { id: string
|
||||
</SectionCard>
|
||||
</div>
|
||||
|
||||
<SectionCard
|
||||
title="Live Hermes event timeline"
|
||||
subtitle="Sanitized session JSONL events read from Hermes homes, paired with durable session index context. Message content is redacted at the backend."
|
||||
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live sessions'}</Badge>}
|
||||
>
|
||||
{telemetryError ? (
|
||||
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
|
||||
Could not load telemetry: {telemetryError}
|
||||
</p>
|
||||
) : (
|
||||
<div className="grid gap-4 xl:grid-cols-[1.2fr_0.8fr]">
|
||||
<div className="space-y-3">
|
||||
{liveEvents.map((event) => (
|
||||
<div key={`${event.instanceId}-${event.id}`} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<div className="flex flex-wrap items-start justify-between gap-3">
|
||||
<div className="min-w-0">
|
||||
<div className="flex flex-wrap items-center gap-2">
|
||||
<Badge variant={event.eventType === 'tool-call' ? 'info' : event.eventType === 'system' ? 'neutral' : 'success'}>{event.eventType}</Badge>
|
||||
<Badge variant="neutral">{event.instanceId}</Badge>
|
||||
{event.status ? <Badge variant="neutral">{event.status}</Badge> : null}
|
||||
</div>
|
||||
<p className="mt-2 font-medium text-[var(--bl-text-primary)]">{event.summary}</p>
|
||||
<p className="mt-1 truncate text-xs text-[var(--bl-text-secondary)]">{event.sessionFile}</p>
|
||||
</div>
|
||||
<p className="text-xs text-[var(--bl-text-tertiary)]">{event.timestamp ? fmt.format(new Date(event.timestamp)) : 'unknown'}</p>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
{liveEvents.length === 0 ? (
|
||||
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-text-secondary)]">No live session events were returned.</p>
|
||||
) : null}
|
||||
</div>
|
||||
<div className="grid gap-3 sm:grid-cols-2 xl:grid-cols-1">
|
||||
{liveSessions.map((session) => (
|
||||
<div key={`${session.instanceId}-${session.id}`} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<Badge variant={session.resumePending || session.suspended ? 'warning' : 'info'}>{session.platform ?? 'session'}</Badge>
|
||||
<Badge variant="neutral">{session.instanceId}</Badge>
|
||||
</div>
|
||||
<p className="mt-3 truncate font-medium text-[var(--bl-text-primary)]">{session.displayName ?? session.sessionKey}</p>
|
||||
<p className="mt-1 text-xs text-[var(--bl-text-secondary)]">Updated {session.updatedAt ? fmt.format(new Date(session.updatedAt)) : 'unknown'}</p>
|
||||
</div>
|
||||
))}
|
||||
{liveSessions.length === 0 ? (
|
||||
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-text-secondary)]">No live session entries were returned.</p>
|
||||
) : null}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</SectionCard>
|
||||
|
||||
<SectionCard title="Timeline" subtitle="Chronological event stream for the task lifecycle.">
|
||||
<ol className="space-y-4">
|
||||
{timeline.map((event) => (
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
'use client';
|
||||
|
||||
import { Fragment, useMemo, useState } from 'react';
|
||||
import { Fragment, useEffect, useMemo, useState } from 'react';
|
||||
import Link from 'next/link';
|
||||
import { Download, Filter, Search, ChevronDown, ChevronUp, ArrowLeftRight } from 'lucide-react';
|
||||
import { Download, Filter, Search, ChevronDown, ChevronUp, ArrowLeftRight, Activity } from 'lucide-react';
|
||||
import { Badge, Button, Input } from '@/components/ui/Primitives';
|
||||
import { HermesShell, MetricCard, SectionCard } from '@/components/hermes-shell';
|
||||
import { HermesInstanceBadge } from '@/components/hermes-instance-switcher';
|
||||
@ -17,6 +17,16 @@ import {
|
||||
type HermesTaskSource,
|
||||
type HermesTask,
|
||||
} from '@/lib/hermes';
|
||||
import {
|
||||
collectBackupEntries,
|
||||
collectCronEntries,
|
||||
collectSessionEntries,
|
||||
collectWatchdogAlerts,
|
||||
emptyTelemetryState,
|
||||
loadAllHermesTelemetry,
|
||||
telemetryForFilter,
|
||||
type HermesTelemetryState,
|
||||
} from '@/lib/hermes-telemetry-client';
|
||||
|
||||
const statuses: Array<HermesTaskStatus | 'all'> = ['all', 'queued', 'running', 'blocked', 'completed', 'failed', 'skipped', 'cancelled'];
|
||||
const priorities: Array<HermesPriority | 'all'> = ['all', 'P0', 'P1', 'P2', 'P3'];
|
||||
@ -50,6 +60,8 @@ export default function HermesTaskLedgerPage() {
|
||||
const [sort, setSort] = useState<(typeof sortOptions)[number]>('newest');
|
||||
const [page, setPage] = useState(1);
|
||||
const [expandedTaskId, setExpandedTaskId] = useState<string | null>(null);
|
||||
const [telemetry, setTelemetry] = useState<HermesTelemetryState>(emptyTelemetryState);
|
||||
const [telemetryError, setTelemetryError] = useState<string | null>(null);
|
||||
|
||||
const { selectedInstance } = useHermesInstance();
|
||||
const tasks = useMemo(
|
||||
@ -67,6 +79,68 @@ export default function HermesTaskLedgerPage() {
|
||||
}), [tasks]);
|
||||
|
||||
const visibleProducts = hermesProducts.slice(0, 20);
|
||||
const liveSnapshots = useMemo(() => telemetryForFilter(telemetry, selectedInstance), [telemetry, selectedInstance]);
|
||||
const liveCron = useMemo(() => collectCronEntries(telemetry, selectedInstance), [telemetry, selectedInstance]);
|
||||
const liveAlerts = useMemo(() => collectWatchdogAlerts(telemetry, selectedInstance), [telemetry, selectedInstance]);
|
||||
const liveBackups = useMemo(() => collectBackupEntries(telemetry, selectedInstance), [telemetry, selectedInstance]);
|
||||
const liveSessions = useMemo(() => collectSessionEntries(telemetry, selectedInstance), [telemetry, selectedInstance]);
|
||||
const liveActivityRows = useMemo(() => [
|
||||
...liveSessions.map((entry) => ({
|
||||
id: `session-${entry.instanceId}-${entry.id}`,
|
||||
instanceId: entry.instanceId,
|
||||
kind: 'session',
|
||||
title: entry.displayName ? `${entry.displayName} session` : entry.sessionKey,
|
||||
detail: entry.resumePending ? 'resume pending' : entry.suspended ? 'suspended' : entry.platform ?? 'session',
|
||||
time: entry.updatedAt ?? entry.createdAt,
|
||||
tone: entry.resumePending || entry.suspended ? 'warning' as const : 'info' as const,
|
||||
})),
|
||||
...liveCron.map((entry) => ({
|
||||
id: `cron-${entry.instanceId}-${entry.id}`,
|
||||
instanceId: entry.instanceId,
|
||||
kind: 'cron',
|
||||
title: entry.name,
|
||||
detail: entry.lastStatus ?? entry.schedule ?? 'Hermes cron entry',
|
||||
time: entry.nextRun ?? entry.lastRun,
|
||||
tone: entry.active ? 'success' as const : 'neutral' as const,
|
||||
})),
|
||||
...liveAlerts.map((alert) => ({
|
||||
id: `alert-${alert.instanceId}-${alert.timestamp}-${alert.message}`,
|
||||
instanceId: alert.instanceId,
|
||||
kind: 'alert',
|
||||
title: alert.message,
|
||||
detail: alert.severity,
|
||||
time: alert.timestamp,
|
||||
tone: alert.severity === 'critical' ? 'error' as const : alert.severity === 'warn' ? 'warning' as const : 'info' as const,
|
||||
})),
|
||||
...liveBackups.map((entry) => ({
|
||||
id: `backup-${entry.instanceId}-${entry.sha}`,
|
||||
instanceId: entry.instanceId,
|
||||
kind: 'backup',
|
||||
title: entry.subject,
|
||||
detail: entry.sha.slice(0, 8),
|
||||
time: entry.committedAt,
|
||||
tone: 'success' as const,
|
||||
})),
|
||||
].sort((a, b) => new Date(b.time ?? 0).getTime() - new Date(a.time ?? 0).getTime()).slice(0, 12), [liveSessions, liveCron, liveAlerts, liveBackups]);
|
||||
|
||||
useEffect(() => {
|
||||
let active = true;
|
||||
const load = async () => {
|
||||
try {
|
||||
const next = await loadAllHermesTelemetry();
|
||||
if (!active) return;
|
||||
setTelemetry(next);
|
||||
setTelemetryError(null);
|
||||
} catch (err) {
|
||||
if (!active) return;
|
||||
setTelemetryError(err instanceof Error ? err.message : String(err));
|
||||
}
|
||||
};
|
||||
void load();
|
||||
return () => {
|
||||
active = false;
|
||||
};
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<HermesShell
|
||||
@ -86,6 +160,68 @@ export default function HermesTaskLedgerPage() {
|
||||
<MetricCard label="Failed" value={counts.failed} tone="danger" />
|
||||
</section>
|
||||
|
||||
<SectionCard
|
||||
title="Live Hermes activity ledger"
|
||||
subtitle="Real cron entries, watchdog alerts, backup commits, and session totals from the telemetry endpoint. The task table below remains the planner-style seed ledger until Hermes emits task-level events."
|
||||
actions={<Badge variant={telemetryError ? 'error' : 'success'}>{telemetryError ? 'Telemetry unavailable' : 'Live telemetry'}</Badge>}
|
||||
>
|
||||
{telemetryError ? (
|
||||
<p className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4 text-sm text-[var(--bl-warning)]">
|
||||
Could not load telemetry: {telemetryError}
|
||||
</p>
|
||||
) : (
|
||||
<div className="grid gap-4 xl:grid-cols-[1fr_2fr]">
|
||||
<div className="grid gap-3">
|
||||
{liveSnapshots.map((snapshot) => (
|
||||
<div key={snapshot.instanceId} className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<HermesInstanceBadge instanceId={snapshot.instanceId} />
|
||||
<Badge variant={snapshot.sessions.status === 'up' ? 'success' : 'warning'}>{snapshot.sessions.status}</Badge>
|
||||
</div>
|
||||
<p className="mt-3 text-2xl font-semibold text-[var(--bl-text-primary)]">{snapshot.sessions.totalSessions}</p>
|
||||
<p className="text-sm text-[var(--bl-text-secondary)]">{snapshot.sessions.totalMessages} session messages observed</p>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
<div className="overflow-hidden rounded-2xl border border-[var(--bl-border)]">
|
||||
<table className="min-w-full divide-y divide-[var(--bl-border)] text-left text-sm">
|
||||
<thead className="bg-[var(--bl-surface-muted)] text-xs uppercase tracking-[0.18em] text-[var(--bl-text-tertiary)]">
|
||||
<tr>
|
||||
<th className="px-4 py-3">Artifact</th>
|
||||
<th className="px-4 py-3">Instance</th>
|
||||
<th className="px-4 py-3">Signal</th>
|
||||
<th className="px-4 py-3">Time</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody className="divide-y divide-[var(--bl-border)] bg-[var(--bl-surface-card)]">
|
||||
{liveActivityRows.map((row) => (
|
||||
<tr key={row.id}>
|
||||
<td className="px-4 py-4">
|
||||
<div className="flex items-start gap-2">
|
||||
<Activity className="mt-0.5 h-4 w-4 text-[var(--bl-text-tertiary)]" />
|
||||
<div>
|
||||
<p className="font-medium text-[var(--bl-text-primary)]">{row.title}</p>
|
||||
<p className="text-xs text-[var(--bl-text-secondary)]">{row.kind}</p>
|
||||
</div>
|
||||
</div>
|
||||
</td>
|
||||
<td className="px-4 py-4"><HermesInstanceBadge instanceId={row.instanceId} /></td>
|
||||
<td className="px-4 py-4"><Badge variant={row.tone}>{row.detail}</Badge></td>
|
||||
<td className="px-4 py-4 text-[var(--bl-text-secondary)]">{row.time ? prettyDate(row.time) : '—'}</td>
|
||||
</tr>
|
||||
))}
|
||||
{liveActivityRows.length === 0 ? (
|
||||
<tr>
|
||||
<td colSpan={4} className="px-4 py-10 text-center text-[var(--bl-text-secondary)]">No live activity artifacts were returned for the current instance filter.</td>
|
||||
</tr>
|
||||
) : null}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</SectionCard>
|
||||
|
||||
<SectionCard title="Filters" subtitle="Find work by status, product, priority, type, source, or age.">
|
||||
<div className="grid gap-3 lg:grid-cols-4 xl:grid-cols-7">
|
||||
<Input value={query} onChange={(event) => { setQuery(event.target.value); setPage(1); }} placeholder="Search tasks..." aria-label="Search tasks" className="xl:col-span-2" />
|
||||
|
||||
@ -130,6 +130,43 @@ export interface HermesSessionStats {
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesSessionEntry {
|
||||
id: string;
|
||||
sessionKey: string;
|
||||
platform: string | null;
|
||||
chatType: string | null;
|
||||
displayName: string | null;
|
||||
createdAt: string | null;
|
||||
updatedAt: string | null;
|
||||
suspended: boolean;
|
||||
resumePending: boolean;
|
||||
totalTokens: number | null;
|
||||
estimatedCostUsd: number | null;
|
||||
}
|
||||
|
||||
export interface HermesSessionList {
|
||||
entries: HermesSessionEntry[];
|
||||
status: HermesProbeStatus;
|
||||
}
|
||||
|
||||
export interface HermesSessionEvent {
|
||||
id: string;
|
||||
sessionFile: string;
|
||||
timestamp: string | null;
|
||||
role: string | null;
|
||||
eventType: 'message' | 'tool-call' | 'tool-result' | 'reasoning' | 'system' | 'unknown';
|
||||
summary: string;
|
||||
toolNames: string[];
|
||||
itemTypes: string[];
|
||||
status: string | null;
|
||||
}
|
||||
|
||||
export interface HermesSessionEventList {
|
||||
entries: HermesSessionEvent[];
|
||||
status: HermesProbeStatus;
|
||||
sourceCount: number;
|
||||
}
|
||||
|
||||
export interface HermesCronEntry {
|
||||
id: string;
|
||||
name: string;
|
||||
@ -201,6 +238,8 @@ export interface HermesTelemetrySnapshot {
|
||||
cached: boolean;
|
||||
instanceId: 'vijay' | 'bheem';
|
||||
sessions: HermesSessionStats;
|
||||
sessionList: HermesSessionList;
|
||||
sessionEvents: HermesSessionEventList;
|
||||
cron: HermesCronList;
|
||||
memory: HermesMemoryList;
|
||||
skills: HermesSkillList;
|
||||
|
||||
54
dashboard/web/src/lib/hermes-telemetry-client.ts
Normal file
54
dashboard/web/src/lib/hermes-telemetry-client.ts
Normal file
@ -0,0 +1,54 @@
|
||||
import { api, type HermesTelemetrySnapshot, type HermesWatchdogAlert } from '@/lib/api';
|
||||
import type { HermesInstanceId, HermesInstanceFilter } from '@/lib/hermes';
|
||||
|
||||
export type HermesTelemetryState = Record<HermesInstanceId, HermesTelemetrySnapshot | null>;
|
||||
|
||||
export const emptyTelemetryState: HermesTelemetryState = { vijay: null, bheem: null };
|
||||
|
||||
export async function loadAllHermesTelemetry(): Promise<HermesTelemetryState> {
|
||||
const [vijay, bheem] = await Promise.all([
|
||||
api.getHermesTelemetry('vijay'),
|
||||
api.getHermesTelemetry('bheem'),
|
||||
]);
|
||||
return { vijay, bheem };
|
||||
}
|
||||
|
||||
export function telemetryForFilter(
|
||||
telemetry: HermesTelemetryState,
|
||||
selectedInstance: HermesInstanceFilter,
|
||||
): HermesTelemetrySnapshot[] {
|
||||
if (selectedInstance === 'all') return [telemetry.vijay, telemetry.bheem].filter(Boolean) as HermesTelemetrySnapshot[];
|
||||
return telemetry[selectedInstance] ? [telemetry[selectedInstance]] : [];
|
||||
}
|
||||
|
||||
export function collectWatchdogAlerts(
|
||||
telemetry: HermesTelemetryState,
|
||||
selectedInstance: HermesInstanceFilter,
|
||||
): Array<HermesWatchdogAlert & { instanceId: HermesInstanceId }> {
|
||||
return telemetryForFilter(telemetry, selectedInstance)
|
||||
.flatMap((snapshot) => snapshot.watchdog.alerts.map((alert) => ({ ...alert, instanceId: snapshot.instanceId })))
|
||||
.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
|
||||
}
|
||||
|
||||
export function collectBackupEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
|
||||
return telemetryForFilter(telemetry, selectedInstance)
|
||||
.flatMap((snapshot) => snapshot.backupHistory.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })))
|
||||
.sort((a, b) => new Date(b.committedAt).getTime() - new Date(a.committedAt).getTime());
|
||||
}
|
||||
|
||||
export function collectCronEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
|
||||
return telemetryForFilter(telemetry, selectedInstance)
|
||||
.flatMap((snapshot) => snapshot.cron.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })));
|
||||
}
|
||||
|
||||
export function collectSessionEntries(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
|
||||
return telemetryForFilter(telemetry, selectedInstance)
|
||||
.flatMap((snapshot) => snapshot.sessionList.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })))
|
||||
.sort((a, b) => new Date(b.updatedAt ?? b.createdAt ?? 0).getTime() - new Date(a.updatedAt ?? a.createdAt ?? 0).getTime());
|
||||
}
|
||||
|
||||
export function collectSessionEvents(telemetry: HermesTelemetryState, selectedInstance: HermesInstanceFilter) {
|
||||
return telemetryForFilter(telemetry, selectedInstance)
|
||||
.flatMap((snapshot) => snapshot.sessionEvents.entries.map((entry) => ({ ...entry, instanceId: snapshot.instanceId })))
|
||||
.sort((a, b) => new Date(b.timestamp ?? 0).getTime() - new Date(a.timestamp ?? 0).getTime());
|
||||
}
|
||||
98
docs/app-url-bookmarks.md
Normal file
98
docs/app-url-bookmarks.md
Normal file
@ -0,0 +1,98 @@
|
||||
# ByteLyst App URL Bookmarks
|
||||
|
||||
**Owner:** ByteLyst DevOps
|
||||
**Last updated:** 2026-05-31T08:14:55+00:00
|
||||
**Source of truth for bookmarks:** this file
|
||||
**Exposure/security companion:** [`docs/vm-exposure-inventory.md`](vm-exposure-inventory.md)
|
||||
|
||||
Use this as the living bookmark/reference list for deployed apps, dashboards,
|
||||
APIs, and private admin surfaces. When a new app is deployed, add it here in
|
||||
the same change that adds its Caddy route, Compose service, or systemd unit.
|
||||
|
||||
`Last deployed / restarted` means the latest timestamp we have evidence for.
|
||||
For Docker services it is the container `StartedAt` timestamp from
|
||||
`docker inspect`; for systemd services it is the service active-since timestamp.
|
||||
If the deploy time is not known, use `unknown` and update it during the next
|
||||
verified deploy.
|
||||
|
||||
## Update Checklist
|
||||
|
||||
When deploying or changing an app:
|
||||
|
||||
1. Add or update the row in this file.
|
||||
2. Update `Last deployed / restarted` with an exact UTC timestamp.
|
||||
3. Record the repo/service owner and access model.
|
||||
4. If exposure changes, also update [`docs/vm-exposure-inventory.md`](vm-exposure-inventory.md).
|
||||
5. If it is a DevOps dashboard endpoint, also update [`dashboard/ENDPOINTS.md`](../dashboard/ENDPOINTS.md).
|
||||
|
||||
## Primary Dashboards
|
||||
|
||||
| Name | URL | Access | Backend/API | Runtime owner | Last deployed / restarted | Notes |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| DevOps custom dashboard | `https://devops.bytelyst.com` | private-admin/auth | `https://api.bytelyst.com/devops` | `dashboard/docker-compose.yml` (`devops-web`, `devops-backend`) | `2026-05-31T04:02:24Z` web, `2026-05-31T04:02:23Z` backend | Unified ByteLyst DevOps dashboard. Hermes Mission Control lives under `/hermes`. |
|
||||
| DevOps Tailscale entry | `https://srv1491630.tailf85608.ts.net/login` | Tailscale/private-admin/auth | `http://127.0.0.1:4004` | Tailscale serve -> `localhost:3049` | `2026-05-31T04:02:24Z` | Private login path used for VM-side dashboard review. |
|
||||
| Platform admin dashboard | `https://admin.bytelyst.com` | private-admin/auth | `https://api.bytelyst.com/platform/api` | common platform `admin-web` | `unknown` | Caddy route is documented; container was not present in the 2026-05-27 exposure inventory. Verify before relying on it. |
|
||||
| Hermes Mission Control | `https://devops.bytelyst.com/hermes` | private-admin/auth | `https://api.bytelyst.com/devops/api/hermes/*` | DevOps custom dashboard | `2026-05-31T04:02:24Z` | Unified custom Hermes dashboard over Vijay/root and Bheem/Uma. |
|
||||
| Hermes native Vijay dashboard | `http://100.87.53.10:9119/` | Tailscale-only/private-admin | native Hermes service | `hermes-root-dashboard.service` | `2026-05-31T04:02:20Z` | Built-in Hermes dashboard for root/Vijay. No public Caddy route. |
|
||||
| Hermes native Bheem dashboard | `http://100.87.53.10:9120/` | Tailscale-only/private-admin | native Hermes service | `uma-hermes-dashboard.service` | `2026-05-31T04:02:20Z` | Built-in Hermes dashboard for Uma/Bheem. No public Caddy route. |
|
||||
| LLM Lab dashboard | `https://llmlab.bytelyst.com` | private-admin | local/dashboard service | common platform `llmlab-dashboard` | `2026-05-31T04:02:24Z` | Keep private/auth-gated. Local host port `127.0.0.1:3075`. |
|
||||
|
||||
## Public Apps
|
||||
|
||||
| App | Public URL | API URL | Runtime owner | Last deployed / restarted | Notes |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| InvtTrdg | `https://invttrdg.bytelyst.com` | `https://api.bytelyst.com/invttrdg/*` | `/opt/bytelyst/learning_ai_invt_trdg` | `unknown` | Exposure inventory maps web to `:3085` and backend to `:4025`. |
|
||||
| Clock / Chronomind | `https://clock.bytelyst.com` | `https://api.bytelyst.com/chronomind/*` | `/opt/bytelyst/learning_ai_clock` | `2026-05-31T04:02:24Z` web/backend | Local web `127.0.0.1:3030`, backend `127.0.0.1:4011`. |
|
||||
| Notes / Notelett | `https://notes.bytelyst.com` | `https://api.bytelyst.com/notelett/*` | `/opt/bytelyst/learning_ai_notes` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` backend | Local web `127.0.0.1:3000`, backend `127.0.0.1:4016`. |
|
||||
| Tracker | `https://tracker.bytelyst.com` | n/a | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` | Local web `127.0.0.1:3003`. |
|
||||
| PeakPulse | n/a | `https://api.bytelyst.com/peakpulse/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` platform stack | Backend is Docker-internal `peakpulse-backend:4010`. |
|
||||
| Jarvis Jr | n/a | `https://api.bytelyst.com/jarvisjr/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` platform stack | Local web `127.0.0.1:3035`, backend Docker-internal `jarvisjr-backend:4012`. |
|
||||
| Nomgap | Vercel / external | `https://api.bytelyst.com/nomgap/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` platform stack | Old local `nomgap-web` was retired; backend remains Docker-internal. |
|
||||
| Mindlyst | n/a | `https://api.bytelyst.com/mindlyst/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3050`, backend Docker-internal `mindlyst-backend:4014`. |
|
||||
| LysnrAI | n/a | `https://api.bytelyst.com/lysnrai/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` dashboard, `2026-05-31T04:02:24Z` platform stack | Local dashboard `127.0.0.1:3002`, backend Docker-internal `lysnrai-backend:4015`. |
|
||||
| Flowmonk | n/a | `https://api.bytelyst.com/flowmonk/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3040`, backend Docker-internal `flowmonk-backend:4017`. |
|
||||
| ActionTrail | n/a | `https://api.bytelyst.com/actiontrail/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:23Z` web, `2026-05-31T04:02:24Z` platform stack | Local web `127.0.0.1:3060`; exposure inventory notes route/backend mapping needs verification. |
|
||||
| LocalMemGPT | n/a | `https://api.bytelyst.com/localmemgpt/*` | `/opt/bytelyst/learning_ai_common_plat` | `2026-05-31T04:02:24Z` web/platform stack | Local web `127.0.0.1:3070`, backend Docker-internal `localmemgpt-backend:4019`. |
|
||||
|
||||
## Shared APIs And Infrastructure
|
||||
|
||||
| Service | URL | Access | Runtime owner | Last deployed / restarted | Notes |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| API gateway | `https://api.bytelyst.com` | public gateway | Caddy/common platform | `2026-05-31T04:02:24Z` caddy | Routes app APIs by path. |
|
||||
| Platform API | `https://api.bytelyst.com/platform/api` | public/auth-required | common platform `platform-service` | `2026-05-31T04:02:24Z` | Auth and platform data API. |
|
||||
| Extraction API | `https://api.bytelyst.com/extraction/*` | public/API-controlled | common platform `extraction-service` | `2026-05-31T04:02:23Z` | Confirm auth posture before exposing new consumers. |
|
||||
| MCP API | `https://api.bytelyst.com/mcp/*` | public/API-controlled | common platform `mcp-server` | `2026-05-31T04:02:23Z` | Confirm public need before widening access. |
|
||||
| Gitea | `https://gitea.bytelyst.com` | public/admin-auth | `gitea-npm-registry` | `2026-05-31T04:02:23Z` | Local direct registry also listens on `:3300`; see registry docs. |
|
||||
| Gitea npm registry | `http://localhost:3300/api/packages/bytelyst/npm/` | VM/local or tunnel | `gitea-npm-registry` | `2026-05-31T04:02:23Z` | Do not use from laptop unless tunneled. See [`docs/gitea-registry-and-package-resolution.md`](gitea-registry-and-package-resolution.md). |
|
||||
| Ollama endpoint | `https://ollama.bytelyst.com` | private-admin target | host `ollama` service | `unknown` | Must not be unauthenticated public. |
|
||||
| Mailpit UI | `http://127.0.0.1:8025` | loopback-only | common platform `mailpit` | `2026-05-31T04:02:23Z` | Dev/test mail UI. |
|
||||
| Loki | `http://127.0.0.1:3100` | loopback-only | common platform `loki` | `2026-05-31T04:02:24Z` | Observability internal. |
|
||||
| Cosmos emulator UI | `http://127.0.0.1:1234` / `http://127.0.0.1:8081` | loopback-only | common platform `cosmos-emulator` | `2026-05-31T04:02:23Z` | Dev/test only; current production data uses real Cosmos for platform. |
|
||||
| Azurite | `http://127.0.0.1:10000` | loopback-only target | common platform `azurite` | `2026-05-31T04:02:24Z` | Check exposure inventory before relying on external access. |
|
||||
|
||||
## Local Host Ports
|
||||
|
||||
These are operational shortcuts, not public bookmarks.
|
||||
|
||||
| Service | Local URL | Public/private equivalent |
|
||||
| --- | --- | --- |
|
||||
| DevOps web container | `http://127.0.0.1:3049` | `https://devops.bytelyst.com` or Tailscale URL |
|
||||
| DevOps backend health | `http://127.0.0.1:4004/health` | `https://api.bytelyst.com/devops/health` if routed |
|
||||
| Platform service health | `http://127.0.0.1:4003/health` | `https://api.bytelyst.com/platform/api` |
|
||||
| Clock web | `http://127.0.0.1:3030` | `https://clock.bytelyst.com` |
|
||||
| Notes web | `http://127.0.0.1:3000` | `https://notes.bytelyst.com` |
|
||||
| InvtTrdg web | `http://127.0.0.1:3085` | `https://invttrdg.bytelyst.com` |
|
||||
| Tracker web | `http://127.0.0.1:3003` | `https://tracker.bytelyst.com` |
|
||||
| Hermes Vijay dashboard | `http://100.87.53.10:9119/` | Tailscale-only |
|
||||
| Hermes Bheem dashboard | `http://100.87.53.10:9120/` | Tailscale-only |
|
||||
|
||||
## Open Verification Items
|
||||
|
||||
- Confirm whether `admin.bytelyst.com` is currently backed by a running
|
||||
`admin-web` container.
|
||||
- Confirm product-facing public URLs for apps listed as `n/a` before sharing
|
||||
them outside the admin team.
|
||||
- Confirm `actiontrail` API route/container port mapping; historical inventory
|
||||
used `api.bytelyst.com/actiontrail/*` while current container metadata shows
|
||||
`actiontrail-backend` as part of the common platform stack.
|
||||
- Replace any `unknown` deploy timestamp during the next verified deploy.
|
||||
@ -37,6 +37,20 @@ Observed on 2026-05-27:
|
||||
|
||||
Before adding any new Caddy hostname, Docker port, or dashboard/API feature, verify that it is not a Hermes dashboard/API public exposure.
|
||||
|
||||
Session privacy policy for dashboard/telemetry surfaces:
|
||||
|
||||
- Treat gateway session content as private by default for both Vijay and Bheem.
|
||||
- Dashboard routes may show counts, statuses, timestamps, IDs, sanitized warning
|
||||
messages, cron names, skill/memory names, and backup commit subjects.
|
||||
- Dashboard telemetry may show sanitized session JSONL event projections:
|
||||
event type, role, timestamp, source filename, tool names, item types, and
|
||||
status. Raw message content remains redacted before it reaches the UI.
|
||||
- Dashboard routes must not expose raw prompts, full session transcripts, raw
|
||||
command output containing secrets, `.env` values, OAuth payloads, raw
|
||||
`state.db`, Telegram tokens, provider keys, or personal message content.
|
||||
- If a future session-event pipeline is added, enable secret and PII redaction
|
||||
at ingestion time and store only the redacted event projection used by the UI.
|
||||
|
||||
```bash
|
||||
# Inspect public Caddy routes and obvious Hermes/API/dashboard references.
|
||||
docker ps --format '{{.Names}} {{.Ports}}' | grep -i caddy || true
|
||||
@ -85,6 +99,60 @@ systemd/hermes-root-backup.service
|
||||
systemd/hermes-root-backup.timer
|
||||
systemd/uma-hermes-backup.service
|
||||
systemd/uma-hermes-backup.timer
|
||||
systemd/hermes-health-watchdog.service
|
||||
systemd/hermes-health-watchdog.timer
|
||||
systemd/uma-hermes-health-watchdog.service
|
||||
systemd/uma-hermes-health-watchdog.timer
|
||||
systemd/hermes-ops-exporter.service
|
||||
systemd/hermes-ops-exporter.timer
|
||||
systemd/uma-hermes-ops-exporter.service
|
||||
systemd/uma-hermes-ops-exporter.timer
|
||||
```
|
||||
|
||||
## Mission Control ops exporter
|
||||
|
||||
Mission Control can read a sanitized per-instance ops export before falling back
|
||||
to live cross-user probes. This reduces brittle root-to-Uma inspection and keeps
|
||||
the dashboard contract free of secrets or session content.
|
||||
|
||||
Tracked exporter:
|
||||
|
||||
```bash
|
||||
scripts/hermes-ops-exporter.py
|
||||
```
|
||||
|
||||
Output paths:
|
||||
|
||||
```text
|
||||
/root/.hermes/ops-export.json
|
||||
/home/uma/.hermes/ops-export.json
|
||||
```
|
||||
|
||||
The JSON contains only service booleans/status, timer timestamps, short Git
|
||||
metadata, restore counts, and whether a Google token file exists. It does not
|
||||
include token values, raw `state.db`, logs, prompt/session text, OAuth payloads,
|
||||
or environment files.
|
||||
|
||||
Install root exporter:
|
||||
|
||||
```bash
|
||||
cp systemd/hermes-ops-exporter.service /etc/systemd/system/hermes-ops-exporter.service
|
||||
cp systemd/hermes-ops-exporter.timer /etc/systemd/system/hermes-ops-exporter.timer
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now hermes-ops-exporter.timer
|
||||
systemctl status hermes-ops-exporter.timer --no-pager
|
||||
```
|
||||
|
||||
Install Uma exporter as user systemd:
|
||||
|
||||
```bash
|
||||
install -d -o uma -g uma /home/uma/.config/systemd/user
|
||||
cp systemd/uma-hermes-ops-exporter.service /home/uma/.config/systemd/user/uma-hermes-ops-exporter.service
|
||||
cp systemd/uma-hermes-ops-exporter.timer /home/uma/.config/systemd/user/uma-hermes-ops-exporter.timer
|
||||
chown uma:uma /home/uma/.config/systemd/user/uma-hermes-ops-exporter.*
|
||||
runuser -u uma -- systemctl --user daemon-reload
|
||||
runuser -u uma -- systemctl --user enable --now uma-hermes-ops-exporter.timer
|
||||
runuser -u uma -- systemctl --user status uma-hermes-ops-exporter.timer --no-pager
|
||||
```
|
||||
|
||||
## Health baseline commands
|
||||
@ -164,6 +232,48 @@ python3 ~/.hermes/scripts/hermes_health_watchdog.py
|
||||
# Healthy output should be empty.
|
||||
```
|
||||
|
||||
Tracked systemd watchdog timers:
|
||||
|
||||
```bash
|
||||
systemctl status hermes-health-watchdog.timer --no-pager
|
||||
systemctl --user --machine=uma@.host status uma-hermes-health-watchdog.timer --no-pager
|
||||
tail -n 20 /root/.hermes/logs/hermes-health-watchdog.log
|
||||
tail -n 20 /home/uma/.hermes/logs/hermes-health-watchdog.log
|
||||
```
|
||||
|
||||
Dashboard warning bridge:
|
||||
|
||||
```bash
|
||||
/var/log/hermes-dashboard-warnings.log
|
||||
```
|
||||
|
||||
The dashboard backend appends deduplicated warning lines there when
|
||||
`HERMES_DASHBOARD_ALERT_LOG` is configured. Both watchdogs tail the same file
|
||||
and route by `instance=vijay`, `instance=bheem`, or `instance=all`.
|
||||
Telegram delivery is attempted only when `~<user>/.config/hermes/telegram`
|
||||
exists with `BOT_TOKEN=`/`CHAT_ID=` or `TELEGRAM_BOT_TOKEN=`/`TELEGRAM_CHAT_ID=`.
|
||||
If that file is absent, the watchdog still writes a local warning log line and
|
||||
records `Telegram delivery skipped or failed`.
|
||||
|
||||
2026-05-31 Telegram delivery validation:
|
||||
|
||||
- `instance=bheem` synthetic warning: consumed only by Uma watchdog; root log
|
||||
had zero matches; Telegram delivery succeeded.
|
||||
- `instance=vijay` synthetic warning: consumed only by root watchdog; Uma log
|
||||
had zero matches; Telegram delivery succeeded.
|
||||
- `instance=all` synthetic warning: consumed by both watchdogs; Telegram
|
||||
delivery succeeded for both chats.
|
||||
- Recovery messages: after each alert, the next healthy watchdog pass sent
|
||||
`recovery: back to healthy` and logged `Telegram recovery delivery succeeded`.
|
||||
- Approval prompt/media validation: root and Uma bots returned Telegram `200`
|
||||
for harmless inline-button prompt delivery and small document upload.
|
||||
- Approval callback execution evidence: live gateway logs contain real
|
||||
`Telegram button resolved 1 approval(s)` entries for root through
|
||||
2026-05-30, including a deny choice, and for Uma on 2026-05-25. Telegram's
|
||||
Bot API cannot synthesize user callback clicks, so callback execution proof
|
||||
comes from these receiver logs plus source review of the Telegram callback
|
||||
handler.
|
||||
|
||||
Persistent backup timers:
|
||||
|
||||
```bash
|
||||
@ -424,9 +534,33 @@ alerts today) follow a small set of conventions worth keeping consistent.
|
||||
(✅ approve / ❌ deny). The dashboard does not yet trigger these — see the
|
||||
Phase 8 delegation brief in `docs/prompts/phase8-telegram-loop.md` for the
|
||||
design that closes the loop end-to-end.
|
||||
- 2026-05-31 delivery smoke test: root and Uma bots both returned Telegram
|
||||
`200` for a harmless inline-button approval prompt. Callback handling was not
|
||||
exercised because that requires a human button press and an action receiver.
|
||||
|
||||
**Media/file delivery**
|
||||
- 2026-05-31 delivery smoke test: root and Uma bots both returned Telegram
|
||||
`200` for a small text document upload.
|
||||
|
||||
**Don't paste secrets**
|
||||
- Bot tokens and chat IDs live in `~<user>/.config/hermes/telegram` mode `600`,
|
||||
never in repo files. The dashboard's `lib/logger.ts` redacts
|
||||
`Authorization` / `Cookie` / `*.token` paths from any logged object so an
|
||||
accidental `req.log.info({ tg })` won't dump credentials.
|
||||
|
||||
## Token audit status
|
||||
|
||||
Checked on 2026-05-31 without printing token values:
|
||||
|
||||
- Gitea package tokens exist at `/opt/bytelyst/.gitea_token`,
|
||||
`/root/.gitea_npm_token`, and `/root/.gitea_npm_token_home`, mode `600`.
|
||||
They can read package metadata from the local Gitea npm registry and receive
|
||||
`403` from `/api/v1/user`, which is consistent with package-only/no-profile
|
||||
scope.
|
||||
- Root GitHub credentials exist in `/root/.git-credentials`. GitHub API scope
|
||||
headers report `gist, read:org, repo, workflow`; this is broader than the
|
||||
desired least-privilege backup scope.
|
||||
- No Uma-owned GitHub token file was found under `/home/uma` during the metadata
|
||||
scan, and the active `uma-hermes-backup.service` still runs as root. Keep the
|
||||
existing backup path running until a fine-grained Uma-owned token is provided,
|
||||
then migrate Bheem self-push and re-audit.
|
||||
|
||||
@ -87,7 +87,7 @@ The `hermes-ops` snapshot becomes the single source of truth for live status. Be
|
||||
- [x] Stop swallowing every failure to `null` indiscriminately: distinguish "unit inactive" from "probe failed/timed out" and surface per-field status so the UI can show *unknown* vs *down*.
|
||||
- [x] Add Zod validation + a stable typed contract for `HermesOpsSnapshot` on the route.
|
||||
- [x] **Add unit tests for the `hermes-ops` repository** (mock `execFile`/fs) — closes the REVIEW_ACTIONS "only `services` has tests" gap for this module.
|
||||
- [ ] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). Interim stopgap until it ships: `runuser -u uma -- systemctl --user is-active/is-enabled` instead of the `ps`/`existsSync` checks.
|
||||
- [x] Read Bheem/Uma state via a **self-reporting ops exporter** (Decision #2): a read-only `uma` user-systemd timer writes a sanitized JSON snapshot to a known path; the root backend reads + aggregates it (Vijay gets a symmetric exporter). *(Repo implementation complete 2026-05-31: new `scripts/hermes-ops-exporter.py`, root/Uma systemd timer templates, and backend support for `/root/.hermes/ops-export.json` + `/home/uma/.hermes/ops-export.json` with live probe fallback. VM enablement still belongs to Phase 4 verification.)*
|
||||
|
||||
## Phase 2 — Instance dimension across Mission Control (G2)
|
||||
|
||||
@ -107,9 +107,9 @@ Define the ingestion contract first, then convert panes. Keep any pane with no r
|
||||
- [x] Memory + skills inventory (`hermes memory list --json`, `hermes skills list --json`).
|
||||
- [x] Watchdog alerts feed (tails `~/.hermes/logs/hermes-health-watchdog.log`, severity-bucketed `info`/`warn`/`critical`).
|
||||
- [x] Backup history (`git -C <repo> log` — last 20 commits per backup repo).
|
||||
- [ ] Convert **Task Ledger** (`/hermes/tasks`) + **Task Detail** to the real task/event source. *(Deferred: needs the JSONL/SQLite session-events pipeline that Decision #1 marked as optional. Task Ledger remains seed-data; flip when a real source ships.)*
|
||||
- [~] Convert **Task Ledger** (`/hermes/tasks`) + **Task Detail** to the real task/event source. *(Advanced 2026-05-31: telemetry now reads real `sessions/sessions.json` indexes plus sanitized Hermes session JSONL events per instance. Task Detail renders a live Hermes event timeline with message content redacted at the backend. The planner-style task table remains seed-data until Hermes emits a durable task-id/task-state ledger rather than only session events.)*
|
||||
- [~] Convert **Agents** (`/hermes/agents`) to real toolset/integration status per instance. *(Partial: `/hermes/agents` now renders a "Memory & Skills inventory (live)" SectionCard backed by the Phase 3 telemetry endpoint per instance — `hermes memory list` / `hermes skills list` rendered with per-section probe-status badges, item counts, and the first N entries each. Agent **health** statuses (latency, failure rate, last-success/failure) are still seed-data; lighting those up needs a separate observability contract — telemetry only exposes inventory today.)*
|
||||
- [ ] Convert **History** (`/hermes/history`) to real session/cron/backup trends. *(Deferred: depends on real session timeseries.)*
|
||||
- [~] Convert **History** (`/hermes/history`) to real session/cron/backup trends. *(Advanced 2026-05-31: History now renders live sanitized session JSONL events, session index entries, cron count, watchdog alert count, backup commit count, and a live artifact timeline from telemetry. The weekly chart/failure categories remain seed trend models until Hermes emits an aggregate durable analytics timeseries.)*
|
||||
- [x] **Products** (`/hermes/products`): repoint at the real service registry (`backend/src/modules/services/`) + health module (Decision #3); drop the fabricated 50-item mock. Optional manual entries for not-yet-deployed products come later. *(Page rewritten: top "Live services" section sources from `api.getServices()` joined with `api.getHealth()` (real Cosmos-backed registry + 30s-cached health probes), with per-service status, response time, last deploy, last health check. The 50-item seed remains below in a clearly-labelled "Planned products (seed data)" section per the roadmap's "optional manual entries for not-yet-deployed products come later" note. New E2E mocks for `/api/services` + `/api/health` keep the suite deterministic.)*
|
||||
|
||||
## Phase 4 — Bheem/Uma parity so the dashboard shows two equal instances (G7)
|
||||
@ -118,11 +118,11 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn
|
||||
|
||||
> **VM ops, not codebase work.** This phase requires sudo on the Hostinger VM, Uma-owned GitHub credentials, and Telegram bot tokens — none of it is editable in this repo. The full delegation brief is in [`docs/prompts/phase4-bheem-uma-parity.md`](./prompts/phase4-bheem-uma-parity.md). When the brief's Definition-of-Done is met, tick the boxes below and the summary line at the bottom of this file.
|
||||
|
||||
- [ ] Stand up a **Uma persistent backup repo + `uma-hermes-backup.timer`** mirroring the root design (sanitized `hermes_persistent_backup/`, secrets and `state.db` excluded), pushing to `umadev0931/uma_hostinger_hermes_vm` **with a Uma-owned, repo-scoped token (Bheem self-pushes; root no longer pushes Uma's backup — Decision #5)**.
|
||||
- [ ] Install a **Uma health watchdog** (mirror `scripts/hermes-health-watchdog.py`), silent-on-success, alerting Uma's Telegram.
|
||||
- [~] Stand up a **Uma persistent backup repo + `uma-hermes-backup.timer`** mirroring the root design (sanitized `hermes_persistent_backup/`, secrets and `state.db` excluded), pushing to `umadev0931/uma_hostinger_hermes_vm` **with a Uma-owned, repo-scoped token (Bheem self-pushes; root no longer pushes Uma's backup — Decision #5)**. *(Live read-only check 2026-05-31: `uma-hermes-backup.timer` is active, repo HEAD is `a4828db`, repo status is clean, and `/home/uma/.hermes/google_token.json` exists. Still needs explicit token-scope/ownership audit before marking fully complete.)*
|
||||
- [~] Install a **Uma health watchdog** (mirror `scripts/hermes-health-watchdog.py`), silent-on-success, alerting Uma's Telegram. *(Installed 2026-05-31 as `uma-hermes-health-watchdog.timer`; `/home/uma/.hermes/logs/hermes-health-watchdog.log` now exists and reports healthy after fixing user-systemd gateway probing. Telegram delivery is wired but not fully validated because `/home/uma/.config/hermes/telegram` is absent.)*
|
||||
- [ ] Run the **first Uma restore rehearsal** into a temporary `HERMES_HOME`; document in `docs/hermes-operations.md` / `docs/hermes-disaster-recovery.md`.
|
||||
- [ ] Schedule a **quarterly Uma restore-drill reminder** (parity with root).
|
||||
- [ ] Confirm these close the corresponding Bheem warnings emitted by `getHermesOpsSnapshot()` (backup timer active, repo HEAD readable + clean, Google token present).
|
||||
- [~] Confirm these close the corresponding Bheem warnings emitted by `getHermesOpsSnapshot()` (backup timer active, repo HEAD readable + clean, Google token present). *(Partial live evidence 2026-05-31: backup timer active, repo HEAD readable/clean, Google token present, and Uma watchdog log now exists. Still open for Telegram credential validation + Uma-owned token migration.)*
|
||||
|
||||
## Phase 5 — Dashboard app hardening (G5)
|
||||
|
||||
@ -141,21 +141,21 @@ This is the biggest operational asymmetry and the reason half the ops-panel warn
|
||||
- [x] Deep links from the ops panel → Task Ledger filtered to the relevant instance/most-recent work. *(Per-instance "View tasks" button on each ops-panel `InstanceCard` links to `/hermes/tasks?instance=<id>`. `HermesInstanceProvider` now hydrates from the `?instance=` URL param on mount (winning over the persisted localStorage selection) and keeps the param meaningful for back/forward + copy-paste.)*
|
||||
- [x] Per-instance action rows beyond copy-link/open-dashboard: open-runbook, copy SSH/tunnel command, "how to restart this gateway". *(InstanceCard now exposes "Copy SSH command" (Tailscale-scoped: `tailscale ssh root@<tailscale-ip>` for Vijay, `tailscale ssh uma@<tailscale-ip>` for Bheem — never raw `ssh`), "View tasks" deep link, and "Open runbook" pointing at `docs/hermes-operations.md`. "How to restart this gateway" is intentionally a runbook link rather than a button — restarting is a privileged action that should go through the runbook, not the dashboard.)*
|
||||
- [x] Optional dark/light theme toggle if the shell supports it. *(`components/theme-toggle.tsx` Sun/Moon button mounted in the Hermes layout next to the instance switcher. Persists in localStorage `bytelyst.theme.v1`; an inline FOUC-prevention script in the root layout reads the same key and applies `data-theme` to `<html>` before React hydrates so the first paint matches the user's last choice. The design system already had `[data-theme="light"]` overrides in `styles/tokens.css`; the toggle just flips them on.)*
|
||||
- [ ] Unified alerts feed across both instances on the overview. *(Partially achieved by `recentAlerts` + the new severity filter on the ops panel; full per-instance roll-up of telemetry watchdog alerts is queued behind a UI consumer for the new `/api/hermes/telemetry/:instance` endpoint.)*
|
||||
- [x] Unified alerts feed across both instances on the overview. *(Completed 2026-05-31: `/hermes` now renders "Unified live alerts" from both telemetry endpoints, filtered by the global instance switcher, with watchdog alerts, session totals, cron entries, and backup commits.)*
|
||||
|
||||
## Phase 7 — Security & access (G8)
|
||||
|
||||
- [x] Require authentication on the DevOps dashboard's hermes routes/endpoints (reuse platform-service auth already used elsewhere). *(Both `/api/hermes/ops` and the new `/api/hermes/telemetry/:instance` now gate on `requireAdmin`. Privilege-surface table in `dashboard/DEPLOYMENT.md` updated to match. The previous "read-only ops snapshot, no auth" carve-out is gone — all Hermes routes are admin-only.)*
|
||||
- [ ] Decide and document `security.redact_secrets` and `privacy.redact_pii` for gateway sessions (per instance). *(Deferred — needs a founder decision on PII handling for session content; not a code-only change.)*
|
||||
- [ ] Finish the GitHub/Gitea **least-privilege token audit** (root currently pushes both repos) and rotate any migrated/exposed credentials — completed naturally by Decision #5 (Bheem self-pushes with its own scoped token). *(Resolves naturally when Phase 4 ships — see the Phase 4 delegation brief.)*
|
||||
- [x] Decide and document `security.redact_secrets` and `privacy.redact_pii` for gateway sessions (per instance). *(Documented 2026-05-31 in `docs/hermes-operations.md`: dashboard surfaces may expose only redacted projections such as counts/status/timestamps/sanitized warnings/cron names/backup subjects; raw prompts, transcripts, command output with secrets, `.env`, OAuth payloads, `state.db`, Telegram/provider tokens, and personal message content are prohibited. Any future event pipeline must redact at ingestion.)*
|
||||
- [~] Finish the GitHub/Gitea **least-privilege token audit** (root currently pushes both repos) and rotate any migrated/exposed credentials — completed naturally by Decision #5 (Bheem self-pushes with its own scoped token). *(Audited 2026-05-31 without printing tokens: Gitea package tokens can read package metadata and get `403` from `/api/v1/user`; root GitHub token reports broad scopes `gist, read:org, repo, workflow`; no Uma-owned GitHub token file was found, and active `uma-hermes-backup.service` still runs as root. Rotation/migration requires a fine-grained Uma-owned token.)*
|
||||
- [x] Keep all hermes data private-only; never expose the `hermes-ops` snapshot or task data on a public route. *(Verified: no Caddy/public route added; the dashboard is bound to `127.0.0.1` and reached via Tailscale or SSH tunnel only — see `dashboard/DEPLOYMENT.md` "Ports — quick reference" + "Privilege Surface" sections. With this commit's `requireAdmin` change, even an attacker with internal network access still needs a valid admin JWT to read the ops snapshot.)*
|
||||
|
||||
## Phase 8 — Notifications & Telegram loop (G9)
|
||||
|
||||
> **Mostly VM ops + bot-token configuration**, with two small backend hooks. Full delegation brief in [`docs/prompts/phase8-telegram-loop.md`](./prompts/phase8-telegram-loop.md). The dashboard's documentation half is already done — see `docs/hermes-operations.md` "Telegram Notification Convention".
|
||||
|
||||
- [ ] Push new dashboard-detected warnings to the correct Telegram (Vijay → root chat, Bheem → Uma chat), reusing the watchdog delivery path; silent on healthy. *(Design captured in the brief: `lib/dashboard-alerts.ts` writes new warnings to a tag-prefixed log; both watchdogs tail it. Implementation gated on Phase 4 (Uma watchdog must exist first) and on bot tokens.)*
|
||||
- [ ] Validate the Telegram approval-prompt flow and media/file delivery end-to-end (the two unchecked v1 items). *(Brief item 3.)*
|
||||
- [x] Push new dashboard-detected warnings to the correct Telegram (Vijay → root chat, Bheem → Uma chat), reusing the watchdog delivery path; silent on healthy. *(Validated 2026-05-31: `instance=bheem` warning was consumed only by Uma watchdog and delivered to Telegram; `instance=vijay` only by root; `instance=all` by both. Follow-up healthy pass sent Telegram recovery messages for both instances.)*
|
||||
- [x] Validate the Telegram approval-prompt flow and media/file delivery end-to-end (the two unchecked v1 items). *(Validated 2026-05-31: root and Uma bots returned Telegram `200` for harmless inline-button approval prompt delivery and small document upload. Existing live gateway logs also prove real inline approval callback execution: root recorded multiple `Telegram button resolved 1 approval(s)` entries through 2026-05-30, including `choice=deny`; Uma recorded `Telegram button resolved 1 approval(s)` entries on 2026-05-25. Bot API cannot synthesize user callback clicks, so this status is based on live receiver logs plus source review of the callback handler.)*
|
||||
- [x] Preserve the numbered-emoji progress convention (`1️⃣`, `2️⃣`, …) for completion updates. *(Codified in `docs/hermes-operations.md` under a new "Telegram Notification Convention" section, alongside the routing-per-instance, silent-on-healthy, and never-paste-secrets rules. The brief references this as the source of truth so VM-side implementers stay consistent.)*
|
||||
|
||||
---
|
||||
@ -182,25 +182,25 @@ export interface HermesInstanceRef {
|
||||
This roadmap is complete when:
|
||||
|
||||
- [ ] The overview, ledger, agents, and history panes render **real data for both Vijay and Bheem**, filterable by instance; only panes without a real source remain (clearly labeled) seed data.
|
||||
- [ ] `hermes-ops` is cached, uses robust Uma user-systemd checks, distinguishes unknown vs down, and has unit tests.
|
||||
- [x] `hermes-ops` is cached, uses robust Uma user-systemd checks, distinguishes unknown vs down, and has unit tests.
|
||||
- [ ] Bheem has a persistent backup repo + timer, a watchdog, and one completed restore rehearsal — and the dashboard shows **2/2 healthy** with zero standing Bheem warnings.
|
||||
- [ ] CI is green on the correct path, lint is real, and coverage includes auth/csrf/orchestrator/health/hermes-ops.
|
||||
- [ ] Hermes routes require auth and remain private-only; redact policies are decided and documented.
|
||||
- [ ] Dashboard warnings reach the correct Telegram chat per instance.
|
||||
- [ ] Hermes routes require auth and remain private-only; redact policies are decided and documented. *(Auth/private-only/redaction are complete; still open only because the GitHub/Gitea least-privilege token audit remains tied to Phase 4.)*
|
||||
- [x] Dashboard warnings reach the correct Telegram chat per instance.
|
||||
|
||||
## Implementation Status Checklist
|
||||
|
||||
Update only with evidence (source review, tests, build output, or browser/VM verification).
|
||||
|
||||
- [x] Phase 0 — Guardrails reconfirmed (2026-05-30 pass; remains "must hold throughout")
|
||||
- [x] Phase 1 — `hermes-ops` hardened + tested
|
||||
- [x] Phase 1 — `hermes-ops` hardened + tested, including sanitized ops-export support
|
||||
- [x] Phase 2 — Instance dimension + switcher
|
||||
- [x] Phase 3 — Real telemetry ingestion + Products pane converted (Task Ledger / Agents / History deferred — depend on JSONL session pipeline, see Phase 3 notes)
|
||||
- [ ] Phase 4 — Bheem/Uma parity (backup, watchdog, restore drill)
|
||||
- [x] Phase 5 — App/CI hardening (P0/P1/P2 done; P2 follow-ups in DEPLOYMENT.md mitigation roadmap remain)
|
||||
- [x] Phase 6 — UX polish (severity tags + deep links + per-instance actions; trend cards + theme toggle deferred)
|
||||
- [x] Phase 7 — Security & access (auth on hermes routes + privacy stance documented; redact_secrets/redact_pii decision deferred)
|
||||
- [ ] Phase 8 — Notifications & Telegram (convention codified; delivery loop is VM ops, see brief)
|
||||
- [x] Phase 6 — UX polish (severity tags + deep links + per-instance actions; trend cards + theme toggle + unified live alerts)
|
||||
- [x] Phase 7 — Security & access (auth on hermes routes + privacy stance documented; token audit remains tied to Phase 4)
|
||||
- [x] Phase 8 — Notifications & Telegram (warning routing, recovery messages, media delivery, and approval callback evidence validated 2026-05-31)
|
||||
|
||||
## Decisions (resolved 2026-05-30)
|
||||
|
||||
|
||||
@ -4,6 +4,11 @@ Common operational paths for the team.
|
||||
|
||||
Use this file as the routing guide. For the exact support boundary, cross-check `docs/supported-scripts.md`.
|
||||
|
||||
For app/dashboard bookmarks and deployment URL references, use
|
||||
[`docs/app-url-bookmarks.md`](app-url-bookmarks.md). Keep that file updated
|
||||
whenever a new app URL, dashboard URL, API route, or last deploy timestamp
|
||||
changes.
|
||||
|
||||
---
|
||||
|
||||
## Hostinger VM Maintenance
|
||||
|
||||
@ -12,12 +12,21 @@ import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DISK_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_DISK_WARN_PERCENT", "85"))
|
||||
MEMORY_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_MEMORY_WARN_PERCENT", "90"))
|
||||
BACKUP_STALE_MINUTES = int(os.getenv("HERMES_WATCHDOG_BACKUP_STALE_MINUTES", "90"))
|
||||
BACKUP_JOB_NAME = os.getenv("HERMES_WATCHDOG_BACKUP_JOB_NAME", "Sync Hermes persistent-data backup to GitHub")
|
||||
GATEWAY_SERVICE = os.getenv("HERMES_WATCHDOG_GATEWAY_SERVICE", "hermes-gateway.service")
|
||||
SYSTEMD_SCOPE = os.getenv("HERMES_WATCHDOG_SYSTEMD_SCOPE", "system")
|
||||
INSTANCE_ID = os.getenv("HERMES_WATCHDOG_INSTANCE", "vijay")
|
||||
TELEGRAM_CONFIG = Path(os.getenv("HERMES_WATCHDOG_TELEGRAM_CONFIG", str(Path.home() / ".config/hermes/telegram")))
|
||||
WATCHDOG_LOG = Path(os.getenv("HERMES_WATCHDOG_LOG_PATH", str(Path.home() / ".hermes/logs/hermes-health-watchdog.log")))
|
||||
DASHBOARD_ALERT_LOG = Path(os.getenv("HERMES_DASHBOARD_ALERT_LOG", "/var/log/hermes-dashboard-warnings.log"))
|
||||
DASHBOARD_ALERT_STATE = Path(os.getenv("HERMES_DASHBOARD_ALERT_STATE", str(Path.home() / ".hermes/logs/dashboard-alerts.offset")))
|
||||
ALERT_STATE = Path(os.getenv("HERMES_WATCHDOG_ALERT_STATE", str(Path.home() / ".hermes/logs/watchdog-alert-active")))
|
||||
DOCKER_CONTAINERS = [
|
||||
item.strip()
|
||||
for item in os.getenv("HERMES_WATCHDOG_DOCKER_CONTAINERS", "caddy,gitea-npm-registry").split(",")
|
||||
@ -30,13 +39,99 @@ def run(cmd: list[str], timeout: int = 20) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(cmd, text=True, capture_output=True, timeout=timeout, check=False)
|
||||
|
||||
|
||||
def utc_now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def append_watchdog_log(severity: str, message: str) -> None:
|
||||
WATCHDOG_LOG.parent.mkdir(parents=True, exist_ok=True)
|
||||
with WATCHDOG_LOG.open("a", encoding="utf-8") as fh:
|
||||
fh.write(f"{utc_now()} {severity.upper()} {message}\n")
|
||||
|
||||
|
||||
def read_key_file(path: Path) -> dict[str, str]:
|
||||
values: dict[str, str] = {}
|
||||
try:
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
key, sep, value = line.partition("=")
|
||||
if sep and key.strip() and value.strip():
|
||||
values[key.strip()] = value.strip()
|
||||
except FileNotFoundError:
|
||||
return {}
|
||||
return values
|
||||
|
||||
|
||||
def telegram_credentials() -> tuple[str | None, str | None]:
|
||||
values = read_key_file(TELEGRAM_CONFIG)
|
||||
token = values.get("BOT_TOKEN") or values.get("TELEGRAM_BOT_TOKEN")
|
||||
chat_id = values.get("CHAT_ID") or values.get("TELEGRAM_CHAT_ID")
|
||||
return token, chat_id
|
||||
|
||||
|
||||
def send_telegram(message: str) -> bool:
|
||||
token, chat_id = telegram_credentials()
|
||||
if not token or not chat_id:
|
||||
return False
|
||||
data = urlencode({"chat_id": chat_id, "text": message}).encode("utf-8")
|
||||
req = Request(f"https://api.telegram.org/bot{token}/sendMessage", data=data, method="POST")
|
||||
try:
|
||||
with urlopen(req, timeout=10) as response: # noqa: S310 - token-protected Telegram API endpoint.
|
||||
return 200 <= response.status < 300
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def mark_alert_active() -> None:
|
||||
ALERT_STATE.parent.mkdir(parents=True, exist_ok=True)
|
||||
ALERT_STATE.write_text(utc_now(), encoding="utf-8")
|
||||
|
||||
|
||||
def clear_alert_active() -> bool:
|
||||
if not ALERT_STATE.exists():
|
||||
return False
|
||||
try:
|
||||
ALERT_STATE.unlink()
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def read_dashboard_alerts() -> list[str]:
|
||||
if not DASHBOARD_ALERT_LOG.exists():
|
||||
return []
|
||||
try:
|
||||
previous = int(DASHBOARD_ALERT_STATE.read_text(encoding="utf-8").strip() or "0")
|
||||
except Exception:
|
||||
previous = 0
|
||||
try:
|
||||
size = DASHBOARD_ALERT_LOG.stat().st_size
|
||||
start = previous if previous <= size else 0
|
||||
with DASHBOARD_ALERT_LOG.open("r", encoding="utf-8") as fh:
|
||||
fh.seek(start)
|
||||
lines = [line.strip() for line in fh if line.strip()]
|
||||
offset = fh.tell()
|
||||
DASHBOARD_ALERT_STATE.parent.mkdir(parents=True, exist_ok=True)
|
||||
DASHBOARD_ALERT_STATE.write_text(str(offset), encoding="utf-8")
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
routed: list[str] = []
|
||||
for line in lines:
|
||||
if f"instance={INSTANCE_ID}" in line or "instance=all" in line:
|
||||
routed.append(line)
|
||||
return routed
|
||||
|
||||
|
||||
def check_gateway(alerts: list[str]) -> None:
|
||||
result = run(["systemctl", "is-active", GATEWAY_SERVICE])
|
||||
cmd = ["systemctl", "--user", "is-active", GATEWAY_SERVICE] if SYSTEMD_SCOPE == "user" else ["systemctl", "is-active", GATEWAY_SERVICE]
|
||||
result = run(cmd)
|
||||
if result.stdout.strip() != "active":
|
||||
alerts.append(f"gateway service `{GATEWAY_SERVICE}` is not active: `{result.stdout.strip() or result.stderr.strip() or 'unknown'}`")
|
||||
|
||||
|
||||
def check_backup_cron(alerts: list[str]) -> None:
|
||||
if not BACKUP_JOB_NAME:
|
||||
return
|
||||
result = run(["hermes", "cron", "list"], timeout=30)
|
||||
out = result.stdout + result.stderr
|
||||
if result.returncode != 0:
|
||||
@ -126,16 +221,32 @@ def main() -> int:
|
||||
check(alerts)
|
||||
except Exception as exc: # noqa: BLE001 - watchdog should alert, not crash silently
|
||||
alerts.append(f"{check.__name__} errored: {exc}")
|
||||
alerts.extend(f"dashboard alert: {line}" for line in read_dashboard_alerts())
|
||||
|
||||
if alerts:
|
||||
print("🚨 ByteLyst Hermes watchdog alert")
|
||||
header = f"ByteLyst Hermes watchdog alert ({INSTANCE_ID})"
|
||||
append_watchdog_log("WARNING", header)
|
||||
print("🚨 " + header)
|
||||
for item in alerts:
|
||||
append_watchdog_log("WARNING", item)
|
||||
print(f"- {item}")
|
||||
print(
|
||||
footer = (
|
||||
"\nSuggested first checks: `systemctl status hermes-gateway --no-pager`, "
|
||||
"`hermes cron list`, `df -h /`, `free -h`, `docker ps`."
|
||||
)
|
||||
print(footer)
|
||||
sent = send_telegram("🚨 " + header + "\n" + "\n".join(f"- {item}" for item in alerts) + footer)
|
||||
append_watchdog_log("INFO" if sent else "WARNING", "Telegram delivery succeeded" if sent else "Telegram delivery skipped or failed")
|
||||
mark_alert_active()
|
||||
return 0
|
||||
recovered = clear_alert_active()
|
||||
if recovered:
|
||||
message = f"✅ ByteLyst Hermes watchdog recovery ({INSTANCE_ID})\nBack to healthy."
|
||||
sent = send_telegram(message)
|
||||
append_watchdog_log("INFO", "recovery: back to healthy")
|
||||
append_watchdog_log("INFO" if sent else "WARNING", "Telegram recovery delivery succeeded" if sent else "Telegram recovery delivery skipped or failed")
|
||||
else:
|
||||
append_watchdog_log("INFO", "healthy")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
143
scripts/hermes-ops-exporter.py
Executable file
143
scripts/hermes-ops-exporter.py
Executable file
@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Write a sanitized Hermes ops snapshot for the unified dashboard.
|
||||
|
||||
Run this as the Hermes instance owner (root for Vijay, uma for Bheem). It
|
||||
writes booleans, counts, timestamps, and short Git metadata only. It never
|
||||
copies tokens, state.db, logs, prompts, session content, or environment files.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
|
||||
OUTPUT_PATH = Path(os.getenv("HERMES_OPS_EXPORT_PATH", str(HERMES_HOME / "ops-export.json")))
|
||||
GATEWAY_SERVICE = os.getenv("HERMES_GATEWAY_SERVICE", "hermes-gateway.service")
|
||||
DASHBOARD_SERVICE = os.getenv("HERMES_DASHBOARD_SERVICE", "hermes-root-dashboard.service")
|
||||
BACKUP_TIMER = os.getenv("HERMES_BACKUP_TIMER", "hermes-root-backup.timer")
|
||||
BACKUP_REPO = Path(os.getenv("HERMES_BACKUP_REPO", str(Path.home() / "repos" / "bytelyst_hostinger_hermes_vm")))
|
||||
|
||||
|
||||
def run(cmd: list[str], cwd: Path | None = None, timeout: int = 10) -> tuple[bool, str]:
|
||||
try:
|
||||
result = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True, timeout=timeout, check=False)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return False, ""
|
||||
return True, result.stdout.strip()
|
||||
|
||||
|
||||
def probe_active(unit: str) -> dict[str, Any]:
|
||||
ran, out = run(["systemctl", "--user", "is-active", unit])
|
||||
if not ran:
|
||||
ran, out = run(["systemctl", "is-active", unit])
|
||||
active = out == "active"
|
||||
return {"active": active, "status": "up" if active else "down" if ran else "unknown"}
|
||||
|
||||
|
||||
def probe_enabled(unit: str) -> bool:
|
||||
ran, out = run(["systemctl", "--user", "is-enabled", unit])
|
||||
if not ran:
|
||||
ran, out = run(["systemctl", "is-enabled", unit])
|
||||
return ran and out == "enabled"
|
||||
|
||||
|
||||
def probe_timer(name: str) -> dict[str, Any]:
|
||||
active = probe_active(name)
|
||||
ran, out = run([
|
||||
"systemctl",
|
||||
"--user",
|
||||
"show",
|
||||
name,
|
||||
"-p",
|
||||
"NextElapseUSecRealtime",
|
||||
"-p",
|
||||
"LastTriggerUSec",
|
||||
"--no-pager",
|
||||
])
|
||||
if not ran:
|
||||
ran, out = run([
|
||||
"systemctl",
|
||||
"show",
|
||||
name,
|
||||
"-p",
|
||||
"NextElapseUSecRealtime",
|
||||
"-p",
|
||||
"LastTriggerUSec",
|
||||
"--no-pager",
|
||||
])
|
||||
props: dict[str, str | None] = {}
|
||||
for line in out.splitlines() if ran else []:
|
||||
key, _, value = line.partition("=")
|
||||
props[key] = value or None
|
||||
return {
|
||||
"name": name,
|
||||
"active": active["active"],
|
||||
"status": active["status"],
|
||||
"nextRun": props.get("NextElapseUSecRealtime"),
|
||||
"lastRun": props.get("LastTriggerUSec"),
|
||||
}
|
||||
|
||||
|
||||
def probe_repo(path: Path) -> dict[str, Any]:
|
||||
ran_head, head = run(["git", "rev-parse", "--short", "HEAD"], cwd=path)
|
||||
ran_branch, branch = run(["git", "branch", "--show-current"], cwd=path)
|
||||
ran_status, status = run(["git", "status", "--porcelain"], cwd=path)
|
||||
ran_commit, last_commit = run(["git", "log", "-1", "--format=%cI"], cwd=path)
|
||||
return {
|
||||
"path": str(path),
|
||||
"branch": branch if ran_branch and branch else None,
|
||||
"clean": ran_status and status == "",
|
||||
"head": head if ran_head and head else None,
|
||||
"lastCommitAt": last_commit if ran_commit and last_commit else None,
|
||||
"size": None,
|
||||
"status": "up" if ran_head else "unknown",
|
||||
}
|
||||
|
||||
|
||||
def restore_stats(path: Path) -> dict[str, int | None]:
|
||||
try:
|
||||
manifest = json.loads((path / "hermes_persistent_backup" / "MANIFEST.json").read_text(encoding="utf-8"))
|
||||
files = manifest.get("files")
|
||||
file_count = len(files) if isinstance(files, list) else None
|
||||
except Exception:
|
||||
file_count = None
|
||||
try:
|
||||
jobs = json.loads((path / "hermes_persistent_backup" / "cron" / "jobs.json").read_text(encoding="utf-8"))
|
||||
cron_jobs = jobs.get("jobs") if isinstance(jobs, dict) else jobs
|
||||
cron_count = len(cron_jobs) if isinstance(cron_jobs, list) else None
|
||||
except Exception:
|
||||
cron_count = None
|
||||
return {"restoredFileCount": file_count, "restoredCronJobs": cron_count}
|
||||
|
||||
|
||||
def write_atomic(path: Path, payload: dict[str, Any]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=path.parent, delete=False) as tmp:
|
||||
json.dump(payload, tmp, indent=2, sort_keys=True)
|
||||
tmp.write("\n")
|
||||
tmp_path = Path(tmp.name)
|
||||
tmp_path.replace(path)
|
||||
path.chmod(0o644)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
payload: dict[str, Any] = {
|
||||
"generatedAt": subprocess.check_output(["date", "-u", "+%Y-%m-%dT%H:%M:%SZ"], text=True).strip(),
|
||||
"gateway": {**probe_active(GATEWAY_SERVICE), "enabled": probe_enabled(GATEWAY_SERVICE)},
|
||||
"dashboard": probe_active(DASHBOARD_SERVICE),
|
||||
"backupTimer": probe_timer(BACKUP_TIMER),
|
||||
"repo": probe_repo(BACKUP_REPO),
|
||||
"googleWorkspaceToken": (HERMES_HOME / "google_token.json").is_file(),
|
||||
}
|
||||
payload.update(restore_stats(BACKUP_REPO))
|
||||
write_atomic(OUTPUT_PATH, payload)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
15
systemd/hermes-health-watchdog.service
Normal file
15
systemd/hermes-health-watchdog.service
Normal file
@ -0,0 +1,15 @@
|
||||
[Unit]
|
||||
Description=Run Vijay Hermes health watchdog
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Environment=HERMES_HOME=/root/.hermes
|
||||
Environment=HERMES_WATCHDOG_INSTANCE=vijay
|
||||
Environment=HERMES_WATCHDOG_GATEWAY_SERVICE=hermes-gateway.service
|
||||
Environment=HERMES_WATCHDOG_BACKUP_REPO=/root/repos/bytelyst_hostinger_hermes_vm
|
||||
Environment=HERMES_WATCHDOG_LOG_PATH=/root/.hermes/logs/hermes-health-watchdog.log
|
||||
Environment=HERMES_WATCHDOG_TELEGRAM_CONFIG=/root/.config/hermes/telegram
|
||||
Environment=HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log
|
||||
Environment=HERMES_DASHBOARD_ALERT_STATE=/root/.hermes/logs/dashboard-alerts.offset
|
||||
Environment=HERMES_WATCHDOG_ALERT_STATE=/root/.hermes/logs/watchdog-alert-active
|
||||
ExecStart=/root/.hermes/scripts/hermes_health_watchdog.py
|
||||
11
systemd/hermes-health-watchdog.timer
Normal file
11
systemd/hermes-health-watchdog.timer
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run Vijay Hermes health watchdog every 5 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnUnitActiveSec=5min
|
||||
AccuracySec=30s
|
||||
Unit=hermes-health-watchdog.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
12
systemd/hermes-ops-exporter.service
Normal file
12
systemd/hermes-ops-exporter.service
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Export sanitized Hermes ops state for Mission Control
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Environment=HERMES_HOME=/root/.hermes
|
||||
Environment=HERMES_OPS_EXPORT_PATH=/root/.hermes/ops-export.json
|
||||
Environment=HERMES_GATEWAY_SERVICE=hermes-gateway.service
|
||||
Environment=HERMES_DASHBOARD_SERVICE=hermes-root-dashboard.service
|
||||
Environment=HERMES_BACKUP_TIMER=hermes-root-backup.timer
|
||||
Environment=HERMES_BACKUP_REPO=/root/repos/bytelyst_hostinger_hermes_vm
|
||||
ExecStart=/opt/bytelyst/learning_ai_devops_tools/scripts/hermes-ops-exporter.py
|
||||
11
systemd/hermes-ops-exporter.timer
Normal file
11
systemd/hermes-ops-exporter.timer
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Refresh sanitized Hermes ops export every minute
|
||||
|
||||
[Timer]
|
||||
OnBootSec=1min
|
||||
OnUnitActiveSec=1min
|
||||
AccuracySec=15s
|
||||
Unit=hermes-ops-exporter.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
18
systemd/uma-hermes-health-watchdog.service
Normal file
18
systemd/uma-hermes-health-watchdog.service
Normal file
@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Run Bheem/Uma Hermes health watchdog
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Environment=HERMES_HOME=/home/uma/.hermes
|
||||
Environment=HERMES_WATCHDOG_INSTANCE=bheem
|
||||
Environment=HERMES_WATCHDOG_GATEWAY_SERVICE=uma-hermes-gateway.service
|
||||
Environment=HERMES_WATCHDOG_SYSTEMD_SCOPE=user
|
||||
Environment=HERMES_WATCHDOG_BACKUP_JOB_NAME=
|
||||
Environment=HERMES_WATCHDOG_BACKUP_REPO=/home/uma/repos/uma_hostinger_hermes_vm
|
||||
Environment=HERMES_WATCHDOG_LOG_PATH=/home/uma/.hermes/logs/hermes-health-watchdog.log
|
||||
Environment=HERMES_WATCHDOG_TELEGRAM_CONFIG=/home/uma/.config/hermes/telegram
|
||||
Environment=HERMES_DASHBOARD_ALERT_LOG=/var/log/hermes-dashboard-warnings.log
|
||||
Environment=HERMES_DASHBOARD_ALERT_STATE=/home/uma/.hermes/logs/dashboard-alerts.offset
|
||||
Environment=HERMES_WATCHDOG_ALERT_STATE=/home/uma/.hermes/logs/watchdog-alert-active
|
||||
Environment=HERMES_WATCHDOG_DOCKER_CONTAINERS=
|
||||
ExecStart=/home/uma/.hermes/scripts/hermes_health_watchdog.py
|
||||
11
systemd/uma-hermes-health-watchdog.timer
Normal file
11
systemd/uma-hermes-health-watchdog.timer
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run Bheem/Uma Hermes health watchdog every 5 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnUnitActiveSec=5min
|
||||
AccuracySec=30s
|
||||
Unit=uma-hermes-health-watchdog.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
12
systemd/uma-hermes-ops-exporter.service
Normal file
12
systemd/uma-hermes-ops-exporter.service
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Export sanitized Uma Hermes ops state for Mission Control
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Environment=HERMES_HOME=/home/uma/.hermes
|
||||
Environment=HERMES_OPS_EXPORT_PATH=/home/uma/.hermes/ops-export.json
|
||||
Environment=HERMES_GATEWAY_SERVICE=uma-hermes-gateway.service
|
||||
Environment=HERMES_DASHBOARD_SERVICE=uma-hermes-dashboard.service
|
||||
Environment=HERMES_BACKUP_TIMER=uma-hermes-backup.timer
|
||||
Environment=HERMES_BACKUP_REPO=/home/uma/repos/uma_hostinger_hermes_vm
|
||||
ExecStart=/opt/bytelyst/learning_ai_devops_tools/scripts/hermes-ops-exporter.py
|
||||
11
systemd/uma-hermes-ops-exporter.timer
Normal file
11
systemd/uma-hermes-ops-exporter.timer
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Refresh sanitized Uma Hermes ops export every minute
|
||||
|
||||
[Timer]
|
||||
OnBootSec=1min
|
||||
OnUnitActiveSec=1min
|
||||
AccuracySec=15s
|
||||
Unit=uma-hermes-ops-exporter.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
Loading…
Reference in New Issue
Block a user