Compare commits

...

3 Commits

Author SHA1 Message Date
Hermes VM
5a2d92f519 docs: record VM container health fix
Some checks failed
pre-commit / pre-commit (push) Failing after 33s
2026-05-27 21:12:45 +00:00
e2db92f3b1 Add Hermes snapshot diff view 2026-05-27 21:05:57 +00:00
8f522e3505 Add Hermes dashboard improvement backlog 2026-05-27 21:02:23 +00:00
6 changed files with 160 additions and 6 deletions

View File

@ -1,8 +1,8 @@
'use client';
import { useEffect, useMemo, useState } from 'react';
import { useEffect, useMemo, useRef, useState } from 'react';
import Link from 'next/link';
import { AlertTriangle, CheckCircle2, Cloud, DatabaseBackup, ExternalLink, Gauge, HardDrive, RefreshCw, ShieldCheck, Timer, Wifi, Activity, CalendarClock, Link2 } from 'lucide-react';
import { AlertTriangle, CheckCircle2, Cloud, Copy, DatabaseBackup, ExternalLink, Gauge, HardDrive, RefreshCw, ShieldCheck, Timer, Wifi, Activity, CalendarClock, Link2 } from 'lucide-react';
import { Badge, Button } from '@/components/ui/Primitives';
import { SectionCard } from '@/components/hermes-shell';
import { api, type HermesOpsInstance, type HermesOpsSnapshot } from '@/lib/api';
@ -94,6 +94,10 @@ function InstanceCard({ instance }: { instance: HermesOpsInstance }) {
Open dashboard <ExternalLink className="ml-2 h-4 w-4" />
</a>
</Button>
<Button variant="ghost" size="sm" onClick={() => void navigator.clipboard.writeText(instance.dashboard.url)}>
<Copy className="mr-2 h-4 w-4" />
Copy URL
</Button>
</div>
</article>
);
@ -101,14 +105,19 @@ function InstanceCard({ instance }: { instance: HermesOpsInstance }) {
export function HermesOpsPanel() {
const [snapshot, setSnapshot] = useState<HermesOpsSnapshot | null>(null);
const [previousSnapshot, setPreviousSnapshot] = useState<HermesOpsSnapshot | null>(null);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const latestSnapshotRef = useRef<HermesOpsSnapshot | null>(null);
const load = async () => {
setLoading(true);
setError(null);
try {
setSnapshot(await api.getHermesOps());
const nextSnapshot = await api.getHermesOps();
setPreviousSnapshot(latestSnapshotRef.current);
latestSnapshotRef.current = nextSnapshot;
setSnapshot(nextSnapshot);
} catch (err) {
setError(err instanceof Error ? err.message : 'Unable to load Hermes operations status');
} finally {
@ -123,6 +132,44 @@ export function HermesOpsPanel() {
}, []);
const allHealthy = useMemo(() => snapshot ? snapshot.warnings.length === 0 : false, [snapshot]);
const snapshotDiff = useMemo(() => {
if (!snapshot || !previousSnapshot) return null;
const previousHealthyInstances = previousSnapshot.instances.filter((instance) =>
instance.gateway.active &&
instance.dashboard.active &&
instance.backup.timer.active &&
instance.backup.repo.clean &&
instance.google.workspaceToken
).length;
const currentHealthyInstances = snapshot.instances.filter((instance) =>
instance.gateway.active &&
instance.dashboard.active &&
instance.backup.timer.active &&
instance.backup.repo.clean &&
instance.google.workspaceToken
).length;
return {
healthyInstances: currentHealthyInstances - previousHealthyInstances,
warnings: snapshot.warnings.length - previousSnapshot.warnings.length,
activeSessions: snapshot.activeSessions.active - previousSnapshot.activeSessions.active,
activeDashboards: snapshot.instances.filter((instance) => instance.dashboard.active).length - previousSnapshot.instances.filter((instance) => instance.dashboard.active).length,
activeBackupTimers: snapshot.instances.filter((instance) => instance.backup.timer.active).length - previousSnapshot.instances.filter((instance) => instance.backup.timer.active).length,
};
}, [previousSnapshot, snapshot]);
const healthyInstances = snapshot
? snapshot.instances.filter((instance) =>
instance.gateway.active &&
instance.dashboard.active &&
instance.backup.timer.active &&
instance.backup.repo.clean &&
instance.google.workspaceToken
).length
: 0;
const activeDashboards = snapshot ? snapshot.instances.filter((instance) => instance.dashboard.active).length : 0;
const activeBackupTimers = snapshot ? snapshot.instances.filter((instance) => instance.backup.timer.active).length : 0;
return (
<SectionCard
@ -146,6 +193,65 @@ export function HermesOpsPanel() {
{snapshot ? (
<div className="space-y-5">
{snapshotDiff ? (
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center justify-between gap-3">
<div>
<p className="text-sm font-medium text-[var(--bl-text-primary)]">Since previous refresh</p>
<p className="text-xs text-[var(--bl-text-secondary)]">Snapshot movement compared with the last poll.</p>
</div>
<Badge variant="neutral">Delta view</Badge>
</div>
<div className="mt-3 grid gap-3 md:grid-cols-5">
{[
{ label: 'Healthy instances', value: snapshotDiff.healthyInstances },
{ label: 'Active dashboards', value: snapshotDiff.activeDashboards },
{ label: 'Active backups', value: snapshotDiff.activeBackupTimers },
{ label: 'Active sessions', value: snapshotDiff.activeSessions },
{ label: 'Warnings', value: snapshotDiff.warnings },
].map((item) => (
<div key={item.label} className="rounded-xl border border-[var(--bl-border)] bg-[var(--bl-surface-card)] p-3">
<p className="text-xs uppercase tracking-[0.2em] text-[var(--bl-text-tertiary)]">{item.label}</p>
<p className={`mt-2 text-2xl font-semibold ${item.value > 0 ? 'text-[var(--bl-success)]' : item.value < 0 ? 'text-[var(--bl-danger)]' : 'text-[var(--bl-text-primary)]'}`}>
{item.value > 0 ? '+' : ''}{item.value}
</p>
</div>
))}
</div>
</div>
) : null}
<div className="grid gap-3 md:grid-cols-4">
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center gap-2 text-sm text-[var(--bl-text-secondary)]">
<ShieldCheck className="h-4 w-4" />
Healthy instances
</div>
<p className="mt-2 text-2xl font-semibold text-[var(--bl-text-primary)]">{healthyInstances}/2</p>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center gap-2 text-sm text-[var(--bl-text-secondary)]">
<Activity className="h-4 w-4" />
Active dashboards
</div>
<p className="mt-2 text-2xl font-semibold text-[var(--bl-text-primary)]">{activeDashboards}/2</p>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center gap-2 text-sm text-[var(--bl-text-secondary)]">
<CalendarClock className="h-4 w-4" />
Active backup timers
</div>
<p className="mt-2 text-2xl font-semibold text-[var(--bl-text-primary)]">{activeBackupTimers}/2</p>
</div>
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center gap-2 text-sm text-[var(--bl-text-secondary)]">
<AlertTriangle className="h-4 w-4" />
Open warnings
</div>
<p className="mt-2 text-2xl font-semibold text-[var(--bl-text-primary)]">{snapshot.warnings.length}</p>
</div>
</div>
<div className="grid gap-3 md:grid-cols-4">
<div className="rounded-2xl border border-[var(--bl-border)] bg-[var(--bl-surface-muted)] p-4">
<div className="flex items-center gap-2 text-sm text-[var(--bl-text-secondary)]">

View File

@ -31,7 +31,7 @@ Observed on 2026-05-27:
- Private dashboards:
- Root: `http://100.87.53.10:9119/`, `hermes-root-dashboard.service`
- Uma: `http://100.87.53.10:9120/`, `uma-hermes-dashboard.service`
- Live ops panel shows gateway state, active sessions, cron state, backup freshness, sanitized alerts, and runbook links for both instances.
- Live ops panel shows gateway state, active sessions, refresh delta, cron state, backup freshness, sanitized alerts, and runbook links for both instances.
## Safety guardrail: no public Hermes dashboard/API

View File

@ -21,6 +21,16 @@
- **Needs manual UX validation:** dashboard feature-by-feature checks, Telegram approval prompt flow, and Telegram media/file delivery.
- **Needs future workflow adoption:** practicing `delegate_task`, spawned/tmux sessions, worktrees, and Kanban on real tasks before checking them as completed.
## Next To-Dos
The remaining work is now mostly hardening rather than feature delivery:
- finish the GitHub/Gitea least-privilege audit for the root-managed push path
- decide whether `security.redact_secrets` should be enabled by default
- document the gateway-session `privacy.redact_pii` policy
- rotate any credentials that were migrated or exposed during the setup work
- tighten least-privilege token scopes for GitHub/Gitea, web APIs, and provider keys
## Purpose
Turn the Hermes setup ideas from the referenced video into a practical ByteLyst upgrade checklist for this VM-backed, Telegram-driven Hermes installation.

View File

@ -665,6 +665,18 @@ Known roadmap assumptions to handle safely during implementation:
---
## Next Dashboard Improvements
Potential follow-up work for Hermes Mission Control:
- warning severity filters for the live ops panel
- compact trend cards for recent alert volume and backup freshness over several refreshes
- task-ledger deep links from the ops panel into the most recent Hermes work
- per-instance action row improvements beyond copy-link/open-dashboard, such as open-runbook shortcuts
- optional dark/light theme toggle if the broader dashboard shell eventually supports it
---
# Git workflow
Commit incrementally:

View File

@ -64,7 +64,7 @@ These listeners were bound on `0.0.0.0` and/or `[::]` during review.
| `3040` | `flowmonk-web` | `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | none found in Caddy | `needs-decision` | Unhealthy; classify as private/admin or retire |
| `3049` | `devops-web` | `/opt/bytelyst/bytelyst-devops-tools/dashboard/docker-compose.yml` | `devops.bytelyst.com` | `private-admin` with direct bypass | Fix old repo path drift, then bind loopback/private |
| `3050` | `mindlyst-web` | `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | none found in Caddy | `needs-decision` | Unhealthy; classify as private/admin or retire |
| `3055` | `nomgap-web` | `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | none found in Caddy | `needs-decision` | Unhealthy; classify as private/admin or retire |
| `3055` | `nomgap-web` | orphan from older `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | none found in Caddy | `retire` | Retired on 2026-05-27; current Compose says Nomgap web is deployed to Vercel |
| `3060` | `actiontrail-web` | `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | none found in Caddy | `needs-decision` | Unhealthy; classify as private/admin or retire |
| `3070` | `localmemgpt-web` | `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | none found in Caddy | `needs-decision` | Unhealthy; classify as private/admin or retire |
| `3075` | `llmlab-dashboard` | `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml` | `llmlab.bytelyst.com` | `private-admin` with direct bypass | Dashboard unhealthy; gate or retire |
@ -113,6 +113,7 @@ These listeners were bound on `0.0.0.0` and/or `[::]` during review.
## Drift / Follow-Up Findings
- `nomgap-web` was an orphan from an older Compose revision, had no Caddy route, and was retired on 2026-05-27.
- `devops-backend` runs from `/opt/bytelyst/learning_ai_devops_tools/dashboard/docker-compose.yml`.
- `devops-web` runs from `/opt/bytelyst/bytelyst-devops-tools/dashboard/docker-compose.yml`, an older path. Align this before changing devops dashboard port bindings.
- `gitea-npm-registry` has no Compose labels in Docker inspect output. Find its systemd/compose owner before changing `3300`.

View File

@ -397,7 +397,7 @@ Effective `sshd -T` settings showed:
### Phase 2 — Operational correctness
- [ ] Fix/retire unhealthy containers.
- [x] Fix/retire unhealthy containers.
- [x] Resolve `hermes-root-backup.service` failed state.
- [x] Decide and document Gitea runner active/disabled state.
- [x] Add missing-script checks. Stale root cron path was fixed on 2026-05-27.
@ -515,6 +515,31 @@ Minimum post-checks for Phase 1:
- The detector currently covers root crontab and failed systemd units. Full ownership inventory still needs `/etc/cron.d`, user crontabs, Hermes cron, Gitea schedules, owners, outputs, and alert channels.
### 2026-05-27 — Phase 2 unhealthy containers
**Changed:**
- Added `HOSTNAME=0.0.0.0` to six managed Next.js web services in `/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml`: `jarvisjr-web`, `flowmonk-web`, `mindlyst-web`, `actiontrail-web`, `localmemgpt-web`, and `llmlab-dashboard`.
- Recreated those six services from existing images with `docker compose ... up -d --no-build`.
- Retired the orphan `learning_ai_common_plat-nomgap-web-1` container. Current Compose already documents `nomgap-web` as deployed to Vercel and not part of the Docker stack.
**Verified:**
- `docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem config --quiet` passed.
- The six recreated web containers report Docker health `healthy`.
- `docker ps --filter health=unhealthy` returns no containers.
- Host-level smoke checks returned HTTP `200` for `3035`, `3040`, `3050`, `3060`, `3070`, and `3075`; retired orphan port `3055` is closed.
- Host-permission `vm-health-check.sh --json` reports `container_health=OK`, `container_loops=OK`, `failed_units=OK`, and `cron_missing_paths=OK`.
**Committed/pushed:**
- `learning_ai_common_plat`: `af035e7d` (`fix: bind ecosystem Next apps on all interfaces`) pushed to GitHub.
**Residual risk:**
- Local Gitea mirror push for `learning_ai_common_plat` failed at Git HTTP transport even though fetch and health checks work; retry/fix mirror push separately.
- This fixed health state, not public exposure. Several direct published ports remain to be loopback-bound or blocked in Phase 1.
## Do Not Start With
- Rootless Docker migration.