Add Hermes token analytics dashboard

chore: ignore local graphify outputs
feat(agent-queue): resilient lease renewal + graceful drain
2026-06-06 03:34:49 +00:00 · 2026-06-06 02:56:45 +00:00 · 2026-06-01 12:24:45 -07:00 · 2026-06-01 11:51:56 -07:00 · 2026-06-01 02:30:38 -07:00 · 2026-06-01 02:19:52 -07:00
253 changed files with 35264 additions and 8326 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,6 @@
+* text=auto eol=lf
+
+# Enforce LF for shell scripts and text files
+*.sh text eol=lf
+*.ps1 text eol=lf
+*.md text eol=lf
--- a/.gitea/workflows/shell-ci.yml
+++ b/.gitea/workflows/shell-ci.yml
@ -0,0 +1,71 @@
+name: Shell CI — agent-queue + CLI
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'agent-queue/**'
+      - 'bytelyst-cli.sh'
+      - '.gitea/workflows/shell-ci.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'agent-queue/**'
+      - 'bytelyst-cli.sh'
+      - '.gitea/workflows/shell-ci.yml'
+
+concurrency:
+  group: shell-ci-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    name: shellcheck + syntax
+    runs-on: [ubuntu-latest, bytelyst, hostinger]
+    container:
+      image: node:20-bookworm
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          github-server-url: https://gitea.bytelyst.com
+
+      - name: Install shellcheck
+        run: |
+          apt-get update -qq
+          apt-get install -y -qq shellcheck
+          shellcheck --version
+
+      - name: shellcheck (errors fail the build)
+        run: |
+          shellcheck --severity=error --shell=bash \
+            agent-queue/agent-queue.sh \
+            agent-queue/selftest.sh \
+            bytelyst-cli.sh
+
+      - name: bash syntax check (gating, all scripts)
+        run: |
+          bash -n agent-queue/agent-queue.sh
+          bash -n agent-queue/selftest.sh
+          bash -n bytelyst-cli.sh
+
+      - name: agent-queue self-test (no-op engine cycle)
+        run: ./agent-queue/selftest.sh
+
+      - name: node syntax check (dashboard)
+        run: node --check agent-queue/dashboard.mjs
+
+      - name: smoke test (init + add + drain, no real agent)
+        run: |
+          set -euo pipefail
+          export AGENT_QUEUE_ROOT="$PWD/.ci-queue"
+          ./agent-queue/agent-queue.sh init
+          # task with an invalid cwd lands in failed/ without launching any agent
+          printf '%s\n' '---' 'engine: devin' 'cwd: /no/such/dir' 'yolo: true' '---' '# ci' \
+            > /tmp/ci-task.md
+          ./agent-queue/agent-queue.sh add /tmp/ci-task.md
+          ./agent-queue/agent-queue.sh run --once
+          test -f "$AGENT_QUEUE_ROOT"/failed/*.md
+          echo "smoke OK: task routed to failed/ as expected"
+          rm -rf "$AGENT_QUEUE_ROOT"
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -0,0 +1,63 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  shellcheck:
+    name: Shellcheck
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install shellcheck
+        run: sudo apt-get update && sudo apt-get install -y shellcheck
+      - name: Run shellcheck on shell scripts
+        run: |
+          files=$(git ls-files '*.sh' || true)
+          if [ -z "$files" ]; then
+            echo "No shell scripts to check"
+            exit 0
+          fi
+          echo "$files"
+          shellcheck $files
+
+  syntax:
+    name: Syntax & EOL checks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Fail on CRLF in scripts
+        run: |
+          CRLF_FILES=$(git ls-files '*.sh' | xargs -r grep -Il $'\r' || true)
+          if [ -n "$CRLF_FILES" ]; then
+            echo "CRLF found in the following files:"; echo "$CRLF_FILES";
+            exit 1
+          fi
+          echo "No CRLF in shell scripts"
+      - name: Bash syntax-check
+        run: |
+          for f in $(git ls-files '*.sh'); do
+            echo "Checking $f";
+            bash -n "$f";
+          done
+
+  preview-runner:
+    name: Preview installer scripts
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Bash syntax-check run_installers
+        run: bash -n run_installers.sh
+      - name: Preview run_installers (safe)
+        run: ./run_installers.sh --preview
+
+  windows-preview:
+    name: PowerShell preview
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Preview PowerShell wrapper
+        shell: pwsh
+        run: |
+          ./run_installers.ps1 -Preview
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,8 @@ __pycache__/
 venv/
 env/
 ENV/
+!dashboard/backend/src/modules/env/
+!dashboard/backend/src/modules/env/**

 # IDE files
 .vscode/
@ -36,6 +38,7 @@ accounts.json
 .azure/

 # Generated outputs and local data caches
+graphify-out/
 supabase monitor/output/
 youtube/captions/
 github_repo_scanners/contributor_repos/
@ -46,3 +49,19 @@ bytelyst-ai.json
 saravanakumardb.json
 saravanakumardb1.json
 list_repos_contributors_by_user_saravanakumardb.json
+
+# Agent-queue transient runtime state (jobs move through these dirs at runtime;
+# keep the dirs via .gitkeep but never track the per-job lifecycle files).
+agent-queue/queue/.state/*
+agent-queue/queue/inbox/*
+agent-queue/queue/building/*
+agent-queue/queue/testing/*
+agent-queue/queue/review/*
+agent-queue/queue/failed/*
+agent-queue/queue/shipped/*
+agent-queue/queue/logs/*
+!agent-queue/queue/*/.gitkeep
+
+# gigafactory deploy script runtime pids
+scripts/.gigafactory-platform-service.pid
+scripts/.gigafactory-tracker-web.pid
--- a/AGENTS.md
+++ b/AGENTS.md
@ -32,6 +32,7 @@ Read these first:
 - `remove_user_guided.sh`
 - `remove_user_from_repos.sh`
 - `scripts/`
+- `scripts/tracker-seed/` - file work items into the ByteLyst tracker (see "Cutting Tracker Items")
 - `git-work-safety-tools/`
 - `github_access_scripts/`

@ -75,6 +76,37 @@ These may contain secrets, usernames, or operational snapshots. Avoid printing c
 4. Make the smallest coherent change set.
 5. If docs or discoverability changed, update the canonical docs listed above.

+## Cutting Tracker Items (work tracking)
+
+When the user asks to **"cut items to track"** (file feature/bug/task tickets for
+some work — e.g. the findings in `ENGINEERING_REVIEW_SCORECARD.md`), use the
+seed tooling in `scripts/tracker-seed/`. Do **not** hand-roll API calls.
+
+How the tracker works:
+- Items live in the ByteLyst tracker, served by **platform-service**
+  (`POST /api/items`, in `learning_ai_common_plat/services/platform-service`),
+  and viewed in **tracker-web** (`learning_ai_common_plat/dashboards/tracker-web`, `:3003`).
+- Item schema: `{ productId, type: bug|feature|task, priority: critical|high|medium|low,
+  title, description, labels[], source, visibility, ... }`. Items are scoped per `productId`.
+- Auth is an HS256 JWT signed with the shared `JWT_SECRET` (verified offline by
+  platform-service); the seed script mints one itself.
+
+Standard procedure:
+1. **Add payloads** to `scripts/tracker-seed/engineering-review-items.json`
+   (or a new payload file): one entry per item, scoped to the right `productId`.
+   Use the `repoToProductId` map in that file for repo → product slugs
+   (e.g. `learning_ai_notes` → `notelett`, common-plat/infra → `platform`).
+2. **Preview** with no side effects: `node scripts/tracker-seed/seed-tracker-items.mjs --dry-run`.
+3. **Create for real only when the platform stack is up** (it writes real records):
+   `JWT_SECRET=<secret> PLATFORM_API_URL=http://localhost:4003 node scripts/tracker-seed/seed-tracker-items.mjs`.
+   The script dedupes by title per product, so re-running is safe (`--force` to bypass).
+4. If the stack is **not running** (no Docker / nothing on `:4003`), do **not**
+   stand up Cosmos just to seed — commit the payloads + run instructions and tell
+   the user to run the script when the stack is next up. Confirm before any live
+   write (creating items is a side-effecting datastore operation).
+
+See `scripts/tracker-seed/README.md` for full details.
+
 ## Good First Checks

 ```bash
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -34,6 +34,12 @@ Do not assume a single dependency graph or runtime model across the whole repo.
 - `remove_user_from_repos.sh`
 - `git-work-safety-tools/*.sh`

+## Cutting Tracker Items
+
+When asked to "cut items to track", use `scripts/tracker-seed/` (seeds the
+ByteLyst tracker via platform-service `POST /api/items`). Do not hand-roll API
+calls. Full procedure is in the "Cutting Tracker Items" section of `AGENTS.md`.
+
 ## Safety Notes

 - Treat `accounts.json`, account snapshot JSON files, `.env` files, and generated contributor/output data as sensitive.
--- a/ENGINEERING_REVIEW_SCORECARD.md
+++ b/ENGINEERING_REVIEW_SCORECARD.md
@ -0,0 +1,335 @@
+# Engineering Review & Scorecard
+
+> Evidence-based, read-only review of the entire `~/code/mygh` workspace (~38 git
+> repos) per `docs/prompts/engineering-review-scorecard.md`. Generated 2026-05-30.
+>
+> **Method:** static inspection only — file reads, `grep`, and read-only `git`.
+> No builds, installs, or test runs were executed (that would mutate the trees),
+> so dynamic results (pass/fail, coverage %) are inferred from config + test
+> counts, not measured. See §9 for limits. Per-repo evidence was gathered by
+> parallel read-only agents and spot-verified.
+
+---
+
+## 1. Executive Summary
+
+**What this is:** a single developer running a surprisingly coherent *product
+ecosystem* — ~10 product apps (clock, notes, fastgap, peakpulse, flowmonk,
+efforise, jarvis_jr, trails, talk2obsidian, local-memory-gpt, voice-ai-agent,
+multimodal/mindlyst) sharing one platform monorepo (`learning_ai_common_plat`,
+36 `@bytelyst/*` packages, auth/Cosmos/design-tokens), orchestrated by a single
+`docker-compose.ecosystem.yml` (~20 services) and driven heavily by AI agents
+through a homegrown `agent-queue`. This is far more disciplined than a typical
+"learning" folder.
+
+**Overall maturity:** **Beta-quality ecosystem.** A core of genuinely
+production-grade repos (`learning_ai_notes`, `learning_ai_trails`,
+`oss/claw-code`/`claw-cowork`, `learning_ai_clock`, `learning_ai_fastgap`)
+surrounded by a long tail of MVP/prototype repos with thin or zero tests and no
+CI.
+
+**Biggest strengths (top 3)**
+1. **Strong platform discipline.** Shared `@bytelyst/*` packages, a repeated
+   `types.ts → repository.ts → routes.ts` backend pattern, Cosmos partition-key
+   conventions (`/userId`, `productId` on every doc), per-repo `AGENTS.md`,
+   conventional commits, and field-level encryption (`field-encrypt.ts`) recur
+   across the best repos.
+2. **Clean security posture for a personal workspace.** Secret scans across all
+   repos surfaced **no real committed production secrets** — only `.env.example`
+   placeholders, the public Azure Cosmos emulator key, dev `JWT_SECRET=dev-...`
+   values, and Azure Key Vault *references*. `.gitignore` is present nearly
+   everywhere.
+3. **Top repos are legitimately good.** `notes`, `trails`, and the two Rust
+   `claw-*` repos show modular architecture, real test suites (28–80+ files),
+   CI, multi-stage Docker, and strict typing (`0` `as any` in several backends).
+
+**Biggest risks (top 3)**
+1. **CI is the weak link.** GitHub Actions is **disabled (billing)** on the
+   platform monorepo `learning_ai_common_plat` and on `voice_ai_agent`
+   (`*.disabled` workflows); ~15 repos have **no CI at all**. The shared
+   platform that everything depends on has no automated gate.
+2. **Process churn dirties the repos.** A live `agent-queue` daemon + `devin`
+   agents in `--permission-mode dangerous` were actively writing to repos; ~14
+   repos were found dirty with uncommitted work, several behind `origin`. Work
+   is at risk of being lost or silently diverging.
+3. **Testing is bimodal.** Excellent in the flagship repos, **zero** in many
+   others (`productivity_web`, `webui_copilot`, `pytorch_todo_predictor`,
+   `server-survival`, `sidecar_setup`, `mac_tooling`). No portfolio-wide
+   coverage signal.
+
+**Is the dev style helping or hurting velocity?** **Net helping, but fraying at
+the edges.** The platform/agent approach clearly lets one person ship a dozen
+apps — that's the upside. The drag is operational: disabled CI, constantly-dirty
+working trees, abandoned worktrees, and "AI-generated scaffolding smell" in a
+few repos (e.g. `magic_clipboard_mgr`'s 50+ service files + phase-named test
+buckets). Tightening the commit/CI loop would convert a lot of that churn back
+into velocity.
+
+---
+
+## 2. Overall Score Sheet
+
+Scores are 1–10 (1 = critical/broken, 10 = production-grade), aggregated across
+the ~30 code repos (pure docs/usage repos excluded from category math).
+
+| Category | Score | Justification (evidence) |
+|---|---|---|
+| A. Repository organization | **8** | Consistent `@bytelyst/*` + `types/repository/routes` pattern, per-repo `AGENTS.md`, clear monorepos; minus for ~14 dirty trees, stray worktrees, a few unstructured repos. |
+| B. Code quality | **7** | Flagships: strict TS, `0` `as any`, no `console.log`, Zod validation. Tail: `print()`-heavy (`2nd_brain` 60+, `mac_tooling` 200+), `any` leaks, AI-scaffold smell (`magic_clipboard_mgr`). |
+| C. Architecture | **8** | Genuinely strong: shared platform, datastore abstraction, deterministic engines (`flowmonk` scheduler), risk-scoring (`trails`), MCP integrations, clean native/web boundaries. |
+| D. DevOps & deployment | **6** | Ecosystem compose orchestrates ~20 services, multi-stage Dockerfiles common — but **CI disabled on the platform repo**, ~15 repos with no CI, and **0 healthchecks** in `docker-compose.ecosystem.yml`. |
+| E. Testing | **6** | Bimodal: `notes`/`fastgap`/`clock`/`trails`/`claw-*` have 28–600+ tests; many repos have 0. E2E frequently `continue-on-error: true`. No measured coverage. |
+| F. Security | **8** | No real committed secrets anywhere; field encryption + Key Vault refs in the mature repos; `.gitignore`/`.env.example` discipline. Minus for `NODE_TLS_REJECT_UNAUTHORIZED=0` in some Docker, thin input-validation in prototypes. |
+| G. Product readiness | **7** | Several apps runnable end-to-end (web+backend); mobile/native surfaces often partial; CI-disabled + flaky E2E hold back true "launchable". |
+| H. AI-agent practices | **6** | Impressive tooling (`agent-queue`, profiles, job briefs, `AGENTS.md`), but guardrails are weak: `--permission-mode dangerous`, agents dirtying live repos, duplicate work landing upstream, no enforced test-before-commit. |
+| I. Personal workflow | **6** | Good: conventional commits, auto `backup-main-*` branches, `AGENTS.md`. Bad: ~14 dirty repos, branches behind `origin`, abandoned worktrees, no unified release/issue discipline. |
+| **Weighted overall** | **≈ 7.0** | Beta-quality. See weighting below. |
+
+**Weighting & rationale:** Security (F) and Product readiness (G) weighted ~1.5×,
+Testing (E) and DevOps (D) ~1.25× (these gate real-world reliability);
+A/B/C/H/I at 1.0×. The strong architecture/security pull the number up; the
+weak CI/testing pull it back to a solid-but-not-shippable **~7.0**.
+
+---
+
+## 3. Per-Product / Per-Repo Breakdown
+
+Maturity legend: **PROD** = production-grade, **BETA**, **MVP**, **PROTO** =
+prototype/learning, **REF** = docs/reference (not code).
+
+### Flagship products (platform-integrated)
+| Repo | Stack | Tests | CI | Docker | Maturity |
+|---|---|---|---|---|---|
+| `learning_ai_notes` | Fastify5 + Next16 + Expo, Cosmos | 80+ files | ✓ gitea | ✓ | **BETA→PROD** |
+| `learning_ai_trails` | Fastify5 + Next16 + SDK, Cosmos | 28 files | ✓ gitea | ✓ | **PROD** |
+| `learning_ai_clock` | Next16 PWA + iOS/Android, Fastify | 662 total | ✓ gitea | ✓ | **BETA** |
+| `learning_ai_fastgap` | Expo + Next16 + Fastify | 700+ total | ✓ gitea (7 jobs) | ✓ | **BETA** |
+| `learning_ai_peakpulse` | SwiftUI + Fastify | 26 files | ✓ (backend) | ✓ | **BETA→PROD** |
+| `learning_ai_flowmonk` | Next16 + Fastify + Expo | 102 backend | ✓ gitea | ✓ | **BETA** |
+| `learning_ai_efforise` | React/Vite + Fastify + RN | ~9 backend | ✓ gitea | ✓ | **MVP** |
+| `learning_ai_dev_intelli` | Fastify + Next16, GitHub API | 52 backend | ✓ gitea | ✓ | **MVP** |
+| `learning_ai_local_memory_gpt` | Fastify + Next16, SQLite/Ollama | 122 | ✓ gitea | ✓ | **MVP** |
+| `learning_ai_talk2obsidian` | Fastify + Vite, SQLite/Ollama | 8 | ✗ | ✓ | **BETA** |
+| `learning_voice_ai_agent` | Python + Fastify + Next + KMP | 463+ | ⚠ disabled | ✓ | **BETA** |
+| `learning_multimodal_memory_agents` (MindLyst) | KMP + Next + Fastify | 33 | ⚠ disabled | ✓ | **MVP** |
+| `learning_ai_jarvis_jr` | SwiftUI + Next + Android | ~13 web | ✓ gitea | ✓ | **ALPHA/BETA** |
+| `learning_ai_auth_app` | iOS/watchOS/Android (spec+UI) | 0 (here) | ✗ | ✗ | **MVP (spec)** |
+
+### Platform & infra
+| Repo | Stack | Notes | Maturity |
+|---|---|---|---|
+| `learning_ai_common_plat` | pnpm monorepo, 36 `@bytelyst/*`, Fastify, Cosmos | ~466k LOC; full auth (OAuth/MFA/passkeys/SAML); **GH Actions disabled (billing)**, gitea CI active | **PROD** |
+| `learning_ai_devops_tools` | Bash + Python + Node (this repo) | GitHub admin scripts, `agent-queue`, Hermes dashboard; thin tests | **PROD (scripts) / MVP (dash)** |
+| `learning_ai_k8s_streaming` | Python FastAPI + Helm | Use-case registry, HPA/probes, load tools | **BETA→PROD** |
+| `learning_ai_local_llms` | Next16 dashboard + Python TTS | Ollama mission-control; 57 tests | **BETA** |
+
+### Tools / OSS / native
+| Repo | Stack | Notes | Maturity |
+|---|---|---|---|
+| `oss/learning_ai_claw-code-oss` | Rust workspace (10+ crates) | `unsafe forbid`, clippy pedantic, 40+ test files | **PROD** |
+| `oss/learning_ai_claw-cowork` | Rust + Tauri + Python | 65+ test files, E2E, Docker | **PROD** |
+| `learning_magic_terminal` | **Rust** | README+CI+many tests; command-blocks v2; dirty(5) | **BETA** |
+| `learning_notif_scanr` | **Swift** (Package.swift) | tests present, **no CI**, no Docker | **MVP** |
+| `ios/learning_swift_hourglass` | Swift/SwiftUI macOS | MVVM, 2 test files, no CI | **MVP** |
+| `learning_ai_magic_clipboard_mgr` | Swift/macOS, GRDB | 24 tests but 50+ services + phase-named tests (AI-scaffold smell) | **MVP** |
+| `learning_ai_mac_tooling` | Python FastAPI + React | forensics toolkit; **0 tests**, 200+ `print()`, 3k-line files | **PROTO** |
+| `copilot/learning_ai_uxui_web` | Next16 + MSW + Playwright | component showcase, Lighthouse CI | **MVP** |
+| `learning_ai_productivity_web` | Next15, client-only | clean registry pattern, **0 tests** | **MVP** |
+| `learning_ai_webui_copilot` | Python FastAPI + LangChain | rules/policy engines, **0 tests, no Docker/CI** | **MVP** |
+| `learning_agent_monitoring_fx` | npm monorepo + KMP | agent/ingest/web work, native WIP, 54 `console.log`, TODOs | **BETA** |
+| `learning_agentic_tools_portal` | Python Flask + uv | minimal (1 endpoint, 1 test), has CI | **PROTO** |
+| `learning_server-survival-devops-web` | Vanilla JS + Three.js | playable game, **0 tests** | **MVP** |
+| `learning_pytorch_todo_predictor` | Python + PyTorch | educational, **0 tests**, **no upstream** | **PROTO** |
+| `learning_sidecar_setup` | Next16 scaffold + py stub | scaffolding only, **no upstream**, dirty(8) | **PROTO** |
+| `learning_claude_code_setup` | Bash + markdown | setup notes/scripts; dirty(1) | **REF** |
+| `learning_github_copilot` | Markdown (CLI/SDK docs) | reference only | **REF** |
+| `learning_python_sandbox` | Python | LeetCode/learning; dirty(1) | **PROTO** |
+| `learning_ai_materials` | Docs | NBA handover package | **REF** |
+| `learning_windsurf_setup` | Usage logs | not a codebase | **N/A** |
+
+---
+
+## 4. Findings by Dimension
+
+### A. Repository organization
+- **Fact:** Strong, repeated conventions — `AGENTS.md`/`CLAUDE.md` per repo, pnpm
+  workspaces, `types→repository→routes` backend modules, `docs/` with PRD/ROADMAP.
+- **Fact:** ~14 repos dirty at audit time; abandoned `worktrees/` (now cleaned);
+  some repos behind `origin`. Two repos (`pytorch_todo_predictor`,
+  `sidecar_setup`) have **no git upstream**.
+- **Reco:** Adopt a "clean tree or it doesn't exist" rule (see §8). Add upstreams
+  for the two orphan repos or mark them clearly local.
+
+### B. Code quality
+- **Fact:** Best repos enforce strict TS (`0` `as any` in `notes`, `trails`,
+  `local_memory_gpt` backends), no `console.log` (Fastify logger), Zod validation.
+- **Fact:** `learning_ai_2nd_brain` has 60+ `print()`; `mac_tooling` 200+ and
+  3k+-line files (`network_transfer_audit.py` 3521 lines); `magic_clipboard_mgr`
+  shows AI-scaffold smell (50+ service files, `Phase5–8`/`RemainingQATests`).
+- **Reco:** Lint-gate `print()`/`console.log` in the Python/TS repos; split the
+  3k-line files; audit `magic_clipboard_mgr` for stubbed vs real services.
+
+### C. Architecture
+- **Fact:** Clear separation and reuse: shared auth/datastore/design-tokens,
+  deterministic scheduler (`flowmonk`), risk engine (`trails`), use-case registry
+  (`k8s_streaming`), MCP tool servers, Rust crate boundaries (`claw-*`).
+- **Reco:** This is the strongest dimension — protect it by keeping product
+  domains out of `common_plat` and vice-versa.
+
+### D. DevOps & deployment
+- **Fact:** `docker-compose.ecosystem.yml` wires ~20 services (10 backends + 10
+  webs) + infra (Cosmos emulator, Azurite, Traefik, Loki, Grafana, MCP); 30
+  `restart:` policies, 24 `build:` contexts, but **0 `healthcheck:` blocks**.
+- **Fact:** GH Actions disabled on `common_plat` + `voice_ai_agent`; ~15 repos no CI.
+- **Reco (P1):** Add healthchecks + `depends_on: condition: service_healthy` to
+  the ecosystem compose; re-enable or fully migrate CI to gitea self-hosted.
+
+### E. Testing
+- **Fact:** `fastgap` (~700), `clock` (662), `notes` (80+ files), `voice_ai_agent`
+  (463+), `claw-cowork` (65+ files) are excellent; ~8 repos have 0 tests.
+- **Fact:** E2E often `continue-on-error: true` (`fastgap`, `flowmonk`,
+  `jarvis_jr`, `local_memory_gpt`) — i.e. not actually gating.
+- **Reco:** Set a per-repo minimum (smoke + happy-path) and stop masking E2E
+  failures with `continue-on-error` once stabilized.
+
+### F. Security
+- **Fact:** No real committed secrets across all repos. Matches were
+  `.env.example` placeholders, the public Cosmos emulator key
+  (`C2y6yDjf5/R...`), `dev-*` JWT secrets, and Azure Key Vault references.
+- **Fact:** Field encryption (AES-256-GCM) in `clock`/`notes`/`dev_intelli`;
+  `unsafe_code = "forbid"` in the Rust repos.
+- **Watch:** `NODE_TLS_REJECT_UNAUTHORIZED=0` seen in some Docker setups; thin
+  input validation / no rate-limiting in the prototype Python apps.
+
+### G. Product readiness
+- **Fact:** Web+backend pairs generally run end-to-end; native/mobile surfaces
+  (iOS/Android/KMP) are frequently partial or scaffolded.
+- **Reco:** Pick 2–3 flagships (`notes`, `trails`, `clock`) and drive them to a
+  true launch checklist; treat the rest explicitly as experiments.
+
+### H. AI-agent practices
+- **Fact:** Sophisticated `agent-queue` (profiles, job briefs, lifecycle dirs,
+  Node dashboard) — genuinely advanced for a solo setup.
+- **Fact:** Guardrails weak: agents run `--permission-mode dangerous`, write to
+  live working trees (caused the dirty-repo churn), and **landed duplicate work**
+  (during this session a rebase auto-dropped 2 commits already pushed upstream).
+- **Reco:** Standardize the agent task contract (§8): one task = one branch =
+  clean tree → tests → commit → push; ignore runtime/queue state in git (already
+  fixed in this repo this session).
+
+### I. Personal engineering workflow
+- **Fact:** Conventional commits, auto `backup-main-*` branches (nice safety net),
+  `AGENTS.md` discipline.
+- **Fact:** Too many long-lived dirty trees and behind-`origin` branches; no
+  visible issue tracker or release cadence.
+- **Reco:** A weekly "sync sweep" (rebase+push all clean repos, list dirty) — you
+  effectively did this manually this session; automate it.
+
+---
+
+## 5. Prioritized Action Plan
+
+**P0 — now (correctness / risk)**
+1. **Re-establish a working CI gate on `learning_ai_common_plat`** (everything
+   depends on it). Either fix GH Actions billing or make gitea CI the enforced
+   gate. *(M, common_plat)*
+2. **Resolve the ~14 dirty repos**: review + commit or discard intentionally;
+   add upstreams for `pytorch_todo_predictor` & `sidecar_setup`. *(M, workspace)*
+3. **Decide the agent-queue daemon policy** so it doesn't write to live trees
+   uncontrolled (it was running in `dangerous` mode). *(S, devops_tools)*
+
+**P1 — this week**
+4. Add **healthchecks** to `docker-compose.ecosystem.yml` (0 today) + ordered
+   `depends_on`. *(M, common_plat/ecosystem)*
+5. Stop masking E2E with `continue-on-error: true` once stabilized; make at least
+   smoke E2E gating. *(M, fastgap/flowmonk/jarvis_jr)*
+6. Replace `print()` with logging in `2nd_brain` (60+) and `mac_tooling` (200+).
+   *(S–M)*
+
+**P2 — this month**
+7. Add minimum test suites to the 0-test repos that matter (`productivity_web`,
+   `webui_copilot`, `agent_monitoring_fx`). *(M)*
+8. Audit `magic_clipboard_mgr` for dead/stubbed services (50+ files). *(M)*
+9. Split 3k-line files in `mac_tooling`. *(M)*
+10. Remove `NODE_TLS_REJECT_UNAUTHORIZED=0` from Docker; add rate-limiting to the
+    Python prototypes. *(S–M)*
+
+**P3 — nice to have**
+11. Portfolio-wide coverage reporting + dependency audit (`npm audit`/`pip-audit`)
+    in CI. *(M)*
+12. A lightweight issue/release cadence for the 2–3 flagships. *(S)*
+
+---
+
+## 6. Safe Auto-Fix Candidates
+*(Low-risk; listed only — not applied. Each needs your approval.)*
+- **Ecosystem compose healthchecks** — add `healthcheck:` to each backend/web
+  service in `docker-compose.ecosystem.yml`. Safe: additive.
+- **Add upstreams** for `learning_pytorch_todo_predictor` and
+  `learning_sidecar_setup` (`git remote add origin … && git push -u`). Safe once
+  remote exists.
+- **Lint rule to ban `print()`** in `learning_ai_2nd_brain` (ruff `T20`) — flags
+  only; you fix incrementally.
+- **Drop `NODE_TLS_REJECT_UNAUTHORIZED=0`** from Docker envs where a real CA/host
+  override is available. (Verify per service first.)
+- **`.gitignore` audit** for the few repos still tracking runtime artifacts
+  (pattern already fixed in `devops_tools` this session).
+
+## 7. Delegate-to-Agent Queue
+Ready-to-paste briefs (each self-contained, one branch, clean-tree rule):
+1. **"Add healthchecks to ecosystem compose"** — repo `common_plat`; read
+   `docker-compose.ecosystem.yml`; add `healthcheck` + ordered `depends_on` to
+   all `*-backend`/`*-web` services; `docker compose config` must pass; no app
+   code changes.
+2. **"De-`print()` 2nd_brain"** — repo `learning_ai_2nd_brain`; replace `print()`
+   with `typer.echo`/logging in `src/brain/**`; keep behavior identical; run
+   `pytest`.
+3. **"Bootstrap tests for webui_copilot"** — repo `learning_ai_webui_copilot`;
+   add `pytest` smoke tests for `site_backend` rules/policy engines + a copilot
+   happy-path; wire a `.github`/gitea CI job.
+4. **"Service audit: magic_clipboard_mgr"** — repo `learning_ai_magic_clipboard_mgr`;
+   produce a report of which of the 50+ services are wired vs stubbed; no code
+   changes.
+5. **"Stabilize E2E"** — repos `fastgap`/`flowmonk`; make smoke E2E reliable, then
+   remove `continue-on-error: true` for that job only.
+
+## 8. Recommended Standard Operating Procedure (for every agent task)
+1. **One task = one branch** off latest `origin/main`; never work on a dirty tree.
+2. **Scope it** with a job brief (you already do this in `agent-queue/docs/jobs/`).
+3. **Test before commit**: typecheck + lint + unit must pass locally.
+4. **Commit small**, conventional messages; **push the branch**, open a PR — don't
+   let agents push straight to `main` of the shared platform.
+5. **Never track runtime/queue state** (ignore `agent-queue/queue/*` lifecycle —
+   fixed here this session).
+6. **Prefer least-privilege** over `--permission-mode dangerous`; reserve dangerous
+   mode for sandboxed/disposable checkouts.
+7. **Weekly sync sweep**: rebase+push all clean repos, list dirty ones for review.
+
+## 9. What I Could Not Inspect
+- **No dynamic results.** I did not run `npm/pnpm install`, builds, `pytest`,
+  `vitest`, Playwright, `cargo test`, or `docker compose up` (those mutate trees /
+  need services). Test counts and CI configs are evidence of *intended* coverage,
+  not measured pass/coverage.
+- **No live `git` per-repo ahead/behind** inside the read-only agents (they lacked
+  shell git); branch/dirty facts come from the orchestrator's own checks and may
+  have shifted as the agent-queue daemon ran.
+- **One agent batch misfired**: it reported 5 repos as "missing"
+  (`claude_code_setup`, `github_copilot`, `magic_terminal`, `notif_scanr`,
+  `python_sandbox`) due to a read-access issue; I re-scanned them directly —
+  they exist (notably `magic_terminal` = Rust, `notif_scanr` = Swift).
+- **Mobile/native depth** (iOS/Android/KMP/Tauri runtime behavior) and **secret
+  *values*** were not executed/decrypted — only presence/format was checked.
+- **`.env.ecosystem`** holds dev-only values; production secret management
+  (Key Vault wiring) was inferred from references, not verified live.
+
+---
+
+### TL;DR
+- Coherent **beta-grade product ecosystem** (~38 repos) — far beyond "learning".
+- **Architecture & security are strong; CI & testing are the weak links.**
+- **P0:** restore a CI gate on `common_plat`, clean the ~14 dirty repos, and rein
+  in the `dangerous`-mode agent-queue.
+- A handful of flagships (`notes`, `trails`, `claw-*`, `clock`, `fastgap`) are
+  genuinely production-grade; the long tail is MVP/prototype.
+- Tighten the agent commit/CI loop (§8) and most of the operational churn
+  converts back into velocity.
--- a/README.md
+++ b/README.md
@ -42,6 +42,14 @@ If you are new to the repo, read these in order:

 These are for scanning many repositories, checking dirty state, and performing safer batch git workflows.

+### Work Tracking ("cut items to track")
+
+- `scripts/tracker-seed/seed-tracker-items.mjs`
+  - Files feature/bug/task items into the ByteLyst tracker (platform-service `POST /api/items`, viewed in tracker-web), scoped per `productId`.
+  - Preview safely: `node scripts/tracker-seed/seed-tracker-items.mjs --dry-run`
+  - Create (stack up): `JWT_SECRET=<secret> PLATFORM_API_URL=http://localhost:4003 node scripts/tracker-seed/seed-tracker-items.mjs`
+  - See [scripts/tracker-seed/README.md](scripts/tracker-seed/README.md) and the "Cutting Tracker Items" section in [AGENTS.md](AGENTS.md).
+
 ### Deployment Operations

 - `./deployment-status.sh`
--- a/README_INSTALL.md
+++ b/README_INSTALL.md
@ -0,0 +1,46 @@
+Installation guide — learning_ai_devops_tools
+
+Purpose
+
+This repository contains interactive, safe installers and helpers to install CLI tools (Claude Code, OpenAI Codex, Antigravity agy, Devin, GitHub Copilot) on WSL/Ubuntu, macOS, and Windows.
+
+Files of interest
+
+- install_clis_wsl.sh  — interactive WSL installer (WSL/Ubuntu). Preview and confirm before running remote installers.
+- make_symlinks_wsl.sh — creates /usr/local/bin symlinks (requires sudo)
+- run_installers.sh    — cross-platform wrapper to run installers from WSL or show instructions
+- run_installers.ps1   — Windows PowerShell wrapper to run WSL or show Windows-native steps
+- cli-install-report.md — generated report of installs (example)
+
+Quick start (WSL/Ubuntu)
+
+1. Open WSL (Ubuntu) shell.
+2. cd /mnt/d/SANDBOX/mygh/learning_ai_devops_tools
+3. Ensure scripts use LF and are executable:
+   sudo apt-get update && sudo apt-get install -y dos2unix
+   dos2unix install_clis_wsl.sh run_installers.sh make_symlinks_wsl.sh || true
+   chmod +x install_clis_wsl.sh run_installers.sh make_symlinks_wsl.sh
+4. Run the interactive installer (will preview each remote installer and ask confirmation):
+   bash -i ./install_clis_wsl.sh
+
+Quick start (Windows PowerShell with WSL)
+
+- From PowerShell run (recommended):
+  wsl bash -ic "cd /mnt/d/SANDBOX/mygh/learning_ai_devops_tools && dos2unix install_clis_wsl.sh || true && bash -i ./install_clis_wsl.sh"
+
+Quick start (macOS)
+
+- Inspect installers first. macOS support is similar to Linux; use the run_installers.sh wrapper to list commands. Do NOT pipe unknown scripts to shell without review.
+
+Security and safety
+
+- All remote installers are previewed before execution.
+- No secrets or API keys are written to shell profiles.
+- Auth steps are left interactive (use the tool's login commands).
+
+Developer notes
+
+- Use .gitattributes to enforce LF endings on shell scripts across platforms.
+- To reproduce: run the scripts from a fresh WSL Ubuntu session and follow interactive prompts.
+
+If you want, run './run_installers.sh' to get an interactive cross-platform flow.
--- a/agent-queue/.gitignore
+++ b/agent-queue/.gitignore
@ -0,0 +1,3 @@
+# Queue contents are tracked in-repo by request (prompts, logs, state) so no data is lost.
+# NOTE: daemon.pid + .state heartbeats are pure runtime and will churn/conflict in git —
+# remove them from tracking (re-add a narrow ignore) if the noise becomes a problem.
--- a/agent-queue/README.md
+++ b/agent-queue/README.md
@ -0,0 +1,556 @@
+# agent-queue
+
+A zero-dependency **folder "kanban" runner** for headless coding-agent CLIs —
+**Devin**, **Claude Code**, and **OpenAI Codex**. Drop prompt `.md` files into a folder,
+and they get executed (in auto-approve mode) one slot at a time, moving through
+`inbox → building → review → testing → shipped` (plus `failed`) with live status.
+
+> **Vision & roadmap:** where this is headed — a distributed multi-machine "gigafactory"
+> (fleet of factories × tools × profiles, scheduler-routed, built on platform-service +
+> tracker-web) — is specified as a checklist-driven implementation roadmap in
+> [`docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md`](docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md).
+> A full architecture overview, diagrams, code map and onboarding live alongside it in
+> [`docs/GIGAFACTORY/`](docs/GIGAFACTORY/).
+
+> **Run safety:** how the daemon and its agents must operate (isolated worktrees,
+> branch-per-task, least-privilege instead of blanket `--yolo`/dangerous on live
+> checkouts) is defined in [`docs/RUN_POLICY.md`](docs/RUN_POLICY.md). Read it
+> before enabling `yolo: true`.
+
+**Build/ship lifecycle — auto-QA, manual ship:**
+
+```
+inbox ─▶ building ─▶ review ─▶ testing ─▶ shipped
+  (queued)  (agent     (rc=0;    (verify    (you ran
+            running)   awaiting  passed —    `ship`)
+                       verify)   QA gate)
+                          │
+        agent rc≠0 /      │ verify fails
+        timeout ──────────┴──────────────▶ failed
+```
+
+- **Auto:** agent exits 0 → `review/`. If a `verify:` command is configured it runs
+  automatically: **pass → `testing/` (QA)**, **fail → `failed/`**. No `verify:` →
+  the job parks in `review/` for a manual `promote`.
+- **Manual:** you `ship` a `testing/` job → `shipped/` (the human gate). Shipping is
+  never automatic.
+
+> **Why this exists:** the agent CLIs ship a minimal local interface (no built-in
+> batch/queue/dashboard — that lives in their *cloud* products). This is the
+> zero-dependency bash glue that turns "run one prompt interactively" into
+> "queue many and walk away."
+
+---
+
+## Quick start
+
+```bash
+cd learning_ai_devops_tools/agent-queue
+chmod +x agent-queue.sh
+./agent-queue.sh init
+
+# queue a roadmap for Devin, running in the tracker-web repo, auto-approving everything
+./agent-queue.sh add ~/roadmaps/UX-2.md \
+  --engine devin \
+  --cwd /Users/sd9235/code/mygh/learning_ai_common_plat/dashboards/tracker-web \
+  --yolo
+
+# start processing (foreground; Ctrl-C to stop). Run up to 3 agents at once (default).
+./agent-queue.sh run --max 3
+```
+
+In a **second terminal**, watch progress:
+
+```bash
+./agent-queue.sh watch
+```
+
+```
+  AGENT QUEUE  /…/agent-queue/queue
+  inbox 3   building 2   review 1   testing 2   shipped 5   failed 0   running 2/2
+
+  RUNNING
+    20260528-2130__UX-2        devin     4m12s  pid 51234  ⏺ Edited src/app/dashboard/items/page.tsx
+    20260528-2131__UX-3        claude    1m02s  pid 51290  Running: pnpm typecheck
+```
+
+---
+
+## How a task is configured
+
+Each `.md` carries optional **frontmatter** telling the runner which engine to use,
+which directory to run in, and whether to auto-approve:
+
+```md
+---
+engine: devin          # devin | claude | codex | copilot  (default: $AGENT_QUEUE_ENGINE)
+cwd: /abs/path/to/repo # where the agent executes   (default: cwd when added)
+yolo: true             # auto-approve ALL tools      (default: true)
+lock: my-repo          # optional mutex key (default: cwd). Jobs sharing a key run serially
+timeout: 45m           # optional. 90s|45m|2h|1d. On expiry → failed (result=timeout)
+verify: pnpm -s test   # optional auto-QA gate. Runs in cwd after rc=0:
+                       #   pass → testing/ (QA),  fail → failed/
+                       #   (omit to park in review/ for manual promote)
+---
+
+# Your task / roadmap goes here
+...
+```
+
+`add --engine/--cwd/--yolo` will inject this frontmatter for you if the file doesn't
+already have a `---` block.
+
+### Manifest fields (Gigafactory Phase 1)
+
+The runner parses the richer [gigafactory manifest](docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md#5-the-evolved-job-manifest-feature)
+**backward-compatibly** — a legacy `engine`/`cwd`/`yolo`-only `.md` behaves exactly as before.
+Fields marked **RESERVED** are parsed, stored in `.state/<job>.meta`, and shown in `status`, but
+are otherwise **no-ops until a later phase** (they do not yet affect execution).
+
+| Field | Status | Default | Meaning |
+| ----- | ------ | ------- | ------- |
+| `engine` | active | `$AGENT_QUEUE_ENGINE` | explicit engine (`devin\|claude\|codex\|copilot`) — always wins over `engine-class` |
+| `cwd` / `yolo` / `lock` / `timeout` / `verify` | active | see above | Phase-0 behavior, unchanged |
+| `priority` | **active** | `medium` | `critical\|high\|medium\|low`. Inbox is picked **highest-priority first, then oldest** (was pure FIFO) |
+| `engine-class` | **active** | _(none)_ | used only when `engine` is unset: `agentic-coder`→`devin,claude,codex`; `chat-coder`→`copilot`. Picks the first **available** engine. No engine available → job fails `result=no_engine` |
+| `prefers-engine` | **active** | _(none)_ | optional order hint for `engine-class` resolution, e.g. `[claude, devin]` |
+| `capabilities` | **active** | _(none)_ | hard host requirements, e.g. `[os:any, node>=20, has:git]`. If the host can't satisfy them the job is sent to `failed/` with `result=capability_mismatch` **and the agent is never launched** (grammar below) |
+| `idempotency-key` | **active** | _(none)_ | dedupe on `add` (semantics below) |
+| `profile` | **active** | _(none)_ | inherit persona + verify/caps/engine-class/prefers-engine/allowed-scope/review-policy from `profiles/<name>.md` (job fields override — see **Profiles**) |
+| `prefers` | RESERVED | _(none)_ | soft routing/affinity hints (e.g. `[factory:mac-2]`) |
+| `budget` | RESERVED | _(none)_ | `{ usd, tokens, wall }` ceilings (`wall` enforcement is a later slice) |
+| `deps` / `deps-mode` | **active** | _(none)_ | block until each referenced `idempotency-key` is in `shipped/` (or `testing/` when `deps-mode: soft`). Submit-time cycle detection (see **Profiles & deps**) |
+| `retry` | **active** | _(none)_ | `{ max: N, backoff: 5m, on: [timeout, verify_failed, crash] }` — requeue failures with backoff up to `max`, then `retries_exhausted` (see **Resilience**) |
+| `review-policy` | RESERVED | _(none)_ | `auto\|manual\|reviewers:[…]` |
+| `artifacts` | RESERVED | _(none)_ | extra outputs to capture (coverage, screenshots) |
+| `tracker-item` | RESERVED | _(none)_ | link back to the originating tracker task |
+
+**Capability grammar** (a job matches a host iff **every** required token is satisfied):
+
+| Token form | Example | Satisfied when |
+| ---------- | ------- | -------------- |
+| `key` (bare presence) | `gpu` | the host advertises `key` in any form |
+| `key:value` (exact) | `os:mac`, `engine:devin`, `has:git` | the host advertises that exact token |
+| `key:any` (wildcard) | `os:any` | the host advertises any `key:*` (so `os:any` matches every host) |
+| `key<op>version` (`>=` `>` `=` `<=` `<`) | `node>=20` | numeric/semver-major compare vs the host's `key:<n>` |
+
+The host advertises (via `detect_capabilities`): `os:<mac\|linux>`, `engine:<each available engine>`,
+`node:<major>`, and `has:<git\|pnpm\|docker>` when present.
+
+**`idempotency-key` semantics** (on `add`, hashing the frontmatter-stripped body):
+
+- same key **+ same body** → **no-op** (logged `duplicate, skipped`).
+- same key **+ different body**, prior job still in `inbox/` → **supersedes** it (replaces the queued file).
+- same key **+ different body**, prior job already past `inbox/` (building/review/testing/shipped) →
+  **rejected** with a clear error (use a new key, or requeue the existing job).
+
+## Engine mapping
+
+| `engine:` | Command run | Auto-approve flag (`yolo: true`) |
+| --------- | ----------- | -------------------------------- |
+| `devin`   | `devin -p --prompt-file <body>` | `--permission-mode dangerous` |
+| `claude`  | `claude -p` (body on **stdin**) | `--dangerously-skip-permissions` |
+| `codex`   | `codex exec` (body on **stdin**) | `--dangerously-bypass-approvals-and-sandbox` |
+| `copilot` | `copilot -p` (body on **stdin**) | `--allow-all-tools` _(best-effort; chat-coder class target)_ |
+
+The frontmatter is **stripped** before the body reaches the agent, and
+claude/codex receive it on **stdin** so a body starting with `--` is never
+misparsed as a flag.
+
+> Flags drift between CLI versions — if one changes, edit `build_agent_cmd()` in
+> `agent-queue.sh` (it's the single place each engine is mapped).
+
+## Commands
+
+| Command | What it does |
+| ------- | ------------ |
+| `init` | create the `queue/` folders |
+| `add <file> [--engine E] [--cwd P] [--yolo\|--no-yolo]` | queue a prompt into `inbox/` |
+| `run [--max N] [--engine E] [--once]` | process the inbox (foreground loop) |
+| `status` | kanban counts + running-worker table (marks `⚠ stalled`; per-job insights sub-line) |
+| `watch [interval]` | live `status` (bash), redrawn every N seconds (default 2) |
+| `insights [job]` | per-job metrics, or a recent-jobs table + per-engine token/cost/success rollup (see **Insights**) |
+| `recover` | reclaim orphaned `building/` jobs (dead worker) back to `inbox/` (see **Resilience**) |
+| `dash [--interval N]` | **interactive Node dashboard** — navigable numbered job list with single-key actions (see below) |
+| `stop` | kill running workers + the run loop |
+| `logs <job> [-f]` | print / follow a job's log |
+| `promote <job>` | advance one stage forward: `review → testing → shipped` |
+| `ship <job>` | **manual gate:** move a `testing/` (QA) job → `shipped/` |
+| `reject <job>` | send a `review/` or `testing/` job → `failed/` |
+| `requeue <job>` | move a `failed`/`review`/`testing` job back to `inbox/` for a fresh run |
+| `clean [--keep N]` | archive finished logs+meta beyond the newest N (default 50) into `queue/.archive/` |
+
+Only one `run` loop may be active per queue — a second `run` against the same
+queue is refused while the first is alive (a stale `daemon.pid` is cleared).
+
+### Interactive dashboard (`dash`)
+
+`dash` is a single-script, menu-driven control panel (think a tiny "glassbox").
+It shows the kanban counts, live RUNNING workers (engine, elapsed, last log
+line, stall), a **navigable numbered JOBS list**, and RECENT finished jobs — and
+lets you act on jobs without leaving the screen. Every action shells out to
+`agent-queue.sh`, so the script stays the single source of truth.
+
+| Key | Action |
+| --- | ------ |
+| `↑`/`↓`, `j`/`k`, `1`–`9` | select a job in the JOBS list |
+| `enter` / `l` | view the selected job's log (live, auto-refreshing) |
+| `p` | promote (`review → testing → shipped`) |
+| `s` | ship (`testing`/QA → `shipped`, the manual gate) |
+| `x` | reject (`review`/`testing` → `failed`) — asks `y/n` |
+| `u` | requeue (`failed`/`review`/`testing` → `inbox`) — asks `y/n` |
+| `r` | start the `run` loop (detached → `logs/run-loop.log`) |
+| `S` | stop the run loop + running workers |
+| `g` | refresh now · `?`/`h` help · `q`/`Ctrl-C` quit |
+
+The header shows a `● run loop pid N` / `○ run loop stopped` indicator. Run it in
+a TTY for the interactive mode; piped/non-TTY it falls back to a read-only live view.
+
+## Via `bytelyst-cli.sh`
+
+Wired into the repo's unified CLI (no GitHub token required for this subcommand):
+
+```bash
+./bytelyst-cli.sh agent-queue run --max 3     # full passthrough
+./bytelyst-cli.sh aq status                   # short alias
+```
+
+## Boot-persistence (auto-start on login)
+
+To run the worker non-stop and survive **reboot / crash / logout** (not just a
+closed terminal), install the macOS LaunchAgent — it auto-starts `agent-queue run`
+on login under `caffeinate` and restarts it via `KeepAlive`:
+
+```bash
+bash launchd/install.sh             # install + start now
+bash launchd/install.sh --uninstall # stop + remove
+```
+
+Override engine/concurrency/secrets in `~/.agent-queue.env` (e.g.
+`AGENT_QUEUE_ENGINE=codex`, `AGENT_QUEUE_MAX=1`). See [`launchd/README.md`](launchd/README.md)
+for the full layer comparison (tmux/caffeinate vs LaunchAgent) and gotchas.
+
+## Folder layout
+
+```
+queue/
+  inbox/    # drop / queued .md files (oldest eligible picked first)
+  building/ # currently executing (agent running)
+  review/   # agent exited 0 — awaiting the auto-QA verify gate (or manual promote)
+  testing/  # verify passed (QA) — awaiting manual `ship`
+  shipped/  # manually shipped — the terminal success stage
+  failed/   # non-zero exit, bad cwd, timeout, verify failure, or manual reject
+  logs/     # <job>.log — full agent + verify output
+  locks/    # per-key flock files (Linux hardening; unused on macOS)
+  .state/   # <job>.meta heartbeats + daemon.pid (runtime only)
+  .archive/ # <ts>/ — logs+meta moved here by `clean`
+```
+
+**`result=` values** written to `<job>.meta`: `review`, `testing`, `shipped`,
+`failed`, `timeout`, `verify_failed`, `rejected`, `requeued`, `capability_mismatch`
+(host missing a required capability — agent never launched), `no_engine`
+(an `engine-class` had no available engine), `retries_exhausted` (failed after
+`retry.max` attempts — single-host dead-letter stand-in), `retry_scheduled`
+(transient: requeued for another attempt), `recovered` (transient: an orphan was
+reclaimed to `inbox/`).
+
+## Profiles & deps
+
+### Profiles (roadmap §6)
+
+A **profile** is a reusable role preset in `profiles/<name>.md`. A job opts in with
+`profile: <name>` and inherits any of these fields it does **not** set itself:
+`verify` (from the profile's `default-verify`), `capabilities`, `engine-class`,
+`prefers-engine`, `allowed-scope`, `review-policy`. The profile's `persona` block is
+**prepended** to the body sent to the engine (the job `.md` on disk is unchanged;
+secrets are never logged). Resolution runs **before** the capability gate and engine
+resolution, so inherited caps / engine-class take effect.
+
+**Precedence:** `job field > profile field > built-in default`. Set `AGENT_QUEUE_PROFILES`
+to point at a different catalog directory (defaults to `./profiles`).
+
+Starter catalog: `developer`, `backend-engineer`, `frontend-engineer`, `ux-designer`,
+`ui-designer`, `qa`, `reviewer`, `docs-writer`, and a reserved `planner`. Each presets
+`name`, `persona`, `capabilities`, `default-verify`, `engine-class`, `prefers-engine`,
+`allowed-scope`, and `review-policy`.
+
+**allowed-scope (warn-only this phase).** After a run on a git `cwd`, changed paths
+outside the profile/job `allowed-scope` globs (`dir/**` matches the whole subtree) are
+logged as a `WARNING` and recorded as `scope_warning=` in the meta — **non-blocking**
+(the job is not failed). `path_in_scope` is exposed as a unit-testable function.
+
+### deps / DAG, single host (roadmap §5)
+
+`deps: [keyA, keyB]` references other jobs by their author-controlled
+`idempotency-key`. A dep is **satisfied** when a job with that key is in `shipped/`
+(default), or in `shipped/` **or** `testing/` when the dependent job sets
+`deps-mode: soft`. A job with unmet deps is **blocked**: it is skipped in inbox
+selection (never launched, never failed) and surfaced in `status` as
+`blocked (waiting on: <keys>)`, then re-evaluated every loop until its deps are met.
+`add` performs **submit-time cycle detection** over the inbox + active-stage dep graph
+and rejects (nonzero exit) a job that would create a cycle. Cross-machine deps are P2.
+
+## Resilience (crash recovery & work preservation)
+
+Single-host implementations of the durability model (roadmap §25):
+
+- **Orphan recovery.** A job left in `building/` whose worker process is dead (no
+  live `pid`, PID-reuse-guarded by `pidstart`) is an orphan from a previous
+  crash/power-off. On `run` startup and on every loop iteration (or on demand via
+  `agent-queue.sh recover`) it is moved back to `inbox/` with `attempts`
+  incremented. Recovery is **idempotent** — once moved out of `building/` it is
+  never recovered twice.
+- **WIP checkpointing.** When a job's `cwd` is a git repo, the worker creates/checks
+  out a dedicated branch **`aq/wip/<job>`** at start and commits any changes to it
+  on **every** exit path — success, failure, timeout, and SIGTERM/SIGINT (via a
+  trap). It **never** commits to `main`/your current branch. Non-git `cwd` is
+  skipped cleanly. `wip_branch` / `wip_base` / `wip_commit` are recorded in the meta.
+- **Resume.** When an orphan/retry of a job whose `aq/wip/<job>` branch already
+  exists is relaunched, that branch is checked out first so the agent **continues
+  from the checkpoint** instead of from zero.
+- **Retry policy** (`retry` frontmatter, now active). On a failure whose class is in
+  `on` (`crash`/`agent_error` for a non-zero agent exit, `timeout`, `verify_failed`)
+  the job is requeued to `inbox/` honoring `backoff` (selection skips it until
+  `next_eligible`) up to `max` attempts; on exhaustion it lands in `failed/` with
+  `result=retries_exhausted`, preserving the WIP branch + full log. No `retry` =
+  no retry (Phase-0 behavior).
+
+All bookkeeping (`attempts`, `next_eligible`, `wip_*`) is append-only in the meta
+and re-derivable from the meta + folder location, so recovery is crash-safe.
+
+## Insights (metrics & token accounting)
+
+Each finished run records into `<job>.meta`: `duration_s`, `exit`, `result`,
+`attempts`, and — for a git `cwd` — `files_changed` / `lines_added` /
+`lines_deleted` (diffed `wip_base..HEAD`). A single `parse_usage <engine> <log>`
+adapter extracts `model` / `tokens_in` / `tokens_out` / `tokens_cached` /
+`cost_usd` / `turns` / `tool_calls` when the engine exposes them.
+
+```bash
+agent-queue.sh insights <job>   # full metrics for one job
+agent-queue.sh insights         # recent-jobs table + per-engine rollup
+```
+
+> **Token caveat (honest):** real usage is captured only where the engine surfaces
+> it. A cooperating wrapper may emit a machine-readable `AQ_USAGE key=value …` line;
+> otherwise per-engine heuristics apply (Claude/Codex token fields parsed; Devin
+> session metrics + Copilot are API-only and currently TODO in `parse_usage`). When
+> a value is not provider-reported it is **omitted or flagged `usage_estimated`** —
+> numbers are never fabricated. The per-engine rollup marks totals that include any
+> estimated value with `*`.
+
+## Tracker integration (§10)
+
+Closes the task ↔ job round-trip against the platform-service **items API**: a
+tracker Item can become a job, and a job's outcome echoes back to the Item.
+
+```bash
+agent-queue.sh from-tracker <ITEM_ID>   # pull an Item -> materialize a job in inbox/
+agent-queue.sh to-tracker <job>         # echo the job's current outcome to its Item
+```
+
+All HTTP goes through one curl wrapper (`tracker_api`); there are no other network
+calls. Real use needs **platform-service running and a bearer token**.
+
+### Config (env)
+
+| Var | Default | Meaning |
+| --- | ------- | ------- |
+| `AQ_TRACKER_API` | `http://localhost:4003` | base URL of the items API (routes live under `/api`) |
+| `AQ_TRACKER_TOKEN` | _(none)_ | bearer token — **required** for real calls; never hardcode |
+| `AQ_PRODUCT_ID` | _(none)_ | productId (sent as `X-Product-Id`; every Item has one) |
+| `AQ_TRACKER_CWD` | `$PWD` | cwd a tracker-derived job runs in (Items carry no cwd) |
+| `AQ_TRACKER_AUTO` | `0` | `1` = auto-echo on each transition (default OFF — echo is manual) |
+| `AQ_TRACKER_STATUS_INPROGRESS` / `_DONE` / `_FAILED` | `in_progress` / `done` / `wont_fix` | Item status per bucket (the API has no blocked/failed status) |
+| `AQ_TRACKER_API_CMD` | _(none)_ | test seam: a stub that replaces the curl HTTP entirely (selftest uses it) |
+
+### `from-tracker` — Item → job
+
+`GET /api/items/<id>`, then maps fields to job frontmatter:
+
+| Item | Job |
+| ---- | --- |
+| `title` + `description` | job body (verbatim instruction markdown) |
+| `id` | `tracker-item: <id>` and `idempotency-key: tracker-<id>` (stable) |
+| `priority` | `priority:` (label overrides; else Item priority; else `medium`) |
+| label `engine-class:<x>` | `engine-class: <x>` |
+| label `profile:<x>` | `profile: <x>` |
+| label `priority:<x>` | `priority: <x>` |
+| label `cap:<token>` | a `capabilities: [...]` entry |
+
+Idempotent on the derived `idempotency-key` (Slice 1 dedupe) — pulling the same
+Item twice never enqueues a duplicate.
+
+### `to-tracker` — job → Item (one-way echo, §24.5)
+
+Only if the job's meta has a `tracker-item`. Maps the job's stage/result to an Item
+status and `PATCH /api/items/<id>/status`, then `POST /api/items/<id>/comments`
+with a **metrics-only** summary (result, attempts, duration, tokens/cost, +/- lines —
+**never prompt content or secrets**):
+
+| job result/stage | Item status |
+| ---------------- | ----------- |
+| building / review / testing / recovered | `in_progress` |
+| shipped | `done` |
+| failed / timeout / verify_failed / retries_exhausted / capability_mismatch / no_engine / rejected | `wont_fix` (override via `AQ_TRACKER_STATUS_FAILED`) |
+
+Idempotent via `tracker_echoed` in the meta (re-echoing an unchanged outcome is a
+no-op). The echo is **one-way** (child → tracker) and **never authoritative for
+execution**: an echo failure is logged and the job continues unchanged. With
+`AQ_TRACKER_AUTO=1` the worker echoes automatically on each transition; otherwise
+echo is manual. `status` / `insights` surface the `tracker-item` and last echoed status.
+
+## Fleet integration (Phase 2)
+
+Behind the `AQ_FLEET` flag, the runner becomes a **factory** that registers,
+heartbeats, claims, and reports against the platform-service `fleet` coordinator —
+so coordinator jobs run alongside local `.md` files on the same host. All
+coordinator logic lives in [`lib/fleet-client.sh`](lib/fleet-client.sh) (curl-only +
+POSIX awk, sourced by `agent-queue.sh`); the few hook points in the runner are all
+gated on `fleet_enabled`.
+
+> **Offline vs fleet mode.** With `AQ_FLEET` unset/`0` (the default) the runner is
+> the pure offline git-queue described above — **zero** coordinator calls, behavior
+> byte-for-byte unchanged. With `AQ_FLEET=1` the run loop also registers + claims
+> from the coordinator, reports fenced stage transitions, renews leases, and (in
+> fleet mode) routes the outcome echo through the coordinator's `fleet_events`
+> instead of the direct tracker echo. The tracker echo remains the offline path.
+
+```bash
+AQ_FLEET=1 AQ_FLEET_TOKEN=… AQ_PRODUCT_ID=… agent-queue.sh fleet-status   # register + show identity
+AQ_FLEET=1 AQ_FLEET_TOKEN=… AQ_PRODUCT_ID=… agent-queue.sh run            # claim + execute coordinator jobs
+```
+
+### Config (env)
+
+| Var | Default | Meaning |
+| --- | ------- | ------- |
+| `AQ_FLEET` | `0` | master switch — `1` enables coordinator integration; `0`/unset = offline git-queue (zero coordinator calls) |
+| `AQ_FLEET_ROUTE` | `1` | `route_via_service`: `1` = coordinator is authoritative for claim (P2-S3 behavior); `0` = local inbox authoritative (coordinator not used to source work) |
+| `AQ_FLEET_AUTOSHIP` | `0` | `1` = when the local verify gate passes, advance the coordinator job `testing → shipped` (the factory's verify **is** the test phase); `0` = report `testing` and rest for the human review gate |
+| `AQ_FLEET_PR` | `0` | `1` = for a job carrying a `repo`, run the agent in an isolated checkout on branch `aq/job/<id>`, then commit/push and `gh pr create`; the PR URL is reported back and recorded on the run |
+| `AQ_FLEET_REPOS_DIR` | `.state/repos` | cache dir for PR-mode repo checkouts (one per repo) |
+| `AQ_FLEET_REPO_BASE` | _(none)_ | base dir of existing local repos; a job `repo` matching `<base>/<repo>` is cloned from there (fast, no network) and PRs are pushed to its GitHub origin (embedded creds stripped) |
+| `GH_BIN` | `gh` | GitHub CLI used to open PRs in PR mode |
+| `AQ_FLEET_SHADOW` | `0` | shadow/dual-run: `1` (requires `AQ_FLEET=1` + `AQ_FLEET_ROUTE=0`) queries the coordinator in parallel and records divergence, **never acting on it** |
+| `AQ_FLEET_SHADOW_FACTORY_ID` | `<factory>-shadow` | isolated id used for the read-only shadow claim (never the real factory id) |
+| `AQ_FLEET_SHADOW_LOG` | `.state/fleet-shadow.log` | structured shadow-divergence log (`ts⇥localJob⇥coordJob⇥verdict`) |
+| `AQ_FLEET_API` | `http://localhost:4003/api` | coordinator base URL (already includes `/api`) |
+| `AQ_FLEET_TOKEN` | _(none)_ | bearer token — never hardcode |
+| `AQ_PRODUCT_ID` | _(none)_ | productId (sent as `X-Product-Id`; shared with the tracker config) |
+| `AQ_FACTORY_ID` | `<hostname>-<pid>` | stable factory identity for this process |
+| `AQ_FLEET_LEASE_RENEW_SEC` | `300` | heartbeat / lease-renew cadence |
+| `AQ_FLEET_CAPS` | _(auto)_ | override the auto-detected capability tokens (comma/space list) |
+| `AQ_FLEET_CWD` | `$PWD` | cwd a claimed coordinator job runs in |
+| `AQ_FLEET_API_CMD` | _(none)_ | test seam: a stub that replaces the curl HTTP entirely (selftest uses it) |
+
+### Protocol (claim / heartbeat / report / fence / renew)
+
+- **register / heartbeat:** `POST /fleet/factories/heartbeat {factoryId, capabilities[], health, load}` — registration *is* the first heartbeat; re-sent on `AQ_FLEET_LEASE_RENEW_SEC` cadence.
+- **claim:** `POST /fleet/claim {factoryId, capabilities[], leaseSeconds}`. A returned job (`id`, `bodyMd`, `leaseEpoch`) is materialized as a transient local `.md` (frontmatter `fleet-job-id` + `fleet-lease-epoch`) so the existing runner executes it unchanged, interleaved with local files.
+- **report (fenced):** each stage transition (`building`/`review`/`testing`/`shipped`/`failed`) is `PATCH /fleet/jobs/:id {stage, leaseEpoch, checkpoint?}`. The coordinator writes `fleet_events` server-side. The payload carries only stage/epoch/checkpoint — **never** the prompt/`bodyMd` or token.
+- **fencing (§18):** if a report/renew returns **conflict/409** (stale `leaseEpoch` → the coordinator reclaimed us), the worker **self-aborts**: it stops, does **not** ship/merge, and **quarantines** the local result to `failed/` (`result=fenced_quarantine`) for human triage. A reclaimed zombie can never corrupt coordinator state.
+- **lease renew / release:** `POST /fleet/jobs/:id/lease/renew` while building (fenced); `…/lease/release` on terminal stages.
+- **checkpoint:** the WIP `{wipBranch, wipCommit}` is sent with the building report so a reclaim can resume (§25).
+
+### Offline-degrade + quarantine (§9)
+
+If the coordinator is **unreachable** mid-job (5xx / connection error), the report
+is treated as *degraded* (logged, `fleet_degraded=1`): the in-flight job **finishes
+locally** rather than being abandoned. On the next reachable call the worker
+presents its `leaseEpoch`; if the coordinator now reports it **stale** (it was
+reclaimed during the outage), the local result is **quarantined** (marked, not
+auto-shipped) and surfaced for human triage — split-brain is resolved in favor of
+the coordinator without losing the work. `status` shows the factory id + per-job
+`fleet=<id>@e<epoch>`; `insights` lists the `fleet_*` fields.
+
+### Feature flags + shadow / dual-run (Slice 4, §16/§27)
+
+Three explicit, independently-toggleable levels gate the coordinator — a safe,
+reversible path to validate the fleet coordinator against the proven single-host
+(P1) behavior **before** any real cutover:
+
+| Flag | Effect |
+| ---- | ------ |
+| `AQ_FLEET=0` | **Pure offline.** Zero coordinator calls (including shadow). Offline git-queue path is byte-for-byte unchanged. |
+| `AQ_FLEET_ROUTE=1` (default) | **route_via_service** — the coordinator is *authoritative* for claim/assignment (today's P2-S3 behavior). |
+| `AQ_FLEET_ROUTE=0` | **Local inbox authoritative** — the coordinator is *not* used to source work (the pre-cutover state). |
+| `AQ_FLEET_AUTOSHIP=1` | **Autonomous ship.** On a passing local verify, advance the coordinator job `testing → shipped` (closing the `testing → shipped` gap). Default `0` reports `testing` and leaves the job for the human review gate / `ship` operator action. |
+| `AQ_FLEET_SHADOW=1` | **Shadow / dual-run** (requires `AQ_FLEET=1` **and** `AQ_FLEET_ROUTE=0`): run the offline path as authoritative **and** query the coordinator in parallel, recording divergence **without acting on it**. |
+
+**Precedence.** Shadow is only meaningful when `ROUTE=0`. If both `AQ_FLEET_ROUTE=1`
+and `AQ_FLEET_SHADOW=1` are set, **ROUTE wins** and shadow is disabled (a one-shot
+warning is logged) — you never route *and* shadow at the same time.
+
+**Side-effect-free by construction.** Shadow **never** ships, quarantines, or
+mutates real job state. `fleet_shadow_claim` asks the coordinator what it *would*
+assign using an **isolated `-shadow` factoryId** + `"dryRun":true,"shadow":true`;
+if a coordinator without dry-run support actually assigned, the lease is **released
+immediately** so no real assignment persists. The would-be job is never
+materialized, run, or shipped locally. `fleet_shadow_report` mirrors the local
+stage as a shadow event (`"shadow":true`) purely to exercise reporting — the
+coordinator response is logged but **never acted on** (no fence/quarantine).
+
+Each iteration `fleet_shadow_compare` classifies the local (authoritative) decision
+vs the coordinator's would-be decision as **AGREE / DIVERGE / COORD_EMPTY /
+LOCAL_EMPTY** and appends a line to the shadow log. Summarize it any time:
+
+```bash
+agent-queue.sh fleet-shadow-report        # per-verdict counts + agreement rate + recent divergences
+agent-queue.sh fleet-shadow-report 25     # last 25 divergence/error events
+agent-queue.sh status                     # surfaces the three flags' resolved state
+```
+
+**Cutover ladder (rollback at any step):**
+
+1. **Observe (zero risk):** `AQ_FLEET=1 AQ_FLEET_ROUTE=0 AQ_FLEET_SHADOW=1 run` —
+   the local path stays authoritative; the coordinator is only shadowed.
+2. **Inspect agreement:** `fleet-shadow-report` — drive `AGREEMENT` toward 100%,
+   investigating each `DIVERGE`.
+3. **Cut over:** once agreement is high, flip `AQ_FLEET_ROUTE=1` (coordinator
+   becomes authoritative).
+4. **Rollback:** set `AQ_FLEET_ROUTE=0` (and/or `AQ_FLEET=0`) at any time — instant
+   return to the local/offline path, no data migration.
+
+## Config (env overrides)
+
+| Var | Default | Meaning |
+| --- | ------- | ------- |
+| `AGENT_QUEUE_ROOT` | `./queue` | where the kanban folders live |
+| `AGENT_QUEUE_MAX` | `3` | max concurrent agents (override per-run with `run --max N`) |
+| `AGENT_QUEUE_ENGINE` | `devin` | default engine when none in frontmatter |
+| `AGENT_QUEUE_POLL` | `3` | inbox poll interval (seconds) |
+| `AGENT_QUEUE_VERIFY` | _(empty)_ | default auto-QA verify command; per-job `verify:` overrides it |
+| `AGENT_QUEUE_STALL_MIN` | `10` | minutes of unchanged log before a worker is `⚠ stalled` |
+| `DEVIN_BIN` / `CLAUDE_BIN` / `CODEX_BIN` / `COPILOT_BIN` | autodetected | override CLI binary paths |
+| `FLOCK_BIN` / `TIMEOUT_BIN` | autodetected | `flock` (lock hardening) and `timeout`/`gtimeout` (hard timeouts); absent on stock macOS — see notes |
+
+## ⚠️ Safety
+
+Running agents with `yolo: true` means **no approval prompts** — they will edit files,
+run shell commands, and commit unattended. Mitigate:
+
+- Prefer **scope-locked** prompt files (e.g. "edit only under `dashboards/tracker-web/`").
+- Tell prompts **not to `git push`** — review commits before they leave your machine.
+- **Same-repo safety is automatic:** jobs sharing a `cwd` (or `lock:` key) are
+  serialized, so two agents never run in one repo at once — even at `--max 2+`.
+- Set a `timeout:` on long jobs so a wedged agent can't run forever.
+- Watch cost: each job is a full agent session.
+
+### Portability notes
+
+- **macOS** has no `flock`/`timeout`; locking relies on the single run-loop
+  (enforced by the second-run refusal) and timeouts use a pure-bash watchdog.
+  Install coreutils (`gtimeout`) for hard process-tree kills.
+- **Linux** (incl. Gitea CI) uses `flock` + `timeout` for cross-process hardening.
+
+## Roadmap / nice-to-haves
+
+- [x] Per-repo lock to serialize same-repo jobs automatically (`lock:` / cwd).
+- [x] Per-job `timeout:` with hard kill (or bash watchdog fallback).
+- [x] Stall detection in `status`/`dash`.
+- [x] `requeue` failed jobs + `clean`/archive old runs.
+- [x] Build/ship lifecycle: `review → testing → shipped` with auto-QA `verify:` gate + manual `ship`.
+- [ ] `--push` opt-in policy + commit review gate.
+- [ ] Optional notifications (Slack/desktop) on done/failed/stall.
+- [ ] Persisted run-loop as a daemon/service with auto-restart.
--- a/agent-queue/agent-queue-boot.sh
+++ b/agent-queue/agent-queue-boot.sh
@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# agent-queue-boot.sh — boot/login entrypoint for the agent-queue run loop.
+#
+# Launched by the macOS LaunchAgent (see launchd/) so the folder-kanban worker
+# auto-starts on login AND survives reboot/crash (LaunchAgent KeepAlive). This is
+# the reboot-persistence layer that tmux + caffeinate alone cannot provide.
+#
+# It does three things launchd's minimal environment needs:
+#   1. Repairs PATH so the agent CLIs (codex/devin/claude) + caffeinate are found.
+#   2. Loads optional overrides from ~/.agent-queue.env.
+#   3. Wraps `agent-queue run` in caffeinate (macOS) so the Mac won't sleep while
+#      a job is running. NOTE: because the run loop is long-lived, this keeps the
+#      machine awake for as long as the LaunchAgent runs — intended for a dedicated
+#      overnight runner. Set AGENT_QUEUE_NO_CAFFEINATE=1 to allow idle sleep.
+#
+set -uo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd -P)"
+
+# launchd hands processes a bare PATH — prepend the usual CLI install locations
+# (Homebrew arm64/intel, ~/.local/bin for devin, system dirs) ahead of it.
+export PATH="$HOME/.local/bin:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:${PATH:-}"
+
+# Optional per-machine overrides (engine, concurrency, tokens, NETWORK, etc.).
+# This file is NOT tracked — keep secrets/host-specific config here.
+if [ -f "$HOME/.agent-queue.env" ]; then
+  # shellcheck disable=SC1091
+  . "$HOME/.agent-queue.env"
+fi
+
+# Recommended default for a local monorepo overnight runner (see long-running-jobs
+# cheat sheet): codex runs in-repo so @bytelyst/* workspace links resolve locally.
+: "${AGENT_QUEUE_ENGINE:=codex}"
+export AGENT_QUEUE_ENGINE
+
+echo "[agent-queue-boot] $(date '+%Y-%m-%d %H:%M:%S') starting run loop" \
+     "(engine=$AGENT_QUEUE_ENGINE, max=${AGENT_QUEUE_MAX:-3})"
+
+# Keep the Mac awake for the lifetime of the loop unless explicitly opted out.
+keep=""
+if [ "${AGENT_QUEUE_NO_CAFFEINATE:-0}" != "1" ] && command -v caffeinate >/dev/null 2>&1; then
+  keep="caffeinate -dimsu"
+fi
+
+# exec so the LaunchAgent tracks the real worker PID (clean KeepAlive restarts).
+# shellcheck disable=SC2086
+exec $keep "$SCRIPT_DIR/agent-queue.sh" run
--- a/agent-queue/agent-queue.sh
+++ b/agent-queue/agent-queue.sh
--- a/agent-queue/dashboard.mjs
+++ b/agent-queue/dashboard.mjs
@ -0,0 +1,769 @@
+#!/usr/bin/env node
+// agent-queue dashboard — a zero-dependency, INTERACTIVE TUI for the folder queue.
+//
+// Reads the same queue/ state written by agent-queue.sh and re-renders a board
+// every interval: kanban counts, running workers (engine, elapsed, last log line),
+// and a navigable, numbered job list you can act on without leaving the screen.
+//
+// Lifecycle: inbox → building → review → testing → shipped   (+ failed)
+//
+// Interactive keys (when run in a TTY):
+//   ↑/↓ or j/k or 1-9   select a job        enter / l   view its log
+//   p promote           s ship (testing→shipped)        x reject
+//   u requeue           r run loop           S stop      g refresh now
+//   ? help              q / Ctrl-C quit
+// All actions shell out to agent-queue.sh — it stays the single source of truth.
+//
+// Usage:  node dashboard.mjs [--interval 2] [--root /path/to/queue]
+//         AGENT_QUEUE_ROOT=/path node dashboard.mjs
+//         AQ_TRACKER_WEB=https://tracker.example.com node dashboard.mjs
+//           (makes job tracker-item tags clickable terminal hyperlinks)
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { execFileSync, spawn } from 'node:child_process';
+import { fleetConfig, fetchBoard, fetchEvents, jobAction } from './lib/fleet-dash.mjs';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+// ── args / config ───────────────────────────────────────────────────
+const argv = process.argv.slice(2);
+const getArg = (flag, def) => {
+  const i = argv.indexOf(flag);
+  return i !== -1 && argv[i + 1] ? argv[i + 1] : def;
+};
+const ROOT = path.resolve(getArg('--root', process.env.AGENT_QUEUE_ROOT || path.join(__dirname, 'queue')));
+const INTERVAL = Math.max(1, parseInt(getArg('--interval', '2'), 10)) * 1000;
+// A running worker is flagged stalled if its log has not changed in this many minutes.
+const STALL_MIN = Math.max(1, parseInt(process.env.AGENT_QUEUE_STALL_MIN || '10', 10));
+
+const DIRS = {
+  inbox: path.join(ROOT, 'inbox'),
+  building: path.join(ROOT, 'building'),
+  review: path.join(ROOT, 'review'),
+  testing: path.join(ROOT, 'testing'),
+  shipped: path.join(ROOT, 'shipped'),
+  failed: path.join(ROOT, 'failed'),
+  logs: path.join(ROOT, 'logs'),
+  state: path.join(ROOT, '.state'),
+};
+
+// ── ansi ────────────────────────────────────────────────────────────
+const C = {
+  reset: '\x1b[0m', dim: '\x1b[2m', bold: '\x1b[1m',
+  red: '\x1b[31m', green: '\x1b[32m', yellow: '\x1b[33m',
+  blue: '\x1b[34m', cyan: '\x1b[36m', gray: '\x1b[90m',
+};
+const c = (col, s) => `${C[col]}${s}${C.reset}`;
+
+// ── helpers ─────────────────────────────────────────────────────────
+const listMd = (dir) => {
+  try { return fs.readdirSync(dir).filter((f) => f.endsWith('.md')); }
+  catch { return []; }
+};
+const count = (dir) => listMd(dir).length;
+
+const parseMeta = (file) => {
+  const out = {};
+  try {
+    for (const line of fs.readFileSync(file, 'utf8').split('\n')) {
+      const i = line.indexOf('=');
+      if (i > 0) out[line.slice(0, i)] = line.slice(i + 1);
+    }
+  } catch { /* ignore */ }
+  return out;
+};
+
+// Compact per-job insights (read-only from meta; agent-queue.sh is the source of
+// truth). Surfaces tokens or cost + attempts + line deltas for finished jobs.
+const insightsTag = (m) => {
+  const parts = [];
+  if (m.attempts && m.attempts !== '1') parts.push(`x${m.attempts}`);
+  if (m.cost_usd) parts.push(`$${m.cost_usd}${m.usage_estimated ? '~' : ''}`);
+  else if (m.tokens_in || m.tokens_out) parts.push(`tok ${m.tokens_in || 0}/${m.tokens_out || 0}`);
+  if (m.lines_added || m.lines_deleted) parts.push(`+${m.lines_added || 0}/-${m.lines_deleted || 0}`);
+  if (m.duration_s) parts.push(`${m.duration_s}s`);
+  return parts.join(' ');
+};
+
+// Manifest tags (read-only): the routing inputs an operator cares about when
+// scanning the board — priority, profile, capabilities, and a tracker-item
+// reference. Rendered from a job's meta (launched jobs) or, for never-launched
+// inbox jobs, parsed from the .md frontmatter (see readManifest). The
+// tracker-item becomes a real terminal hyperlink when AQ_TRACKER_WEB is set.
+const TRACKER_WEB = (process.env.AQ_TRACKER_WEB || '').replace(/\/+$/, '');
+const osc8 = (url, label) => `\x1b]8;;${url}\x07${label}\x1b]8;;\x07`;
+const trackerTag = (id) => {
+  if (!id) return '';
+  const label = `⎘ ${id}`;
+  return TRACKER_WEB ? osc8(`${TRACKER_WEB}/${encodeURIComponent(id)}`, label) : label;
+};
+const PRIORITY_COLOR = { critical: 'red', high: 'yellow', medium: 'gray', low: 'gray' };
+const manifestTags = (m) => {
+  if (!m) return '';
+  const parts = [];
+  if (m.priority && m.priority !== 'medium') {
+    parts.push(c(PRIORITY_COLOR[m.priority] || 'gray', `⚑${m.priority}`));
+  }
+  if (m.profile) parts.push(c('blue', `◆${m.profile}`));
+  if (m.capabilities) {
+    const caps = String(m.capabilities).replace(/^\[|\]$/g, '').trim();
+    if (caps) parts.push(c('gray', `caps ${trunc(caps, 36)}`));
+  }
+  if (m.tracker_item) parts.push(c('cyan', trackerTag(m.tracker_item)));
+  return parts.join('  ');
+};
+
+const pidAlive = (pid) => {
+  if (!pid) return false;
+  try { process.kill(Number(pid), 0); return true; } catch { return false; }
+};
+
+const lastLogLine = (job) => {
+  try {
+    const txt = fs.readFileSync(path.join(DIRS.logs, `${job}.log`), 'utf8');
+    const lines = txt.split('\n').map((l) => l.trim()).filter(Boolean);
+    return lines.length ? lines[lines.length - 1] : '';
+  } catch { return ''; }
+};
+
+// seconds since a job's log was last modified (no new agent output); null if no log
+const logAgeSec = (job) => {
+  try {
+    const mt = fs.statSync(path.join(DIRS.logs, `${job}.log`)).mtimeMs;
+    return Math.max(0, Math.floor((Date.now() - mt) / 1000));
+  } catch { return null; }
+};
+
+const fmtElapsed = (startSec) => {
+  if (!startSec) return '  --  ';
+  const s = Math.max(0, Math.floor(Date.now() / 1000) - Number(startSec));
+  const m = Math.floor(s / 60);
+  const h = Math.floor(m / 60);
+  if (h > 0) return `${h}h${String(m % 60).padStart(2, '0')}m`;
+  return `${m}m${String(s % 60).padStart(2, '0')}s`;
+};
+
+const trunc = (s, n) => (s.length > n ? s.slice(0, n - 1) + '…' : s);
+const shortPath = (p) => (p || '').replace(process.env.HOME || '~', '~');
+
+const readMetas = () => {
+  let files = [];
+  try { files = fs.readdirSync(DIRS.state).filter((f) => f.endsWith('.meta')); }
+  catch { /* ignore */ }
+  return files.map((f) => parseMeta(path.join(DIRS.state, f)));
+};
+
+// readManifest(stage, job) — manifest tags for a job that has no launched meta
+// yet (e.g. queued in inbox/). Parses the leading --- frontmatter block of the
+// job's .md and maps the few fields manifestTags renders. Never throws.
+const FM_TAG_KEYS = {
+  priority: 'priority', profile: 'profile',
+  capabilities: 'capabilities', 'tracker-item': 'tracker_item',
+};
+const readManifest = (stage, job) => {
+  const out = {};
+  try {
+    const lines = fs.readFileSync(path.join(DIRS[stage], `${job}.md`), 'utf8').split('\n');
+    if ((lines[0] || '').trim() !== '---') return out;
+    for (let i = 1; i < lines.length; i++) {
+      if (lines[i].trim() === '---') break;
+      const line = lines[i].replace(/^\s+/, '');
+      const ci = line.indexOf(':');
+      if (ci <= 0) continue;
+      const key = line.slice(0, ci).trim();
+      if (!FM_TAG_KEYS[key]) continue;
+      out[FM_TAG_KEYS[key]] = line.slice(ci + 1).trim().replace(/^["']|["']$/g, '');
+    }
+  } catch { /* ignore */ }
+  return out;
+};
+
+// ── agent-queue.sh control (single source of truth) ─────────────────
+const AQ = path.join(__dirname, 'agent-queue.sh');
+const stripAnsi = (s) => (s || '').replace(/\x1b\[[0-9;]*m/g, '');
+const lastLine = (s) => {
+  const lines = stripAnsi(s).split('\n').map((l) => l.trim()).filter(Boolean);
+  return lines.length ? lines[lines.length - 1] : '';
+};
+
+// aq(args) — run an agent-queue.sh subcommand, capturing output (never throws).
+const aq = (args) => {
+  try {
+    const out = execFileSync('bash', [AQ, ...args], {
+      encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'],
+      env: { ...process.env, AGENT_QUEUE_ROOT: ROOT },
+    });
+    return { ok: true, out };
+  } catch (e) {
+    return { ok: false, out: ((e.stdout || '') + (e.stderr || '') || e.message || '').toString() };
+  }
+};
+
+// ── fleet mode (Phase 3: TUI re-pointed at /fleet) ──────────────────
+// Opt-in via AQ_FLEET_DASH=1. When ON, the board is sourced from the
+// platform-service /fleet REST API instead of the local queue; when OFF, every
+// fleet code path below is skipped and the dashboard is byte-for-byte the local
+// tool. All fleet I/O lives in lib/fleet-dash.mjs (injectable + unit-tested).
+const FLEET = fleetConfig();
+// Latest fleet board snapshot. On a refresh failure we KEEP the last good board
+// (no destructive flicker) and surface a staleness banner instead.
+let fleetState = { board: null, error: null, loading: FLEET.enabled, lastOk: 0 };
+let fleetRefreshing = false;       // single-flight guard (no overlapping fetches)
+let fleetEvents = { jobId: null, lines: [], error: null, loading: false };
+
+// refreshFleet() — single-flight board refresh. Applies a successful board;
+// on failure preserves the previous board and records the error.
+const refreshFleet = async () => {
+  if (fleetRefreshing) return;
+  fleetRefreshing = true;
+  try {
+    const r = await fetchBoard(FLEET);
+    if (r.ok) fleetState = { board: r.board, error: null, loading: false, lastOk: Date.now() };
+    else fleetState = { ...fleetState, error: r.error, loading: false };
+  } catch (e) {
+    fleetState = { ...fleetState, error: (e && e.message) || 'refresh failed', loading: false };
+  } finally {
+    fleetRefreshing = false;
+  }
+};
+
+// refreshFleetEvents(job) — load a job's event stream into the log view state.
+const refreshFleetEvents = (job) => {
+  fleetEvents = { jobId: job, lines: [], error: null, loading: true };
+  fetchEvents(FLEET, job).then((r) => {
+    fleetEvents = { jobId: job, lines: r.lines || [], error: r.ok ? null : r.error, loading: false };
+    if (mode === 'log' && logJob === job) draw();
+  });
+};
+
+
+// daemonPid() — pid of a live `run` loop, or null.
+const daemonPid = () => {
+  try {
+    const pid = fs.readFileSync(path.join(DIRS.state, 'daemon.pid'), 'utf8').trim();
+    return pid && pidAlive(pid) ? pid : null;
+  } catch { return null; }
+};
+
+// startRun() — spawn a detached `run` loop writing to logs/run-loop.log.
+const startRun = () => {
+  if (daemonPid()) { setFlash(c('yellow', 'run loop already active')); return; }
+  try {
+    const fd = fs.openSync(path.join(DIRS.logs, 'run-loop.log'), 'a');
+    const child = spawn('bash', [AQ, 'run'], {
+      detached: true, stdio: ['ignore', fd, fd],
+      env: { ...process.env, AGENT_QUEUE_ROOT: ROOT },
+    });
+    child.unref();
+    setFlash(c('green', `▶ run loop started (max ${process.env.AGENT_QUEUE_MAX || 3})`));
+  } catch (e) { setFlash(c('red', `run failed: ${e.message}`)); }
+};
+
+// ── interactive state ───────────────────────────────────────────────
+const INTERACTIVE = !!process.stdin.isTTY;
+const ACTION_STAGES = ['review', 'testing', 'failed', 'inbox'];
+let mode = 'board';        // 'board' | 'log' | 'help' | 'confirm'
+let items = [];            // actionable jobs, rebuilt each draw
+let selIdx = 0;            // selected index into items
+let selJob = null;         // selected job name (stable across refreshes)
+let flash = '';            // transient status message
+let flashUntil = 0;
+let logJob = null;         // job whose log is being viewed
+let confirmAction = null;  // { verb, job, run }
+
+const setFlash = (msg, ms = 4000) => { flash = msg; flashUntil = Date.now() + ms; };
+const flashLine = () => (flash && Date.now() < flashUntil ? flash : '');
+
+const buildItems = () => {
+  if (FLEET.enabled) {
+    const b = fleetState.board;
+    if (!b) return [];
+    return b.items.map((it) => ({ stage: it.stage, job: it.id, fleet: it }));
+  }
+  const list = [];
+  for (const st of ACTION_STAGES) {
+    for (const f of listMd(DIRS[st]).sort()) list.push({ stage: st, job: f.replace(/\.md$/, '') });
+  }
+  return list;
+};
+
+const syncSelection = () => {
+  if (selJob) {
+    const i = items.findIndex((it) => it.job === selJob);
+    if (i >= 0) { selIdx = i; return; }
+  }
+  selIdx = Math.max(0, Math.min(selIdx, items.length - 1));
+  selJob = items[selIdx]?.job ?? null;
+};
+
+const STAGE_TAG = {
+  review:  () => c('cyan',  '[review ]'),
+  testing: () => c('cyan',  '[testing]'),
+  failed:  () => c('red',   '[failed ]'),
+  inbox:   () => c('blue',  '[inbox  ]'),
+};
+
+// gate(verb, stage) — is this action valid for a job in this stage?
+const gate = (verb, stage) => ({
+  promote: stage === 'review' || stage === 'testing',
+  ship:    stage === 'testing',
+  reject:  stage === 'review' || stage === 'testing',
+  requeue: stage === 'failed' || stage === 'review' || stage === 'testing',
+  logs:    true,
+}[verb]);
+
+// doAction(verb) — run the gated action on the selected job. In fleet mode it
+// calls the /fleet API (lib/fleet-dash.mjs); otherwise it shells out to
+// agent-queue.sh. promote is unavailable in fleet mode (no safe server contract).
+const doAction = (verb) => {
+  const it = items[selIdx];
+  if (!it) { setFlash(c('gray', 'no job selected')); return; }
+  if (FLEET.enabled && verb === 'promote') {
+    setFlash(c('gray', 'promote n/a in fleet mode (use ship/requeue/reject)'));
+    return;
+  }
+  if (!gate(verb, it.stage)) { setFlash(c('gray', `${verb} not valid for a ${it.stage} job`)); return; }
+  if ((verb === 'reject' || verb === 'requeue') && mode !== 'confirm') {
+    confirmAction = { verb, job: it.job, run: () => doAction(verb) };
+    mode = 'confirm';
+    return;
+  }
+  if (FLEET.enabled) {
+    setFlash(c('gray', `${verb}…`));
+    mode = 'board'; confirmAction = null;
+    jobAction(FLEET, it.fleet, verb).then((r) => {
+      setFlash((r.ok ? c('green', '✓ ') : c('red', '✗ ')) + (r.message || `${verb} ${it.job}`));
+      refreshFleet().then(draw);
+    });
+    return;
+  }
+  const r = aq([verb, it.job]);
+  setFlash((r.ok ? c('green', '✓ ') : c('red', '✗ ')) + (lastLine(r.out) || `${verb} ${it.job}`));
+  mode = 'board'; confirmAction = null;
+};
+
+// ── render ──────────────────────────────────────────────────────────
+const ENGINE_COLOR = { devin: 'cyan', claude: 'yellow', codex: 'green' };
+
+const FLEET_STAGE_COLOR = {
+  queued: 'blue', assigned: 'yellow', building: 'yellow',
+  review: 'cyan', testing: 'cyan', shipped: 'green',
+  failed: 'red', dead_letter: 'red',
+};
+
+// drawFleetBoard() — the board sourced from the /fleet API (AQ_FLEET_DASH=1).
+// Mirrors the local board layout; running rows reflect lease/factory status
+// (there is no local PID/liveness in fleet mode).
+function drawFleetBoard() {
+  items = buildItems();
+  syncSelection();
+  const board = fleetState.board;
+  const counts = board ? board.counts : { inbox: 0, building: 0, review: 0, testing: 0, shipped: 0, failed: 0 };
+  const running = board ? board.running : [];
+  const recent = board ? board.recent : [];
+
+  const out = [];
+  out.push('');
+  out.push(`  ${C.bold}AGENT QUEUE${C.reset}  ${c('cyan', 'fleet')} ${c('gray', FLEET.api)}`);
+  const staleSec = fleetState.lastOk ? Math.floor((Date.now() - fleetState.lastOk) / 1000) : null;
+  let statusBit;
+  if (fleetState.loading && !board) statusBit = c('gray', '◌ loading…');
+  else if (fleetState.error) statusBit = c('red', `⚠ ${trunc(fleetState.error, 40)}${board ? ` (stale ${staleSec}s)` : ''}`);
+  else statusBit = c('green', `● live${staleSec !== null ? ` (${staleSec}s ago)` : ''}`);
+  out.push(
+    `  ${c('gray', new Date().toLocaleTimeString())}   refresh ${INTERVAL / 1000}s   ${statusBit}` +
+    `   ${c('gray', `product ${FLEET.productId}`)}   ${c('gray', INTERACTIVE ? 'press ? for help' : 'read-only')}`
+  );
+  out.push('');
+  out.push(
+    `  ${c('blue', '▢ inbox')} ${String(counts.inbox).padEnd(3)}` +
+    `  ${c('yellow', '◧ building')} ${String(counts.building).padEnd(3)}` +
+    `  ${c('cyan', '◔ review')} ${String(counts.review).padEnd(3)}` +
+    `  ${c('cyan', '◕ testing')} ${String(counts.testing).padEnd(3)}` +
+    `  ${c('green', '▣ shipped')} ${String(counts.shipped).padEnd(3)}` +
+    `  ${c('red', '✕ failed')} ${String(counts.failed).padEnd(3)}` +
+    `  ${C.bold}running ${running.length}${C.reset}`
+  );
+  out.push('');
+
+  // factories (per-factory rows when /fleet/factories exists; else metrics aggregate)
+  out.push(`  ${C.bold}FACTORIES${C.reset}`);
+  const factories = board ? board.factories : [];
+  const metrics = board ? board.metrics : null;
+  if (factories.length > 0) {
+    for (const f of factories) {
+      const health = String(f.health || 'ok');
+      const hc = health === 'ok' ? 'green' : health === 'degraded' ? 'yellow' : 'red';
+      out.push(
+        `    ${c('bold', trunc(f.factoryId || f.id || '?', 24).padEnd(24))} ` +
+        `${c(hc, health.padEnd(9))} ` +
+        `${c('gray', `load ${f.load ?? '?'}/${f.seatLimit ?? '?'}`)}  ` +
+        `${c('gray', trunc((Array.isArray(f.capabilities) ? f.capabilities.join(', ') : f.capabilities) || '', 36))}`
+      );
+    }
+  } else if (metrics && metrics.factory) {
+    const fm = metrics.factory;
+    const bh = fm.byHealth || {};
+    out.push(
+      `    ${c('green', `ok ${bh.ok ?? 0}`)}  ${c('yellow', `degraded ${bh.degraded ?? 0}`)}  ${c('red', `down ${bh.down ?? 0}`)}` +
+      `   ${c('gray', `live ${fm.live ?? '?'} · stale ${fm.stale ?? '?'}`)}` +
+      `   ${c('gray', `seats ${fm.seatsUsed ?? '?'}/${fm.seatsTotal ?? '?'}`)}` +
+      `   ${c('gray', `util ${metrics.utilizationPct ?? '?'}%`)}`
+    );
+  } else {
+    out.push(`    ${c('dim', 'no factory data')}`);
+  }
+  // alerts (from metrics)
+  if (metrics && Array.isArray(metrics.alerts) && metrics.alerts.length > 0) {
+    for (const a of metrics.alerts) {
+      const sev = a.severity === 'critical' ? 'red' : 'yellow';
+      out.push(`    ${c(sev, '⚠ ')}${c(sev, a.kind || 'alert')}${a.message ? c('gray', ` — ${trunc(a.message, 50)}`) : ''}`);
+    }
+  }
+  out.push('');
+
+  // running (lease/factory status — no local pid)
+  out.push(`  ${C.bold}RUNNING${C.reset}`);
+  if (running.length === 0) {
+    out.push(`    ${c('dim', 'no jobs in flight')}`);
+  } else {
+    for (const r of running) {
+      const sc = FLEET_STAGE_COLOR[r.fleetStage] || 'gray';
+      out.push(
+        `    ${c('bold', trunc(r.id, 30).padEnd(30))} ` +
+        `${c(sc, String(r.fleetStage).padEnd(9))} ` +
+        `${c('gray', r.factoryId ? `@${trunc(r.factoryId, 18)}` : 'unassigned')}`
+      );
+      const mtags = manifestTags(r);
+      if (mtags) out.push(`      ${mtags}`);
+    }
+  }
+  out.push('');
+
+  // actionable jobs (numbered + selectable) — reuses STAGE_TAG buckets
+  out.push(`  ${C.bold}JOBS${C.reset}  ${c('gray', '(review · testing · failed · inbox)')}`);
+  if (items.length === 0) {
+    out.push(`    ${c('dim', board ? 'no actionable jobs' : 'waiting for fleet…')}`);
+  } else {
+    items.forEach((it, i) => {
+      const sel = i === selIdx;
+      const ptr = sel ? c('cyan', '▶') : ' ';
+      const num = c('gray', String(i + 1).padStart(2) + '.');
+      const tag = (STAGE_TAG[it.stage] || (() => `[${it.stage}]`))();
+      const name = sel ? `${C.bold}${trunc(it.job, 46)}${C.reset}` : trunc(it.job, 46);
+      out.push(`  ${ptr} ${num} ${tag} ${name}`);
+      const jtags = manifestTags(it.fleet);
+      if (jtags) out.push(`        ${jtags}`);
+    });
+  }
+  out.push('');
+
+  // recent (shipped + failed)
+  out.push(`  ${C.bold}RECENT${C.reset}`);
+  if (recent.length === 0) {
+    out.push(`    ${c('dim', 'nothing finished yet')}`);
+  } else {
+    for (const r of recent) {
+      const failedRes = r.stage === 'failed';
+      const mark = failedRes ? c('red', '✕') : c('green', '▣');
+      const label = failedRes ? c('red', r.fleetStage) : c('green', r.fleetStage);
+      const when = r.updatedAt ? new Date(r.updatedAt).toLocaleTimeString() : '';
+      out.push(`    ${mark} ${trunc(r.id, 34).padEnd(34)} ${label}  ${c('gray', when)}`);
+    }
+  }
+  out.push('');
+
+  // flash + footer
+  const fl = flashLine();
+  if (fl) out.push(`  ${fl}`);
+  if (mode === 'confirm' && confirmAction) {
+    out.push(`  ${c('yellow', `${confirmAction.verb} "${confirmAction.job}" ? `)}${C.bold}y${C.reset}${c('gray', '/')}${C.bold}n${C.reset}`);
+  } else if (INTERACTIVE) {
+    out.push(c('gray', '  ↑/↓ select · enter events · s ship · x reject · u requeue'));
+    out.push(c('gray', '  g refresh · ? help · q quit'));
+  }
+  process.stdout.write('\x1b[2J\x1b[H' + out.join('\n') + '\n');
+}
+
+function drawBoard() {
+  if (FLEET.enabled) return drawFleetBoard();
+  const metas = readMetas();
+  const metaByJob = Object.fromEntries(metas.filter((m) => m.job).map((m) => [m.job, m]));
+  const running = metas.filter((m) => !m.ended && pidAlive(m.pid));
+  const finished = metas
+    .filter((m) => m.ended)
+    .sort((a, b) => Number(b.ended) - Number(a.ended));
+
+  const counts = {
+    inbox: count(DIRS.inbox), building: count(DIRS.building),
+    review: count(DIRS.review), testing: count(DIRS.testing),
+    shipped: count(DIRS.shipped), failed: count(DIRS.failed),
+  };
+
+  // rebuild actionable list + keep selection stable
+  items = buildItems();
+  syncSelection();
+
+  const loop = daemonPid();
+  const out = [];
+  out.push('');
+  out.push(`  ${C.bold}AGENT QUEUE${C.reset}  ${c('gray', ROOT)}`);
+  out.push(
+    `  ${c('gray', new Date().toLocaleTimeString())}   refresh ${INTERVAL / 1000}s   ` +
+    (loop ? c('green', `● run loop pid ${loop}`) : c('gray', '○ run loop stopped')) +
+    `   ${c('gray', INTERACTIVE ? 'press ? for help' : 'read-only')}`
+  );
+  out.push('');
+  out.push(
+    `  ${c('blue', '▢ inbox')} ${String(counts.inbox).padEnd(3)}` +
+    `  ${c('yellow', '◧ building')} ${String(counts.building).padEnd(3)}` +
+    `  ${c('cyan', '◔ review')} ${String(counts.review).padEnd(3)}` +
+    `  ${c('cyan', '◕ testing')} ${String(counts.testing).padEnd(3)}` +
+    `  ${c('green', '▣ shipped')} ${String(counts.shipped).padEnd(3)}` +
+    `  ${c('red', '✕ failed')} ${String(counts.failed).padEnd(3)}` +
+    `  ${C.bold}running ${running.length}${C.reset}`
+  );
+  out.push('');
+
+  // running table
+  out.push(`  ${C.bold}RUNNING${C.reset}`);
+  if (running.length === 0) {
+    out.push(`    ${c('dim', 'no workers running')}`);
+  } else {
+    for (const m of running) {
+      const eng = m.engine || '?';
+      const engC = ENGINE_COLOR[eng] || 'gray';
+      const age = logAgeSec(m.job);
+      const stalled = age !== null && age > STALL_MIN * 60;
+      out.push(
+        `    ${c('bold', trunc(m.job || '?', 30).padEnd(30))} ` +
+        `${c(engC, eng.padEnd(7))} ` +
+        `${fmtElapsed(m.started).padStart(7)}  ` +
+        `${c('gray', 'pid ' + (m.pid || '?'))}` +
+        `${stalled ? '  ' + c('red', '⚠ stalled') : ''}`
+      );
+      out.push(`      ${c('dim', trunc(shortPath(m.cwd || ''), 70))}`);
+      const mtags = manifestTags(m);
+      if (mtags) out.push(`      ${mtags}`);
+      const last = lastLogLine(m.job);
+      if (last) out.push(`      ${c('cyan', '› ')}${c('dim', trunc(last, 70))}`);
+    }
+  }
+  out.push('');
+
+  // actionable job list (numbered + selectable)
+  out.push(`  ${C.bold}JOBS${C.reset}  ${c('gray', '(review · testing · failed · inbox)')}`);
+  if (items.length === 0) {
+    out.push(`    ${c('dim', 'no actionable jobs')}`);
+  } else {
+    items.forEach((it, i) => {
+      const sel = i === selIdx;
+      const ptr = sel ? c('cyan', '▶') : ' ';
+      const num = c('gray', String(i + 1).padStart(2) + '.');
+      const tag = (STAGE_TAG[it.stage] || (() => `[${it.stage}]`))();
+      const name = sel ? `${C.bold}${trunc(it.job, 46)}${C.reset}` : trunc(it.job, 46);
+      out.push(`  ${ptr} ${num} ${tag} ${name}`);
+      const jtags = manifestTags(metaByJob[it.job] || readManifest(it.stage, it.job));
+      if (jtags) out.push(`        ${jtags}`);
+    });
+  }
+  out.push('');
+
+  // recent finished
+  out.push(`  ${C.bold}RECENT${C.reset}`);
+  const recent = finished.slice(0, 5);
+  if (recent.length === 0) {
+    out.push(`    ${c('dim', 'nothing finished yet')}`);
+  } else {
+    for (const m of recent) {
+      const res = m.result || '';
+      const failedRes = res === 'failed' || res === 'timeout' || res === 'verify_failed' ||
+        res === 'rejected' || res === 'retries_exhausted' || res === 'capability_mismatch' ||
+        res === 'budget_exceeded' || res === 'no_engine';
+      const mark = failedRes ? c('red', '✕') : c('green', '▣');
+      const when = m.ended ? new Date(Number(m.ended) * 1000).toLocaleTimeString() : '';
+      let label;
+      if (res === 'shipped') label = c('green', 'shipped');
+      else if (res === 'testing') label = c('cyan', 'testing (QA)');
+      else if (res === 'review') label = c('cyan', 'review');
+      else if (res === 'verify_failed') label = c('red', 'verify failed');
+      else if (res === 'timeout') label = c('red', 'timeout');
+      else if (res === 'budget_exceeded') label = c('red', 'budget exceeded');
+      else if (res === 'rejected') label = c('red', 'rejected');
+      else if (res === 'retries_exhausted') label = c('red', 'retries exhausted');
+      else if (res === 'failed') label = c('red', 'failed rc=' + (m.exit || '?'));
+      else label = c('gray', res || '?');
+      out.push(
+        `    ${mark} ${trunc(m.job || '?', 34).padEnd(34)} ` +
+        `${c('gray', (m.engine || '').padEnd(7))} ` +
+        `${label}  ${c('gray', when)}  ${c('cyan', insightsTag(m))}`
+      );
+    }
+  }
+  out.push('');
+
+  // flash + footer
+  const fl = flashLine();
+  if (fl) out.push(`  ${fl}`);
+  if (mode === 'confirm' && confirmAction) {
+    out.push(`  ${c('yellow', `${confirmAction.verb} "${confirmAction.job}" ? `)}${C.bold}y${C.reset}${c('gray', '/')}${C.bold}n${C.reset}`);
+  } else if (INTERACTIVE) {
+    out.push(c('gray', '  ↑/↓ select · enter logs · p promote · s ship · x reject · u requeue'));
+    out.push(c('gray', '  r run · S stop · g refresh · ? help · q quit'));
+  }
+
+  process.stdout.write('\x1b[2J\x1b[H' + out.join('\n') + '\n');
+}
+
+function drawLog() {
+  const rows = (process.stdout.rows || 30) - 6;
+  if (FLEET.enabled) {
+    let body;
+    if (fleetEvents.loading) body = c('gray', '  loading events…');
+    else if (fleetEvents.error) body = c('red', `  ${fleetEvents.error}`);
+    else if (!fleetEvents.lines.length) body = c('gray', '  no events for this job');
+    else body = fleetEvents.lines.slice(-rows).join('\n');
+    const head = `  ${C.bold}EVENTS${C.reset} ${c('cyan', logJob)}   ${c('gray', 'q/esc back · g refresh')}`;
+    process.stdout.write('\x1b[2J\x1b[H' + head + '\n' + c('gray', '  ' + '─'.repeat(60)) + '\n' + body + '\n');
+    return;
+  }
+  let body = `no log for ${logJob}`;
+  try {
+    const txt = fs.readFileSync(path.join(DIRS.logs, `${logJob}.log`), 'utf8');
+    body = txt.split('\n').slice(-rows).join('\n');
+  } catch { /* keep default */ }
+  const head = `  ${C.bold}LOG${C.reset} ${c('cyan', logJob)}   ${c('gray', 'q/esc back · g refresh')}`;
+  process.stdout.write('\x1b[2J\x1b[H' + head + '\n' + c('gray', '  ' + '─'.repeat(60)) + '\n' + body + '\n');
+}
+
+function drawHelp() {
+  const L = [
+    '', `  ${C.bold}AGENT QUEUE — keys${C.reset}`,
+    FLEET.enabled ? `  ${c('cyan', 'fleet mode')} ${c('gray', '— board sourced from /fleet API; run/stop/promote disabled')}` : '', '',
+    `  ${c('cyan', '↑/↓, j/k, 1-9')}   select a job in the JOBS list`,
+    `  ${c('cyan', 'enter / l')}       ${FLEET.enabled ? "view the selected job's events" : "view the selected job's log (live)"}`,
+    `  ${c('cyan', 'p')}               promote (review → testing → shipped)`,
+    `  ${c('cyan', 's')}               ship  (testing/QA → shipped, the manual gate)`,
+    `  ${c('cyan', 'x')}               reject (review/testing → failed)   ${c('gray', '[confirm]')}`,
+    `  ${c('cyan', 'u')}               requeue (failed/review/testing → inbox)   ${c('gray', '[confirm]')}`,
+    '',
+    `  ${c('cyan', 'r')}               start the run loop (detached, max ${process.env.AGENT_QUEUE_MAX || 3})`,
+    `  ${c('cyan', 'S')}               stop the run loop + running workers`,
+    `  ${c('cyan', 'g')}               refresh now`,
+    `  ${c('cyan', '? / h')}           toggle this help`,
+    `  ${c('cyan', 'q / Ctrl-C')}      quit`,
+    '',
+    `  ${c('gray', 'Lifecycle: inbox → building → review → testing → shipped  (+ failed)')}`,
+    `  ${c('gray', 'auto: rc=0 → review; verify pass → testing; verify fail → failed')}`,
+    `  ${c('gray', 'manual: ship (testing → shipped)')}`,
+    '', `  ${c('gray', 'press any key to return')}`, '',
+  ];
+  process.stdout.write('\x1b[2J\x1b[H' + L.join('\n') + '\n');
+}
+
+const draw = () => {
+  if (mode === 'log') drawLog();
+  else if (mode === 'help') drawHelp();
+  else drawBoard();
+};
+
+// ── main loop + key handling ────────────────────────────────────────
+// Fleet mode (AQ_FLEET_DASH=1) sources the board from the /fleet API on an async,
+// single-flight tick loop. Local mode keeps the original synchronous setInterval
+// path byte-for-byte. `timer` holds whichever timer is live so quit() can clear it.
+let timer = null;
+
+if (FLEET.enabled) {
+  if (!FLEET.ok) {
+    process.stdout.write(
+      `agent-queue: fleet dashboard enabled (AQ_FLEET_DASH=1) but missing config:\n` +
+      `  ${FLEET.missing.join(', ')}\n` +
+      `Set AQ_FLEET_API, AQ_FLEET_TOKEN and AQ_PRODUCT_ID, or unset AQ_FLEET_DASH.\n`
+    );
+    process.exit(1);
+  }
+  draw(); // initial frame (loading…)
+  const tick = async () => {
+    await refreshFleet();
+    if (mode !== 'log' && mode !== 'help') draw();
+    timer = setTimeout(tick, INTERVAL); // single-flight: schedule only after the await
+  };
+  tick();
+} else {
+  if (!fs.existsSync(ROOT)) {
+    process.stdout.write(`agent-queue: queue root not found: ${ROOT}\nRun \`agent-queue.sh init\` first.\n`);
+    process.exit(1);
+  }
+  draw();
+  timer = setInterval(draw, INTERVAL);
+}
+
+const quit = () => {
+  if (timer) { clearTimeout(timer); clearInterval(timer); }
+  try { if (process.stdin.isTTY) process.stdin.setRawMode(false); } catch { /* noop */ }
+  process.stdout.write(C.reset + '\n');
+  process.exit(0);
+};
+
+const moveSel = (delta) => {
+  if (items.length === 0) return;
+  selIdx = (selIdx + delta + items.length) % items.length;
+  selJob = items[selIdx]?.job ?? null;
+};
+
+function onKey(key) {
+  // global quit
+  if (key === '\u0003') return quit(); // Ctrl-C always quits
+
+  if (mode === 'help') { mode = 'board'; return draw(); }
+  if (mode === 'log') {
+    if (key === 'q' || key === '\u001b' || key === '\r' || key === '\n') { mode = 'board'; logJob = null; }
+    else if (key === 'g') { if (FLEET.enabled && logJob) refreshFleetEvents(logJob); }
+    return draw();
+  }
+  if (mode === 'confirm') {
+    if (key === 'y' || key === 'Y') confirmAction?.run();
+    else { mode = 'board'; confirmAction = null; setFlash(c('gray', 'cancelled')); }
+    return draw();
+  }
+
+  // board mode
+  switch (key) {
+    case 'q': return quit();
+    case '?': case 'h': mode = 'help'; break;
+    case 'g': break; // just redraw
+    case 'j': case '\u001b[B': moveSel(1); break;
+    case 'k': case '\u001b[A': moveSel(-1); break;
+    case '\r': case '\n': case 'l':
+      if (items[selIdx]) {
+        logJob = items[selIdx].job; mode = 'log';
+        if (FLEET.enabled) refreshFleetEvents(logJob);
+      }
+      break;
+    case 'p': doAction('promote'); break;
+    case 's': doAction('ship'); break;
+    case 'x': doAction('reject'); break;
+    case 'u': doAction('requeue'); break;
+    case 'r': if (FLEET.enabled) setFlash(c('gray', 'run loop n/a in fleet mode')); else startRun(); break;
+    case 'S':
+      if (FLEET.enabled) { setFlash(c('gray', 'stop n/a in fleet mode')); break; }
+      { const res = aq(['stop']); setFlash(c('red', '■ ') + (lastLine(res.out) || 'stopped')); break; }
+    default:
+      if (/^[1-9]$/.test(key)) {
+        const i = parseInt(key, 10) - 1;
+        if (i < items.length) { selIdx = i; selJob = items[i].job; }
+      } else { return; } // ignore unknown keys (no redraw)
+  }
+  draw();
+}
+
+if (INTERACTIVE) {
+  process.stdin.setRawMode(true);
+  process.stdin.resume();
+  process.stdin.setEncoding('utf8');
+  process.stdin.on('data', onKey);
+}
+process.on('SIGINT', quit);
+process.on('SIGTERM', quit);
--- a/agent-queue/demo/README.md
+++ b/agent-queue/demo/README.md
@ -0,0 +1,83 @@
+# Two-Factory Parallel Demo (Phase-2 Exit Criteria, §14)
+
+This demo closes the final Phase-2 exit-criteria box: **≥2 factories executing jobs in
+parallel through one coordinator**, proving the concurrency guarantees end-to-end. It is a
+**harness over the existing runtime** — it does *not* change `agent-queue.sh` or
+`lib/fleet-client.sh`; it starts two real `agent-queue.sh run` daemons (distinct
+factoryIds, separate queues/cwds) that compete **only** through the coordinator, then
+observes and asserts.
+
+## The three guarantees it proves
+
+| # | Guarantee | How it's shown |
+|---|-----------|----------------|
+| **(a)** | **No double-assign** | Each of the 3 jobs is claimed/executed by exactly **one** factory. The coordinator's atomic claim (lock-guarded; only a `queued` job is claimable) means two concurrent claimers never get the same job version. |
+| **(b)** | **Fencing + reclaim** | One factory is **killed mid-job**. The reaper returns its in-flight job to `queued` with a **bumped lease epoch**; the surviving factory **reclaims and completes** it. The dead worker's late/zombie report (stale epoch) is **fenced (HTTP 409)** and never ships. |
+| **(c)** | **Parallelism** | Both factories hold an active job **simultaneously** (observed in coordinator state) — work is concurrent, not serialized. |
+
+## Run it
+
+### Stub mode (default, zero dependencies, CI-safe)
+
+```bash
+bash demo/two-factory-demo.sh
+```
+
+Drives [`coordinator-stub.sh`](coordinator-stub.sh) — a stateful, lock-guarded, file-backed
+coordinator that implements the same claim / lease / fence / reaper contract as
+platform-service, via the existing `AQ_FLEET_API_CMD` test seam. No platform-service, no
+Cosmos, no network. This is exactly what `selftest.sh` runs headlessly.
+
+### Real-coordinator mode (against a live platform-service)
+
+```bash
+DEMO_MODE=real \
+  AQ_FLEET_API=http://localhost:4003/api \
+  AQ_FLEET_TOKEN=<bearer> \
+  AQ_PRODUCT_ID=<product> \
+  bash demo/two-factory-demo.sh
+```
+
+In real mode the demo submits via the platform-service fleet API and relies on the
+coordinator's **own lease reaper** to reclaim the killed factory's job (it waits
+`DEMO_REAP_WAIT` seconds; pair with a short `AQ_FLEET_LEASE_SECONDS` so the lease expires
+quickly). Submit endpoint is overridable via `DEMO_SUBMIT_PATH` (default `/fleet/jobs`).
+Real mode is observational/best-effort — the machine-checked assertions run in stub mode
+(and in `selftest.sh`).
+
+## Env knobs
+
+| Var | Default | Meaning |
+|-----|---------|---------|
+| `DEMO_MODE` | `stub` | `stub` or `real` (auto-set to `real` when `AQ_FLEET_API`+`AQ_FLEET_TOKEN` are set and `DEMO_MODE` ≠ `stub`) |
+| `DEMO_JOB_SLEEP` | `2` | per-job engine seconds — the window during which the victim is killed mid-job |
+| `DEMO_TIMEOUT` | `60` | max seconds to wait for the survivor to drain all 3 jobs |
+| `DEMO_POLL` | `0.2` | coordinator-state poll interval |
+| `DEMO_FACTORY_1` / `DEMO_FACTORY_2` | `mac-1` / `ubuntu-1` | factory ids (F1 is the victim) |
+| `DEMO_KEEP` | `0` | `1` keeps the temp dir (queues, logs, coordinator state) for inspection |
+| `DEMO_REAP_WAIT` / `DEMO_DRAIN_WAIT` | `20` / `30` | real-mode waits for the coordinator reaper / drain |
+
+## What to watch
+
+The demo prints a step-by-step trace and a final `RESULTS` block. The key lines:
+
+- `PARALLELISM observed: mac-1 and ubuntu-1 both holding active jobs concurrently` — guarantee (c).
+- `killed factory mac-1 ... mid-job` then `reaper reclaimed mac-1's lease(s)` — the crash + reclaim.
+- `zombie report for <job> @epoch=N was FENCED (HTTP 409)` — guarantee (b) fencing.
+- `RESULTS` shows each job's winning factory; the reclaimed job's winner is the **survivor**.
+
+With `DEMO_KEEP=1`, inspect under the printed temp dir:
+
+- `coord/events.log` — the coordinator's audit trail: `CLAIM` / `PATCH:<stage>` / `RECLAIM` / `FENCE` events (factory + epoch on each).
+- `coord/jobs/<id>.job` — final per-job `stage` / `holder` / `epoch`.
+- `log-mac-1.txt`, `log-ubuntu-1.txt` — each factory's run-loop log (claims, the `▶ launching`, the fenced/quarantine path on the killed worker).
+
+## Files
+
+- `two-factory-demo.sh` — the orchestrator (start factories, kill/reclaim/fence, assert).
+- `coordinator-stub.sh` — the stateful coordinator stub (claim/patch/fence/renew/release/reap, mkdir-locked).
+- `start-fleet.example.sh` — reference launcher for a **real** multi-product local
+  fleet against a live platform-service (one `agent-queue.sh run` daemon per
+  product). Parameterized via env; ships the two settings you must get right —
+  `AQ_FLEET_GATE=1` (M0 RU gate) and `AQ_FLEET_LEASE_RENEW_SEC=30` (heartbeat
+  cadence < the 90s stale threshold). Copy + adjust for your sandbox.
--- a/agent-queue/demo/coordinator-stub.sh
+++ b/agent-queue/demo/coordinator-stub.sh
@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+#
+# coordinator-stub.sh — a STATEFUL, concurrency-safe fleet-coordinator stub for the
+# two-factory demo + its selftest. It is the same "AQ_FLEET_API_CMD responder" pattern
+# the existing fleet selftests use (invoked as `<METHOD> <PATH> <BODY>`, prints the
+# response body then a final HTTP-code line), EXTENDED with file-backed shared state +
+# an mkdir lock so >=2 competing factory processes coordinate through ONE coordinator —
+# exactly modeling platform-service's claim / lease / fence / reaper contract
+# (../../learning_ai_common_plat/services/platform-service/src/modules/fleet/coordinator.ts).
+#
+# It is curl-free + dependency-free (bash + POSIX awk/sed/grep) so the demo runs in CI
+# with zero external services. Real-coordinator mode bypasses this entirely (the demo
+# talks to platform-service over HTTP when AQ_FLEET_API/AQ_FLEET_TOKEN are set).
+#
+# Contract implemented (paths under the caller's AQ_FLEET_API base, which includes /api):
+#   POST  /fleet/factories/heartbeat        -> {"ok":true} 200
+#   POST  /fleet/claim                      -> {"claimed":true,"job":{id,bodyMd,leaseEpoch},"lease":{leaseEpoch}} | {"claimed":false}
+#   PATCH /fleet/jobs/:id                    -> 200 | 409 (stale leaseEpoch => FENCED)
+#   POST  /fleet/jobs/:id/lease/renew        -> 200 | 409 (fenced)
+#   POST  /fleet/jobs/:id/lease/release      -> 200
+#   POST  /fleet/_reap                        -> {"reaped":N} 200  (DEMO-only admin: models the
+#                                               coordinator reaper reclaiming a dead factory's
+#                                               leases — returns its in-flight jobs to `queued`
+#                                               and BUMPS the epoch so the zombie is fenced)
+#
+# Atomicity: every state mutation runs inside an mkdir spin-lock, so under true
+# concurrency EXACTLY ONE claimer wins a given job version (no double-assign), and a
+# report carrying an epoch older than the stored epoch is rejected (409) — the same
+# guarantees the real rev/_etag compare-and-swap provides.
+#
+# State (under $COORD_STATE, set by the demo):
+#   order            submit-ordered job ids (one per line)
+#   jobs/<id>.job    key=val lines: stage, holder, epoch, body
+#   events.log       append-only audit: "<ts> <EVENT> job=<id> factory=<f> epoch=<n>"
+#   lock/            the mkdir lock dir
+#
+# Stages: queued -> assigned -> building -> review|testing -> shipped (terminal);
+#         failed/dead_letter terminal. Reclaimable (active) = assigned|building|review|testing.
+
+set -uo pipefail
+
+METHOD="${1:-}"; RPATH="${2:-}"; BODY="${3:-}"
+: "${COORD_STATE:?coordinator-stub.sh requires COORD_STATE}"
+JOBS_DIR="$COORD_STATE/jobs"
+EVENTS="$COORD_STATE/events.log"
+LOCK="$COORD_STATE/lock"
+
+# ── JSON field extraction (no jq) ───────────────────────────────────────────
+_str_field() { printf '%s' "$BODY" | sed -n 's/.*"'"$1"'"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' | head -1; }
+_num_field() { printf '%s' "$BODY" | grep -oE "\"$1\"[[:space:]]*:[[:space:]]*-?[0-9]+" | grep -oE -- '-?[0-9]+$' | head -1; }
+# job id from /fleet/jobs/<id> or /fleet/jobs/<id>/lease/<op>
+_job_id_from_path() { printf '%s' "$RPATH" | sed -e 's#^/fleet/jobs/##' -e 's#/lease/.*$##'; }
+
+# ── lock (mkdir is atomic on POSIX filesystems) ─────────────────────────────
+_lock() { local n=0; until mkdir "$LOCK" 2>/dev/null; do sleep 0.02; n=$((n+1)); [ "$n" -gt 5000 ] && break; done; }
+_unlock() { rmdir "$LOCK" 2>/dev/null || true; }
+
+_jobfile() { printf '%s/%s.job\n' "$JOBS_DIR" "$1"; }
+_get() { grep -E "^$2=" "$1" 2>/dev/null | head -1 | cut -d= -f2-; }
+_set() { # <file> <key> <val> : replace or append key=val
+  local f=$1 k=$2 v=$3 tmp; tmp="$f.tmp.$$"
+  if grep -qE "^$k=" "$f" 2>/dev/null; then
+    sed "s#^$k=.*#$k=$v#" "$f" > "$tmp" && mv "$tmp" "$f"
+  else
+    printf '%s=%s\n' "$k" "$v" >> "$f"
+  fi
+}
+_event() { printf '%s %s\n' "$(date +%s)" "$*" >> "$EVENTS"; }
+_is_active() { case "$1" in assigned|building|review|testing) return 0;; *) return 1;; esac; }
+
+_emit() { printf '%s\n%s\n' "$1" "$2"; }   # <json-body> <http-code>
+
+case "$METHOD $RPATH" in
+  "POST /fleet/factories/heartbeat")
+    _emit '{"ok":true}' 200 ;;
+
+  "POST /fleet/claim")
+    factory=$(_str_field factoryId)
+    _lock
+    claimed_id=""
+    if [ -f "$COORD_STATE/order" ]; then
+      while IFS= read -r jid; do
+        [ -n "$jid" ] || continue
+        jf=$(_jobfile "$jid")
+        [ -f "$jf" ] || continue
+        if [ "$(_get "$jf" stage)" = "queued" ]; then claimed_id="$jid"; break; fi
+      done < "$COORD_STATE/order"
+    fi
+    if [ -n "$claimed_id" ]; then
+      jf=$(_jobfile "$claimed_id")
+      epoch=$(( $(_get "$jf" epoch) + 1 ))
+      _set "$jf" stage assigned; _set "$jf" holder "$factory"; _set "$jf" epoch "$epoch"
+      body=$(_get "$jf" body)
+      _event "CLAIM job=$claimed_id factory=$factory epoch=$epoch"
+      _unlock
+      _emit "{\"claimed\":true,\"job\":{\"id\":\"$claimed_id\",\"bodyMd\":\"$body\",\"leaseEpoch\":$epoch},\"lease\":{\"leaseEpoch\":$epoch}}" 200
+    else
+      _unlock
+      _emit '{"claimed":false}' 200
+    fi ;;
+
+  PATCH\ /fleet/jobs/*)
+    jid=$(_job_id_from_path); stage=$(_str_field stage); rep_epoch=$(_num_field leaseEpoch)
+    jf=$(_jobfile "$jid")
+    _lock
+    if [ ! -f "$jf" ]; then _unlock; _emit '{}' 404
+    else
+      cur_epoch=$(_get "$jf" epoch)
+      if [ -n "$rep_epoch" ] && [ "$rep_epoch" -lt "$cur_epoch" ]; then
+        _event "FENCE job=$jid factory=$(_get "$jf" holder) epoch=$rep_epoch<stored=$cur_epoch"
+        _unlock; _emit '{}' 409   # stale leaseEpoch -> fenced (zombie rejected)
+      else
+        [ -n "$stage" ] && _set "$jf" stage "$stage"
+        _event "PATCH:$stage job=$jid factory=$(_get "$jf" holder) epoch=$rep_epoch"
+        _unlock; _emit '{}' 200
+      fi
+    fi ;;
+
+  POST\ /fleet/jobs/*/lease/renew)
+    jid=$(_job_id_from_path); rep_epoch=$(_num_field leaseEpoch); jf=$(_jobfile "$jid")
+    _lock
+    cur_epoch=$(_get "$jf" epoch 2>/dev/null)
+    if [ -n "$rep_epoch" ] && [ -n "$cur_epoch" ] && [ "$rep_epoch" -lt "$cur_epoch" ]; then
+      _event "RENEW_FENCE job=$jid epoch=$rep_epoch<stored=$cur_epoch"; _unlock; _emit '{}' 409
+    else
+      _unlock; _emit '{}' 200
+    fi ;;
+
+  POST\ /fleet/jobs/*/lease/release)
+    jid=$(_job_id_from_path); _event "RELEASE job=$jid"; _emit '{}' 200 ;;
+
+  "POST /fleet/_reap")
+    # DEMO admin: model the coordinator reaper reclaiming a dead factory's leases.
+    factory=$(_str_field factoryId)
+    _lock
+    n=0
+    for jf in "$JOBS_DIR"/*.job; do
+      [ -f "$jf" ] || continue
+      if [ "$(_get "$jf" holder)" = "$factory" ] && _is_active "$(_get "$jf" stage)"; then
+        jid=$(basename "$jf" .job)
+        epoch=$(( $(_get "$jf" epoch) + 1 ))   # bump => the dead worker's old epoch is now stale (fenced)
+        _set "$jf" stage queued; _set "$jf" holder ""; _set "$jf" epoch "$epoch"
+        _event "RECLAIM job=$jid factory=$factory epoch=$epoch"
+        n=$((n+1))
+      fi
+    done
+    _unlock
+    _emit "{\"reaped\":$n}" 200 ;;
+
+  *) _emit '{}' 200 ;;
+esac
--- a/agent-queue/demo/start-fleet.example.sh
+++ b/agent-queue/demo/start-fleet.example.sh
@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+#
+# start-fleet.example.sh — reference launcher for a multi-product local fleet.
+#
+# Starts one detached `agent-queue.sh run` daemon (a "factory") per product, each
+# routing work through the platform-service fleet coordinator. This is the tracked,
+# parameterized version of the operational `_start_fleet.sh` people keep in their
+# local sandbox — copy it, adjust the env vars, and run.
+#
+# Prereqs:
+#   - platform-service running on $AQ_FLEET_API (see scripts/deploy-gigafactory.sh)
+#   - a factory token in $FLEET_TOKEN_FILE (an admin/factory JWT for the fleet API)
+#   - tmux + the `longrun` helper (sourced below) for detached, logged daemons
+#
+# Env overrides (all optional):
+#   SB                 sandbox/state root (per-product queues live in $SB/q_<product>)
+#   AQ                 path to agent-queue.sh
+#   AQ_FLEET_API       coordinator base URL (default http://localhost:4003/api)
+#   FLEET_TOKEN_FILE   file holding the bearer token (default $SB/.token)
+#   PRODUCTS           space-separated product ids (default: the ecosystem set)
+#   AGENT_QUEUE_MAX    per-factory concurrency (default 3)
+#
+# Docs: ../docs/GIGAFACTORY/GIGAFACTORY_SYSTEM_OVERVIEW.md (§9 API, §14 gotchas) and
+#       ../docs/GIGAFACTORY/FLEET_DISPATCH_REDESIGN.md (the M0 RU gate).
+set -uo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SB="${SB:-$PWD/.fleet-sandbox}"
+AQ="${AQ:-$HERE/../agent-queue.sh}"
+AQ_FLEET_API="${AQ_FLEET_API:-http://localhost:4003/api}"
+FLEET_TOKEN_FILE="${FLEET_TOKEN_FILE:-$SB/.token}"
+PRODUCTS="${PRODUCTS:-lysnrai chronomind mindlyst nomgap}"
+AGENT_QUEUE_MAX="${AGENT_QUEUE_MAX:-3}"
+LONGRUN_ALIAS="${LONGRUN_ALIAS:-$HERE/../../aliases/_longrun.alias}"
+# PR mode: where the product repos are checked out, so a job's `repo` resolves to
+# a local checkout and the factory opens a PR off a git worktree. Set FLEET_PR=0
+# to run plain (no PR) jobs in $SB instead.
+FLEET_PR="${FLEET_PR:-1}"
+REPO_BASE="${REPO_BASE:-$(cd "$HERE/../../.." && pwd)}"
+
+[ -f "$AQ" ] || { echo "agent-queue.sh not found at $AQ (set AQ=)"; exit 1; }
+[ -s "$FLEET_TOKEN_FILE" ] || { echo "fleet token not found at $FLEET_TOKEN_FILE (set FLEET_TOKEN_FILE=)"; exit 1; }
+TOK="$(cat "$FLEET_TOKEN_FILE")"
+
+mkdir -p "$SB"
+export LONGRUN_LOG_DIR="$SB"
+# shellcheck disable=SC1090
+source "$LONGRUN_ALIAS"
+
+for p in $PRODUCTS; do
+  ROOT="$SB/q_$p"
+  longrun "gigafactory-$p" env \
+    AGENT_QUEUE_ROOT="$ROOT" \
+    AGENT_QUEUE_ENGINE=devin \
+    AGENT_QUEUE_MAX="$AGENT_QUEUE_MAX" \
+    AQ_FLEET=1 AQ_FLEET_ROUTE=1 \
+    AQ_FLEET_API="$AQ_FLEET_API" \
+    AQ_FLEET_TOKEN="$TOK" \
+    AQ_PRODUCT_ID="$p" \
+    AQ_FACTORY_ID="mac-$p" \
+    AQ_FLEET_GATE=1 \
+    AQ_FLEET_LEASE_RENEW_SEC=30 \
+    AQ_FLEET_PR="$FLEET_PR" \
+    AQ_FLEET_REPO_BASE="$REPO_BASE" \
+    "$AQ" run
+  echo "----"
+done
+
+# Why these two matter (both verified on a live fleet):
+#   AQ_FLEET_GATE=1            §M0 RU gate — the run loop point-reads the cheap
+#                             per-product queue version (GET /fleet/queue-state) and
+#                             SKIPS the claim while nothing changed, slashing idle
+#                             Cosmos RU. Default OFF; safe (fails open). See
+#                             FLEET_DISPATCH_REDESIGN.md §8/§12.
+#   AQ_FLEET_LEASE_RENEW_SEC=30  heartbeat/renew cadence. MUST stay well under the
+#                             coordinator's 90s stale threshold, or a healthy
+#                             factory flaps to "stale"/"no live factory" between
+#                             beats (the 300s default caused exactly that).
+#   AQ_FLEET_PR=1 + AQ_FLEET_REPO_BASE  WITHOUT these a job's `repo` is ignored and
+#                             Devin just runs the prompt in the sandbox cwd (no PR).
+#                             With them, the factory checks out a worktree of
+#                             $REPO_BASE/<repo>, commits, pushes, and opens a PR.
+#
+# Subset restart (leave a busy factory running):
+#   PRODUCTS="lysnrai mindlyst" bash start-fleet.example.sh
+#
+# Stop a factory:  tmux kill-session -t gigafactory-<product>
+# Tail a factory:  tail -f "$SB"/longrun-gigafactory-<product>-*.log
--- a/agent-queue/demo/two-factory-demo.sh
+++ b/agent-queue/demo/two-factory-demo.sh
@ -0,0 +1,248 @@
+#!/usr/bin/env bash
+#
+# two-factory-demo.sh — Phase-2 EXIT-CRITERIA demo (§14): >=2 factories executing jobs
+# in PARALLEL through ONE coordinator, proving the Phase-2 guarantees end-to-end:
+#
+#   (a) NO DOUBLE-ASSIGN  — each job is claimed/executed by exactly ONE factory.
+#   (b) FENCING + RECLAIM — kill a factory MID-JOB; the reaper returns its job; the OTHER
+#                           factory reclaims + completes it; the dead worker's late/zombie
+#                           report is FENCED (409, never shipped).
+#   (c) PARALLELISM       — both factories make progress concurrently (not serialized).
+#
+# This is a DEMO HARNESS over the EXISTING runtime — it does NOT change agent-queue.sh or
+# lib/fleet-client.sh; it starts two real `agent-queue.sh run` daemons (distinct factoryIds,
+# separate queues/cwds) that compete ONLY through the coordinator, then observes + asserts.
+#
+# DUAL MODE:
+#   STUB  (default / CI-safe): drives demo/coordinator-stub.sh — a stateful, lock-guarded
+#         file-backed coordinator. Zero external services. Used by selftest.sh.
+#   REAL  : set AQ_FLEET_API + AQ_FLEET_TOKEN (and DEMO_MODE=real) to run against a live
+#         platform-service fleet coordinator. Submit + reaper-reclaim use its HTTP API.
+#
+# Usage:
+#   bash demo/two-factory-demo.sh                 # stub mode (default)
+#   DEMO_MODE=real AQ_FLEET_API=http://host:4003/api AQ_FLEET_TOKEN=... \
+#     AQ_PRODUCT_ID=notelett bash demo/two-factory-demo.sh
+#
+# Env knobs: DEMO_JOB_SLEEP (per-job engine seconds, default 2), DEMO_TIMEOUT (drain
+#   seconds, default 60), DEMO_POLL (poll seconds, default 0.2), DEMO_KEEP=1 (keep temp).
+#
+# Exit 0 = all three guarantees PASS; non-zero = FAIL. bash 3.2+ (no assoc arrays);
+# awk/sed/grep/pgrep only; mac+linux safe.
+
+set -uo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+AQ="$HERE/../agent-queue.sh"
+STUB="$HERE/coordinator-stub.sh"
+
+DEMO_MODE="${DEMO_MODE:-stub}"
+if [ -n "${AQ_FLEET_API:-}" ] && [ -n "${AQ_FLEET_TOKEN:-}" ] && [ "${DEMO_MODE}" != "stub" ]; then DEMO_MODE=real; fi
+DEMO_JOB_SLEEP="${DEMO_JOB_SLEEP:-2}"
+DEMO_TIMEOUT="${DEMO_TIMEOUT:-60}"
+DEMO_POLL="${DEMO_POLL:-0.2}"
+F1="${DEMO_FACTORY_1:-mac-1}"      # victim (killed mid-job)
+F2="${DEMO_FACTORY_2:-ubuntu-1}"  # survivor (reclaims)
+
+c_b=$'\033[1m'; c_g=$'\033[32m'; c_r=$'\033[31m'; c_c=$'\033[36m'; c_0=$'\033[0m'
+log()  { printf '%s[demo]%s %s\n' "$c_c" "$c_0" "$*"; }
+ok()   { printf '  %s+%s %s\n' "$c_g" "$c_0" "$*"; }
+bad()  { printf '  %s- %s%s\n' "$c_r" "$*" "$c_0" >&2; }
+
+TMP="$(mktemp -d "${TMPDIR:-/tmp}/aq-2factory.XXXXXX")"
+COORD_STATE="$TMP/coord"; export COORD_STATE
+DAEMON_PIDS=()
+
+# kill a process AND its descendants (mac+linux; pgrep -P is portable)
+kill_tree() {
+  local p=$1 c
+  for c in $(pgrep -P "$p" 2>/dev/null); do kill_tree "$c"; done
+  kill -9 "$p" 2>/dev/null || true
+}
+cleanup() {
+  local p
+  if [ "${#DAEMON_PIDS[@]}" -gt 0 ]; then
+    for p in "${DAEMON_PIDS[@]}"; do [ -n "$p" ] && kill_tree "$p"; done
+  fi
+  [ "${DEMO_KEEP:-0}" = "1" ] || rm -rf "$TMP"
+}
+trap cleanup EXIT INT TERM
+
+# In stub mode every coordinator HTTP call is routed to the stateful stub via the
+# existing AQ_FLEET_API_CMD seam; in real mode it is unset so curl talks to the service.
+if [ "$DEMO_MODE" = stub ]; then export AQ_FLEET_API_CMD="$STUB"; else unset AQ_FLEET_API_CMD 2>/dev/null || true; fi
+
+# ── coordinator primitives (mode-branched) ─────────────────────────────────
+coord_init() {
+  if [ "$DEMO_MODE" = stub ]; then mkdir -p "$COORD_STATE/jobs"; : > "$COORD_STATE/order"; : > "$COORD_STATE/events.log"; fi
+}
+coord_submit() {  # <jobid> <bodyMd>
+  if [ "$DEMO_MODE" = stub ]; then
+    printf '%s\n' "stage=queued" "holder=" "epoch=0" "body=$2" > "$COORD_STATE/jobs/$1.job"
+    printf '%s\n' "$1" >> "$COORD_STATE/order"
+  else
+    curl -sS -m 30 -X POST -H "Content-Type: application/json" \
+      -H "Authorization: Bearer ${AQ_FLEET_TOKEN}" ${AQ_PRODUCT_ID:+-H "X-Product-Id: $AQ_PRODUCT_ID"} \
+      --data "{\"idempotencyKey\":\"$1\",\"bodyMd\":\"$2\",\"priority\":\"medium\"}" \
+      "${AQ_FLEET_API}${DEMO_SUBMIT_PATH:-/fleet/jobs}" >/dev/null 2>&1 || true
+  fi
+}
+coord_reap() {  # <factoryId> : model the reaper reclaiming a dead factory's leases
+  if [ "$DEMO_MODE" = stub ]; then
+    "$STUB" POST /fleet/_reap "{\"factoryId\":\"$1\"}" >/dev/null 2>&1 || true
+  else
+    log "real mode: waiting ${DEMO_REAP_WAIT:-20}s for the coordinator reaper to reclaim $1's lease"
+    sleep "${DEMO_REAP_WAIT:-20}"
+  fi
+}
+coord_zombie_report() {  # <jobid> <staleEpoch> -> echoes the HTTP code (expect 409)
+  if [ "$DEMO_MODE" = stub ]; then
+    "$STUB" PATCH "/fleet/jobs/$1" "{\"stage\":\"building\",\"leaseEpoch\":$2}" | tail -n1
+  else
+    curl -sS -m 30 -o /dev/null -w '%{http_code}' -X PATCH -H "Content-Type: application/json" \
+      -H "Authorization: Bearer ${AQ_FLEET_TOKEN}" ${AQ_PRODUCT_ID:+-H "X-Product-Id: $AQ_PRODUCT_ID"} \
+      --data "{\"stage\":\"building\",\"leaseEpoch\":$2}" "${AQ_FLEET_API}/fleet/jobs/$1"
+  fi
+}
+# stub-only state readers (assertions in stub mode read authoritative coordinator state)
+jget()   { grep -E "^$2=" "$COORD_STATE/jobs/$1.job" 2>/dev/null | head -1 | cut -d= -f2-; }
+# emit (one per line) the factoryId of every factory currently holding an ACTIVE job
+active_holders() {
+  local jf st ho
+  for jf in "$COORD_STATE"/jobs/*.job; do
+    [ -f "$jf" ] || continue
+    st=$(grep -E '^stage=' "$jf" | cut -d= -f2-); ho=$(grep -E '^holder=' "$jf" | cut -d= -f2-)
+    case "$st" in assigned|building|review|testing) [ -n "$ho" ] && printf '%s\n' "$ho";; esac
+  done
+}
+
+# ── engine + factory launch ─────────────────────────────────────────────────
+engine="$TMP/engine.sh"
+printf '#!/usr/bin/env bash\n# demo engine: sleep then succeed (gives a window to kill mid-job)\nsleep %s\nexit 0\n' "$DEMO_JOB_SLEEP" > "$engine"
+chmod +x "$engine"
+
+start_factory() {  # <factoryId>
+  local fid=$1 root="$TMP/q-$1" work="$TMP/w-$1"
+  mkdir -p "$work"
+  AGENT_QUEUE_ROOT="$root" "$AQ" init >/dev/null 2>&1
+  # Each factory: own queue + cwd, AQ_FLEET=1 ROUTE=1 (coordinator authoritative),
+  # MAX=1 so it holds one job at a time, fast poll. Competes ONLY via the coordinator
+  # (AQ_FLEET_API_CMD / AQ_FLEET_API inherited from the environment above).
+  AGENT_QUEUE_ROOT="$root" AGENT_QUEUE_MAX=1 AGENT_QUEUE_POLL=1 \
+    AQ_FLEET=1 AQ_FLEET_ROUTE=1 AQ_FACTORY_ID="$fid" AQ_FLEET_CWD="$work" \
+    AQ_FLEET_API="${AQ_FLEET_API:-http://stub.local/api}" \
+    DEVIN_BIN="$engine" "$AQ" run >"$TMP/log-$1.txt" 2>&1 &
+  DAEMON_PIDS+=("$!")
+  disown 2>/dev/null || true   # detach from job control so SIGKILL later prints no "Killed" notice
+  log "started factory $c_b$fid$c_0 (pid $!, queue q-$1)"
+}
+
+# ════════════════════════════════════════════════════════════════════════════
+log "Phase-2 two-factory parallel demo — mode=$c_b$DEMO_MODE$c_0 (job-sleep=${DEMO_JOB_SLEEP}s)"
+coord_init
+
+# 1) submit 3 jobs
+for n in 1 2 3; do coord_submit "demo-job-$n" "two-factory demo job $n"; done
+log "submitted 3 jobs to the coordinator"
+
+# 2) start two factories competing through the coordinator
+start_factory "$F1"
+start_factory "$F2"
+
+# 3) PARALLELISM: wait until BOTH factories simultaneously hold an active job, and the
+#    victim (F1) holds one we can kill mid-job.
+PARALLELISM_OK=0; VICTIM_JOB=""; VICTIM_EPOCH=""
+if [ "$DEMO_MODE" = stub ]; then
+  deadline=$(( $(date +%s) + 30 ))
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    holders=$(active_holders | sort -u | tr '\n' ' ')
+    if printf '%s' "$holders" | grep -qw "$F1" && printf '%s' "$holders" | grep -qw "$F2"; then
+      PARALLELISM_OK=1
+      for jf in "$COORD_STATE"/jobs/*.job; do
+        [ -f "$jf" ] || continue
+        if [ "$(grep -E '^holder=' "$jf" | cut -d= -f2-)" = "$F1" ]; then
+          case "$(grep -E '^stage=' "$jf" | cut -d= -f2-)" in
+            assigned|building|review|testing)
+              VICTIM_JOB=$(basename "$jf" .job); VICTIM_EPOCH=$(jget "$VICTIM_JOB" epoch); break;;
+          esac
+        fi
+      done
+      [ -n "$VICTIM_JOB" ] && break
+    fi
+    sleep "$DEMO_POLL"
+  done
+else
+  sleep "${DEMO_SETTLE:-5}"; PARALLELISM_OK=1; VICTIM_JOB="${DEMO_VICTIM_JOB:-demo-job-1}"; VICTIM_EPOCH="${DEMO_VICTIM_EPOCH:-1}"
+fi
+if [ "$PARALLELISM_OK" = 1 ]; then log "PARALLELISM observed: $F1 and $F2 both holding active jobs concurrently"; else log "WARN: did not observe simultaneous holders"; fi
+log "victim=$c_b$F1$c_0 holds job $c_b${VICTIM_JOB:-?}$c_0 (epoch ${VICTIM_EPOCH:-?}) — killing it mid-job"
+
+# 4) KILL the victim factory mid-job (hard crash, no graceful drain)
+victim_pid="${DAEMON_PIDS[0]}"
+kill_tree "$victim_pid"
+DAEMON_PIDS[0]=""
+log "killed factory $F1 (pid $victim_pid)"
+
+# 5) RECLAIM: the reaper returns the victim's in-flight job to the queue (epoch bumped)
+coord_reap "$F1"
+log "reaper reclaimed $F1's lease(s)"
+
+# 6) FENCE the zombie: the dead worker's LATE report (stale epoch) must be rejected (409)
+FENCE_OK=0
+if [ -n "$VICTIM_JOB" ] && [ -n "$VICTIM_EPOCH" ]; then
+  zcode=$(coord_zombie_report "$VICTIM_JOB" "$VICTIM_EPOCH")
+  if [ "$zcode" = 409 ]; then FENCE_OK=1; ok "zombie report for $VICTIM_JOB @epoch=$VICTIM_EPOCH was FENCED (HTTP 409)"; else bad "zombie report not fenced (HTTP $zcode)"; fi
+fi
+
+# 7) DRAIN: the survivor (F2) finishes everything, including the reclaimed job
+log "draining remaining work on the survivor ($F2)..."
+DONE=0
+if [ "$DEMO_MODE" = stub ]; then
+  deadline=$(( $(date +%s) + DEMO_TIMEOUT ))
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    d=0
+    for jf in "$COORD_STATE"/jobs/*.job; do
+      case "$(grep -E '^stage=' "$jf" | cut -d= -f2-)" in review|testing|shipped) d=$((d+1));; esac
+    done
+    [ "$d" -ge 3 ] && { DONE=1; break; }
+    sleep "$DEMO_POLL"
+  done
+else
+  sleep "${DEMO_DRAIN_WAIT:-30}"; DONE=1
+fi
+
+# ── ASSERT the three guarantees (stub mode reads authoritative coordinator state) ──
+echo
+log "${c_b}RESULTS${c_0}"
+PASS=1
+if [ "$DEMO_MODE" = stub ]; then
+  reviewed=0
+  for jf in "$COORD_STATE"/jobs/*.job; do
+    jid=$(basename "$jf" .job); st=$(jget "$jid" stage); ho=$(jget "$jid" holder)
+    case "$st" in
+      review|testing|shipped) reviewed=$((reviewed+1)); printf '    job %-12s -> %s (stage=%s)\n' "$jid" "$ho" "$st";;
+      *) printf '    job %-12s -> INCOMPLETE (stage=%s)\n' "$jid" "$st";;
+    esac
+  done
+  claims=$(grep -c ' CLAIM ' "$COORD_STATE/events.log" 2>/dev/null || echo 0)
+  distinct_claimers=$(grep ' CLAIM ' "$COORD_STATE/events.log" 2>/dev/null | sed -n 's/.*factory=\([^ ]*\).*/\1/p' | sort -u | tr '\n' ' ')
+  reclaims=$(grep -c ' RECLAIM ' "$COORD_STATE/events.log" 2>/dev/null || echo 0)
+  fences=$(grep -c ' FENCE ' "$COORD_STATE/events.log" 2>/dev/null || echo 0)
+  victim_winner=$(jget "${VICTIM_JOB:-_none_}" holder)
+
+  if [ "$reviewed" -eq 3 ]; then ok "(a) no double-assign: all 3 jobs executed to terminal, one winner each"; else bad "(a) only $reviewed/3 jobs reached terminal"; PASS=0; fi
+  if [ -n "$VICTIM_JOB" ] && [ "$victim_winner" = "$F2" ]; then ok "    reclaimed job $VICTIM_JOB completed by survivor $F2 (not the killed $F1)"; elif [ -n "$VICTIM_JOB" ]; then bad "    reclaimed job $VICTIM_JOB winner='$victim_winner' (expected $F2)"; PASS=0; fi
+  if [ "$reclaims" -ge 1 ]; then ok "(b) reclaim: $reclaims RECLAIM event(s) (reaper returned the dead factory's job)"; else bad "(b) no RECLAIM event"; PASS=0; fi
+  if [ "$FENCE_OK" = 1 ] && [ "$fences" -ge 1 ]; then ok "(b) fencing: zombie report rejected (409); $fences FENCE event(s)"; else bad "(b) zombie was not fenced (fence_ok=$FENCE_OK events=$fences)"; PASS=0; fi
+  if [ "$PARALLELISM_OK" = 1 ] && printf '%s' "$distinct_claimers" | grep -qw "$F1" && printf '%s' "$distinct_claimers" | grep -qw "$F2"; then ok "(c) parallelism: both factories claimed concurrently (claimers: ${distinct_claimers}; $claims claims)"; else bad "(c) parallelism not observed (claimers: ${distinct_claimers})"; PASS=0; fi
+else
+  if [ "$DONE" = 1 ]; then ok "real mode: drain window elapsed — inspect the coordinator + factory logs in $TMP"; fi
+  ok "real mode is best-effort/observational; the asserted guarantees are validated in stub mode (and selftest)."
+fi
+
+echo
+if [ "$PASS" = 1 ]; then
+  printf '%s[demo] PASS%s — Phase-2 exit guarantees demonstrated (no double-assign + reclaim/fence + parallelism)\n' "$c_g" "$c_0"; exit 0
+else
+  printf '%s[demo] FAIL%s\n' "$c_r" "$c_0"; exit 1
+fi
--- a/agent-queue/docs/GIGAFACTORY/FLEET_DISPATCH_REDESIGN.md
+++ b/agent-queue/docs/GIGAFACTORY/FLEET_DISPATCH_REDESIGN.md
@ -0,0 +1,515 @@
+# Fleet Dispatch Redesign — Broker-Backed, On-Demand Factories
+
+> Design proposal (no code yet). Companion to `GIGAFACTORY_SYSTEM_OVERVIEW.md`
+> (what exists today) and `GIGAFACTORY_ROADMAP.md` (source-of-truth spec). This
+> doc realizes roadmap **Phase 4** ("Message bus + autoscaling") and the
+> routing-model cleanup that comes with it. Last reviewed: **2026-05-31**.
+>
+> **Review log**
+> - v1 (2026-05-31): initial proposal.
+> - v2 (2026-05-31): self-review pass — reconciled the routing model
+>   (coordinator-targeted as primary), fixed the Cosmos outbox transactionality
+>   claim (change feed *is* the log), constrained message size (jobId + routing
+>   props only), addressed long-job vs Service Bus 5-min lock, corrected the
+>   idempotency key (`MessageId = jobId`), renamed migration steps `M0–M3` to
+>   avoid collision with roadmap phases, fixed the Phase-0 RU figure, and added a
+>   ticked roadmap checklist + auth/observability notes.
+> - v3 (2026-05-31): added **§5.5 Error handling & cleanup** (current behavior +
+>   lease-release-on-failure, branch/worktree GC, same-repo worktree clobber).
+>   Review fixes: unified the field name to `targetFactoryId` (§5.1), reconciled
+>   §5.3 with the complete-on-claim model (broker is not the redelivery path),
+>   aligned §6 token scoping with per-factory subscriptions, and added the GC /
+>   `POST /fleet/fail` checklist block to §12.
+> - v4 (2026-05-31): **coverage audit** — roadmap now maps 1:1 to the design via a
+>   coverage matrix. Closed plan gaps: **M-prep** (decisions/§10 + schema +
+>   containers + RBAC), correlation filter + dispatcher budget enforcement (M1),
+>   small-messages/body-from-Cosmos + token re-check + alerting (M2), and new
+>   **Testing** and **Rollback & flags** blocks. No design element is now without
+>   an implementation step.
+> - v5 (2026-05-31): **M0 implemented + shipped** (`fleet_queue_state` + bump
+>   hooks + `GET /fleet/queue-state` in common_plat; `AQ_FLEET_GATE` gate-skip in
+>   agent-queue). Reconciled M0 to the as-built approach (gate the *claim*; keep
+>   `POLL_SECONDS` for local responsiveness rather than raising it globally) and
+>   ticked the M0 checklist. Backend vitest + gate logic verified.
+
+---
+
+## 1. Why this doc exists (the two smells)
+
+Two structural problems surfaced while running the local fleet against
+`tracker-web` + `platform-service`:
+
+### 1.1 Product-as-queue is conflated with repo-as-work-target
+
+- `fleet_jobs` is partitioned by **`/productId`**, and a factory is bound to a
+  single product via `AQ_PRODUCT_ID`. The job's **`repo`** is just a payload
+  field (the PR target). Routing uses `productId`; the repo is orthogonal.
+- Consequence observed: a `learning_ai_notes` job submitted via the form was
+  filed under **`chronomind`** (because the form's Factory dropdown maps
+  `mac-2 → chronomind`), and would have opened a PR to the notes repo from a
+  "chronomind" factory. Nothing ties the product to the repo, and nothing
+  guarantees the chosen factory even has that repo checked out.
+- The form (`dashboards/tracker-web/.../fleet/jobs/page.tsx`) hardcodes
+  `FLEET_FACTORIES = [mac-1→lysnrai, mac-2→chronomind]` and defaults
+  `capabilities = "build"` — a capability **no agent-queue factory ever
+  advertises** (`detect_capabilities` only emits `os:*`, `engine:*`, `node:*`,
+  `has:*`). So default UI submissions are unroutable to live factories.
+
+### 1.2 Pull-poll daemons burn Cosmos RU to stay "ready"
+
+- The run loop iterates every **`POLL_SECONDS=3`**; with `AQ_FLEET_ROUTE=1`
+  (default) each iteration calls `POST /fleet/claim`.
+- `claimNextJob` runs `repo.listJobs({ productId })` — **reads every job doc in
+  the product partition, no stage filter, no limit** — on every claim, plus a
+  `getLease` point-read per active job when preemption is on.
+- One process **per product** (`_start_fleet.sh` spawns 4) ⇒ ~`4 × (1/3s)` ≈
+  **115k claim queries/day at idle**, each scaling with partition size, billed
+  continuously whether or not work exists. The machine must also stay up running
+  the loop.
+
+> **Root cause:** `productId` is doing double duty as *tenant/billing scope* and
+> *work-routing queue*, and work discovery is a busy-poll against the state store.
+
+---
+
+## 2. Goals, non-goals, constraints
+
+**Goals**
+- Eliminate idle-poll RU cost; pay (near) zero when there is no work.
+- Make a factory a **generic build worker** (host + capabilities + engines +
+  checked-out repos), not a product-bound process.
+- Route work by what actually matters (**capabilities + repo**), while keeping
+  per-product **billing, budgets, visibility, and token scoping**.
+- Preserve the existing **weighted scheduler** and **leaseEpoch fencing**
+  (exactly-once assignment, zombie-writer protection).
+- Enable later **on-demand spawn** (scale-to-zero) without re-architecting.
+
+**Non-goals (this phase)**
+- Replacing Cosmos as the system of record for job/lease/event/budget **state**.
+- Rewriting the scheduler's scoring math.
+- Multi-region / cross-cloud dispatch.
+
+**Hard constraints (ecosystem rules)**
+- Every Cosmos doc keeps a `productId` (platform rule) — product stays a
+  first-class **tag**, even when it is no longer the routing key.
+- Per-product budgets (`fleet_budgets /productId`), enrollment tokens (§12), and
+  the `tracker-web` per-product views must keep working.
+- Changes must be flag-gated and reversible (match the existing
+  `AQ_FLEET` / `AQ_FLEET_ROUTE` / `AQ_FLEET_SHADOW` cutover discipline).
+
+---
+
+## 3. Decision summary
+
+1. **Do NOT build A3 ("single shared queue") inside Cosmos.** A single logical
+   queue tempts a hot partition; scaling it forces a synthetic partition key and
+   a **cross-partition "find next job" query**, which *increases* RU — the
+   opposite of the goal. It also dissolves the per-product isolation the
+   platform's tenancy/budget/token model depends on.
+2. **Get the shared-queue behavior from a real broker (B3), not from Cosmos.**
+   Adopt **Azure Service Bus** as the dispatch substrate. Cosmos remains
+   product-partitioned for **state**; the broker owns **delivery**.
+3. **Keep the scheduler.** Use a **coordinator-owns-scheduling /
+   broker-owns-delivery hybrid** (B2 ⊕ B3): the coordinator decides *which
+   factory* should run a job and pushes a **targeted** message; the broker
+   handles transport, visibility timeout, retries, and dead-lettering.
+4. **Ship the cheap RU win first (B1) as step M0** — it is reversible, needs no
+   new infra, and de-risks the broker migration by removing the bleed while the
+   bigger change is built and shadowed.
+
+> Net: the shared-queue *experience* (generic workers, one work stream) comes
+> from Service Bus topics/subscriptions; Cosmos stays `/productId`-partitioned
+> for state, budgets, and visibility.
+
+---
+
+## 4. Target architecture
+
+### 4.1 Components & ownership
+
+| Concern | Owner (target) | Notes |
+| --- | --- | --- |
+| Job/lease/event/budget **state** | **Cosmos** (`/productId`, `/jobId` as today) | unchanged system of record |
+| **Scheduling** (which factory) | **Coordinator** (platform-service) | existing weighted scorer + preemption |
+| **Dispatch / delivery** | **Service Bus** | competing consumers, visibility timeout, DLQ |
+| **Fencing** (zombie writers) | **Cosmos `leaseEpoch`** | broker visibility ≠ correctness boundary |
+| Per-product billing/budgets/tokens | **Cosmos + coordinator** | enforced at submit + assign, not by partition |
+| Control planes | `tracker-web`, `agent-queue` dashboard | unchanged REST surface |
+
+### 4.2 Service Bus topology
+
+- One **topic** `fleet-dispatch`.
+- **Primary model — coordinator-targeted (preserves the scheduler):** the
+  coordinator picks the factory, then publishes a message stamped with
+  `targetFactoryId`. Each factory has its **own subscription** with a
+  **correlation filter** `targetFactoryId = '<me>'`. The broker does no policy —
+  it just delivers the scorer's decision. **This is the model the rest of this
+  doc assumes.**
+- **Fallback model — self-select (only if the scheduler is disabled):**
+  capability/repo **SQL filters** on message application properties let consumers
+  self-match. Multi-valued `capabilities` do **not** filter cleanly as one
+  string, so encode each as a boolean property (`cap_os_mac=true`,
+  `repo_learning_ai_notes=true`) rather than `LIKE '%…%'`. Subscription filters
+  are why Service Bus beats Storage Queue / SQS (which can't filter → a
+  queue-per-class sprawl).
+- **Messages stay small.** A message carries only
+  `{ jobId, productId, repo, caps, priority, targetFactoryId }` — **not**
+  `bodyMd`/manifest. The consumer reads the full job from Cosmos by `jobId`.
+  (Service Bus max message is **256 KB** Standard / 1 MB Premium; job bodies can
+  approach that — reinforcing "broker = transport, Cosmos = state".)
+- **DLQ** per subscription ⇒ maps onto `failed` / `retries_exhausted`.
+- **Sessions** (optional) keyed by `repo` to serialize same-repo work and avoid
+  worktree/branch contention on one host.
+
+### 4.3 Why this keeps the scheduler
+
+A vanilla broker is FIFO competing-consumers and does **no** weighted scoring.
+To preserve the existing scorer (`capabilityFit / affinity / load / costFit /
+health / starvation`) + preemption + seat limits, the coordinator stays in the
+decision path: it **selects the target factory** and publishes a message whose
+filter routes it to *that* factory's subscription (or a per-factory
+subscription). The broker is transport, not policy.
+
+---
+
+## 5. Key flows
+
+### 5.1 Submit → dispatch (consistency)
+
+The **Cosmos change feed on `fleet_jobs` is the durable, ordered event log**, so
+no separate outbox container is needed for the primary design:
+
+1. `submitJob` writes the `fleet_jobs` doc (`stage: queued`). That write *is* the
+   event.
+2. A single **dispatcher** (coordinator process) tails the `fleet_jobs` change
+   feed (via a lease container), runs the scheduler for each new/`queued` job,
+   stamps `targetFactoryId` on the job (CAS), and **publishes** the targeted
+   Service Bus message.
+3. **Crash-safe & idempotent:** the change feed redelivers from the last
+   checkpoint on dispatcher restart; Service Bus **duplicate detection** keyed on
+   **`MessageId = jobId`** collapses re-publishes. The consumer is idempotent
+   because the authoritative claim is a Cosmos CAS on `leaseEpoch` — a second
+   delivery is simply fenced (`leaseEpoch` is assigned *at claim*, so it is **not**
+   a valid dedup key for the message itself).
+
+> A separate **transactional outbox** is only needed if you ever publish *inline*
+> at submit instead of via the change feed. Cross-container writes are **not
+> atomic** in Cosmos, so an outbox row would have to live in the **same container
+> + same partition** as the job and be written with a **Cosmos transactional
+> batch** — or, simpler, carried as an `outboxState` field on the job doc itself.
+> The change-feed design avoids this entirely.
+
+> Net effect: the per-factory busy-poll is replaced by one change-feed-driven
+> dispatcher. Idle cost is event-driven, not a per-3s full-partition scan.
+
+### 5.2 Deliver → claim → fence
+
+1. Factory **receives** a message (long-poll/`receiveMessages`, no RU).
+2. Factory calls `POST /fleet/claim` (or a lighter `/fleet/accept`) with
+   `{ jobId, factoryId }`. Coordinator does the **CAS lease** in Cosmos exactly
+   as today (`revUpdateJob` + `leaseEpoch` bump) and returns the new epoch.
+   409 ⇒ fenced ⇒ factory abandons the message (it goes back / to DLQ).
+3. The **broker lock** governs redelivery (a dead consumer's message reappears);
+   the **Cosmos `leaseEpoch`** governs *correctness* (a zombie writer is rejected
+   on PATCH). Two distinct mechanisms — do not collapse them.
+4. **Long-running jobs vs the broker lock.** Service Bus message lock max is
+   **5 minutes**; a coding job runs far longer. Two viable patterns:
+   - **(recommended) complete-on-claim:** complete the message immediately after
+     a successful Cosmos claim. The **Cosmos lease + reaper** then own liveness —
+     on crash the reaper sets the job back to `queued`, which is a change-feed
+     event that **re-dispatches** (§5.1). This decouples job runtime from the
+     5-min lock entirely.
+   - **renew-lock:** keep the message locked and call `renewMessageLock` on a
+     timer, reusing the existing `AQ_FLEET_LEASE_RENEW_SEC` cadence to renew
+     *both* the Cosmos lease and the broker lock. Simpler delivery semantics, but
+     couples runtime to the broker and risks redelivery storms on long jobs.
+
+### 5.3 Failure / retry / DLQ
+
+Assumes the recommended **complete-on-claim** model (§5.2): the broker message is
+completed at claim, so the broker is **not** the redelivery path — re-dispatch is
+driven by Cosmos stage changes through the change feed (§5.1).
+
+- **Logical failure** (engine error / verify-fail) ⇒ coordinator transitions
+  `failed` and **releases the lease immediately** (new `/fleet/fail`, see §5.5);
+  no redelivery (a logical failure is terminal unless a retry policy applies).
+- **Retryable failure** ⇒ coordinator sets the job back to `queued` (attempts++,
+  backoff) ⇒ change-feed re-dispatch to the next best factory.
+- **Crash / lease-expiry** ⇒ the **reaper** reclaims the Cosmos lease (bumps
+  `leaseEpoch`, fencing the dead holder) and returns the job to `queued` ⇒
+  change-feed re-dispatch. (With the alternative *renew-lock* model, broker
+  redelivery is the trigger instead — pick one, not both.)
+- **Exhausted retries** ⇒ Cosmos `retries_exhausted`; mirror to the broker DLQ
+  for visibility.
+
+### 5.4 Routing model (the §1.1 fix)
+
+- Job carries `repo` + required `capabilities` (real tokens: `os:*`, `engine:*`,
+  `has:git`, plus a new `repo:<name>` token).
+- The **scheduler** does the matching: it picks among factories that advertise
+  those caps **and** have the repo locally (or can clone it), then targets the
+  winner (§4.2 primary model: message stamped `targetFactoryId`, delivered via
+  that factory's correlation-filtered subscription).
+- **Product is a property/tag** used for billing/visibility and budget checks —
+  **not** the routing key. (In the self-select fallback, product/caps/repo become
+  subscription SQL filters instead.)
+- Fix the `tracker-web` form in lockstep: derive factories/repos from live data,
+  drop the bogus default `capabilities = "build"`, and stop hardcoding
+  `mac-1/mac-2`.
+
+### 5.5 Error handling & cleanup (worktrees, branches, leases)
+
+**Today (single-host, agent-queue.sh).** The worker already handles errors well:
+the stage machine routes `timeout`/`budget_exceeded`/`crash`/`verify_failed`/
+`capability_mismatch`/`no_engine` through `_finish_failure` (→ `failed/`, with a
+retry policy that requeues to `inbox/` with backoff); a `trap` writes a WIP
+checkpoint to `aq/wip/<job>` on **every** exit path; `recover_orphans` requeues
+dead-worker `building/` jobs; and a **FENCED** report (stale `leaseEpoch`)
+triggers `fleet_quarantine` → `failed/` that **never ships or merges**
+(split-brain guard). PR/merge cleanup: `.aq_pr.md` is removed before commit; the
+PR branch `aq/job/<jid>` is deleted on auto-merge (`--delete-branch`); the repo
+worktree is force-recreated at the next job for that repo.
+
+**Gaps this redesign must close.** These are real loose ends in the current code:
+
+1. **No client-side lease release on failure.** `_finish_failure` is
+   fleet-agnostic, so a failed fleet job's lease only frees on **expiry** via the
+   reaper — slow recovery. Target: a `POST /fleet/fail` (stage=`failed`/`queued`
+   + release lease) so failure is reflected and the lease freed **immediately**.
+2. **Unbounded git artifacts.** `aq/wip/<job>` branches are never GC'd; worktrees
+   are cleaned only on reuse; unmerged `aq/job/<jid>` branches accumulate on
+   origin when auto-merge is off or blocked by branch protection. Target: a
+   periodic **GC** sweep — delete merged `aq/job/*`, prune stale worktrees, and
+   sweep `aq/wip/*` after a job reaches a terminal/shipped stage.
+3. **Same-repo concurrency can clobber a worktree.** The per-repo worktree is
+   force-recreated, so two same-repo jobs on one host collide. Target: **Service
+   Bus sessions keyed by `repo`** (serialize same-repo work) plus a per-`(host,
+   repo)` lock as a local backstop.
+
+**Target invariants.**
+- Terminal failure ⇒ Cosmos `failed` + lease released now (no expiry wait); DLQ
+  mirrors `retries_exhausted` for visibility.
+- Crash / fence ⇒ reaper bumps `leaseEpoch` (fences zombie) ⇒ `queued` ⇒
+  change-feed re-dispatch (§5.3).
+- Cleanup is **explicit and idempotent** — safe to re-run, never deletes a branch
+  with unmerged work or a worktree with an in-flight job. (Checklist in §12.)
+
+---
+
+## 6. Per-product tenancy without product-partitioned queues
+
+- **Budgets:** checked by the coordinator at **assign time** (it already reads
+  `fleet_budgets /productId` in `claimNextJob`); unchanged, just moved to the
+  dispatcher.
+- **Tokens (§12):** the factory token still scopes `productId + capabilities +
+  factoryId`. In the primary (coordinator-targeted) model the dispatcher only
+  ever targets a factory the scheduler deemed eligible, and the coordinator
+  **re-checks the token on `/fleet/claim`** — so least-privilege holds without
+  relying on the subscription topology. (In the self-select fallback, scope it
+  with per-product/per-token subscription filters instead.)
+- **Visibility:** `tracker-web` keeps querying per product (state is still
+  product-partitioned), so the UX is unchanged.
+
+---
+
+## 7. Alternatives considered
+
+| Option | Verdict | Reason |
+| --- | --- | --- |
+| **A3 shared queue in Cosmos** | ✗ | hot partition; cross-partition claim = more RU; loses tenancy isolation |
+| **A1 validate ownership only** | partial | fixes "wrong factory" but not the RU/poll model or process-per-product |
+| **Storage Queue / SQS broker** | ✗ (for now) | no subscription filters ⇒ queue-per-capability sprawl; weaker DLQ/visibility ergonomics |
+| **B2 change feed, no broker** | viable | good for dispatch signal, but still needs a transport to *reach* factories; pairs naturally with B3 |
+| **Plain competing-consumers (drop scheduler)** | ✗ | throws away weighted scoring + preemption + cost/affinity routing |
+| **B3 Service Bus + coordinator hybrid** | ✓ chosen | zero idle RU, keeps scheduler + fencing, filters give capability/repo routing, paves path to B4 |
+
+---
+
+## 8. Phased migration
+
+> Steps are labelled **M0–M3** to avoid collision with the roadmap's Phase 0–5
+> numbering; all of M0–M3 sit *inside* roadmap **Phase 4**. The ticked checklist
+> is in §12.
+
+### M0 — RU quick win (no new infra, fully reversible) — *IMPLEMENTED*
+- Per-product `fleet_queue_state` doc holds a monotonic `version`, bumped on job
+  create + every stage change (centralized in the repo layer, best-effort).
+- The factory run loop does a **~1-RU point-read** (`GET /fleet/queue-state`) and
+  **skips the expensive claim** while the version is unchanged and it is not
+  mid-drain — rather than raising `POLL_SECONDS` globally (which would slow local,
+  non-fleet job pickup). A periodic safety backstop + fail-open-on-read-error
+  guarantee work is never stranded.
+- Gated behind **`AQ_FLEET_GATE=1`** (default OFF ⇒ byte-for-byte prior behavior).
+- Expected: **~10–50× fewer claim queries at idle**, local responsiveness
+  unchanged.
+- Code: common_plat `services/platform-service/src/modules/fleet/{types,repository,routes}.ts`
+  + `lib/cosmos-init.ts`; `agent-queue/lib/fleet-client.sh` (`fleet_gate_*`) + the
+  run-loop hook in `agent-queue.sh`. Tests: fleet vitest (repo bump + endpoint) +
+  selftest `39b` (gate decisions).
+
+### M1 — Stand up the broker in **shadow**
+- Provision Service Bus (`fleet-dispatch` topic + subscriptions) with
+  **managed-identity** auth (no connection-string keys in env/`.env`). Coordinator
+  publishes messages **in parallel** with the existing claim path but factories
+  still source work from Cosmos. Use the existing `AQ_FLEET_SHADOW` discipline:
+  record divergence (did the broker route match the scorer's pick?) without
+  acting on it.
+
+### M2 — Cutover delivery to the broker
+- Flip a flag so factories source work from Service Bus + `/fleet/claim` for
+  fencing; Cosmos poll path becomes the fallback only. Keep the reaper + lease
+  fencing untouched. Validate exactly-once + crash recovery on multi-host.
+
+### M3 — On-demand factories (B4)
+- KEDA / Container Apps scale-to-zero on subscription depth: spin a factory only
+  when depth > 0; idle ⇒ **zero** running workers and zero RU. Warm-pool a single
+  small instance if cold-start latency matters.
+
+---
+
+## 9. Risks & mitigations
+
+| Risk | Mitigation |
+| --- | --- |
+| Dual source-of-truth (broker + Cosmos) drift | change-feed *is* the log (no separate outbox); SB duplicate-detection on `MessageId=jobId`; claim is a Cosmos CAS on `leaseEpoch` |
+| Broker lock vs `leaseEpoch` confusion | explicit rule: broker lock = *delivery*, `leaseEpoch` = *correctness*; never merge (§5.2) |
+| Long job > 5-min broker lock | **complete-on-claim** (reaper + change feed re-dispatch) or `renewMessageLock` on the lease cadence (§5.2) |
+| Message > 256 KB | message carries `jobId` + routing props only; consumer reads body from Cosmos (§4.2) |
+| Same-repo worktree contention across hosts | Service Bus **sessions** keyed by `repo` to serialize same-repo jobs |
+| Lost scheduler features under FIFO | coordinator keeps assignment; broker only transports targeted messages |
+| Token scope leak in shared subscriptions | per-factory subscription + correlation filter; coordinator re-checks the §12 token on claim |
+| Secrets in env (`.env` keys) | **managed identity** for Service Bus + Cosmos; no connection-string keys committed |
+| Blind operation | emit metrics: subscription depth, dispatch lag, claim-conflict (409) rate, DLQ count, change-feed lag — wire to existing monitoring |
+| Migration regressions | M1 shadow measures divergence before any cutover; all flag-gated |
+
+---
+
+## 10. Open questions
+
+1. **Per-factory subscription scale.** The chosen coordinator-targeted model uses
+   one subscription per factory (correlation filter on `targetFactoryId`). Service
+   Bus allows up to **2,000 subscriptions/topic**, so this scales for realistic
+   fleets. If factory churn is high, fall back to a single subscription with a
+   per-consumer `targetFactoryId` SQL filter.
+2. **Where does the dispatcher run?** A new lightweight loop in platform-service
+   vs a separate worker. A change-feed lease container is required either way; a
+   single active dispatcher (leader-elected) avoids double-publish.
+3. **Cost envelope:** Service Bus tier (Standard vs Premium). Standard likely
+   sufficient; Premium only if sessions/large messages/VNet are needed. Confirm
+   against expected message volume.
+4. **Do we keep the Cosmos poll path permanently** as an offline/degraded
+   fallback (like today's `AQ_FLEET_ROUTE=0`)? Recommend yes.
+5. **Repo advertisement.** How does a factory tell the coordinator which repos it
+   has locally (for the `repo:<name>` capability)? Extend the heartbeat payload
+   with a `repos[]` list, or derive from `AQ_FLEET_REPO_BASE`.
+
+---
+
+## 11. Appendix — idle RU cost sketch (today vs M0 vs target)
+
+| Model | Claim/work-find ops at idle (4 factories) | Notes |
+| --- | --- | --- |
+| **Today** (poll 3s) | ~115k/day full-partition `listJobs` | scales with partition size; ~`4 × 28.8k` |
+| **M0** (poll 15–30s + gate) | ~12–23k/day **1-RU point-reads** + ~0 full scans | full scan only when the gate doc changes |
+| **Target (B3)** | ~0 | long-poll receive, no RU; full scan never on the hot path |
+
+> Figures are order-of-magnitude to frame the decision, not a billing estimate.
+> A full-partition `listJobs` costs many RU and grows with partition size; a
+> point-read is ~1 RU and flat. The point: idle cost goes from "linear in
+> partition size, forever" to "≈ zero".
+
+---
+
+## 12. Roadmap & checklist (roadmap Phase 4)
+
+Acceptance gate for the whole effort: **idle work-find RU ≈ 0**, the
+"wrong-factory / ineligible-capability" stranding is gone, exactly-once
+assignment + crash recovery still hold on multi-host, and every step is
+flag-gated + reversible.
+
+### Coverage matrix (design → plan)
+
+Every design element maps to a checklist block below — no design decision is left
+without an implementation step.
+
+| Design element | §ref | Plan block |
+| --- | --- | --- |
+| Idle-poll RU bleed | §1.2 | M0 |
+| Product-as-queue / wrong-factory | §1.1, §5.4 | Routing-model fix |
+| Open questions / decisions | §10 | M-prep |
+| Schema, containers, RBAC | §4, §5, §6 | M-prep |
+| Service Bus topic + subscriptions + filters | §4.2 | M-prep, M1 |
+| Change-feed dispatcher + scheduler + targeting | §4.3, §5.1 | M1 |
+| Budget enforcement at assign | §6 | M1 |
+| Claim/fence + complete-on-claim | §5.2 | M2 |
+| Small messages (body from Cosmos) | §4.2 | M2 |
+| Token re-check at claim | §6 | M2 |
+| Metrics + alerting | §9 | M2 |
+| Failure→lease release, GC, same-repo clobber | §5.5 | Error handling & cleanup |
+| Scale-to-zero on-demand | §3, §5.1 | M3 |
+| Tests (dispatcher, CAS/fencing, GC, shadow) | §9 | Testing |
+| Rollback / flags per phase | §3 | Rollback & flags |
+| Doc updates | — | Docs to update |
+
+### M-prep — Decisions & schema (closes §10; before M1)
+- [ ] Lock dispatcher placement (platform-service loop vs separate worker) + **leader election** so a single active dispatcher avoids double-publish (§10 Q2).
+- [ ] Lock Service Bus tier (Standard default; Premium only for sessions / large messages / VNet) (§10 Q3).
+- [ ] Lock subscription model (per-factory correlation filter default; single-subscription SQL filter if factory churn is high) (§10 Q1).
+- [ ] Confirm the Cosmos poll path stays as a **permanent** flag-gated fallback (`AQ_FLEET_ROUTE=0`) (§10 Q4).
+- [ ] Confirm repo-advertisement source: `repos[]` in the heartbeat, derived from `AQ_FLEET_REPO_BASE` (§10 Q5).
+- [ ] Schema: add `targetFactoryId` to `FleetJobDoc`, `repos[]` to `FleetFactoryDoc`; register a new `fleet_queue_state` doc (`/productId`) for the M0 gate; provision the change-feed **lease container**; update the container registry / `COSMOS_AUTO_INIT`.
+- [ ] RBAC via managed identity: dispatcher = Service Bus **Sender**, factories = **Listener** on their own subscription; no shared keys committed.
+
+### M0 — RU quick win (no new infra) — ✅ DONE
+- [x] Add per-product `fleet_queue_state` doc; bump on create + every stage change (repo layer).
+- [x] Factory loop point-reads the gate each tick; run the claim only when it changed / mid-drain / safety interval.
+- [x] Keep `POLL_SECONDS` for local responsiveness; gate the *claim*, with a periodic safety backstop + fail-open (instead of raising the global poll interval).
+- [x] Flag-gate `AQ_FLEET_GATE=1` (default OFF) with a clean off-switch.
+- [x] Tests: fleet vitest (repo bump + `GET /fleet/queue-state`) + selftest `39b` (gate decisions) green; gate logic verified standalone.
+
+### Routing-model fix (lands with M0/M1)
+- [ ] Add `repo:<name>` capability token; factories advertise local repos via heartbeat (`repos[]`).
+- [ ] Scheduler matches on caps **+ repo**; product becomes a tag, not the routing key.
+- [ ] Fix `tracker-web` New-Job form: drop default `capabilities="build"`, stop hardcoding `mac-1/mac-2`, derive factories/repos from live data.
+- [ ] Add product→repo ownership validation (reject/route mismatches) — the A1 safety net.
+
+### M1 — Broker in shadow
+- [ ] Provision Service Bus `fleet-dispatch` topic + per-factory subscriptions, each with a **correlation filter** `targetFactoryId='<id>'` (managed identity, no keys).
+- [ ] Change-feed dispatcher (leader-elected) tails `fleet_jobs`, runs scheduler, stamps `targetFactoryId` (CAS), publishes targeted messages (`MessageId=jobId`, dup-detection on).
+- [ ] Dispatcher enforces per-product **budget** (paused / ceiling) before publishing (relocates the `claimNextJob` budget check, §6).
+- [ ] Publish in **shadow** alongside the Cosmos claim path; record route divergence (no action taken).
+- [ ] Verify: ≥ N hours shadow with broker-route == scorer-pick within tolerance.
+
+### M2 — Cutover delivery
+- [ ] Factories consume from Service Bus; `/fleet/accept` does the Cosmos CAS claim + returns `leaseEpoch`.
+- [ ] Messages carry `{jobId, productId, repo, caps, priority, targetFactoryId}` only; the consumer **reads the full job body from Cosmos** by `jobId` (256 KB limit, §4.2).
+- [ ] `/fleet/accept` (and `/fleet/claim`) **re-checks the §12 factory token** (productId + caps + factoryId) before granting the lease.
+- [ ] Implement **complete-on-claim** (reaper + change-feed re-dispatch owns liveness).
+- [ ] Cosmos poll path retained as flag-gated fallback (`AQ_FLEET_ROUTE=0`).
+- [ ] Emit metrics: subscription depth, dispatch lag, 409 claim-conflict rate, DLQ count, change-feed lag — **and wire alerts** (DLQ depth > 0, dispatch lag > threshold) into existing monitoring.
+- [ ] Verify exactly-once + crash recovery on a real multi-host run; DLQ ↔ `failed`/`retries_exhausted` mapping correct.
+
+### Error handling & cleanup (lands with M2) — see §5.5
+- [ ] Add `POST /fleet/fail` so a failed job sets the coordinator stage + **releases the lease immediately** (no expiry wait); wire it into `_finish_failure` / `fleet_quarantine`.
+- [ ] GC sweep (idempotent): delete merged `aq/job/*` branches, prune stale worktrees, sweep `aq/wip/*` after a job reaches a terminal/shipped stage.
+- [ ] Prevent same-repo worktree clobber: Service Bus **sessions keyed by `repo`** + a per-`(host, repo)` local lock.
+- [ ] Verify: failed jobs free their lease promptly; no orphaned worktrees/branches after N jobs; GC never deletes unmerged work or an in-flight worktree.
+
+### M3 — On-demand factories (scale-to-zero)
+- [ ] KEDA / Container Apps scaler on subscription depth; idle ⇒ zero running workers.
+- [ ] Optional warm-pool (1 small instance) if cold-start latency matters.
+- [ ] Verify: zero idle workers + zero idle RU; cold-start latency within target.
+
+### Testing (every phase — tests are sacred)
+- [ ] Unit: dispatcher scheduling + publish, claim CAS + `leaseEpoch` fencing, `/fleet/fail`, GC idempotency, the M0 gate read/skip logic.
+- [ ] Integration: shadow-divergence harness (M1), exactly-once + crash recovery (M2), scale-to-zero behavior (M3).
+- [ ] Extend `agent-queue/selftest.sh` + platform-service `vitest`; **CI green is the gate** to advance each phase.
+
+### Rollback & flags (per phase)
+- [ ] Each phase ships behind a flag with a documented one-line rollback: M0 `AQ_FLEET_GATE`, M1 shadow (publishes but never acts), M2 `AQ_FLEET_ROUTE` / broker-source toggle, M3 scaler disable.
+- [ ] Verify each rollback returns to the prior working path with **no data loss** and no stranded leases/messages.
+
+### Docs to update on completion
+- [ ] `GIGAFACTORY_ROADMAP.md` — tick Phase 4; correct the stale §0 progress table.
+- [ ] `GIGAFACTORY_SYSTEM_OVERVIEW.md` — add the broker/dispatcher to the architecture + code map.
+- [ ] common_plat `docs/GIGAFACTORY/` — mirror the backend/dispatcher changes.
--- a/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md
+++ b/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md
@ -0,0 +1,684 @@
+# Agent Gigafactory — Vision & Implementation Roadmap
+
+> **One-liner:** Evolve today's single-host `agent-queue` bash runner into a distributed **gigafactory** — a fleet of heterogeneous machines (Mac/Ubuntu/Windows), each running different coding-agent CLIs (Devin/Codex/Claude/Copilot/…), where a scheduler **auto-picks jobs from a shared inbox and routes each `.md` to the best factory × tool × profile** — built service-side on `platform-service` + `tracker-web`, with the bash runner surviving as the offline edge agent.
+
+> **How to use this doc:** It is both a PRD and an execution checklist. Every feature is a `- [ ]` checkbox with **acceptance criteria** and a **verify gate**. A phase is "100% done" only when every box is checked, its gate passes, and the phase **Definition of Done** rubric (§16) is green. Update the progress table (§0) as you go.
+
+---
+
+## 0. Progress tracker
+
+| Phase | Theme | Status | % | Gate |
+| ----- | ----- | ------ | - | ---- |
+| **0** | Baseline (today) | ✅ shipped | 100% | `selftest.sh` green |
+| **1** | Manifest + profiles + capabilities + tracker adapter (single host) | ✅ done | ~98% | adapter e2e + selftest |
+| **2** | Coordinator as platform-service module + Cosmos + multi-factory leasing | ✅ done | ~98% | fleet e2e + module tests |
+| **3** | Fleet control plane in tracker-web + DAG deps + budgets + scoring router | ✅ done | 100% | web e2e + router tests |
+| **4** | Message bus + autoscaling + cross-OS capability marketplace | ◐ in progress | ~10% | load/chaos suite — **M0 RU gate shipped** (`fleet_queue_state` + `GET /fleet/queue-state` + `AQ_FLEET_GATE`); broker/M1+ per `FLEET_DISPATCH_REDESIGN.md` |
+| **5** | Self-optimizing / learned routing | ☐ not started | 0% | offline eval + A/B |
+
+Legend: ☐ not started · ◐ in progress · ✅ done. Keep per-phase checklists below as the source of truth; this table is the summary. **Owners per phase: §23 · rollout/rollback: §21 · capacity & SLOs: §22/§19.** For the full current-state architecture, diagrams, code map, next steps and known gaps see **`GIGAFACTORY_SYSTEM_OVERVIEW.md`** (companion doc).
+
+---
+
+## 1. Vision & metaphor
+
+A **gigafactory** turns raw intent (`.md` task files / tracker items) into shipped software with minimal human touch. The mental model is a physical factory network:
+
+| Term | Meaning |
+| ---- | ------- |
+| **Fleet** | The whole network of machines under one control plane. |
+| **Factory** | One physical/virtual machine (a Mac, an Ubuntu box, a Windows host). Has an OS, installed tools, creds, capacity. |
+| **Station** | A tool/engine slot inside a factory (a Devin seat, a Codex CLI, a Claude Code session, a Copilot agent). |
+| **Worker** | A single running agent process executing one job at a station. |
+| **Job** | A unit of work: a prompt/`.md` + manifest (profile, scope, gates, budget). |
+| **Profile** | The *role* doing the work (developer, backend engineer, UX/UI designer, QA, reviewer) = persona prompt **+** capability requirements. |
+| **Capability** | A tag a factory advertises and a job requires (`os:mac`, `has:xcode`, `has:figma`, `gpu`, `engine:devin`). |
+| **Lease** | A time-boxed claim of a job by a worker; expires → job is reclaimable (crash recovery). |
+| **Gate** | A checkpoint a job must pass: auto-QA `verify`, human review, ship approval. |
+| **Artifact** | Any captured output: commits/PRs, logs, screenshots, reports, build outputs. |
+
+**North star:** drop work into one inbox (or file a tracker task), and the fleet figures out *where* (factory), *with what* (tool/engine), *as whom* (profile), runs it in parallel, self-heals on crash, gates quality automatically, and surfaces everything in one live control plane — while a human only approves the final ship.
+
+```
+                         ┌──────────────────────── CONTROL PLANE (tracker-web) ────────────────────────┐
+                         │  plan/intake · roadmap · Fleet map · live logs · cost · approvals           │
+                         └───────────────▲───────────────────────────────────┬─────────────────────────┘
+                                         │ REST/SSE                           │
+            ┌────────────────────────────┴─────── COORDINATOR (platform-service module) ───────────────┐
+            │  queue · scheduler/router · leases · profiles · capabilities · events · budgets (Cosmos)  │
+            └───▲───────────────────────▲───────────────────────▲───────────────────────▲───────────────┘
+                │ claim/lease/report     │                       │                       │
+        ┌───────┴───────┐       ┌────────┴───────┐       ┌────────┴───────┐       ┌───────┴────────┐
+        │  FACTORY: mac │       │ FACTORY: ubuntu│       │FACTORY: windows│       │ FACTORY: mac-2 │
+        │ devin, claude │       │ codex, claude  │       │ copilot, codex │       │ devin (xcode)  │
+        │ [agent-queue] │       │ [agent-queue]  │       │ [agent-queue]  │       │ [agent-queue]  │
+        └───────────────┘       └────────────────┘       └────────────────┘       └────────────────┘
+```
+
+---
+
+## 2. Current state (Phase 0 baseline — already shipped)
+
+Today's `agent-queue.sh` + `dashboard.mjs` (single host, zero-dep bash + Node):
+
+- **Folder kanban lifecycle:** `inbox → building → review → testing → shipped` (+ `failed`).
+- **Auto-QA gate:** agent rc=0 → `review/`; optional `verify:` runs in `cwd` → pass `testing/`, fail `failed/`; no verify → parks in `review/`. Manual `ship` = the human gate.
+- **Per-job frontmatter:** `engine` (devin/claude/codex), `cwd`, `yolo` (→ dangerous/auto-approve), `lock` (per-repo serialization), `timeout`, `verify`.
+- **Concurrency:** `AGENT_QUEUE_MAX` (default 3), per-`lock` serialization so same-repo jobs never collide.
+- **State & logs:** `.state/<job>.meta` heartbeats + `logs/<job>.log`; git-tracked queue (audit-by-commit).
+- **Interactive dashboard:** numbered selectable job list, single-key actions (promote/ship/reject/requeue), live log viewer, run/stop, all shelling out to `agent-queue.sh`.
+
+**Carries forward:** the `.md`-in-`inbox` UX, frontmatter contract, lifecycle stage names, `verify` gate, lock/affinity concept, the bash runner itself (becomes the factory agent).
+**Must change for the fleet:** single-host run loop → distributed leasing; file-only state → service + Cosmos; one engine choice → capability/profile routing; local dashboard → shared control plane.
+
+- [x] Phase 0 complete — baseline shipped and self-tested. *(reference, not a work item)*
+
+---
+
+## 3. Goals & non-goals
+
+**Goals**
+- One intake, many machines: parallel execution across heterogeneous OS/tools.
+- Automatic routing to the best `factory × tool × profile` with affinity, fairness, budget, and health awareness.
+- Self-healing (lease expiry/requeue), quality gates, and full observability.
+- Reuse the ByteLyst stack (`platform-service`, Cosmos, `@bytelyst/*`, tracker-web) — no parallel infra.
+- Preserve offline/zero-dep edge operation via the bash runner.
+
+**Non-goals**
+- Not a CI/CD replacement (it *triggers* CI; CI still gates merges).
+- Not a general-purpose workflow engine (scoped to coding-agent execution).
+- Not a model/inference host (it orchestrates agent CLIs, doesn't serve models).
+- Not abandoning the simple `.md` mental model — humans still drop files / file tasks.
+
+---
+
+## 4. Core concepts contract (must hold across all phases)
+
+- [ ] Every job has a stable **id**, an immutable **manifest**, and an append-only **event log**.
+- [ ] Every Cosmos document carries `productId` (ByteLyst rule).
+- [x] A job in flight is always covered by exactly one **lease**; no live lease → reclaimable.
+- [x] **Atomic claim:** a job is assigned to exactly one worker via optimistic concurrency (Cosmos `_etag`/`If-Match` or a conditional `fleet_leases` insert keyed by `jobId`). Concurrent claimers — exactly one wins; losers retry the next candidate.
+- [x] **Fencing token:** every lease carries a monotonic `leaseEpoch`. Every report/commit/ship carries its epoch; the coordinator **rejects writes from a stale epoch**, so a partitioned or zombie worker cannot corrupt state after its lease was reclaimed.
+- [ ] **Coordinator-authoritative time:** all lease/TTL/SLA math uses server timestamps, never factory clocks (clock-skew safety).
+- [ ] Lifecycle stages are canonical and shared: `queued → assigned → building → review → testing → shipped` (+ `blocked`, `failed`, `dead_letter`).
+- [ ] The bash runner and the service speak the **same manifest + event vocabulary** (one schema, two transports).
+
+> **Implementation status (2026-05-29) — Phase 2 Foundation merged** (common-plat PR #28, `platform-service/src/modules/fleet/`): all 7 `fleet_*` containers (§13) ✓; repositories + coordinator (claim/lease/fence/heartbeat/reaper) ✓; idempotency + deps + submit-time cycle detection ✓; 50 module tests green.
+> **✓ P0 hardening landed (2026-05-29, common-plat PR #29) — atomic claim is now truly concurrency-safe.** Added `updateIfMatch` to `@bytelyst/datastore`: Cosmos conditions the replace on `_etag` via `accessCondition {type:'IfMatch'}` (412 → conflict) plus a rev compare for the pre-read window; the Memory provider does `get→compare→set` in one synchronous block (no `await` between), so concurrent callers cannot interleave. `fleet` `revUpdate*` now write conditionally. Proven by `Promise.all` 2-contender + N-claimer stress + concurrent `claimNextJob`/lease-renew tests (these **fail** on the old read-check-write, pass now). datastore 48 + fleet 53 green; full workspace build/test clean; no consumer regressed. **P2-S3 (factory integration) is now unblocked.**
+
+---
+
+## 5. The evolved Job manifest (feature)
+
+Extend today's frontmatter into a richer, **backward-compatible** manifest. Old `.md` files keep working (new fields optional with sane defaults).
+
+```yaml
+---
+# --- existing (unchanged) ---
+engine: devin            # explicit engine; overrides profile/engine-class
+cwd: /abs/path/repo
+yolo: true
+lock: my-repo
+timeout: 45m
+verify: pnpm -s test
+# --- new ---
+profile: backend-engineer        # role: persona + capability requirements
+engine-class: agentic-coder      # abstract; scheduler picks a concrete engine if `engine` unset
+capabilities: [os:any, node>=20] # hard requirements a factory MUST satisfy
+prefers: [factory:mac-2]         # soft routing hints (affinity)
+priority: high                   # critical|high|medium|low → SLA + preemption
+budget: { usd: 5, tokens: 2M, wall: 4h }   # wall = HARD ceiling (always enforceable). usd/tokens = best-effort
+                                           # caps: enforced only where the engine/provider exposes live metering;
+                                           # otherwise estimated from provider usage APIs post-hoc + alerted.
+deps: [job-123, job-456]         # DAG: don't start until these reach `shipped`/`testing`
+idempotency-key: nomgap-ux-2     # dedupe: a second identical submit is a no-op
+retry: { max: 2, backoff: 5m, on: [timeout, verify_failed] }
+review-policy: manual            # auto|manual|reviewers:[@alice]
+artifacts: [coverage, screenshots]   # what to capture beyond commits
+tracker-item: ITEM-789           # link back to the originating tracker task
+---
+```
+
+- [ ] Define the manifest schema (Zod in the service; documented YAML for `.md`).
+- [x] Backward-compat: a Phase-0 `.md` (only `engine/cwd/yolo`) parses with all new fields defaulted. *(P1-S1: bash runner; Zod schema still P2. selftest backward-compat case green.)*
+- [x] **Capability grammar** defined: tokens are `key` (presence, e.g. `has:xcode`), `key:value` (e.g. `os:mac`, `engine:devin`), or `key<op>version` with `op ∈ {>=,>,=,<=,<}` (e.g. `node>=20`). `os:any` is a wildcard that matches every factory. A job matches a factory iff every required token is satisfied by the factory descriptor. *(P1-S1: `caps_match`/`detect_capabilities` in `agent-queue.sh`.)*
+- [x] **`engine-class` taxonomy** defined as an enum (`agentic-coder`, `chat-coder`, `review-only`) with a documented engine→class map (`devin,claude,codex → agentic-coder`; `copilot → chat-coder`). If `engine` is set it wins; else the scheduler picks any free engine in the class honoring `prefers-engine`. *(P1-S1: `resolve_engine`; `review-only` mapping reserved.)*
+- [x] **`idempotency-key` semantics:** `key + content-hash` identical ⇒ no-op (returns existing job). Same `key`, **different** content ⇒ **rejected with 409** unless the prior job is still `queued`/`blocked` (then it is superseded). A re-`run`/`retry` of an existing job is **not** a new submit and never trips dedupe. *(P1-S1: add-time dedupe; bash maps "409" → clear error, `queued` → still in `inbox/` ⇒ superseded.)*
+- [x] **`deps` semantics:** a dep is satisfied when it reaches `shipped` (default) or `testing` if `deps-mode: soft`. Submit-time **cycle detection** rejects cyclic graphs; unmet deps put the job in `blocked` (not `queued`). Cross-factory deps require the coordinator (P2); single-host deps work in P1. *(P1-S2: `deps_unmet` skip-with-reason in selection + `status` surfacing; `deps_would_cycle` on `add`. Cross-machine deps remain P2.)*
+- **Acceptance:** a manifest fixture suite parses/validates; invalid manifests fail with precise errors; capability-grammar + dep-cycle + idempotency-conflict cases covered.
+- **Verify gate:** schema unit tests (≥ 1 per field incl. defaults + 5 invalid cases + grammar/cycle/409 cases).
+
+---
+
+## 6. Profiles — persona + capability (feature)
+
+A **profile** = a versioned file combining a persona (system-prompt overlay), required capabilities, default gates, preferred engine/model, and allowed repo scopes. Stored as `profiles/<name>.md` (Phase 1) → Cosmos `profiles` container (Phase 2).
+
+```yaml
+# profiles/backend-engineer.md
+---
+name: backend-engineer
+persona: |
+  You are a senior backend engineer. Favor minimal, well-tested changes...
+capabilities: [node>=20, has:pnpm]
+default-verify: pnpm -s typecheck && pnpm -s test
+engine-class: agentic-coder
+prefers-engine: [devin, claude]
+allowed-scope: ["backend/**", "packages/**"]   # blast-radius guardrail
+review-policy: manual
+---
+```
+
+- [x] Author starter catalog: `developer`, `backend-engineer`, `frontend-engineer`, `ux-designer`, `ui-designer`, `qa`, `reviewer`, `docs-writer`. *(P1-S2: `profiles/*.md` + a reserved `planner`.)*
+- [x] Persona overlay is **prepended** to the job body before the agent runs; secrets are never written to logs or the event stream (redaction at the source). *(P1-S2: `profile_persona` prepended to the stripped body file.)*
+- [x] Profile supplies default `verify`, `capabilities`, `engine-class`, `allowed-scope` when the job omits them. *(P1-S2: `fm_eff` — also `prefers-engine` + `review-policy`; job fields always override.)*
+- [ ] Profile versioning: changing a profile doesn't mutate in-flight jobs (snapshot at assign time). *(P2 — needs Cosmos snapshot at assign time.)*
+- [x] `allowed-scope` enforced as a guardrail (warn in P1, enforce/deny in P2 via pre-flight diff check). *(P1-S2: `scope_check` post-run WARN-only + `scope_warning=` in meta; `path_in_scope` unit-testable.)*
+- **Acceptance:** a job with `profile: backend-engineer` and no `verify` inherits the profile's verify + persona.
+- **Verify gate:** profile-resolution unit tests; persona-injection golden test.
+
+---
+
+## 7. The scheduler / router (the heart) (feature)
+
+Given a `queued` job and the current fleet, choose `(factory, station/engine, profile)` and issue a lease.
+
+**Inputs:** job manifest (capabilities, priority, budget, deps, prefers, lock), profile requirements, live factory descriptors (capabilities, load, health, cost class), lock/affinity table, fairness counters.
+
+**Algorithm (deterministic, explainable):**
+1. **Filter** factories by **hard capability match** (job ∪ profile capabilities ⊆ factory capabilities) and free station for a compatible engine.
+2. **Block** if `deps` unmet or `lock` already held → leave `queued`/`blocked`.
+3. **Score** each candidate factory:
+   `score = w1·capabilityFit + w2·affinity(prefers, repo-stickiness) + w3·(1/load) + w4·costFit(budget) + w5·health − w6·starvationPenalty`
+4. **Tie-break:** highest priority job first; then oldest; then lowest cost class.
+5. **Assign atomically** → create the lease under an optimistic-concurrency guard (`_etag`/`If-Match` or conditional insert keyed by `jobId`) **with a fresh `leaseEpoch`**; on conflict another factory won → retry the next candidate. Set job `assigned`, decrement station/seat capacity, bump fairness counter. Use **coordinator-authoritative timestamps** only.
+6. **Preemption (P3+):** a `critical` job may pause a `low` job at a needed station (checkpoint + requeue, bumping the preempted job's `leaseEpoch`).
+
+> **Phasing:** Phase 2 ships the deterministic **filter + atomic-assign core** (fixed weights). Phase 3 adds **tunable weights, preemption, and the explainability UI**. Phase 5 learns the weights (§14).
+
+- [ ] Implement pure, unit-testable scoring function (no I/O) with configurable weights.
+- [ ] Hard-filter correctness: never assign a job to a factory missing a required capability.
+- [ ] Affinity/stickiness: same-repo jobs prefer the factory that has the warm checkout (lock-aware).
+- [ ] Fairness: no factory or product starves under sustained load (counter + penalty).
+- [ ] Explainability: every assignment records *why* (matched caps, score breakdown) in the event log.
+- [ ] Determinism: same inputs → same decision (seeded tie-breaks) for testability.
+- [ ] Define **factory health** ∈ [0,1] = f(heartbeat freshness, recent run failure-rate, resource pressure); factories below a health floor are **filtered out**, not merely down-weighted.
+- [ ] **Station/seat capacity:** a factory's free stations = `min(host slots, per-engine seat limits)` (e.g. licensed Devin/Claude seats); the scheduler never over-subscribes a seat-limited engine.
+- [ ] **Distributed lock:** the Phase-0 local `lock` becomes a **coordinator-held lock** so same-`lock` jobs serialize across the whole fleet (prevents two factories pushing the same repo concurrently).
+- **Acceptance:** scenario fixtures (10+) produce expected assignments incl. starvation, capability-miss, seat-exhaustion, unhealthy-factory exclusion, and budget-exceed; a concurrent-claim race test proves exactly one winner.
+- **Verify gate:** router unit suite ≥ 95% branch coverage on the scoring/filter core; atomic-claim race test.
+
+---
+
+## 8. Factory model & registration (feature)
+
+Each machine runs a **factory agent** (the evolved `agent-queue` runner) that registers, heartbeats, claims jobs, and reports events.
+
+- [ ] **Capability auto-detection** at boot: OS, installed engines (devin/claude/codex/copilot), tool probes (xcode, figma-cli, docker, gpu), node/pnpm versions, available creds (presence only, never values).
+- [ ] **Enrollment / bootstrap trust**: first registration authenticates with a one-time enrollment secret (or an operator-issued platform JWT). The factory then receives a **scoped, rotatable factory token** (`jose` JWT); decommission = revoke. No standing shared secret in the queue.
+- [ ] **Registration**: `POST /fleet/factories/enroll` with descriptor → receives a factory id + one-time token (built as: registration == first heartbeat; enroll mints the scoped token).
+- [ ] **Heartbeat**: periodic `POST /fleet/factories/heartbeat` (load, free stations, health). A **coordinator lease reaper** (not Cosmos TTL) sweeps `expiresAt < now` and reclaims, **bumping `leaseEpoch`** so the dead/zombie worker is fenced; a factory missing N heartbeats is marked `offline` and all its leases reclaimed. **Cadence must be < the 90s stale threshold** (`AQ_FLEET_LEASE_RENEW_SEC`; fleet launcher uses 30s).
+- [ ] **Claim loop**: `POST /fleet/claim` advertising capabilities/free stations; atomic (exactly one winner, §4); receives a job + lease TTL + `leaseEpoch`. Use **claim backoff / long-poll** to bound Cosmos RU under many idle factories (see §22); **Phase-4 M0 adds the `AQ_FLEET_GATE` skip** (`GET /fleet/queue-state`), and broker push replaces polling in M1+.
+
+> The endpoint paths above are the **as-built** API (`/fleet/factories/enroll`,
+> `/fleet/factories/heartbeat`, `/fleet/claim`) — see `GIGAFACTORY_SYSTEM_OVERVIEW.md`
+> §9 and the fleet module README for the authoritative list.
+- [ ] **Report**: stream stage/log/event back (`POST /fleet/runs/:id/events`), **echoing `leaseEpoch`** (stale epoch → 409, worker self-aborts); renew lease while alive.
+- [ ] **Environment prep**: before `verify`, the factory ensures deps are installed (cold checkout → `pnpm install`); prep time counts against `budget.wall`.
+- [ ] **Graceful drain**: factory can stop claiming, finish in-flight, deregister.
+- **Acceptance:** a factory enrolls, claims a matching job, heartbeats, completes; a killed factory's job is reclaimed by another within the lease TTL and the killed worker's late report is **rejected by fencing**.
+- **Verify gate:** factory-agent integration test against a mock coordinator; crash-recovery + fencing-rejection test.
+
+---
+
+## 9. Coordination architecture (decision + path)
+
+Three transports were evaluated. **Decision: platform-service-native coordinator is the spine; git-queue stays for the offline edge; broker added only at scale.**
+
+| Option | Pros | Cons | Verdict |
+| ------ | ---- | ---- | ------- |
+| (a) **Git-synced queue** (evolve folders) | zero infra, audit-by-commit, offline | weak/racey leasing, latency, merge churn | **Edge/offline only** |
+| (b) **Coordinator service** (platform-service module) | real leases, fairness, observability, reuses auth/Cosmos/productId | a service to run | **Chosen spine (P2)** |
+| (c) **Message broker** (NATS/Redis/SQS) | scale, backpressure, push dispatch | most moving parts/ops | **P4 when throughput demands** |
+
+- [ ] Document the decision + rationale in-repo (this section is the canonical record).
+- [ ] Define the **claim/lease protocol** once; both git-queue (poll) and service (API) implement it.
+- [ ] **Split-brain / network-partition safety:** a partitioned factory may keep running and even `git push`. `idempotency-key` dedupes *submits* but cannot undo *side-effects*. Mitigation: **fencing** — the coordinator rejects `ship`/merge reports from a stale `leaseEpoch`, and the distributed `lock` (§7) prevents a reclaimed-job's twin from pushing the same repo. Residual risk (a stale push to a feature branch) is contained by the PR-merge ship gate (§10) and surfaced for human triage.
+- [ ] **Offline-degrade**: a factory cut off from the coordinator falls back to its local git-queue and reconciles on reconnect; on reconnect it presents its `leaseEpoch` — if reclaimed, its results are quarantined, not auto-merged.
+- [ ] **Poll cost**: bound claim-loop RU via long-poll/backoff (§22); migrate to broker push at P4.
+- **Acceptance:** the same job manifest runs identically through the bash/git path and the service path; a simulated partition does not double-merge (fencing test).
+- **Verify gate:** contract test asserting protocol parity (git vs service) + partition/fencing test.
+
+---
+
+## 10. tracker-web / platform-service integration (committed path)
+
+**Layering:** tracker = *WHAT/WHY* (plan, intake, prioritize, roadmap, votes) · gigafactory = *HOW* (execute) · platform-service = shared brain · agent-queue runner = offline edge. Grounded in the real `tracker-service` model (`Item`: `type` bug/feature/**task**, `status` open/in_progress/done/closed/wont_fix, priority, labels, assignee, `source` incl. **auto_detected**, votes, comments, public roadmap) and the `tracker-web` `/api/tracker/[...path]` proxy pattern.
+
+### Phase 1 — Adapter (no new infra)
+- [x] **task → job**: a tracker `Item` of `type: task` (e.g. `assignee: @agent` or label `agent:run`) is exported to a job `.md` (manifest mapped: title/description → body, priority → priority, labels → capabilities/profile hints). *(P1-S4: `aq from-tracker`; labels `engine-class:`/`profile:`/`priority:`/`cap:` → frontmatter.)*
+- [x] **job → tracker**: lifecycle events post back as **status updates + comments** — `building` → status `in_progress` + comment "started on factory X"; `shipped` → `done` + comment with commit SHAs / PR link / verify results; `failed` → comment with reason (status stays `in_progress` for human triage). *(P1-S4: `aq to-tracker` PATCHes status + posts a metrics-only comment; one-way echo §24.5; never fatal. The items API has no blocked/failed status, so failures map to `wont_fix` by default — override via `AQ_TRACKER_STATUS_FAILED`.)*
+- [x] Idempotency: re-running the adapter for the same item doesn't create duplicate jobs (idempotency-key = item id + content hash). *(P1-S4: derived `idempotency-key: tracker-<id>` reuses Slice 1 dedupe; `to-tracker` is idempotent via `tracker_echoed`.)*
+- [x] Adapter is a thin script/CLI (`aq from-tracker ITEM-789`) + optional poller. *(P1-S4: `from-tracker`/`to-tracker` + opt-in `AQ_TRACKER_AUTO` auto-echo; a standalone poller is deferred.)*
+- **Acceptance:** filing a tracker task, marking it `agent:run`, results in a queued job; on ship, the item flips to `done` with a SHA comment.
+- **Verify gate:** adapter e2e against a tracker-service test instance (or mock); round-trip assertion.
+
+**Stage → tracker status mapping** (tracker's enum is coarser than the fleet's; keep fine-grained stage in a label + comment so no detail is lost):
+
+| Fleet stage | Tracker `status` | Extra |
+| ----------- | ---------------- | ----- |
+| `queued` / `assigned` / `blocked` | `in_progress` | label `fleet:<stage>` |
+| `building` / `review` / `testing` | `in_progress` | label `fleet:<stage>` + progress comment |
+| `shipped` | `done` | comment with SHA(s)/PR link/verify result |
+| `failed` / `dead_letter` | `in_progress` + label `needs-triage` | never auto-`closed`/`wont_fix` (humans decide) |
+
+**Ship semantics (PR flow):** `shipped` = change **merged to target branch with CI green** (default), OR `pr-opened` when `review-policy` defers merge to humans/CI — configurable per profile. This honors the non-goal that CI still gates merges (§3); the agent never bypasses branch protection.
+
+### Phase 2 — Native spine
+- [ ] Stand up a `fleet` (a.k.a. `orchestrator`) module **inside platform-service**, sibling to `tracker-service`: pattern `types.ts → repository.ts → routes.ts`, ESM, Cosmos, `productId`, `req.log`.
+- [ ] Endpoints: jobs CRUD, claim/lease, events/report, factories register/heartbeat, profiles, stats.
+- [ ] Runners (bash + any) become API clients of this module; tracker adapter calls it directly.
+- **Acceptance:** a job submitted via the module is claimed by a real factory and shipped, with all state in Cosmos.
+- **Verify gate:** module test suite (repository + routes) using the shared `@bytelyst/testing` inject helpers.
+
+### Phase 3 — Unified control plane
+- [ ] Add a **Fleet** surface to `tracker-web` reusing auth/Primitives/DataTable/product switcher: fleet map (factories + load/health), job table, job DAG, **live log streaming**, lease/heartbeat status, cost burndown, approve/ship buttons.
+- [ ] **Streaming caveat (correctness):** live logs **must not** use the existing buffering catch-all proxy `/api/tracker/[...path]` — it does `res.text()` and would never stream. Use a **dedicated Next.js Route Handler returning a `ReadableStream` (SSE)** or a direct SSE/WebSocket to platform-service. Full logs are shipped to blob storage (§17); the endpoint serves stored tail + live append.
+- [x] The Node TUI dashboard becomes a thin client of the same `/fleet` API (parity with web). *(devops-tools `agent-queue/dashboard.mjs` + `lib/fleet-dash.mjs`, `AQ_FLEET_DASH=1`.)*
+- **Acceptance:** an operator can watch all factories + tail any job log + ship from the browser.
+- **Verify gate:** web e2e (Playwright) covering fleet map render, live log, and a ship action.
+
+---
+
+## 11. Lifecycle & gates at scale (feature)
+
+- [ ] Canonical stages enforced server-side: `queued → assigned → building → review → testing → shipped` (+ `blocked`, `failed`, `dead_letter`); transitions validated (illegal transition → 409).
+- [ ] Per-profile default `verify`; per-job override; verify runs at the factory, result reported as an event.
+- [ ] Human gates: `review-policy` routes to reviewers; multi-reviewer support (P3).
+- [x] **Dead-letter**: after `retry.max` exhausted, job → `dead_letter` with full diagnostics; never silently dropped. *(P1-S3 single-host stand-in: `failed/` `result=retries_exhausted`, WIP branch + full log preserved.)*
+- [ ] **Backpressure**: when no factory can take more, jobs stay `queued` (no thrash); SLA timers visible.
+- [ ] **Ship semantics** are profile-configurable (merged+green vs `pr-opened`, §10); `shipped` is terminal-success, `dead_letter` terminal-failure; `blocked` (unmet deps) is distinct from `queued`.
+- [x] **Retry vs idempotency**: a retry creates a new `fleet_runs` attempt under the same job/`idempotency-key` (never a duplicate job); backoff honored; `retry.on` filters which failure classes retry. *(P1-S3 single-host: `attempts` counter survives requeue; `backoff`→`next_eligible` gates selection; `on` filters timeout/verify_failed/crash.)*
+- **Acceptance:** a perpetually-failing job lands in `dead_letter` after configured retries; a passing one auto-advances to `testing` then waits for human `ship`; an illegal transition is rejected.
+- **Verify gate:** lifecycle state-machine unit tests (all transitions + illegal-transition rejection + retry/dead-letter path).
+
+---
+
+## 12. Security, safety & governance (feature — critical with `yolo`/dangerous)
+
+- [ ] **Secret isolation**: creds live on each factory (env/keychain), **never** in the queue, manifest, logs, or Cosmos. Factory advertises *presence* of a cred capability, not the value.
+- [ ] **Scoped git tokens** per factory/repo; least-privilege; rotation documented.
+- [ ] **Push policy**: protected branches; agents push to feature branches + open PRs by default; direct-to-main gated by profile/flag.
+- [ ] **Blast-radius guardrail**: enforce `allowed-scope` — pre-flight + post-run diff check; out-of-scope changes block the ship gate.
+- [ ] **Budget kill-switch**: exceed `budget` (usd/tokens/wall) → pause worker, alert, require human resume.
+- [ ] **Supply-chain safety**: edits to shared `@bytelyst/*` packages require `reviewer` profile + human gate (never auto-ship).
+- [ ] **Audit trail**: append-only event log per job (who/what/when/where/cost); immutable.
+- [ ] **Corp network/proxy**: honor `NETWORK`/proxy + truststore conventions on factories that need them.
+- [ ] **Kill switch (global)**: one command/flag halts all claiming fleet-wide (incident response).
+- **Acceptance:** a job attempting an out-of-scope edit is blocked at the gate; a budget overrun pauses and alerts; no secret ever appears in any persisted artifact (scanner test).
+- **Verify gate:** security test suite incl. a secret-leak scanner over logs/meta + scope-enforcement test.
+
+---
+
+## 13. Data model (Cosmos containers, P2+)
+
+Each container partitioned sensibly; every doc has `productId`.
+
+- [x] `fleet_jobs` (pk `/productId`) — manifest snapshot **+ the full instruction body verbatim as markdown (`bodyMd`)**, current stage, idempotency-key, tracker-item link, `checkpoint` pointer (WIP branch/commit). This is the **durable source of truth for instructions** — a factory holds only a transient materialized copy, so a machine going down loses nothing (§25).
+- [x] `fleet_runs` (pk `/jobId`) — one per execution attempt: factory, engine, profile snapshot, timings, exit, verify result, **and execution insights: model, tokensIn/Out (+cached), cost (`estimated` flag), turns, tool-call counts, filesChanged, linesAdded/Deleted, attempt number** (§26).
+- [x] `fleet_leases` (pk `/jobId`) — holder factory, `expiresAt`, **`leaseEpoch` (fencing)**, renewals. **Reclaim via a coordinator reaper** that scans `expiresAt < now` — Cosmos TTL only garbage-collects stale rows, it **cannot trigger reclaim logic**. Claim guarded by `_etag`/`If-Match`.
+- [x] `fleet_factories` (pk `/productId`) — descriptor, capabilities, health, load, last heartbeat, seat limits.
+- [x] `fleet_profiles` (pk `/productId`) — versioned profile snapshots (immutable per version).
+- [x] `fleet_events` (pk `/jobId`) — append-only audit/event stream (stage changes, log pointers, cost ticks, scheduler decisions).
+- [ ] `fleet_artifacts` (pk `/jobId`) — pointers to **blob-stored** logs + artifacts (coverage, screenshots, build output). Large logs live in `@bytelyst/blob`, **never** inline in Cosmos (doc-size + RU limits).
+- [ ] Relate to existing tracker `Item` via `tracker-item` (no duplication of planning data).
+- [x] **Optimistic concurrency** (`_etag`) on every job stage transition + lease claim to prevent lost updates / double-assignment. *(PR #29: `updateIfMatch`.)*
+- [ ] **Indexing/RU**: the claim query is hot — index `stage`, `priority`, `capabilities`; avoid cross-partition fan-out; provision RU/s per §22.
+- **Acceptance:** repository CRUD + query tests per container; **atomic-claim race test (N concurrent claimers → exactly one wins)**; reaper-reclaim + fencing-rejection test; lease-expiry verified via reaper (not TTL).
+- **Verify gate:** repository unit/integration tests (memory + Cosmos provider via `DB_PROVIDER`).
+
+---
+
+## 14. Phased build roadmap (checklists)
+
+Each phase: **Goal → checklist → Exit criteria**. Don't start a phase until the prior phase's Exit criteria are green. Tick boxes here as the canonical progress.
+
+### Phase 1 — Manifest + profiles + capabilities + tracker adapter (single host)
+**Goal:** richer single-host runner that understands profiles/capabilities and bridges to tracker — no distributed infra yet.
+
+> **Slice progress — P1-S1:** manifest parsing (all §5 fields, defaulted + backward-compatible), `priority` ordering, capability detection+match gate, `engine-class` resolution, and `idempotency-key` dedupe are **done** on the bash runner.
+>
+> **Slice progress — P1-S3 (resilience & insights, single host):** crash recovery (`recover_orphans` + `aq recover`), git WIP checkpoint/resume (`aq/wip/<job>`), functional `retry` policy (backoff + `retries_exhausted`), and execution insights (`parse_usage`, per-run metrics in meta, `aq insights`, `status`/`dash` insights) are **done** — see §11/§25/§26.
+>
+> **Slice progress — P1-S2 (profiles + deps/DAG, single host):** the `profiles/` catalog + resolution (`fm_eff` inheritance with job>profile>default precedence, persona injection), the warn-only `allowed-scope` guardrail (`scope_check`/`path_in_scope`), and single-host `deps` (block-with-reason in selection, `status` surfacing, submit-time cycle detection) are **done** — see §5/§6.
+>
+> **Slice progress — P1-S4 (tracker adapter, single host):** the task ↔ job round-trip is **done** (§10) — `aq from-tracker` materializes a job from a tracker Item (idempotent on `tracker-<id>`, label→manifest mapping), `aq to-tracker` echoes status + a metrics-only comment one-way (idempotent via `tracker_echoed`, never fatal), and opt-in `AQ_TRACKER_AUTO` auto-echoes on transitions. All HTTP is curl-only through one wrapper (test seam `AQ_TRACKER_API_CMD`). **This closes the Phase-1 §14 tracker-adapter item.** Remaining P1 extras: Node-`dash` surfacing of the new fields. *(`budget.wall` now enforced — see §11 retry/budget line below.)*
+
+- [x] Extend `agent-queue.sh` frontmatter parsing for all new manifest fields (§5), defaulted + backward-compatible. *(P1-S1)*
+- [x] Add `profiles/` directory + profile resolution (persona injection, default verify/caps/scope) (§6). *(P1-S2)*
+- [x] Local capability detection + a job/factory capability match check before launch (§8 subset). *(P1-S1: `detect_capabilities` + `caps_match`; mismatch ⇒ `failed/` `result=capability_mismatch`, agent never launched.)*
+- [x] `priority` ordering in the inbox pick (replace pure FIFO with priority-then-age). *(P1-S1: `inbox_sorted`; per-lock serialization preserved.)*
+- [x] `deps` (DAG) blocking on a single host; `idempotency-key` dedupe on `add`. *(P1-S1 idempotency dedupe + P1-S2 `deps` blocking/cycle detection.)*
+- [x] `retry` with backoff into `failed`/requeue; `budget.wall` enforced (extends `timeout`). *(P1-S3: `retry` with backoff + `retries_exhausted` DONE. `budget.wall` DONE: parsed from `budget: { wall: <dur> }`, armed as a HARD wall-clock ceiling alongside `timeout` (whichever fires first binds), expiry → `failed` result=`budget_exceeded`, non-retryable by default.)*
+- [x] `allowed-scope` guardrail (warn-only this phase) + post-run diff report. *(P1-S2: `scope_check` WARN-only + `scope_warning=`.)*
+- [x] **Tracker adapter** `aq from-tracker <ITEM>` + `aq to-tracker` event poster (§10 P1). *(P1-S4: curl-only `tracker_api`; from-tracker materializes a job (idempotent), to-tracker echoes status+metrics one-way; opt-in `AQ_TRACKER_AUTO`. A standalone background poller is deferred to P2.)*
+- [ ] Dashboard shows profile + priority + capability tags + tracker-item link. *(P1-S1: `status` shows priority/profile/caps/tracker-item; P1-S4: status/insights also show last echoed tracker status; Node `dash` surfacing pending.)*
+- [x] Update `selftest.sh` with: manifest parse fixtures, profile resolution, priority order, dep-block, idempotency, adapter round-trip (mock). *(P1-S1 manifest/priority/idempotency + P1-S2 profile/persona/scope/dep-block/cycle + P1-S3 resilience/insights + P1-S4 tracker from/to round-trip via stub.)*
+- [x] Update README + this doc's progress table. *(P1-S1)*
+- **Exit criteria:** all boxes ✅; `selftest.sh` green; a tracker task → executed → tracker `done` with SHA comment, fully on one host; no regression to Phase-0 `.md` files.
+
+### Phase 2 — Coordinator as platform-service module + Cosmos + multi-factory leasing
+**Goal:** the service spine; ≥2 real factories executing in parallel via leases.
+
+> **Slice progress — P2-S3 (factory-agent integration, single host):** the bash runner
+> is now a coordinator **factory** behind `AQ_FLEET` — `lib/fleet-client.sh` (curl-only,
+> sourced) registers via heartbeat, claims jobs into inbox (interleaved with local `.md`),
+> reports **fenced** stage transitions with WIP checkpoints, renews/releases leases, and on
+> a stale `leaseEpoch` (reclaimed) **self-aborts + quarantines** the local result. Coordinator
+> 5xx/connection errors **degrade** (finish locally) rather than abandon work. When `AQ_FLEET`
+> is off the offline git-queue path is byte-for-byte unchanged. The remaining P2 items —
+> scheduler/router core, direct tracker→module calls, factory enrollment + scoped tokens,
+> `fleet.*` feature flags + shadow/dual-run, and the two-factory parallel demo — are now all
+> landed in common-plat (`scheduler.ts`, `tracker-bridge.ts`, `enrollment.ts`).
+
+- [x] Scaffold `fleet`/`orchestrator` module in `platform-service` (`types/repository/routes`, Zod, ESM, `productId`). *(PR #28)*
+- [x] Cosmos containers (§13) + repository layer (memory + Cosmos providers). *(PR #28; `fleet_artifacts` blob wiring still pending.)*
+- [x] **Atomic claim** (optimistic concurrency / `_etag`) + **lease reaper** + **fencing (`leaseEpoch`)** endpoints (§4/§8/§9) — *not* Cosmos-TTL-driven reclaim. *(common-plat PR #28 + #29; truly atomic via `updateIfMatch`.)*
+- [x] Port `agent-queue` runner to a **factory agent** API client (enroll/register/heartbeat/claim/report, fencing-aware) while keeping git-queue fallback. *(P2-S3: `lib/fleet-client.sh` behind `AQ_FLEET`; registers via heartbeat, claims into inbox, reports fenced stage transitions, renews leases, quarantines on stale-epoch; offline git-queue unchanged when the flag is off.)*
+- [x] Scheduler/router core (§7) as a pure module (fixed weights) + wired into atomic assignment. *(common-plat `fleet/scheduler.ts` pure `selectJob`/`scoreCandidate`/`selectPreemptionVictim`; `coordinator.ts` `claimNextJob` ranks candidates via `selectJob` after the capability hard-filter.)*
+- [x] Tracker adapter calls the module directly (not just file export). *(common-plat `fleet/tracker-bridge.ts` + `POST /fleet/tracker/ingest` / `/fleet/tracker/echo`: idempotent ingest of a tracker item → job and one-way status echo, in-module.)*
+- [x] Auth: factory enrollment + scoped rotatable tokens; secret isolation enforced (§12 subset). *(common-plat `fleet/enrollment.ts`: `enrollFactory`/`rotateToken`/`revokeToken` issue a plaintext token once, store it hashed, scope it to `{productId, factoryId, capabilities}`; `enforceFactoryToken` gates `claim`/`heartbeat` in `routes.ts`.)*
+- [x] **Feature flags** (`fleet.enabled`, `fleet.route_via_service`) + **shadow/dual-run** vs P1 before cutover (§21). *(agent-queue runner: `AQ_FLEET` / `AQ_FLEET_ROUTE` / `AQ_FLEET_SHADOW` with documented precedence; shadow claim/compare/report is side-effect-free (isolated `-shadow` factoryId + dryRun, never materializes/ships); `fleet-shadow-report` summarizes AGREE/DIVERGE/COORD_EMPTY/LOCAL_EMPTY + agreement; 60→68 selftest checks.)*
+- [x] Module test suite (repository + routes via `@bytelyst/testing`); **atomic-claim race**, crash-recovery, fencing-rejection, reaper-reclaim tests. *(PR #28 + #29: 53 fleet + 48 datastore tests, incl. true-concurrency claim.)*
+- [x] Two-factory demo (e.g. mac + ubuntu) running 3 parallel jobs end-to-end. *(`agent-queue/demo/two-factory-demo.sh` + `coordinator-stub.sh`: two real `run` daemons (mac-1 + ubuntu-1, separate queues/cwds) compete through one coordinator; asserts (a) no double-assign, (b) kill-mid-job → reaper reclaim → survivor completes → zombie report fenced (409), (c) concurrent parallelism. Dual-mode: CI-safe stateful stub by default, live platform-service when `AQ_FLEET_API`/`AQ_FLEET_TOKEN` set. Headless checks in `selftest.sh` → 68→71 green.)*
+- **Exit criteria:** all boxes ✅; `pnpm --filter @lysnrai/platform-service test` green; killing a factory mid-job → another reclaims and completes **and the dead worker's late report is fenced**; concurrent claimers never double-assign; all state in Cosmos with `productId`; **flag-off rollback verified** (§21). — _Runtime exit guarantees **demonstrated** by the two-factory demo (no double-assign + reclaim/fence + parallelism) and flag-off rollback verified (§21). Scheduler/router core, tracker-module direct calls, and factory enrollment + scoped tokens are now all wired in (see boxes above) — Phase 2 is effectively complete. **Remaining for a hard 100%:** validate the Cosmos `_etag` CAS path under true production contention + live blob-backed `fleet_artifacts`._
+
+### Phase 3 — Fleet control plane in tracker-web + DAG + budgets + scoring router
+**Goal:** one browser control plane; smart routing + budgets live.
+
+- [x] `fleet` API client in `tracker-web` (reuse `/api/tracker`-style proxy → `/fleet`). *(common-plat `dashboards/tracker-web/src/lib/fleet-client.ts`: typed client over `/api/fleet`.)*
+- [x] Fleet map page (factories, load, health, capabilities) on `@bytelyst/*` components. *(common-plat `app/dashboard/fleet/page.tsx`: health badges, load, capabilities, fleet metrics + alerts.)*
+- [x] Job table + job detail + **DAG view**; live log via **SSE**; approve/ship/reject/requeue actions. *(common-plat `app/dashboard/fleet/jobs/page.tsx` + `jobs/[id]/page.tsx`: stage-filtered table, DAG via `getJobDag`, SSE event stream, ship/requeue/reject/requestReview.)*
+- [x] Cost burndown + budget kill-switch UI; multi-reviewer routing. *(common-plat `app/dashboard/fleet/budget/page.tsx` burndown + pause/resume; `ReviewGateCard` multi-reviewer quorum gate via `requestReview`/`submitReview`.)*
+- [x] Scoring router with configurable weights + explainability surfaced in UI. *(common-plat `fleet/scheduler.ts` tunable weights + `GET /fleet/jobs/:id/explain`; `ExplainPanel` breakdown in job detail.)*
+- [x] Preemption of low-priority by critical jobs (checkpoint + requeue). *(common-plat `fleet/scheduler.ts` `selectPreemptionVictim` + coordinator eviction under `FLEET_PREEMPTION`; victim requeued with checkpoint + bumped epoch, `preempted` event.)*
+- [x] TUI dashboard re-pointed at `/fleet` API (parity). *(devops-tools `agent-queue/lib/fleet-dash.mjs` adapter + `dashboard.mjs` fleet mode under `AQ_FLEET_DASH=1`: board/factories/metrics/alerts, job actions ship/requeue/reject via `/fleet`, per-job events log; opt-in so local mode is byte-for-byte unchanged. Verified by `lib/fleet-dash.test.mjs` (22 assertions) wired into `selftest.sh` + live non-TTY render smoke.)*
+- [x] Web e2e (Playwright): fleet map, live log, ship, budget-pause. *(common-plat `dashboards/tracker-web/e2e/fleet.spec.ts`: fleet overview, metrics, job detail, ship, budget-pause, review-gate specs green.)*
+- **Exit criteria:** all boxes ✅; web `verify` (typecheck+lint+test+e2e) green; an operator runs the whole 3-repo parallel workload from the browser, including a budget pause + resume.
+
+### Phase 4 — Message bus + autoscaling + cross-OS capability marketplace
+**Goal:** scale-out and elasticity.
+
+- [ ] Introduce broker (NATS/Redis) for push dispatch + backpressure; coordinator publishes, factories subscribe by capability.
+- [ ] Autoscaling hooks (spin ephemeral factories: cloud VM / container) keyed to queue depth + SLA.
+- [ ] Capability "marketplace": jobs requiring rare caps (xcode/figma/gpu) routed to the few factories that have them; queueing fairness across products.
+- [ ] Load + chaos test suite (factory churn, broker outage, thundering herd).
+- **Exit criteria:** all boxes ✅; sustained N×throughput vs Phase 3 under load test; graceful degradation on broker outage (fallback to poll).
+
+### Phase 5 — Self-optimizing / learned routing
+**Goal:** the scheduler learns from history to cut time/cost and raise first-pass success.
+
+- [ ] Capture outcome features per run (engine, profile, repo, duration, cost, verify pass, human-edit rate).
+- [ ] Offline eval harness comparing learned vs heuristic routing on historical data.
+- [ ] Shadow/A-B rollout with guardrails; auto-tune scoring weights.
+- [ ] Recommendations surfaced ("route NomGap UX jobs to claude on mac-2: 23% faster, 11% cheaper").
+- **Exit criteria:** all boxes ✅; learned router beats heuristic on the eval set without regressing safety gates; A/B shows measurable improvement on a target metric.
+
+---
+
+## 15. Cross-cutting feature catalog (quick index)
+
+| Feature | First phase | Section |
+| ------- | ----------- | ------- |
+| Evolved job manifest | P1 | §5 |
+| Profiles (persona + capability) | P1 | §6 |
+| Capability matching | P1→P2 | §6/§8 |
+| Priority + SLA | P1 | §5/§7 |
+| DAG dependencies | P1→P3 | §5/§11 |
+| Idempotency / dedupe | P1 | §5 |
+| Retry + dead-letter | P1→P2 | §11 |
+| Budgets + kill-switch | P1(wall)→P3 | §5/§12 |
+| Scheduler/router scoring | P2→P3 | §7 |
+| Factory registration/heartbeat/lease | P2 | §8 |
+| Coordinator (platform-service module) | P2 | §9/§10 |
+| Cosmos data model | P2 | §13 |
+| Tracker bi-directional sync | P1→P2 | §10 |
+| Web control plane + SSE logs | P3 | §10/§17 |
+| Security/scope/secret isolation | P1→P2 | §12 |
+| Broker + autoscaling | P4 | §14 |
+| Learned routing | P5 | §14 |
+| Atomic claim + fencing + distributed lock | P2 | §4/§7/§9 |
+| Rollout / rollback / feature flags | P2→ | §21 |
+| Capacity planning & RU/cost | P2→ | §22 |
+| Ownership & RACI / on-call | all | §23 |
+| Work hierarchy & composite delegation (roadmap/epic) | P3 (manual) → P5 (planner) | §24 |
+| Durability, crash recovery & work preservation | P1 (orphan/retry/WIP) → P2 (lease/resume) | §25 |
+| Execution insights & token accounting | P1 (capture) → P3 (rollup UI) | §26 |
+
+---
+
+## 16. Definition of Done — the "100% accuracy" rubric
+
+A feature/phase is **not done** until **every** item below is true (this is the bar for "100% end-to-end"):
+
+- [ ] **Functionality**: acceptance criteria met; happy path + documented edge cases handled.
+- [ ] **Tests**: unit + integration written *first or alongside*, all green; no weakened/deleted tests; coverage targets met (router ≥95% core).
+- [ ] **Verify gate**: the phase's named gate command passes locally (and in CI where applicable).
+- [ ] **Idempotency & recovery**: re-runs are safe; crash mid-step recovers (lease/idempotency).
+- [ ] **Security review**: secret-leak scan clean; scope guardrail honored; least-privilege tokens.
+- [ ] **Observability**: events/logs/metrics emitted; failures are diagnosable from the control plane.
+- [ ] **Docs**: this roadmap's checkboxes ticked; README/AGENTS updated; manifest/profile docs current.
+- [ ] **Backward-compat**: existing `.md`/Phase-0 behavior unbroken (regression check).
+- [ ] **Drift checks**: shared-infra templates (`.npmrc`, `docker-prep`) untouched/synced; conventional commits.
+- [ ] **No `console.log`/`print`** in service code; `req.log`/`os.Logger` used; ESM `.js` imports.
+
+---
+
+## 17. Observability & control plane details
+
+- [ ] **Log transport/storage**: factory ships logs to blob (`@bytelyst/blob`); `fleet_events` carries pointers + a recent-tail buffer. The control plane serves stored tail + live append (via the streaming route, **not** the buffering proxy — §10).
+- [ ] **Live logs** via SSE (single stream contract) from the streaming endpoint to web/TUI.
+- [ ] **Metrics**: queue depth, `blocked` count, assign latency, claim-loop RU/s, run duration, verify pass-rate, cost, factory utilization, fairness, reclaim/fencing-rejection counts.
+- [ ] **Alerting**: stall (no log N min), failure spikes, budget breach, factory offline, dead-letter, **claim-race anomalies**, RU throttling (Cosmos 429s).
+- [ ] **Tracing**: a job's full timeline (queued→…→shipped) reconstructable from `fleet_events` (immutable, ordered).
+- [ ] **Cost burndown** per job/product/day with budget overlays.
+- [ ] **SLOs defined + dashboarded** (see §19 targets); error budget tracked per SLO.
+
+---
+
+## 18. Risks & gaps explicitly tracked (expert call-outs)
+
+- [ ] **Duplicate execution** across transports (git fallback + service) — `idempotency-key` (submit) + atomic lease (assign) + **fencing token** (side-effect) + distributed `lock` (push).
+- [ ] **Crash recovery** — coordinator **lease reaper + fencing** (not Cosmos TTL); checkpoint long jobs where engines allow.
+- [ ] **Split-brain / partition** — fencing rejects stale `leaseEpoch` writes; reclaimed-job results quarantined, not auto-merged (§9).
+- [ ] **Shared-package conflicts** — two jobs editing `@bytelyst/*` simultaneously → fleet-wide `lock` + reviewer gate.
+- [ ] **Starvation/fairness** — per-product + per-factory counters with penalty.
+- [ ] **Cost runaway** — `budget.wall` hard ceiling everywhere; `usd`/`tokens` best-effort (provider metering) + global kill switch.
+- [ ] **Cosmos RU throttling (429)** — hot claim path; bound via long-poll/backoff + indexing (§13/§22); broker offload at P4.
+- [ ] **Clock skew** — coordinator-authoritative timestamps for all lease/SLA math (§4).
+- [ ] **Tool-version drift / reproducibility** — record engine + tool versions per run; pin where possible.
+- [ ] **Windows quirks** — path/shell differences in the factory agent; capability-gate Windows-only work.
+- [ ] **Human-review bottleneck** — auto-verify as much as possible; batch review UI; reviewer routing.
+- [ ] **Result capture beyond commits** — artifacts (coverage, screenshots, build logs) attached to runs.
+- [ ] **Secret sprawl** — never in queue/manifest/logs/Cosmos; presence-only capabilities.
+- [ ] **Data retention** — event/log retention + archival policy (extend today's `clean`).
+- [ ] **Engine API churn** — engines mapped in one place (`build_agent_cmd`); capability matrix versioned.
+
+---
+
+## 19. Success metrics
+
+Each metric has a **provisional SLO target** (tune with real data; tracked with an error budget):
+
+| Dimension | Metric | Provisional SLO target |
+| --------- | ------ | ---------------------- |
+| Throughput | jobs shipped/day; parallel utilization | utilization ≥ 60% under backlog |
+| Quality | % auto-verified; first-pass success; escaped-defect; post-agent human-edit rate | first-pass ≥ 70%; escaped-defect < 2% |
+| Speed | assign latency; time queued→shipped (excl. human gate) | assign p95 < 5s; queue-wait p95 < 2m at target load |
+| Cost | $/shipped job; budget-breach rate | budget-breach < 1% of jobs |
+| Reliability | lease-reclaim success; dead-letter rate; factory uptime; double-execution incidents | reclaim success ≥ 99.9%; **double-merge = 0**; dead-letter < 5% |
+| Fairness | max/min product wait-time ratio | ratio < 3× |
+| Correctness | atomic-claim violations; fencing rejections functioning | claim violations = 0 |
+
+> Targets are starting points; the §0 owners ratify per-phase SLOs before that phase's exit.
+
+---
+
+## 20. Open questions
+
+- [ ] Copilot headless feasibility as an engine/station (CLI/automation surface?).
+- [ ] Who owns merge/push authority — agents open PRs only, or auto-merge on green for low-risk profiles?
+- [ ] Multi-user/tenant: per-user queues + RBAC in the control plane?
+- [ ] On-call/ownership for the fleet (alerts routing, runbooks)?
+- [ ] Cloud factory provisioning (Phase 4) — which provider/runtime, cost guardrails?
+- [ ] Profile authorship/governance — who can create/edit profiles, and review of persona prompts?
+
+---
+
+## 21. Rollout, rollback & data migration
+
+Each phase ships behind controls so it can be turned off without losing work.
+
+- [ ] **Feature-flagged rollout**: gate each phase's new path behind a platform feature flag (`fleet.enabled`, `fleet.route_via_service`, `fleet.tracker_sync`); default off; enable per-product first.
+- [x] **Dual-run / shadow**: P2 coordinator runs in shadow (assign decisions logged, not executed) alongside the P0/P1 path before cutover; compare decisions. *(agent-queue `AQ_FLEET_SHADOW=1`: offline path stays authoritative, coordinator queried in parallel, decisions classified AGREE/DIVERGE/COORD_EMPTY/LOCAL_EMPTY into `.state/fleet-shadow.log`; strictly side-effect-free — never ships/quarantines/mutates real job state.)*
+- [x] **Cutover is reversible**: a factory can fall back from service-claim to git-queue via flag; no schema-destructive step on the rollback path. *(rollback = `AQ_FLEET_ROUTE=0` and/or `AQ_FLEET=0` at any time → instant return to the local/offline path; no data migration.)*
+- [ ] **Data migration**: introducing Cosmos containers (P2) is **additive** — no migration of existing tracker data; backfill is read-only (link `tracker-item`, don't mutate). Container creation is idempotent (registered in `cosmos-init`).
+- [ ] **Backward-compat gate**: every phase re-runs Phase-0 `selftest.sh` + a corpus of legacy `.md` files (regression).
+- [ ] **Rollback drill**: each phase's exit includes a tested rollback (flag off → prior behavior, in-flight jobs drain or requeue cleanly).
+- **Acceptance:** flipping `fleet.*` flags off returns the system to the prior phase's behavior with zero data loss; in-flight jobs either complete or requeue.
+- **Verify gate:** rollout/rollback drill documented + a flag-off regression run is green.
+
+---
+
+## 22. Capacity planning & cost
+
+- [ ] **Concurrency model**: fleet throughput = Σ factory free-stations, bounded by per-engine **seat limits** (e.g. N Devin seats) — document seat inventory per engine before P2.
+- [ ] **Cosmos RU budgeting**: the claim/heartbeat paths are the hot loops. Estimate RU/s = (factories × claim-poll rate × query RU) + (factories × heartbeat rate × upsert RU); pick **long-poll interval** to keep steady-state RU within a provisioned budget; enable autoscale RU with a ceiling + 429 alerting.
+- [ ] **Polling vs push**: at F factories the poll RU grows linearly — define the F threshold that triggers the P4 broker migration.
+- [ ] **Blob storage**: logs/artifacts sizing + lifecycle (hot → cool → delete) per retention policy (§18).
+- [ ] **Factory sizing**: per-OS resource baseline (CPU/RAM/disk for N concurrent agent sessions + warm checkouts); disk pressure as a health input.
+- [ ] **Cost guardrails**: per-product spend caps + alerts; ties to `budget` and the global kill-switch.
+- **Acceptance:** a documented capacity sheet (seats, RU/s, blob GB, factory specs) sized for the target steady-state + 2× burst.
+- **Verify gate:** load test sustains target throughput within the RU/cost budget (no 429 storms).
+
+---
+
+## 23. Ownership & RACI
+
+Owners are roles, not names — assign before each phase starts (this removes the "undefined owner" gap).
+
+| Area | Responsible (R) | Accountable (A) | Consulted (C) | Informed (I) |
+| ---- | --------------- | --------------- | ------------- | ------------ |
+| Runner / factory agent (bash) | DevOps eng | Platform lead | — | All |
+| Coordinator module (platform-service) | Backend eng | Platform lead | Security | All |
+| Scheduler/router | Distributed-systems eng | Platform lead | Backend | All |
+| Control plane (tracker-web Fleet) | Frontend eng | Platform lead | UX | All |
+| Security/governance | Security eng | Security lead | Platform | All |
+| Capacity/cost & SLOs | SRE | Platform lead | Finance | All |
+| Profiles & persona governance | Eng leads | Platform lead | — | All |
+
+- [ ] Each phase names its R/A before kickoff; SLOs (§19) ratified by A.
+- [ ] On-call + runbooks established before the fleet runs unattended `yolo` workloads (Phase 2+).
+
+---
+
+## 24. Work hierarchy & composite delegation (roadmap / epic)
+
+**Goal:** delegate work at *any* granularity — a single bug/feature/task, **or an entire roadmap** — and let the fleet decompose + orchestrate rather than hand a multi-day roadmap to one agent session (which is long-horizon, low first-pass-success, and high blast-radius under `yolo`).
+
+### 24.1 Two delegation modes
+- **Atomic** (today's model): one leaf item (`bug`/`feature`/`task`) → one job → one agent at one station.
+- **Composite** (new): a `roadmap`/`epic` → a **planner** profile expands it into child jobs → the scheduler runs them as a **DAG across factories/agents/profiles**, honoring `deps` + phase gates. "Delegate the whole roadmap" = hand it to the **orchestrator**, which fans out — never one agent grinding for hours.
+
+### 24.2 Job `kind` — the one genuinely new concept
+A new axis, **orthogonal to tracker `type`**:
+- **`kind: leaf`** — runs an engine at a station (everything Phase 1–2 already does).
+- **`kind: composite`** — runs the **planner/orchestrator** that emits child `leaf` jobs and a dependency graph; it never itself edits a repo.
+
+The scheduler (§7) routes by `kind`: `leaf` → station/engine; `composite` → planner. This keeps execution and planning cleanly separated.
+
+### 24.3 Hierarchy & relationships
+- [ ] `parentId` links a child job/item to its roadmap/epic; `deps` (§5) expresses ordering within it (DAG, submit-time cycle detection).
+- [ ] A roadmap is, mechanically, a **named DAG of jobs + a rollup** — it reuses `deps`, profiles (§6), the scheduler (§7), and the lifecycle (§11); the only additions are `kind`, `parentId`, and rollup logic.
+- [ ] Add a **`planner`/`architect`/`tech-lead` profile** (§6 catalog) for decomposition + orchestration; leaf work still uses `backend-engineer`, `ux-designer`, etc.
+
+### 24.4 Rollup semantics (composite-level)
+- [ ] **Status rollup:** roadmap `status` is derived from children — `in_progress` once any child starts; `shipped`/`done` only when **all** children reach `shipped`; surfaces `blocked`/`failed` children for triage.
+- [ ] **Budget rollup:** roadmap `budget` = Σ child budgets with an explicit **ceiling**; breaching the ceiling pauses fan-out (ties to §12 kill-switch).
+- [ ] **Verify rollup:** each leaf runs its own `verify`; the roadmap's acceptance gate runs **after** all leaves pass (e.g. an integration/e2e gate).
+- [ ] **Phase gates:** the roadmap's own phase Exit-criteria become **runtime gates** — fan-out of phase N+1 is blocked until phase N's children ship; human approval between phases is the default for `yolo` safety.
+- [ ] **Idempotent re-run:** re-running a roadmap **skips already-`shipped` children** (content-hash dedupe, §5); only unfinished/changed children re-queue.
+
+### 24.5 Source-of-truth & sync (no drift)
+Composite work obeys the same SoT discipline as the core contract (§4 immutable manifest) and the tracker echo (§10): a roadmap/epic is **one record referenced by many**, never duplicated.
+- [ ] The **roadmap/epic** is the SoT for *what/why + rollup status*; each **leaf job/run** is the SoT for *its* execution.
+- [ ] Children reference the parent by `parentId`; the planner writes the child set **once** at decomposition (immutable manifest snapshot). Re-planning creates a new revision, it does not mutate in-flight children.
+- [ ] Status flows **one way, child → parent → tracker** (the §10 echo); humans never hand-edit rollup state.
+
+### 24.6 Decision — **Hybrid** (recorded)
+> Model composite delegation in the **fleet layer now**; defer the shared-platform enum change until proven.
+
+- **Now (fleet-owned):** add `kind` (`leaf`/`composite`), `parentId`, and rollup to the `fleet_jobs` schema (§13). The fleet owns this schema outright — no cross-product risk.
+- **Tracker stays `bug`/`feature`/`task`** (the shared `ITEM_TYPES` used by all 9 products is unchanged). A roadmap is represented by a **parent item + label `kind:roadmap`** + `parentId` on children — zero platform migration, no sign-off needed.
+- **Later (optional, gated on proven value):** promote `kind:roadmap` → a first-class `epic` tracker `type` via an **additive migration** (backfill items where `labels` contains `kind:roadmap` into `type: epic`, keep the label as an alias during transition). Low-risk because the behavior already works fleet-side.
+- **Rationale:** avoids a speculative 9-product platform change (UI/filters/stats/tests) before the orchestration model is validated; if the model is wrong, only fleet code is refactored, not a platform enum every product depends on.
+
+### 24.7 Phasing & gates
+- **P1–P2:** leaf-only (no composite); `kind` defaults to `leaf`.
+- **P3:** composite scheduling + rollup + DAG view in the control plane, with **manual decomposition** (a human/author defines the child set).
+- **P3→P5:** the **auto-decomposition planner agent** (itself a `composite` job run by the `planner` profile) — start manual, automate once trustworthy.
+- **Acceptance:** a roadmap with N child jobs fans out across ≥2 factories, respects `deps` + phase gates, rolls up status/budget correctly, and a re-run skips shipped children; tracker shows the parent moving `in_progress → done` via the one-way echo.
+- **Verify gate:** composite-orchestration tests — DAG expansion, rollup status/budget, phase-gate blocking, idempotent re-run; control-plane e2e for the roadmap DAG view.
+
+---
+
+## 25. Durability, crash recovery & work preservation
+
+**Goal:** a machine power-off, daemon/agent crash, or network partition **never loses the job, its instructions, or in-progress work**, and never corrupts state. Recovery is automatic and idempotent.
+
+### 25.1 Instructions are durable (markdown in Cosmos)
+- [ ] The **full job instruction body is persisted verbatim as markdown** in `fleet_jobs.bodyMd` (§13), alongside the structured manifest. The originating tracker `Item.description` also retains the human instruction text; the two are linked by `tracker-item`, never duplicated as competing truth (§24.5).
+- [ ] A factory only ever holds a **transient materialized copy** (temp prompt file) fetched from the API — losing the factory loses nothing. On the offline edge, the `.md` file on disk is the durable copy and reconciles on reconnect (§9).
+
+### 25.2 Work-in-progress is preserved (checkpointing)
+- [x] For a git-repo `cwd`, the worker commits **WIP to a dedicated branch `aq/wip/<jobId>`** at start and on every exit path (success, failure, timeout, signal) — partial work is never lost to a crash. Never commits to `main`/protected branches (§12 push policy). *(P1-S3: `_wip_start`/`_wip_checkpoint` + EXIT/INT/TERM trap; non-git cwd skipped.)*
+- [ ] `fleet_jobs.checkpoint` records the WIP branch + last commit so any worker can find it. *(P2 Cosmos; single-host records `wip_branch`/`wip_base`/`wip_commit` in `<job>.meta`.)*
+- [x] Long agents checkpoint periodically where the engine supports it; otherwise the start/exit commits bound the loss window. *(P1-S3: start + every-exit-path commits bound the loss window.)*
+
+### 25.3 Recovery is automatic, resumable, and fenced
+- [x] **Orphan detection:** on coordinator/runner startup (and continuously), a job in `building/assigned` whose worker is dead (no live lease / dead pid) is an **orphan**; it is recovered, not stranded. *(P1-S3: `recover_orphans` on `run` startup + each loop, and `agent-queue.sh recover`; dead-pid + `pidstart` reuse guard.)*
+- [x] **Resume vs restart:** recovery starts a **new `fleet_runs` attempt**; if `aq/wip/<jobId>` exists, the new worker **resumes from the checkpoint** instead of restarting from zero. *(P1-S3: relaunch checks out `aq/wip/<job>`; `attempts` incremented.)*
+- [ ] **Fencing (§4):** the reclaimed run gets a higher `leaseEpoch`; the dead/zombie worker's late commits/ship reports are rejected — no double-execution of *visible* outcomes. *(P2 — distributed leasing; out of single-host scope.)*
+- [x] **Retry policy** (`retry.max/backoff/on`): agent `rc≠0` / `timeout` / `verify_failed` requeue with backoff up to `max`; on exhaustion → `dead_letter` (P2) / `failed` (P1 stand-in) with full diagnostics — never silently dropped. *(P1-S3 single-host.)*
+- [x] **State integrity:** all run state is **append-only / optimistic-concurrency guarded** (§13); recovery is idempotent (running it twice yields one recovery). *(P1-S3 single-host: meta is append-only + re-derivable from folder location; `_etag` guard is P2.)*
+
+### 25.4 Crash taxonomy (all handled)
+| Failure | Detection | Recovery |
+| ------- | --------- | -------- |
+| Agent process crash (`rc≠0`) | exit code | retry policy → requeue or `failed`/`dead_letter` |
+| Daemon/runner crash | lease not renewed | reaper reclaims → resume from checkpoint |
+| Machine power-off / partition | missed heartbeats + lease expiry | reaper + fencing + WIP resume elsewhere |
+| Coordinator restart | state in Cosmos | leases survive; in-flight reconciled on boot |
+
+- **Acceptance:** SIGKILL an agent and power-off a factory mid-run → another worker **resumes from the last checkpoint (not from zero)** and ships; instructions intact (read back from Cosmos `bodyMd`); **zero duplicate commits/merges**; a retry-exhausted job lands in `dead_letter`/`failed` with diagnostics.
+- **Verify gate:** chaos tests — kill agent, kill runner, simulate partition; assert resume-from-checkpoint, fencing rejection of the stale worker, instruction integrity, and no double-merge.
+
+---
+
+## 26. Execution insights & token accounting
+
+**Goal:** per-job/run visibility into **token usage, cost, model, latency, and tool activity** — to drive budgets (§5/§12), cost burndown (§17), and learned routing (§14 P5).
+
+- [x] **Per-run telemetry record** (in `fleet_runs`, streamed as `fleet_events`): engine, model, **tokensIn/Out (+cached)**, **cost USD** (`estimated:true` when not provider-reported), wall + CPU time, **turn count, tool-call counts**, verify pass/fail, **filesChanged, linesAdded/Deleted**, attempt number, retries. *(P1-S3 single-host: recorded in `<job>.meta` — `duration_s`, `files_changed`/`lines_added`/`lines_deleted`, tokens/cost/turns/tool_calls, `attempts`; CPU time not captured.)*
+- [x] **Token source (honest feasibility):** capture real usage where the engine/provider exposes it (Claude/Codex/OpenAI usage in responses; Devin session metrics); otherwise **estimate** from log heuristics and mark `estimated` — same caveat as `budget.usd/tokens` (§5). A single `parse_usage(engine, log)` adapter centralizes per-engine extraction. *(P1-S3: `parse_usage` adapter; generic `AQ_USAGE` line + Claude/Codex heuristics; Devin/Copilot TODO; `usage_estimated` flag, never fabricated.)*
+- [ ] **Aggregation/rollups:** per job, roadmap (§24), product, factory, engine, profile, and day. Powers cost burndown (§17) and the learned-routing eval (§14). *(P1-S3 partial: `aq insights` does per-job + per-engine rollup; product/factory/profile/day are P2/P3.)*
+- [ ] **Surfacing:** control-plane panels (tokens, cost, success/first-pass/human-edit rates) + a CLI insights summary at the edge; reuse the platform-service telemetry module where present. *(P1-S3 partial: edge CLI `aq insights` + `status`/`dash` insights line done; web control-plane panels are P3.)*
+- [x] **Privacy:** telemetry carries metrics + pointers only — **never prompt content or secrets** (redaction §12). *(P1-S3: insights/meta record only metrics; no prompt body or secrets added.)*
+- **Acceptance:** after a run, its `fleet_runs` carries token/cost/duration/tool/diff metrics (real where metered, flagged `estimated` otherwise); dashboards show per-engine and per-profile cost + token totals; a budget breach is detectable from telemetry alone.
+- **Verify gate:** telemetry unit tests (capture + rollup); a metered-engine run records real tokens; an unmetered run records estimated + flagged; aggregation totals verified.
+
+---
+
+*This document is the single source of truth for the gigafactory build. Keep the §0 table and per-phase checkboxes updated; a phase ships only when its Exit criteria and the §16 Definition-of-Done rubric are fully green.*
+
--- a/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_SYSTEM_OVERVIEW.md
+++ b/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_SYSTEM_OVERVIEW.md
@ -0,0 +1,451 @@
+# Agent Gigafactory — System Overview (current picture)
+
+> Companion to `GIGAFACTORY_ROADMAP.md` (the source-of-truth spec & checklists).
+> This document describes **what is actually built today**, how the pieces fit
+> together, the architecture diagrams, the code map across both repos, the next
+> steps, and the known bugs/gaps. Last reviewed: **2026-05-31**.
+>
+> The **Phase-4 plan + the as-built M0 RU gate** live in
+> [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md) — read it for the
+> broker-backed dispatch design and the migration checklist.
+
+---
+
+## 1. What it is (in one paragraph)
+
+The **Agent Gigafactory** turns a single-host "folder queue" agent runner into a
+**distributed fleet** of agent "factories" (machines: mac/ubuntu/windows) that
+claim and execute coding jobs in parallel, coordinated by a durable,
+product-agnostic service. A job is a markdown manifest (persona + capabilities +
+budget + deps); the **coordinator** assigns each job to the best-fit factory via a
+deterministic scoring router, guarantees **exactly-once assignment** through
+optimistic-concurrency claims + **leases with epoch fencing**, recovers crashed
+work automatically (reaper + WIP checkpoints), enforces **per-product budgets**,
+supports **DAG decomposition** (composite → child jobs), and exposes the whole
+fleet through **two control planes**: a browser UI (`tracker-web`) and a terminal
+TUI (`agent-queue` dashboard). Both control planes talk to the same `/fleet` REST
+API.
+
+---
+
+## 2. Completion snapshot (reality, not the stale table)
+
+| Phase | Theme | Real status | Notes |
+| ----- | ----- | ----------- | ----- |
+| **0** | Single-host baseline | ✅ 100% | `agent-queue.sh` folder queue, selftest green |
+| **1** | Manifest + profiles + capabilities + tracker adapter | ✅ ~98% | Only leftover: Node `dash` field surfacing — **now also done** via fleet-dash tags. Effectively complete |
+| **2** | Coordinator module + Cosmos + multi-factory leasing | ✅ ~98% | Scheduler wiring, enrollment+tokens, tracker-bridge are **done in code** but boxes 384/386 unticked in roadmap (see §11 Gaps) |
+| **3** | Fleet control plane (web + TUI) + DAG + budgets + scoring | ✅ 100% (all boxes ticked) | Pending: Playwright e2e wired into CI; live multi-host operator run |
+| **4** | Message bus + autoscaling + capability marketplace | 🟡 in progress | **M0 (RU gate) shipped** — see below. Broker (M1+) not started. Plan: [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md) |
+| **5** | Self-optimizing / learned routing | ☐ 0% | Not started |
+
+> **Phase-4 M0 (RU gate) is live (2026-05-31):** a per-product `fleet_queue_state`
+> doc holds a monotonic `version` (bumped on job create + every stage change);
+> factories with `AQ_FLEET_GATE=1` point-read `GET /fleet/queue-state` (~1 RU) and
+> skip the expensive claim while nothing changed — cutting idle Cosmos RU without
+> raising the local poll interval. Default OFF; the live fleet runs it on.
+
+---
+
+## 3. System architecture
+
+```mermaid
+graph TB
+  subgraph CP["Control planes (operators)"]
+    WEB["tracker-web Fleet UI<br/>(Next.js, /dashboard/fleet/*)"]
+    TUI["agent-queue TUI<br/>(dashboard.mjs, AQ_FLEET_DASH=1)"]
+  end
+
+  subgraph SVC["platform-service — fleet module (the spine)"]
+    ROUTES["routes.ts<br/>/fleet REST + SSE"]
+    COORD["coordinator.ts<br/>claim · lease · fence · reaper<br/>preemption · budgets · DAG · review"]
+    SCHED["scheduler.ts<br/>pure scoring router (§7)"]
+    ENROLL["enrollment.ts<br/>factory tokens (scoped, rotatable)"]
+    BRIDGE["tracker-bridge.ts<br/>job ↔ tracker item"]
+    ARTIF["artifacts.ts / artifacts-blob.ts<br/>pointer + blob bytes"]
+    REPO["repository.ts<br/>CAS (rev/_etag) CRUD"]
+  end
+
+  subgraph DATA["@bytelyst/datastore (Cosmos / memory)"]
+    JOBS[("fleet_jobs")]
+    RUNS[("fleet_runs")]
+    LEASES[("fleet_leases")]
+    FAC[("fleet_factories")]
+    PROF[("fleet_profiles")]
+    EVENTS[("fleet_events")]
+    ARTDOCS[("fleet_artifacts")]
+  end
+
+  subgraph FLEET["Factory agents (workers, N hosts)"]
+    F1["agent-queue.sh + lib/fleet-client.sh<br/>(AQ_FLEET=1) — mac-1"]
+    F2["agent-queue.sh + lib/fleet-client.sh<br/>ubuntu-1"]
+    ENGINES["engines: claude · codex · devin"]
+  end
+
+  WEB -->|/api/fleet proxy| ROUTES
+  TUI -->|lib/fleet-dash.mjs| ROUTES
+  ROUTES --> COORD
+  COORD --> SCHED
+  ROUTES --> ENROLL
+  ROUTES --> BRIDGE
+  ROUTES --> ARTIF
+  COORD --> REPO
+  ENROLL --> REPO
+  BRIDGE --> REPO
+  ARTIF --> ARTDOCS
+  REPO --> JOBS & RUNS & LEASES & FAC & PROF & EVENTS
+
+  F1 -->|heartbeat · claim · patch fenced · renew| ROUTES
+  F2 -->|heartbeat · claim · patch fenced · renew| ROUTES
+  F1 --> ENGINES
+  F2 --> ENGINES
+```
+
+**Layering principle:** `scheduler.ts` is **pure** (no I/O — all inputs passed
+in), `coordinator.ts` is the orchestration core, `repository.ts` is the only thing
+that touches the datastore, and `routes.ts` is the only thing that touches HTTP.
+Factories never touch the DB directly — they only call REST.
+
+---
+
+## 4. Job lifecycle (stages)
+
+```mermaid
+stateDiagram-v2
+  [*] --> queued: submitJob
+  queued --> blocked: unmet deps
+  blocked --> queued: deps satisfied (reaper/unblock)
+  queued --> assigned: claimNextJob (CAS win + lease)
+  assigned --> building: factory starts (patch fenced)
+  building --> review: rc=0 → review gate
+  building --> testing: verify-pass (auto)
+  review --> testing: approve / requestReview quorum
+  testing --> shipped: ship (manual gate)
+  building --> failed: verify-fail / budget_exceeded / timeout
+  review --> failed: reject
+  assigned --> queued: lease expired (reaper, +epoch, keep checkpoint)
+  building --> queued: preempted (critical job, checkpoint + epoch bump)
+  failed --> queued: requeue (operator)
+  failed --> dead_letter: retries exhausted
+  shipped --> [*]
+  dead_letter --> [*]
+```
+
+Stages (`types.ts`): `queued · blocked · assigned · building · review · testing ·
+shipped · failed · dead_letter`. The TUI/local board collapse these onto kanban
+buckets (`inbox/building/review/testing/shipped/failed`) for parity.
+
+---
+
+## 5. The core guarantee — atomic claim + lease fencing
+
+This is the heart of "no double-assignment, ever" and "a dead worker can never
+corrupt a reassigned job."
+
+```mermaid
+sequenceDiagram
+  participant FA as Factory A
+  participant FB as Factory B
+  participant CO as coordinator
+  participant DB as fleet_jobs / fleet_leases
+
+  FA->>CO: POST /fleet/claim (caps)
+  FB->>CO: POST /fleet/claim (caps)
+  CO->>DB: selectJob() → job J (rev=5)
+  CO->>DB: revUpdate J: queued→assigned IF rev==5 (CAS)
+  DB-->>CO: A wins (rev→6, leaseEpoch=1)
+  CO->>DB: revUpdate J IF rev==5 (B's CAS)
+  DB-->>CO: conflict (B re-selects)
+  CO-->>FA: assigned J (leaseEpoch=1)
+  CO-->>FB: conflict → next job
+
+  Note over FA: A crashes mid-build
+  CO->>DB: reapExpiredLeases(): lease expired → J back to queued,<br/>leaseEpoch=2, checkpoint preserved
+  FB->>CO: claim → J (leaseEpoch=2)
+  FA-->>CO: (zombie) PATCH J stage=shipped leaseEpoch=1
+  CO-->>FA: 409 fenced (1 < 2) — rejected
+```
+
+- **CAS:** `repository.revUpdateJob/revUpdateLease` write only if stored `rev`
+  matches (Cosmos `_etag`/`If-Match`; memory provider re-reads `rev`).
+- **Fencing:** every worker mutation carries `leaseEpoch`; epoch `< job.leaseEpoch`
+  ⇒ `fenced` (409).
+- **Reaper:** `reapExpiredLeases(now)` requeues expired-lease jobs, **bumps the
+  epoch**, and **keeps the `checkpoint`** (WIP git branch pointer) so work resumes
+  rather than restarts. Cosmos TTL cannot do this — the reaper owns recovery.
+
+---
+
+## 6. Data model (Cosmos containers)
+
+| Container | PK | Purpose |
+| --------- | -- | ------- |
+| `fleet_jobs` | `/productId` | durable job: `manifestSnapshot`, verbatim `bodyMd`, `stage`, `idempotencyKey`, `deps`, `depsMode`, `checkpoint`, `priority`, `rev`, `leaseEpoch`, `kind`, `parentId` |
+| `fleet_runs` | `/jobId` | one execution attempt: engine, timings, `result`, `insights` (tokens/cost/diff) |
+| `fleet_leases` | `/jobId` | single-holder lease: `holderFactoryId`, `expiresAt`, `leaseEpoch`, `status` |
+| `fleet_factories` | `/productId` | worker host: `capabilities[]`, `health`, `load`, `seatLimit`, `lastHeartbeatAt` |
+| `fleet_profiles` | `/productId` | immutable, versioned persona/capability profile snapshot |
+| `fleet_events` | `/jobId` | append-only audit stream (monotonic `seq`) — powers SSE |
+| `fleet_artifacts` | `/jobId` | **pointers** to blob-stored artifacts (no inline logs) |
+| `fleet_queue_state` | `/productId` | **Phase-4 M0 RU gate**: monotonic `version` bumped on job create + every stage change; read via `GET /fleet/queue-state` so a factory can cheaply detect "work changed" |
+
+Every document carries `productId`. Containers registered in `lib/cosmos-init.ts`.
+
+---
+
+## 7. The scheduler / scoring router (`scheduler.ts`)
+
+Pure, deterministic, fixed-weight (tunable per-product in Phase 3, learned in
+Phase 5). Filter → score → rank:
+
+```
+score = w1·capabilityFit + w2·affinity + w3·(1/(1+load))
+      + w4·costFit(budget) + w5·health − w6·starvationPenalty(age)
+```
+
+Default weights (`DEFAULT_WEIGHTS`): `capabilityFit 1.0 · affinity 0.5 · load 1.0
+· costFit 0.75 · health 1.0 · starvation 1.5`. Capability is a **hard filter**
+(subset check); `down` factories are filtered out, not scored; aging fully
+de-penalises after ~30 min (anti-starvation). `scoreCandidate` returns a per-term
+breakdown that powers the **explainability** panel (`GET /fleet/jobs/:id/explain`
+→ `ExplainPanel`). `selectPreemptionVictim` picks the lowest-priority running job a
+critical job may evict (under `FLEET_PREEMPTION`).
+
+---
+
+## 8. Subsystems at a glance
+
+| Subsystem | File(s) | What it does | Flag |
+| --------- | ------- | ------------ | ---- |
+| Claim / lease / fence / reaper | `coordinator.ts` | exactly-once assignment, recovery | — |
+| Scoring router + preemption | `scheduler.ts`, `coordinator.ts` | best-fit assignment, evict low-pri for critical | `FLEET_PREEMPTION` |
+| Per-product budgets | `coordinator.ts` (`accrueSpend`, `pause/resume`) | ceiling + auto-pause kill-switch; burndown | `FLEET_BUDGETS` |
+| DAG decomposition | `coordinator.ts` (`submitChildren`, `getDagSubtree`, `maybeUnblockParent`) | composite job fans out to children; deps gate parent | — |
+| Review gate | `coordinator.ts` (`requestReview`, `submitReview`) | multi-reviewer quorum before ship | — |
+| Factory enrollment | `enrollment.ts` | scoped, rotatable, hashed tokens; auth on claim/heartbeat | — |
+| Tracker bridge | `tracker-bridge.ts` | idempotent ingest of tracker item → job; one-way status echo | — |
+| Artifacts | `artifacts.ts`, `artifacts-blob.ts` | pointer docs in Cosmos, bytes in blob (SAS) | — |
+| Live events | `routes.ts` SSE + `fleet_events` | `GET /fleet/jobs/:id/events/stream` | — |
+| Metrics / alerts | `coordinator.ts` (`fleetMetrics`) | utilization, health rollup, starvation alerts | — |
+
+---
+
+## 9. REST API surface (`/fleet`, under `/api`, auth + `x-product-id`)
+
+```
+Jobs       POST /fleet/jobs · GET /fleet/jobs · GET /fleet/jobs/:id
+           PATCH /fleet/jobs/:id (fenced) · POST /fleet/jobs/:id/actions/:action
+Claim      POST /fleet/claim
+Lease      POST /fleet/jobs/:id/lease/renew · /lease/release
+Factories  POST /fleet/factories/heartbeat · /enroll
+           POST /fleet/factories/:id/token/rotate · /token/revoke
+Runs/Events GET /fleet/jobs/:id/runs · /events · /events/stream (SSE) · /explain
+Review     POST /fleet/jobs/:id/review/request · /review
+Budgets    GET /fleet/budgets/:productId · /burndown
+           PUT /fleet/budgets/:productId · POST /pause · /resume
+DAG        POST /fleet/jobs/:id/children · GET /fleet/jobs/:id/dag
+Artifacts  POST /fleet/jobs/:id/artifacts · GET (list) · GET/DELETE /fleet/artifacts/:id
+Tracker    POST /fleet/tracker/ingest · /fleet/tracker/echo
+Metrics    GET /fleet/metrics · GET /fleet/queue-state (Phase-4 M0 RU gate)
+```
+
+---
+
+## 10. The two control planes & feature flags
+
+**Browser (`tracker-web`)** — `dashboards/tracker-web/src/`:
+- `app/dashboard/fleet/page.tsx` — fleet map (factory cards, health/load/caps, metrics + alerts)
+- `app/dashboard/fleet/jobs/page.tsx` — stage-filtered job table
+- `app/dashboard/fleet/jobs/[id]/page.tsx` — job detail: SSE event timeline, runs, artifacts, **DAG view**, **ExplainPanel**, **ReviewGateCard**, ship/requeue/reject
+- `app/dashboard/fleet/budget/page.tsx` — burndown chart + pause/resume kill-switch
+- `lib/fleet-client.ts` — typed client; `subscribeJobEvents` (fetch-based SSE w/ auth + `Last-Event-ID` resume + poll fallback); graceful 404 → null
+- `app/api/fleet/[...path]/route.ts` — proxy to platform-service
+
+**Terminal (`agent-queue`)** — `learning_ai_devops_tools/agent-queue/`:
+- `dashboard.mjs` (`AQ_FLEET_DASH=1`) → `lib/fleet-dash.mjs` adapter: board counts, factories (per-factory rows or metrics aggregate), alerts, running, actionable JOBS w/ tags, recent, per-job events log; ship/requeue/reject via `/fleet`. Local folder-queue mode byte-for-byte unchanged when the flag is off.
+
+**Feature flags**
+
+| Flag | Where | Effect |
+| ---- | ----- | ------ |
+| `FLEET_PREEMPTION` | platform-service | enable critical-job preemption + seat limits |
+| `FLEET_BUDGETS` | platform-service | enable budget enforcement + auto-pause |
+| `AQ_FLEET` | factory runner | runner becomes a coordinator factory (claim/report) |
+| `AQ_FLEET_ROUTE` / `AQ_FLEET_SHADOW` | factory runner | route via service / side-effect-free shadow compare |
+| `AQ_FLEET_DASH` | TUI | dashboard sources board from `/fleet` API |
+| `AQ_FLEET_API` / `AQ_FLEET_TOKEN` / `AQ_PRODUCT_ID` | both | base URL / bearer / `x-product-id` |
+
+All flags default **off** → the system is byte-for-byte the prior single-host tool.
+
+---
+
+## 11. Code map (where everything lives)
+
+**`learning_ai_common_plat` (the durable spine):**
+```
+services/platform-service/src/modules/fleet/
+  types.ts            Zod schemas + canonical model (stages, lease, budget, DAG, events)
+  repository.ts       per-container CRUD + revUpdate CAS, appendEvent, listChildrenByParent
+  coordinator.ts      submit/claim/lease/fence/reaper, preemption, budgets, DAG, review, metrics
+  scheduler.ts        pure scoring router + selectPreemptionVictim + scoreCandidate (explain)
+  enrollment.ts       factory enroll / rotate / revoke / enforceFactoryToken
+  tracker-bridge.ts   ingest tracker item → job; one-way status echo
+  artifacts.ts        artifact pointer mgmt
+  artifacts-blob.ts   blob upload/download/delete (SAS)
+  routes.ts           all /fleet REST + SSE
+  *.test.ts           coordinator/scheduler/repository/routes/enrollment/tracker/artifacts/types
+dashboards/tracker-web/src/
+  app/dashboard/fleet/**          the browser control plane (pages above)
+  lib/fleet-client.ts             typed client + SSE
+  app/api/fleet/[...path]/route.ts proxy
+  e2e/fleet.spec.ts               Playwright specs
+lib/cosmos-init.ts                container registration
+docs/GIGAFACTORY/gigafactory-phase3-progress.md / docs/GIGAFACTORY/FLEET_CONTROL_PLANE.md
+```
+
+**`learning_ai_devops_tools` (the factory agent + TUI + spec):**
+```
+agent-queue/
+  agent-queue.sh      single-host runner + factory agent (AQ_FLEET); budget.wall, retry, recover
+  lib/fleet-client.sh curl-only coordinator client (register/claim/report/renew, fencing-aware)
+  lib/fleet-dash.mjs  TUI fleet-mode adapter over /fleet (+ fleet-dash.test.mjs, 22 assertions)
+  dashboard.mjs       the TUI (local + fleet modes)
+  profiles/*.md       persona+capability catalog
+  demo/two-factory-demo.sh + coordinator-stub.sh  parallel-fleet demo
+  selftest.sh         ~75 dependency-light checks
+  docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md   source-of-truth spec & checklists
+  docs/GIGAFACTORY/GIGAFACTORY_SYSTEM_OVERVIEW.md   (this file)
+```
+
+---
+
+## 12. Test coverage (what's verified)
+
+- **platform-service fleet** (~134+ tests): atomic-claim race (true concurrency, no
+  double-assign), fencing rejection, reaper reclaim + checkpoint, scheduler scoring
+  / tie-breaks / starvation / preemption-victim, DAG fan-out/unblock/subtree,
+  budgets + burndown + auto-pause, review-gate quorum, enrollment/token lifecycle +
+  auth enforcement, tracker ingest/echo idempotency, routes (incl. SSE + explain),
+  schema validation.
+- **tracker-web** (~198 tests): fleet-client unit tests + page render; SSE
+  parse/resume/fallback; graceful 404 degradation.
+- **tracker-web e2e** (`e2e/fleet.spec.ts`): fleet map, live log, ship, budget-pause,
+  review-gate (Playwright — needs CI wiring).
+- **agent-queue** (`selftest.sh`, ~75 checks): manifest/profiles/caps/priority/deps/
+  idempotency, retry/recover/insights, tracker round-trip, `AQ_FLEET` register/claim/
+  fenced-patch/reaper-reclaim/quarantine, shadow AGREE/DIVERGE, two-factory demo,
+  **budget.wall enforcement**, **fleet-dash adapter (22 assertions)**.
+
+---
+
+## 13. Next steps
+
+**Immediate (close Phase 1–3 to a clean 100%):**
+1. **Validate the Cosmos `_etag`/`If-Match` CAS path under true contention** and
+   **live blob-backed `fleet_artifacts`** — the two items the roadmap marks as
+   "remaining for a hard 100%" on Phase 2/3 (tests today use the memory provider +
+   pointer-only artifacts).
+2. **Wire `e2e/fleet.spec.ts` into CI** (Playwright install + a `verify` job) so the
+   Phase-3 exit criterion ("web verify incl. e2e green") is enforced, not just
+   present.
+3. **Live multi-host operator run** end-to-end (the Phase-3 acceptance: drive the
+   3-repo parallel workload from the browser, including a budget pause + resume
+   against a real platform-service, not the stub).
+
+**Phase 4 (scale-out) — in progress; see [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md):**
+- ✅ **M0 (done)** — RU gate: `fleet_queue_state` + `GET /fleet/queue-state` +
+  `AQ_FLEET_GATE`; factories skip the claim while the queue version is unchanged.
+4. **M1+: broker** (the redesign picks **Azure Service Bus**, not NATS/Redis, for
+   subscription filters + DLQ) for push dispatch + backpressure in a
+   coordinator-owns-scheduling / broker-owns-delivery hybrid (keeps the scorer).
+5. **M3: autoscaling** — scale-to-zero ephemeral factories (KEDA/Container Apps)
+   keyed to subscription depth.
+6. **Capability marketplace** — route rare-capability jobs (xcode/figma/gpu) to the
+   few factories that have them; cross-product queueing fairness.
+7. **Load + chaos suite** — factory churn, broker outage, thundering herd.
+
+**Phase 5 (learned routing):**
+8. Capture per-run outcome features → offline eval harness (learned vs heuristic) →
+   shadow/A-B with guardrails → surface recommendations ("route NomGap UX jobs to
+   claude on mac-2: 23% faster").
+
+---
+
+## 14. Bugs, gaps & risks (be honest)
+
+**Documentation status (reconciled 2026-05-31):**
+- `GIGAFACTORY_ROADMAP.md` §0 now reads Phase 0 ✅100% · 1 ✅~98% · 2 ✅~98% ·
+  3 ✅100% · **4 ◐ in progress (~10%, M0 shipped)** · 5 ☐. Phase-2 boxes for the
+  scheduler core and factory enrollment/scoped tokens are ticked (`scheduler.ts`
+  `selectJob`/`selectPreemptionVictim` wired into `claimNextJob`; `enrollment.ts`
+  `enforceFactoryToken` gating claim/heartbeat). The earlier "stale §0 table"
+  warning no longer applies.
+
+**Runtime / correctness gaps:**
+- **SSE is poll-fallback based, not a push-only contract.** `subscribeJobEvents`
+  falls back to `getJobEvents()` polling on stream error — fine for resilience, but
+  "live" can silently degrade to polling without a visible operator signal.
+- **UI pages degrade silently on some errors** (empty states / `null`), which can
+  mask a real backend outage as "nothing happening."
+- **Budget page assumes `ceilingUsd` exists** when rendering the spend bar — a
+  budget doc without a ceiling could render a broken/NaN bar. Guard it.
+- **Dashboard `patchJob` only sends `{stage, leaseEpoch}`** — other fenced-transition
+  fields (e.g. `checkpoint`) aren't exposed in the web UI, so operator-driven
+  transitions can't carry a checkpoint.
+- **`rev` CAS on the memory provider** is exact only for the sequential calls the
+  coordinator/tests make (re-read `rev` before write). Real concurrency safety
+  depends on Cosmos `_etag`/`If-Match` in production — verify the Cosmos path under
+  true contention before relying on it at scale.
+
+**TUI-specific (this repo):**
+- Fleet **utilization %** only renders in the metrics-aggregate fallback branch, not
+  when per-factory rows are present — a minor inconsistency in the TUI board.
+- The **budget.wall live selftest is timing-sensitive** (races a 2s wall ceiling) and
+  can flake under heavy disk/CPU load; the code is correct but the test could be made
+  more robust (e.g. inject the clock).
+- TUI fleet mode has **no write path for budgets/preemption** — it's read + job
+  actions only; budget pause/resume is web-only.
+
+**Operational gotchas (verified on the live fleet — get these right):**
+- **Heartbeat cadence MUST be < the 90s stale threshold.** `fleet_metrics` marks a
+  factory stale after `DEFAULT_STALE_FACTORY_MS = 90_000`, but the factory only
+  heartbeats every `AQ_FLEET_LEASE_RENEW_SEC` (**default 300s**). Left at the
+  default, a healthy factory flaps to "stale"/"no live factory" between beats. The
+  fleet launcher sets `AQ_FLEET_LEASE_RENEW_SEC=30` to stay well inside the window.
+- **The tracker-web New-Job form is misconfigured:** it hardcodes factories
+  `mac-1`/`mac-2` and defaults `capabilities=["build"]` — a token **no agent-queue
+  factory advertises** (`detect_capabilities` emits `os:*`/`engine:*`/`node:*`/`has:*`).
+  So a default UI submission is unroutable (queues forever → `queue_starvation`).
+  Fix tracked in the redesign doc's routing-model section.
+- **No factory deregister API.** Only heartbeat/enroll/rotate/revoke exist, so a
+  dead factory's doc lingers and shows as `stale` until pruned out-of-band
+  (currently a manual Cosmos delete). A prune/deregister path is a Phase-4 item.
+
+**Not-yet-built (expected, Phase 4+):**
+- **No message bus yet** — dispatch is still poll-based, but the **M0 RU gate now
+  skips the claim while idle** (so idle Cosmos RU is near-flat). Broker push/
+  backpressure is M1+.
+- **No autoscaling** — factory fleet is static/manually run (M3 target).
+- **No capability marketplace / cross-product fairness** under contention.
+- **No load/chaos test suite** — resilience is unit-proven, not load-proven.
+- **Artifacts blob wiring** (`fleet_artifacts` → real blob storage) should be
+  validated against a live storage account (tests use memory/pointer only).
+
+**Recently fixed (2026-05-31):**
+- **`run --once` could return before a backgrounded worker finished the PR/report.**
+  `_meta_end` (which writes `ended=`) was called right after the `testing/` move,
+  *before* PR open/merge + coordinator reports, so the slot freed early and `--once`
+  could exit (and a caller could observe completion) mid-PR. Now `ended=` is written
+  last; the selftest PR-mode case is deterministic again.
+
+---
+
+## 15. TL;DR
+
+Phases 0–3 are functionally **complete and well-tested**: a durable coordinator with
+exactly-once leasing + fencing + crash recovery, a deterministic scoring router with
+preemption + explainability, per-product budgets, DAG decomposition, a multi-reviewer
+gate, factory enrollment with scoped tokens, and **two** control planes (browser +
+TUI) over one `/fleet` API. The remaining work is (a) trivial doc corrections, (b)
+CI-enforcing the existing e2e, and (c) the genuinely new Phase-4 scale-out frontier
+(broker, autoscaling, marketplace, chaos) and Phase-5 learned routing.
--- a/agent-queue/docs/GIGAFACTORY/README.md
+++ b/agent-queue/docs/GIGAFACTORY/README.md
@ -0,0 +1,20 @@
+# Gigafactory — Agent-Queue Docs
+
+Source-of-truth specs and the system overview for **Agent Gigafactory**, the
+fleet-coordination layer that turns the single-host `agent-queue` runner into a
+multi-host factory of autonomous coding agents.
+
+## Contents
+
+| Doc | What it is |
+| --- | --- |
+| [`GIGAFACTORY_ROADMAP.md`](GIGAFACTORY_ROADMAP.md) | The canonical source-of-truth spec: architecture, the evolved job manifest, scoring formula, lifecycle/retry, enrollment, and the phased checklists (§1–§17). Job specs in `../jobs/` point here. |
+| [`GIGAFACTORY_SYSTEM_OVERVIEW.md`](GIGAFACTORY_SYSTEM_OVERVIEW.md) | A narrative overview of how the pieces fit together end-to-end, with a code-map of the relevant files across both repos. |
+| [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md) | Phase-4 design proposal (no code): broker-backed (Azure Service Bus) dispatch + on-demand factories that fixes the product-as-queue routing smell and the idle-poll Cosmos RU cost. Phased migration starting with a zero-infra RU quick win. |
+
+## Related docs in the other repo
+
+The platform-service backend and the tracker-web UI live in
+`learning_ai_common_plat`. Its Gigafactory docs (roadmap-completion audit,
+remaining-task checklist, Phase-3 progress, and the fleet control-plane guide)
+are under `docs/GIGAFACTORY/` there.
--- a/agent-queue/docs/RUN_POLICY.md
+++ b/agent-queue/docs/RUN_POLICY.md
@ -0,0 +1,79 @@
+# Agent-Queue Run Policy
+
+How the agent-queue daemon and the agents it launches must operate. Written
+after a live review found jobs running in `--yolo` (dangerous) mode directly
+against **live working trees**, which dirtied repos, produced duplicate/competing
+commits, and risked leaking secrets.
+
+## Observed behavior (the problem)
+
+`agent-queue.sh` launches the chosen CLI with `cwd` taken from the job
+front-matter (default `$PWD`) and, when `yolo: true` (the default), with
+full-autonomy flags:
+
+| Engine | yolo flag |
+| --- | --- |
+| devin  | `--permission-mode dangerous` |
+| claude | `--dangerously-skip-permissions` |
+| codex  | `--dangerously-bypass-approvals-and-sandbox` |
+| (other) | `--allow-all-tools` |
+
+With `cwd` pointing at a canonical checkout (e.g. `…/learning_ai_fastgap`), a
+dangerous-mode agent edits, commits, and pushes in the repo you also work in.
+
+## Policy
+
+1. **Isolation — never run in the canonical checkout.**
+   Each job MUST run in a dedicated **git worktree** (or fresh clone) created off
+   `origin/main`, not the live working directory. Set the job's `cwd` to that
+   worktree. The canonical checkout must be left untouched.
+
+2. **One job = one branch.**
+   Create/checkout a dedicated branch (e.g. `aq/<job-id>`) off the latest
+   `origin/main`. Agents push that branch and open a PR. **Never push straight to
+   the shared `main`** of platform/shared repos.
+
+3. **Least privilege by default.**
+   Default `yolo: false`. Reserve the dangerous/`--allow-all-tools` flags for
+   **disposable sandboxes only** (throwaway worktree/clone or container). Never
+   run dangerous mode against a directory whose changes you care about.
+
+4. **Clean-tree contract.**
+   A job starts only from a clean tree and verifies the canonical checkout is
+   unchanged when it finishes. If a worktree is dirty at pickup, fail fast.
+
+5. **Test before ship.**
+   Run typecheck + lint + the repo's test suite before committing. Commit small,
+   conventional messages. Open a PR for review instead of force-merging.
+
+6. **Never track runtime/queue state.**
+   The `queue/{.state,inbox,building,testing,review,failed,shipped,logs}` lifecycle
+   dirs are runtime state and are git-ignored (see repo `.gitignore`). Jobs must
+   not commit them.
+
+7. **One writer per repo.**
+   At most one job per target repo at a time (use the existing per-repo lock) so
+   two agents never compete on the same working tree.
+
+8. **Secrets stay out of git.**
+   Jobs must not write real secrets into tracked files. Use `.env` (gitignored);
+   the pre-push secret scan is a backstop, not a license.
+
+## Applying this with the current runner
+
+- Add a **worktree-prep step** before launch: `git -C <repo> worktree add
+  <tmp>/<job-id> -b aq/<job-id> origin/main`, then set the job `cwd: <tmp>/<job-id>`.
+- Set `yolo: false` in job front-matter unless the `cwd` is a disposable
+  sandbox.
+- On completion, push `aq/<job-id>` and open a PR; remove the worktree
+  (`git worktree remove`) once merged.
+
+## Pre-flight checklist (per job)
+
+- [ ] `cwd` is a dedicated worktree/clone, not a canonical checkout
+- [ ] dedicated branch off latest `origin/main`
+- [ ] `yolo: false` unless sandboxed/disposable
+- [ ] starts from a clean tree
+- [ ] tests/lint/typecheck run before commit
+- [ ] pushes a branch + PR (no direct shared-`main` pushes)
+- [ ] no runtime/queue state or secrets committed
--- a/agent-queue/docs/jobs/dependabot-triage.md
+++ b/agent-queue/docs/jobs/dependabot-triage.md
@ -0,0 +1,86 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat-dependabot
+timeout: 4h
+---
+
+ROLE: Senior platform engineer. TRIAGE the open Dependabot dependency-update PRs in
+`learning_ai_common_plat`, verify each one builds + tests green against CURRENT main,
+and MERGE only the safe ones. This is a maintenance sweep — be conservative: a green
+verify gate is the bar for merging; anything that fails, conflicts, or is a risky major
+bump gets left open with a clear note. NEVER weaken or skip a test to make a PR pass.
+
+PARALLEL-SAFETY: Other Devins may be running in this repo and in learning_ai_devops_tools
+on gigafactory `fleet` work. You touch ONLY dependency manifests + lockfile as Dependabot
+already changed them — do NOT edit application source. If a Dependabot branch conflicts
+with main on anything other than package.json / pnpm-lock.yaml, SKIP it (leave open, note
+why) rather than hand-resolving source conflicts.
+
+THE BRANCHES (each is one open PR, ahead of main by ~1 commit):
+- dependabot/npm_and_yarn/azure/cosmos-4.9.2
+- dependabot/npm_and_yarn/fastify/cors-11.2.0
+- dependabot/npm_and_yarn/happy-dom-20.8.4
+- dependabot/npm_and_yarn/jose-6.2.2
+- dependabot/npm_and_yarn/lint-staged-16.4.0
+- dependabot/npm_and_yarn/multi-6d7db9f379   (a grouped multi-package bump)
+- dependabot/npm_and_yarn/react-dom-19.2.4
+- dependabot/npm_and_yarn/stripe-20.4.1
+- dependabot/npm_and_yarn/types/node-25.5.0
+- dependabot/npm_and_yarn/typescript-eslint/parser-8.57.1
+- dependabot/github_actions/actions/checkout-6
+- dependabot/github_actions/actions/setup-node-6
+- dependabot/github_actions/actions/setup-python-6
+(Re-list with `git branch -r | grep dependabot` in case the set changed.)
+
+PER-PR PROCEDURE (do each in an ISOLATED worktree off CURRENT origin/main so the main
+checkout + other Devins are never disturbed):
+1. `git fetch origin --prune`; create a temp worktree at origin/main; merge the dependabot
+   branch into it (`--no-commit --no-ff`).
+   - If the merge touches ANY file other than package.json / pnpm-lock.yaml /
+     .github/workflows/* -> ABORT, classify SKIP (unexpected scope), note it.
+   - If it conflicts -> ABORT, classify SKIP (conflicts main), note it.
+2. Identify the bump TYPE from the version delta (semver): patch / minor / major.
+3. Run the VERIFY GATE in the merged worktree:
+   - `pnpm install --frozen-lockfile` (must succeed with the bumped lockfile)
+   - `pnpm build`
+   - `pnpm test`
+   - For react-dom: also run the dashboards' web tests if they have their own suite.
+   - GitHub-actions bumps (checkout/setup-node/setup-python): no pnpm gate; just confirm
+     the workflow YAML still parses and the action major is supported by our runners.
+4. CLASSIFY:
+   - MERGE if: scope is only manifests/lockfile/workflow, no conflicts, verify gate fully
+     green. (Patch/minor with green gate = merge. A MAJOR bump may merge ONLY if the gate
+     is green AND nothing in our code uses a removed/changed API — if unsure, HOLD.)
+   - HOLD (leave open) if: gate fails, major bump with any ambiguity, or behavioral risk
+     (e.g. stripe / jose / react-dom majors that need a human eye).
+   - SKIP if: conflicts main or touches unexpected files.
+5. To MERGE: merge the branch into main with `--no-ff` (first parent = main), message
+   `chore(deps): <package> <old> -> <new> (#<pr>)`, push origin HEAD:main, then delete the
+   dependabot branch. Re-fetch main before the NEXT PR so each builds on the latest (avoids
+   lockfile churn between merges). Do the LOW-RISK ones first (types/node, lint-staged,
+   happy-dom, the actions bumps), majors last.
+
+CONSTRAINTS: no app-source edits; never modify/skip tests; ESM repo conventions; conventional
+commits (chore(deps): ...); do not touch the gigafactory `fleet` modules; do not delete
+backup/* branches; leave the gigafactory + hermes branches alone. Stay entirely in isolated
+worktrees; clean every worktree up afterward (`git worktree remove --force` + `prune`).
+
+VERIFY GATE (per merged PR, must be green to merge):
+- pnpm install --frozen-lockfile && pnpm build && pnpm test  (no regression)
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Dependency Triage Report — common-plat Dependabot
+### Summary table
+| PR / package | old -> new | bump | verify gate | decision |
+(one row per branch: MERGE / HOLD / SKIP)
+### Merged (pushed to main)
+- <package> <old->new> (#pr) — commit <sha>
+### Held open (with reason)
+- <package> — <why: failing gate / major risk / needs human>
+### Skipped (with reason)
+- <package> — <conflicts main / unexpected scope>
+### Verify gate results (build/test summary per merged PR)
+### Branches deleted
+### Anything that needs a human decision
--- a/agent-queue/docs/jobs/phase1-slice1.md
+++ b/agent-queue/docs/jobs/phase1-slice1.md
@ -0,0 +1,101 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: devops-tools
+timeout: 3h
+---
+
+ROLE: Senior engineer. Implement Phase 1 — Slice 1 of the Agent Gigafactory roadmap.
+
+SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §4, §5, §6, §7, §14 Phase 1
+first). This slice implements ONLY the items listed below.
+
+STRICT SCOPE:
+- Edit ONLY files under agent-queue/ (primarily agent-queue.sh, selftest.sh, README.md,
+  docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). DO NOT touch any other repo.
+- DO NOT modify, move, or delete anything under agent-queue/queue/ — there are LIVE jobs
+  running there. DO NOT run `agent-queue.sh run`. selftest.sh uses its own temp queue
+  (AGENT_QUEUE_ROOT) — that is the only execution allowed.
+- This is bash (single host). No service/Cosmos/Zod work in this slice (that is Phase 2).
+
+DELIVERABLES (in agent-queue.sh, backward-compatible — legacy .md files with only
+engine/cwd/yolo MUST behave exactly as today):
+
+1. MANIFEST PARSING: recognize these new frontmatter keys with safe defaults via the existing
+   fm_get pattern: profile, engine-class, capabilities, prefers, priority, budget, deps,
+   deps-mode, idempotency-key, retry, review-policy, artifacts, tracker-item. In THIS slice
+   only items 2–5 are functional; the rest must be parsed + stored in the job .meta and shown
+   in `status`, but otherwise inert (document as "reserved, no-op until later phase").
+
+2. PRIORITY ORDERING: replace pure-FIFO inbox selection with priority-then-age.
+   priority in {critical,high,medium,low} (default medium). Higher priority picked first; ties
+   broken by oldest timestamp. Must not break per-lock serialization.
+
+3. CAPABILITY GRAMMAR + MATCH (single-host):
+   - detect_capabilities(): emit tokens for this host — os:<mac|linux>,
+     engine:<devin|claude|codex|copilot present>, node>=<major>, has:<tool> for a small probe
+     set (git, pnpm, docker if present).
+   - caps_match(required[], available[]) honoring §5 grammar: bare `key` = presence;
+     `key:value` exact; `key<op>version` with op in {>=,>,=,<=,<} (numeric/semver-major
+     compare); `os:any` = wildcard match-all. A job matches iff EVERY required token is satisfied.
+   - At run time, if a job declares `capabilities` the host does not satisfy, move the job to
+     failed/ with result=capability_mismatch and a clear log line (do NOT launch the agent).
+
+4. ENGINE-CLASS RESOLUTION: if `engine` is unset but `engine-class` is set, pick a concrete
+   engine from a documented class map honoring `prefers-engine` then availability:
+   agentic-coder -> [devin, claude, codex]; chat-coder -> [copilot]. Explicit `engine` always
+   wins. If neither yields an available binary, fail the job with result=no_engine.
+
+5. IDEMPOTENCY-KEY DEDUPE (on `add`): compute a content hash of the stripped body. If an
+   existing job in ANY stage (inbox/building/review/testing/shipped) has the same
+   idempotency-key AND same hash -> no-op (log "duplicate, skipped"). Same key + DIFFERENT hash
+   -> reject with a clear error UNLESS the prior job is still in inbox/ (then replace it).
+
+TESTS (selftest.sh — tests are sacred; only ADD, never weaken existing ones). Add cases:
+- backward-compat: a legacy engine/cwd/yolo-only .md still completes and lands in review/.
+- priority: with max=1, a `critical` job queued after a `low` job runs first.
+- capability mismatch: a job requiring `has:definitely-not-installed` -> failed/
+  result=capability_mismatch (agent never launched; use the existing no-op engine stub).
+- engine-class: a job with `engine-class: agentic-coder` and no `engine`, DEVIN_BIN stubbed,
+  runs and lands in review/.
+- idempotency: adding the same key+body twice yields exactly one inbox file; same key +
+  different body is rejected.
+
+DOCS:
+- README.md frontmatter table: add the new fields, clearly marking ACTIVE (Phase 1) vs RESERVED.
+- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick ONLY the Phase 1 checklist boxes you fully completed and
+  update the §0 progress % for Phase 1 (do not tick incomplete items).
+
+CONSTRAINTS:
+- bash style consistent with the existing script; no new runtime dependencies; macOS + Linux
+  safe (no GNU-only flags without fallback). No emojis in code. No leftover debug echo noise.
+- Work on a NEW branch: feat/gigafactory-p1-slice1. Commit in logical steps with conventional
+  commit messages. Push the branch and open a PR. DO NOT merge to main (human gate).
+
+VERIFY GATE (must pass before finishing):
+- bash agent-queue/selftest.sh  -> MUST be fully green (existing + new cases).
+- bash -n agent-queue/agent-queue.sh  and  node --check agent-queue/dashboard.mjs.
+
+FINAL OUTPUT — print an implementation report in EXACTLY this format:
+
+## Implementation Report — Phase 1 Slice 1
+### Branch & commits
+- branch: <name>
+- commits: <sha> <message> (one per line)
+- PR: <url or "opened, not merged">
+### Files changed
+- <path>: <one-line summary>
+### What was implemented (per deliverable 1-5)
+- <deliverable>: <how, key functions added/changed>
+### Tests added
+- <test name>: <what it asserts>  (plus selftest.sh PASS/FAIL summary)
+### Verify gate results
+- selftest.sh: <PASS/FAIL + counts>
+- bash -n / node --check: <result>
+### Deviations / assumptions
+- <anything changed from spec and why>
+### Reserved (parsed-but-inert) fields
+- <list fields parsed but no-op this slice>
+### Suggested next slice
+- <what should come next>
--- a/agent-queue/docs/jobs/phase1-slice2.md
+++ b/agent-queue/docs/jobs/phase1-slice2.md
@ -0,0 +1,109 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: devops-tools
+timeout: 3h
+---
+
+ROLE: Senior engineer. Implement Phase 1 — Slice 2 (Profiles + deps/DAG, single host).
+
+SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §5 deps, §6 profiles,
+§14 Phase 1). This slice implements ONLY the items below.
+
+PREREQUISITE / BRANCHING:
+- Slice 1 (manifest/priority/capabilities/engine-class/idempotency) AND Slice 3
+  (resilience: orphan recovery, WIP checkpoint/resume, retry, insights) are BOTH
+  already merged into `main`. Branch off the CURRENT `main`.
+- Do NOT duplicate, revert, or break any Slice 1 or Slice 3 code or tests — the
+  existing selftest cases (34 checks) MUST stay green (regression).
+- New branch: feat/gigafactory-p1-slice2. Push + open a PR. DO NOT merge.
+
+STRICT SCOPE:
+- Edit ONLY under agent-queue/ (agent-queue.sh, selftest.sh, README.md, new
+  profiles/ dir, docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). No other repo.
+- DO NOT modify/delete anything under agent-queue/queue/ (live jobs). DO NOT run
+  `agent-queue.sh run`. selftest.sh uses its own temp AGENT_QUEUE_ROOT only.
+- bash, single host. No service/Cosmos work (that is Phase 2).
+
+A. PROFILES (§6)
+1. Create agent-queue/profiles/ with a starter catalog as profiles/<name>.md:
+   developer, backend-engineer, frontend-engineer, ux-designer, ui-designer, qa,
+   reviewer, docs-writer (and a reserved `planner`). Each has frontmatter:
+   name, persona (multi-line block), capabilities, default-verify, engine-class,
+   prefers-engine, allowed-scope, review-policy.
+2. Profile resolution: when a job sets `profile: X`, inherit any of
+   {verify, capabilities, engine-class, prefers-engine, allowed-scope,
+   review-policy} that the job OMITS. Job-level fields ALWAYS override the profile.
+   Resolution runs BEFORE the capability gate + engine resolution so inherited
+   caps/engine-class take effect.
+3. Persona injection: prepend the profile's persona to the job body in the
+   stripped body file fed to the engine. Never write secrets to logs.
+4. allowed-scope guardrail — WARN-ONLY this phase: after the run, if cwd is a git
+   repo, compute changed paths and log a WARNING for any path outside the
+   allowed-scope globs. Non-blocking (do NOT fail the job). Expose the scope-check
+   as a unit-testable function.
+5. Document the resolution precedence (job > profile > built-in default).
+
+B. DEPS / DAG — single host (§5)
+6. deps reference other jobs by `idempotency-key` (stable, author-controlled). A
+   dep is satisfied when a job with that key is in shipped/ (default), or in
+   shipped/ OR testing/ when the dependent job sets `deps-mode: soft`.
+7. A job with unmet deps is BLOCKED: not selected to run, surfaced in `status` as
+   "blocked (waiting on <keys>)". Implement as a skip-with-reason in inbox
+   selection (like the busy-lock skip) — do NOT launch, do NOT move to failed.
+   Re-evaluated every run loop; becomes runnable once deps are satisfied.
+8. Submit-time cycle detection on `add`: build the dep graph from idempotency-keys
+   across inbox + active stages; reject (die, nonzero) if the new job would create
+   a cycle.
+9. No cross-machine deps (that is P2).
+
+TESTS (selftest.sh — tests are sacred; only ADD):
+- profile inherit verify: a profile whose default-verify is `false` → a job using
+  it (no own verify) routes to failed/; a profile with default-verify `true` →
+  testing/.
+- persona injection (golden): the body fed to the engine begins with the profile
+  persona (capture via a stub that copies its --prompt-file to a sentinel).
+- profile caps inheritance: job omitting capabilities inherits the profile's →
+  unmet → failed/ result=capability_mismatch.
+- allowed-scope warn: an out-of-scope change logs a WARNING and the job still
+  succeeds (or assert the scope-check function directly).
+- deps block→run: job B deps:[keyA] stays blocked while A is unshipped; once A is
+  in shipped/, B becomes runnable and completes.
+- deps-mode soft: dep satisfied when the dependency is in testing/.
+- cycle detection: adding A deps:[keyB] while B deps:[keyA] is rejected.
+
+DOCS:
+- README: profiles section (catalog + resolution precedence) + deps/blocked
+  semantics.
+- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick the §6 boxes you fully completed and the §5
+  `deps` box; bump §0 Phase 1 %.
+
+CONSTRAINTS: bash style consistent with the existing script; no new runtime deps;
+macOS + Linux safe; no emojis in code; no leftover debug noise; conventional
+commits.
+
+VERIFY GATE (must pass):
+- bash agent-queue/selftest.sh → fully green (existing + new).
+- bash -n agent-queue/agent-queue.sh ; node --check agent-queue/dashboard.mjs.
+
+FINAL OUTPUT — print the implementation report in EXACTLY this format:
+
+## Implementation Report — Phase 1 Slice 2
+### Branch & commits
+- branch / based-on: <name> (based on main | feat/gigafactory-p1-slice1)
+- commits: <sha> <message> (one per line)
+- PR: <url or "opened, not merged">
+### Files changed
+- <path>: <one-line summary>
+### What was implemented (A1-5, B6-9)
+- <item>: <how, key functions added/changed>
+### Tests added
+- <test name>: <what it asserts>  (plus selftest.sh PASS/FAIL summary)
+### Verify gate results
+- selftest.sh: <PASS/FAIL + counts>
+- bash -n / node --check: <result>
+### Deviations / assumptions
+- <anything changed from spec and why>
+### Suggested next slice
+- <what should come next (likely: tracker adapter aq from-tracker/to-tracker)>
--- a/agent-queue/docs/jobs/phase1-slice3.md
+++ b/agent-queue/docs/jobs/phase1-slice3.md
@ -0,0 +1,168 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: devops-tools
+timeout: 4h
+---
+
+ROLE: Senior engineer. Implement Phase 1 — Slice 3: RESILIENCE & INSIGHTS (single host).
+This is a LARGE, fully self-contained slice (git + log parsing only — NO network,
+NO external service, NO credentials) so it runs end-to-end without blockers.
+
+SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §11 lifecycle/retry,
+§25 durability/crash-recovery, §26 execution insights, §17 observability, §14 Phase 1).
+Implement the SINGLE-HOST bash equivalents of §25 and §26.
+
+PREREQUISITE / BRANCHING:
+- Builds on Slice 1 (PR #1, branch feat/gigafactory-p1-slice1).
+- Base on `main` IF PR #1 (and PR #2 if present) are merged; otherwise branch off
+  feat/gigafactory-p1-slice1. Do NOT revert or duplicate earlier slice code.
+- This slice is INDEPENDENT of Slice 2 (profiles/deps) — do not depend on it.
+- New branch: feat/gigafactory-p1-slice3. Commit in logical steps, push, open a PR.
+  DO NOT merge (human gate).
+
+STRICT SCOPE:
+- Edit ONLY under agent-queue/ (agent-queue.sh, selftest.sh, README.md,
+  docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). No other repo.
+- DO NOT modify/delete anything under agent-queue/queue/ (live jobs). DO NOT run
+  `agent-queue.sh run` against the real queue. selftest.sh uses its own temp
+  AGENT_QUEUE_ROOT and temp git repos only.
+- bash, single host, macOS + Linux safe, zero new runtime deps.
+
+==================================================================
+A. CRASH RECOVERY & WORK PRESERVATION (single-host §25)
+==================================================================
+A1. ORPHAN RECOVERY: On `run` startup (and at the top of each run loop), detect
+    jobs stuck in building/ whose worker is no longer alive — i.e. the meta has a
+    `pid=` whose process is dead (and `pidstart` mismatch guards PID reuse), or no
+    live pid at all. Such a job is an ORPHAN from a previous crash/power-off.
+    Recover it deterministically (never lose or strand it):
+      - increment an `attempts=` counter in the meta,
+      - log a clear recovery line,
+      - move it back to inbox/ for re-selection (subject to retry policy A3),
+      - recovery MUST be idempotent (running it twice recovers once).
+
+A2. WIP CHECKPOINTING (work preservation): when a job's `cwd` is inside a git repo,
+    the worker preserves partial work on a dedicated branch so a crash never loses it:
+      - at START: ensure/create branch `aq/wip/<job>` (from current HEAD), record
+        `wip_branch=` + `wip_base=` in meta. NEVER touch main/protected branches.
+      - on EVERY exit path (success, failure, timeout, signal/trap): commit any
+        changes in cwd to `aq/wip/<job>` with a message like
+        "aq wip: <job> (<stage/exit>)" and record `wip_commit=` in meta.
+      - use a trap so even SIGTERM/SIGINT/timeout still checkpoints.
+      - if cwd is NOT a git repo: skip cleanly (log "wip: cwd not a git repo").
+    RESUME: when an orphan/retry of a job whose `aq/wip/<job>` branch exists is
+    relaunched, check out / fast-forward that branch first so the agent continues
+    from the checkpoint instead of from zero. Document the resume behavior.
+
+A3. RETRY POLICY (make the reserved `retry` field FUNCTIONAL):
+    parse `retry: { max: N, backoff: 5m, on: [timeout, verify_failed, crash] }`.
+    On a failure whose class is in `on` (agent rc!=0 => crash/agent_error,
+    timeout => timeout, verify fail => verify_failed), requeue to inbox/ with the
+    backoff delay honored (record `next_eligible=` epoch; selection skips until
+    then) up to `max` attempts. On exhaustion → failed/ with
+    result=retries_exhausted (single-host stand-in for dead_letter), preserving the
+    wip branch + full diagnostics in the log. Default when `retry` absent = no
+    retry (current behavior).
+
+A4. STATE INTEGRITY: keep all meta writes append-only (as today); never truncate a
+    live meta. Recovery/retry/backoff bookkeeping must be crash-safe (re-derivable
+    from meta + folder location).
+
+==================================================================
+B. EXECUTION INSIGHTS & TOKEN ACCOUNTING (single-host §26)
+==================================================================
+B1. PER-RUN METRICS: on completion, record into the job meta:
+      duration_s, exit, result, attempts, and repo deltas for the run —
+      files_changed, lines_added, lines_deleted (from `git -C <cwd> diff --numstat`
+      against wip_base, or against HEAD~ if applicable).
+B2. TOKEN/COST CAPTURE (best-effort, honest): add a single extensible adapter
+      `parse_usage <engine> <logfile>` that extracts, when present in the engine's
+      output: model, tokens_in, tokens_out, tokens_cached, cost_usd, turns,
+      tool_calls. Where the engine does not expose usage, omit the field or set an
+      `estimated=true` marker — DO NOT fabricate precise numbers. Centralize all
+      per-engine patterns in this one function (devin/claude/codex/copilot stubs;
+      real patterns where known, TODO-commented otherwise).
+B3. SURFACE in `status`: add an insights sub-line per finished/running job
+      (duration, attempts, tokens/cost if known, +/- lines).
+B4. NEW COMMAND `aq insights [job]`:
+      - with a job id: print that job's full metrics.
+      - without: print a table of recent finished jobs + an AGGREGATE rollup by
+        engine (total tokens, total cost (mark if any estimated), job count,
+        success rate, avg duration).
+B5. dashboard.mjs: surface a compact insights column/panel (tokens or cost +
+      attempts) for finished jobs. Keep it read-only from meta (agent-queue.sh
+      stays the single source of truth).
+B6. PRIVACY: never write prompt content or secrets into meta/insights/logs beyond
+      what already exists.
+
+==================================================================
+TESTS (selftest.sh — tests are sacred; only ADD; use temp git repos + stubs)
+==================================================================
+- orphan recovery: craft a building/ job whose meta pid is a dead PID → a `run`
+  startup recovers it to inbox/ with attempts incremented; running recovery twice
+  recovers exactly once.
+- wip checkpoint (git): job with a git-repo cwd that creates a file → after the
+  run, branch aq/wip/<job> exists and contains a commit with the change; main
+  branch untouched. Non-git cwd → skipped cleanly (no error).
+- wip resume: a recovered job whose aq/wip/<job> has a prior commit → the relaunch
+  checks out that branch (assert HEAD is on aq/wip/<job> when the agent runs).
+- retry policy: verify-fail job with retry.max=1 on=[verify_failed] → requeued once
+  (attempts=2) then → failed/ result=retries_exhausted; backoff next_eligible
+  respected (job not picked before its delay — use a tiny backoff like 1s).
+- retry on crash: agent rc!=0 with on=[crash] retries; without `crash` in `on`,
+  it goes straight to failed/ (no retry).
+- insights parse: feed a stub engine log containing a known usage line →
+  parse_usage extracts tokens/cost into meta; `aq insights <job>` prints them;
+  a no-usage log → fields omitted/estimated, no crash.
+- insights aggregate: two finished jobs → `aq insights` prints a per-engine rollup
+  with correct totals + success rate.
+- numstat deltas: a run that adds N lines → lines_added recorded.
+- REGRESSION: all existing selftest cases (Slice 0 + Slice 1) still green.
+
+==================================================================
+DOCS
+==================================================================
+- README: new "Resilience" section (orphan recovery, WIP checkpoint/resume, retry)
+  and "Insights" section (metrics, `aq insights`, token caveat) + document the
+  `retry` frontmatter (now active) and the new result= values
+  (retries_exhausted). Update the manifest table: move `retry` from RESERVED to ACTIVE.
+- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick the single-host items you fully completed in
+  §11 (retry/dead-letter stand-in), §25 (orphan/WIP/retry — note "single-host
+  subset"), §26 (capture/insights — single-host subset); bump §0 Phase 1 %.
+
+==================================================================
+CONSTRAINTS
+==================================================================
+- bash style consistent with the existing script; no new runtime deps; mac+linux
+  safe (no GNU-only flags without a fallback — note macOS has BSD date/stat);
+  no emojis in code; no leftover debug noise; conventional commits.
+- Be careful with `set -euo pipefail` + traps so the WIP-on-exit checkpoint always
+  runs even on failure/timeout.
+
+VERIFY GATE (must pass before finishing):
+- bash agent-queue/selftest.sh → fully green (existing + all new cases).
+- bash -n agent-queue/agent-queue.sh ; node --check agent-queue/dashboard.mjs.
+- shellcheck --severity=error agent-queue/agent-queue.sh (if available) → clean.
+
+FINAL OUTPUT — print the implementation report in EXACTLY this format:
+
+## Implementation Report — Phase 1 Slice 3
+### Branch & commits
+- branch / based-on: <name> (based on main | feat/gigafactory-p1-slice1)
+- commits: <sha> <message> (one per line)
+- PR: <url or "opened, not merged">
+### Files changed
+- <path>: <one-line summary>
+### What was implemented (A1-A4, B1-B6)
+- <item>: <how, key functions added/changed>
+### Tests added
+- <test name>: <what it asserts>  (plus selftest.sh PASS/FAIL summary)
+### Verify gate results
+- selftest.sh: <PASS/FAIL + counts>
+- bash -n / node --check / shellcheck: <result>
+### Deviations / assumptions
+- <anything changed from spec and why; which engines have real token parsing vs TODO>
+### Suggested next slice
+- <what should come next (likely: tracker adapter aq from-tracker/to-tracker, P2)>
--- a/agent-queue/docs/jobs/phase1-slice4.md
+++ b/agent-queue/docs/jobs/phase1-slice4.md
@ -0,0 +1,125 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: devops-tools
+timeout: 3h
+---
+
+ROLE: Senior engineer. Implement Phase 1 — Slice 4: TRACKER ADAPTER (single host).
+This CLOSES Phase 1: a task in the tracker can become a job, and job outcomes echo
+back to the tracker — the task<->job round-trip (§10, the last Phase-1 §14 item).
+
+SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §10 tracker
+integration, §5 manifest incl. tracker-item + idempotency-key, §24.5 one-way echo).
+
+PREREQUISITE / BRANCHING:
+- Slice 1, Slice 3, AND Slice 2 (profiles/deps) are merged into `main`. Branch off
+  the CURRENT `main`. This slice MUST run AFTER Slice 2 is merged (it shares
+  agent-queue.sh) — do not start it until then.
+- New branch: feat/gigafactory-p1-slice4. Push + open a PR. DO NOT merge.
+- Keep ALL existing selftest checks green (regression).
+
+STRICT SCOPE:
+- Edit ONLY under agent-queue/ (agent-queue.sh, selftest.sh, README.md,
+  docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). No other repo is modified.
+- You MAY READ (not edit) ../learning_ai_common_plat/services/platform-service/
+  src/modules/items/{types,routes}.ts to match the real Item API contract
+  (paths, fields, auth header). Do not change that repo.
+- bash, single host, mac+linux safe, zero new runtime deps (curl only).
+
+CONFIG (all via env; document in README; never hardcode URLs/tokens/secrets):
+- AQ_TRACKER_API   : base URL of the items API (default http://localhost:4003).
+- AQ_TRACKER_TOKEN : bearer token for auth (required for real calls).
+- AQ_PRODUCT_ID    : productId to stamp/filter (every tracker Item has productId).
+- A single `tracker_api <method> <path> [json]` wrapper does ALL HTTP via curl
+  (bearer header, content-type, base URL). It MUST be overridable for tests via
+  AQ_TRACKER_API_CMD (a stub script path) so selftest needs NO live service.
+
+DELIVERABLES
+
+1. `aq from-tracker <ITEM_ID>` — pull a tracker Item and materialize a job in inbox/:
+   - GET the item via tracker_api; map fields → job frontmatter:
+       title/description     -> job body (the instruction markdown, verbatim)
+       item type/labels      -> engine-class/profile/capabilities/priority where
+                                labels carry them (e.g. label `engine-class:agentic-coder`,
+                                `profile:backend-engineer`, `priority:high`,
+                                `cap:os:mac`); otherwise sane defaults.
+       item id               -> `tracker-item: <ITEM_ID>` and
+                                `idempotency-key: tracker-<ITEM_ID>` (stable).
+   - IDEMPOTENT: if a job for this tracker-item already exists in any stage
+     (reuse Slice 1 idempotency on the derived key) → no duplicate enqueue.
+   - On success print the created inbox filename; on missing item → clear error, nonzero.
+
+2. `aq to-tracker <job>` — push a job's CURRENT outcome to its tracker Item
+   (one-way echo, child -> tracker; §24.5). Only if the job meta has tracker-item.
+   - Map stage/result -> item status PATCH:
+       building/review/testing -> in_progress
+       shipped                 -> done
+       failed                  -> blocked (or the API's failure status) + note
+   - Post a comment/note with result, attempts, and insights summary
+     (duration, tokens/cost if present) — reuse Slice 3 metrics. Metrics only,
+     NEVER prompt content or secrets.
+   - IDEMPOTENT: re-running to-tracker for an unchanged outcome is a no-op
+     (track last-echoed state in meta, e.g. `tracker_echoed=<status>`).
+
+3. Auto-echo hook (opt-in, default OFF): an env flag (e.g. AQ_TRACKER_AUTO=1)
+   makes the worker call `to-tracker` automatically on each stage transition it
+   already performs (enqueue→building→review/testing/failed/shipped). When OFF,
+   echo is manual via the command. Never block/fail a job because an echo failed —
+   log the echo error and continue (the tracker is downstream, not authoritative
+   for execution).
+
+4. `status` / `aq insights`: show the tracker-item id and last echoed status where
+   present (you already surface tracker-item in status from Slice 1 — extend it).
+
+TESTS (selftest.sh — only ADD; NO live service — use AQ_TRACKER_API_CMD stub that
+returns canned JSON and records the calls it received):
+- from-tracker creates an inbox job: stub returns an item JSON →
+  `aq from-tracker T-1` creates one inbox/*.md whose frontmatter has
+  tracker-item: T-1 and idempotency-key: tracker-T-1, body = item description.
+- from-tracker label mapping: item with labels [engine-class:agentic-coder,
+  priority:high] → frontmatter reflects them.
+- from-tracker idempotent: calling it twice for T-1 → exactly one job (dedupe).
+- to-tracker status echo: a shipped job → stub receives a PATCH to status=done and
+  a comment with the insights summary; assert no prompt body is sent.
+- to-tracker idempotent: second call with unchanged outcome → no duplicate
+  PATCH/comment (tracker_echoed honored).
+- echo failure is non-fatal: stub returns HTTP 500 → `to-tracker` logs the error,
+  exits without corrupting job state; the job's stage is unchanged.
+- REGRESSION: all existing checks (Slice 0/1/2/3) still green.
+
+DOCS:
+- README: "Tracker integration" section — from-tracker/to-tracker, the env config,
+  label→manifest mapping table, the one-way-echo rule, AQ_TRACKER_AUTO, and a note
+  that real use needs platform-service running + a token.
+- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick the §10 single-host items + the §14 Phase-1
+  "tracker adapter" item; set §0 Phase 1 → complete (or note the exact remaining %).
+
+CONSTRAINTS: bash style consistent with the script; curl-only HTTP through the one
+wrapper; mac+linux safe; no emojis; conventional commits; tests sacred.
+
+VERIFY GATE: bash agent-queue/selftest.sh fully green; bash -n agent-queue.sh;
+node --check dashboard.mjs; shellcheck --severity=error clean.
+
+FINAL OUTPUT — print the report in EXACTLY this format:
+
+## Implementation Report — Phase 1 Slice 4
+### Branch & commits
+- branch / based-on: <name>
+- commits: <sha> <message>
+- PR: <url or "opened, not merged">
+### Files changed
+- <path>: <one-line summary>
+### What was implemented (1-4)
+- <item>: <how, key functions; the Item API contract you matched>
+### Tests added
+- <test name>: <what it asserts>  (+ selftest PASS/FAIL summary)
+### Verify gate results
+- selftest / bash -n / node --check / shellcheck: <results>
+### Deviations / assumptions
+- <API path/field/status mapping choices; anything stubbed>
+### Phase 1 status
+- <which §14 items now complete; what (if anything) remains>
+### Suggested next slice
+- Phase 2 Slice 1 (fleet data model + repositories in platform-service)
--- a/agent-queue/docs/jobs/phase2-artifacts.md
+++ b/agent-queue/docs/jobs/phase2-artifacts.md
@ -0,0 +1,86 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat-artifacts
+timeout: 4h
+---
+
+ROLE: Senior backend engineer. Implement FLEET ARTIFACTS + BLOB WIRING (§13 leftover):
+large run outputs (logs, coverage, screenshots, build output) are stored in blob
+storage and only POINTERS (with size/content-type/SAS) live in the `fleet_artifacts`
+Cosmos container — NEVER inline in Cosmos (doc-size + RU limits).
+
+PARALLEL-SAFETY (two other Devins are running — DO NOT collide):
+- You OWN the fleet_artifacts surface: types.ts (artifact schema only), repository.ts
+  (artifact repo only), routes.ts (artifact endpoints only), cosmos-init.ts (only if the
+  fleet_artifacts container needs registration), and a NEW artifacts.test.ts.
+- You MUST NOT touch: coordinator.ts, coordinator.test.ts, scheduler.ts (another Devin owns
+  the scheduler + claim ranking). Keep your edits to types/repository/routes additive and
+  localized to the artifact pieces — do not refactor the job/lease/claim code.
+- A third Devin is in a different repo (agent-queue) — no overlap.
+
+READ FIRST:
+- services/platform-service/src/modules/fleet/types.ts — find FleetArtifactDoc (the
+  foundation may already declare it, pk /jobId). repository.ts — see if an artifacts repo
+  already exists; extend, don't duplicate. cosmos-init.ts — see if fleet_artifacts is
+  already registered.
+- packages/blob (@bytelyst/blob) — the Azure Blob client + SAS token helpers. Learn the
+  exact API (upload, container/key conventions, SAS generation, the memory/dev fallback).
+  Use it the same way other consumers do (grep for existing @bytelyst/blob usage).
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §13 (fleet_artifacts
+  bullet) + §26 (insights/artifacts).
+
+PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-artifacts.
+Push + open PR. DO NOT merge.
+
+DELIVERABLES
+1. FleetArtifactDoc (in types.ts — confirm/extend): { id, productId, jobId, runId?, kind
+   ('log'|'coverage'|'screenshot'|'build'|'other'), blobKey, contentType, sizeBytes,
+   sha256?, createdAt }. Zod schema → inferred type. productId on the doc.
+2. repository.ts — artifacts repo: createArtifact, listArtifactsByJob(jobId),
+   getArtifact(id, productId), deleteArtifact. Single-partition (pk /jobId). Do not touch
+   the job/lease/run repos beyond importing shared helpers.
+3. Blob integration (a small artifacts service fn, e.g. in a NEW
+   modules/fleet/artifacts-blob.ts): uploadArtifact(jobId, kind, bytes/stream, contentType)
+   → stores in @bytelyst/blob under a deterministic key
+   (`fleet/<productId>/<jobId>/<id>-<kind>`), returns the persisted FleetArtifactDoc with a
+   short-lived SAS read URL. getArtifactDownload(id) → re-issues a SAS URL. Large content
+   NEVER goes into Cosmos.
+4. routes.ts — guarded endpoints (auth + productId, Zod-validated), additive only:
+   POST   /fleet/jobs/:id/artifacts        (multipart or base64 body → upload + pointer)
+   GET    /fleet/jobs/:id/artifacts        (list pointers)
+   GET    /fleet/artifacts/:artifactId     (pointer + fresh SAS download URL)
+   DELETE /fleet/artifacts/:artifactId
+   Register exactly like the existing fleet routes (do not reorder/rewrite the others).
+
+TESTS (artifacts.test.ts — memory blob + memory datastore; tests are sacred):
+- upload → a fleet_artifacts pointer doc is created with productId, blobKey, sizeBytes,
+  contentType; the bytes live in blob, NOT in the Cosmos doc (assert the doc has no inline
+  payload field).
+- list by job returns only that job's artifacts (partition isolation).
+- get returns a (fresh) SAS download URL; a large payload (> a Cosmos-safe threshold) still
+  succeeds (proves blob offload).
+- delete removes the pointer (and blob if your helper does so).
+- routes via fastify inject: upload/list/get/delete; auth + productId enforced; invalid body
+  → 400; unknown id → 404.
+- existing fleet tests (jobs/leases/claim/events) remain green and untouched.
+
+VERIFY GATE:
+- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet  (all green)
+- pnpm --filter @lysnrai/platform-service build
+- pnpm build && pnpm test  (no consumer regressed)
+
+CONSTRAINTS: ESM .js imports; no any; no console.log; productId on every doc; large logs in
+blob never Cosmos; conventional commits (feat(platform-service): ...); do not touch the files
+reserved for the other Devins; do not edit the agent-queue repo.
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Implementation Report — Fleet Artifacts + Blob Wiring (§13)
+### Branch & commits / PR
+### Files changed
+### What was implemented (artifact schema, blob key scheme, SAS, routes)
+### Tests added (+ pnpm test summary; esp. the "bytes in blob not Cosmos" assertion)
+### Verify gate results
+### Deviations / assumptions (blob API used, dev/memory fallback, SAS TTL)
+### Suggested next slice
--- a/agent-queue/docs/jobs/phase2-atomic-claim-hardening.md
+++ b/agent-queue/docs/jobs/phase2-atomic-claim-hardening.md
@ -0,0 +1,108 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat
+timeout: 3h
+---
+
+ROLE: Senior distributed-systems engineer. P0 HARDENING: make the fleet
+coordinator's job claim TRULY atomic. The Phase 2 Foundation (merged) implements
+the claim as an in-module "rev compare-and-swap" layered over an UNCONDITIONAL
+datastore read-then-replace. Because there are `await` points between the read,
+the rev check, and the write, two CONCURRENT claims can both read the same rev,
+both pass the check, and both write — a DOUBLE-ASSIGNMENT. The existing race test
+only drives the claims SEQUENTIALLY, so it does not catch this. Fix the root cause.
+
+CONTEXT TO READ FIRST:
+- services/platform-service/src/modules/fleet/repository.ts — revUpdateJob /
+  revUpdateLease (the non-atomic read-check-write).
+- services/platform-service/src/modules/fleet/coordinator.ts — tryClaimJob.
+- services/platform-service/src/modules/fleet/coordinator.test.ts — the current
+  (sequential) "atomic claim race" test.
+- packages/datastore — the shared datastore abstraction + its Memory and Cosmos
+  providers. Find the update/replace method and how (if at all) it exposes
+  optimistic concurrency.
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §4 (atomic
+  claim is THE core contract) + §13 (maps to Cosmos _etag / If-Match).
+
+PREREQUISITE / BRANCHING:
+- Branch off CURRENT `main` (the foundation is already merged). New branch:
+  feat/gigafactory-p2-atomic-claim. Push + open a PR. DO NOT merge.
+
+GOAL: a single-winner claim that holds under TRUE concurrency, backed by a
+server-side conditional write (Cosmos If-Match/_etag) and a process-atomic memory
+implementation — not a best-effort in-module check.
+
+DELIVERABLES
+
+1. ADD an optimistic-concurrency update to the shared datastore (@bytelyst/datastore).
+   This is a legitimate ADDITIVE shared-package feature (NOT a template-managed infra
+   file) — it MUST be backward-compatible and fully tested so existing consumers are
+   unaffected. Suggested API (match the package's existing style/naming):
+     updateIfMatch(id, partitionKey, expected: { etag?: string; rev?: number }, patch)
+       -> { ok: true, doc } | { ok: false, reason: 'conflict' | 'not_found' }
+   - COSMOS provider: perform a conditional replace using the document `_etag` with
+     `accessCondition { type: 'IfMatch', condition: etag }`; translate Cosmos 412
+     (precondition failed) → { ok:false, reason:'conflict' }. Surface `_etag` on reads
+     so callers can pass it back.
+   - MEMORY provider: implement the get → compare → set with NO `await`/yield between
+     the compare and the set (do it in one synchronous block inside the method) so two
+     concurrent callers CANNOT interleave within the single-threaded event loop. This
+     gives true in-process atomicity. Keep a monotonic rev (or reuse the existing one)
+     as the compare token for parity with Cosmos `_etag`.
+   - Do NOT change existing method signatures; only ADD. Update the provider interface
+     + both providers + the package's index exports.
+
+2. REWIRE the fleet repository to use it: revUpdateJob / revUpdateLease must perform
+   the compare-and-write through the new conditional update (no read-check-write with
+   an intervening await). The coordinator's tryClaimJob keeps the same external
+   behavior (returns ok / conflict) but is now genuinely atomic.
+
+3. UPGRADE the tests to actually prove atomicity (these are the point of the slice):
+   - In datastore: unit tests for updateIfMatch on BOTH providers — match → writes +
+     bumps token; stale token → conflict, NO write; missing → not_found.
+   - In fleet coordinator: replace/extend the race test to drive TRUE concurrency:
+       (a) `await Promise.all([tryClaimJob(jobA), tryClaimJob(jobB)])` on the same
+           freshly-read job → exactly one ok, one conflict; job assigned once; exactly
+           one run; one lease; leaseEpoch == 1.
+       (b) an N-claimer stress test: fire N (>=10) concurrent claims for one job via
+           Promise.all → exactly one ok, N-1 conflicts; no double-assignment.
+       (c) the same for lease renew under contention (optional but preferred).
+   - These concurrent tests MUST fail against the OLD read-check-write and pass after
+     the fix (sanity-check that you are testing the right thing; mention it in the report).
+
+4. Keep ALL existing platform-service tests green (the 50 fleet + the rest). Do not
+   weaken any test.
+
+VERIFY GATE (must pass):
+- pnpm --filter @bytelyst/datastore test         (new conditional-update tests)
+- pnpm --filter @bytelyst/datastore build
+- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet
+- pnpm --filter @lysnrai/platform-service build
+- pnpm build && pnpm test   (full workspace — confirm no consumer of @bytelyst/datastore regressed)
+
+CONSTRAINTS: ESM .js imports; no any; no console.log; additive + backward-compatible
+datastore change with tests; conventional commits (feat(datastore): ... /
+fix(platform-service): ...); never edit template-managed infra (.npmrc, docker-prep,
+tsconfig.base, pnpm-workspace). Tests are sacred.
+
+FINAL OUTPUT — print the report in EXACTLY this format:
+
+## Implementation Report — Phase 2 Atomic-Claim Hardening
+### Branch & commits / PR
+### Files changed
+- <path>: <summary>
+### The fix
+- datastore conditional update: <API, Cosmos If-Match mapping, memory atomicity approach>
+- fleet rewire: <how revUpdate* now writes conditionally>
+### Tests added (the proof)
+- concurrent claim (Promise.all) + N-claimer stress: <results>
+- did the new concurrent test FAIL on the old code? <yes/no + brief note>
+- datastore conditional-update unit tests: <results>
+### Verify gate results
+- datastore test/build · fleet test · platform build · full pnpm build && test: <results>
+### Deviations / assumptions
+### Suggested next slice
+- Phase 2 Slice 3: factory-agent integration (agent-queue.sh ↔ coordinator) now that
+  the claim is genuinely atomic.
--- a/agent-queue/docs/jobs/phase2-enrollment-tokens.md
+++ b/agent-queue/docs/jobs/phase2-enrollment-tokens.md
@ -0,0 +1,97 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat-enrollment
+timeout: 4h
+---
+
+ROLE: Senior backend + security engineer. Implement PHASE 2 — FACTORY ENROLLMENT +
+SCOPED ROTATABLE TOKENS (§12) for the fleet coordinator in platform-service, plus two
+small artifact-route hardening fixes found in review.
+
+PARALLEL-SAFETY (another Devin is running in a DIFFERENT repo — agent-queue/devops-tools —
+on feature flags; no file overlap with you. Stay within platform-service):
+- You OWN: a NEW modules/fleet/enrollment.ts, modules/fleet/tokens.ts (or one
+  enrollment.ts), enrollment.test.ts, and ADDITIVE edits to types.ts, repository.ts,
+  routes.ts, cosmos-init.ts (factory token fields + enrollment endpoints + token-auth
+  middleware). You MAY edit artifacts-blob.ts/routes.ts ONLY for the two review fixes below.
+- You MUST NOT change the scheduler.ts scoring, coordinator.ts claim/lease/fence CAS, or
+  the heartbeat/claim PAYLOAD shape (only ADD an optional auth check around them, behind a
+  flag — see below). Do not break any of the existing 79 fleet tests / 1591 platform tests.
+
+READ FIRST:
+- modules/fleet/types.ts — FleetFactoryDoc (id, productId, capabilities, health, load,
+  lastHeartbeatAt...). repository.ts — factory upsert (heartbeat). routes.ts — POST
+  /fleet/factories/heartbeat, POST /fleet/claim (these will optionally require a token).
+- modules/auth/** in platform-service AND ../../packages/auth — reuse the EXISTING token/
+  hashing primitives (bcrypt/sha-256 recovery-code pattern). Do NOT invent new crypto.
+  Tokens are stored HASHED at rest; the plaintext is returned exactly once at enroll/rotate.
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §12 (enrollment,
+  scoped tokens, rotation, revocation) + §18 (trust boundary).
+
+PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-enrollment.
+Push + open PR. DO NOT merge.
+
+DELIVERABLES
+1. Factory enrollment + token lifecycle (enrollment.ts):
+   - enrollFactory({productId, capabilities, label?}) → creates/links a FleetFactoryDoc and
+     issues a SCOPED token: scope = {productId, factoryId, capabilities[]}. Persist only the
+     HASH (+ tokenId, createdAt, lastUsedAt, status). Return plaintext token ONCE.
+   - rotateToken(factoryId, productId) → issue a new token, invalidate the previous (grace:
+     mark old `rotating` with a short overlap TTL so an in-flight worker isn't cut off).
+   - revokeToken(tokenId|factoryId, productId) → status=revoked; immediately rejected.
+   - verifyToken(plaintext) → resolves {factoryId, productId, capabilities, status} or null;
+     constant-time hash compare; updates lastUsedAt. Revoked/expired ⇒ null.
+2. Token-auth on the fleet endpoints — GATED so existing tests keep passing:
+   - Add a `requireFactoryToken` check to POST /fleet/factories/heartbeat and POST
+     /fleet/claim that is ENFORCED only when enforcement is on (env/flag
+     FLEET_REQUIRE_FACTORY_TOKEN, default OFF so the 79 existing tests are unaffected). When
+     on: missing/invalid/revoked token ⇒ 401; token scope must cover the requested productId
+     + the claim's capabilities ⇒ else 403. When off: behaves exactly as today.
+   - The claim's effective capabilities/productId must be taken from the VERIFIED token scope
+     when enforcement is on (a factory cannot claim outside its scope).
+3. Routes (additive): POST /fleet/factories/enroll, POST /fleet/factories/:id/token/rotate,
+   POST /fleet/factories/:id/token/revoke — all auth + productId + Zod validated, registered
+   like the existing fleet routes (do not reorder others).
+4. REVIEW FIXES (small, same module):
+   - listArtifactsByJob must be productId-scoped: thread `productId` through
+     repo.listArtifactsByJob + the GET /fleet/jobs/:id/artifacts handler (use the request
+     productId), so a caller can only list artifacts for their own product.
+   - Upload must prefer the request/auth productId over body.productId (drop the
+     `body.productId ||` precedence; use getRequestProductId(req), body value only as a
+     non-overriding hint or removed).
+
+TESTS (enrollment.test.ts + targeted additions; tests are sacred, all prior green):
+- enroll returns a plaintext token once; the stored doc holds only a hash (assert no
+  plaintext persisted) + scope (productId, capabilities).
+- verifyToken: valid → scope; tampered/unknown → null; revoked → null.
+- rotate: old token still works during the overlap TTL, then is rejected; new token works.
+- revoke: immediate rejection.
+- enforcement OFF (default): heartbeat/claim behave exactly as the existing tests expect
+  (re-assert claim works with NO token).
+- enforcement ON: no token → 401; out-of-scope productId or capability → 403; in-scope → ok,
+  and claim is constrained to the token's scope.
+- artifact fixes: list is productId-scoped (a different product cannot see the pointers);
+  upload ignores a spoofed body.productId.
+
+VERIFY GATE:
+- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet  (all green;
+  count grows from 79)
+- pnpm --filter @lysnrai/platform-service build
+- pnpm build && pnpm test  (no regression across consumers)
+
+CONSTRAINTS: ESM .js imports; no any; no console.log; productId on every doc; tokens HASHED
+at rest, plaintext shown once; reuse existing auth/crypto primitives (no new schemes);
+enforcement default OFF; conventional commits (feat(platform-service): ...); do not touch
+scheduler scoring or the claim CAS; do not edit the agent-queue repo.
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Implementation Report — Phase 2 Factory Enrollment + Scoped Tokens (§12)
+### Branch & commits / PR
+### Files changed
+### What was implemented (enroll/rotate/revoke/verify, scope model, gated auth, artifact fixes)
+### Tests added (+ pnpm test summary; esp. hashed-at-rest, scope 401/403, enforcement-off no-op)
+### Verify gate results
+### Deviations / assumptions (which crypto primitive, rotation overlap TTL, flag name)
+### Suggested next slice
--- a/agent-queue/docs/jobs/phase2-feature-flags-shadow.md
+++ b/agent-queue/docs/jobs/phase2-feature-flags-shadow.md
@ -0,0 +1,105 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: agent-queue
+timeout: 4h
+---
+
+ROLE: Senior bash + distributed-systems engineer. Implement PHASE 2 — FLEET FEATURE FLAGS
+ SHADOW / DUAL-RUN for the agent-queue runner: a safe, reversible path to validate the
+fleet coordinator against the proven single-host (P1) behavior BEFORE any real cutover.
+
+PARALLEL-SAFETY (another Devin is running in a DIFFERENT repo — learning_ai_common_plat —
+on enrollment/tokens; no file overlap with you. Stay within the agent-queue repo):
+- You OWN: agent-queue/lib/fleet-client.sh, agent-queue/agent-queue.sh (the fleet hook
+  points only), agent-queue/selftest.sh, agent-queue/README.md,
+  agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md.
+- Keep the offline git-queue path unchanged when fleet is off. All 60 existing selftest
+  checks MUST stay green.
+
+READ FIRST:
+- agent-queue/lib/fleet-client.sh — the P2-S3 client: fleet_enabled, fleet_api,
+  fleet_claim, fleet_report, lease renew/release, fleet_quarantine. You EXTEND this.
+- agent-queue/agent-queue.sh — the run loop + the existing fleet hook points + the offline
+  path (cmd_add/run_worker/ship). Study how AQ_FLEET gates everything today.
+- agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §9 (split-brain / offline degrade), §16/§17
+  (feature flags fleet.enabled / fleet.route_via_service), §27 (cutover & rollback).
+
+PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-flags-shadow.
+Push + open PR. DO NOT merge.
+
+FLAG MODEL (three explicit, independently-toggleable levels; document precedence):
+- AQ_FLEET=0|1            master switch (exists). 0 ⇒ pure offline, zero coordinator calls.
+- AQ_FLEET_ROUTE=0|1      route_via_service: when 1 (and AQ_FLEET=1) the coordinator is
+                          AUTHORITATIVE for claim/assignment (today's P2-S3 behavior).
+                          When 0, the LOCAL inbox is authoritative (coordinator not used to
+                          source work) — this is the pre-cutover state.
+- AQ_FLEET_SHADOW=0|1     shadow/dual-run: when 1 (requires AQ_FLEET=1, AQ_FLEET_ROUTE=0)
+                          the runner does its normal OFFLINE/local processing as the
+                          authoritative path, and IN PARALLEL queries the coordinator
+                          (shadow claim + shadow report) WITHOUT acting on its responses —
+                          purely to compare decisions and record divergence. Shadow NEVER
+                          ships, quarantines, or mutates real job state.
+
+DELIVERABLES
+1. fleet-client.sh additions (all guarded; no-ops unless their flag is on):
+   - fleet_route_enabled / fleet_shadow_enabled helpers (precedence: SHADOW only meaningful
+     when ROUTE=0; if both ROUTE=1 and SHADOW=1, ROUTE wins and a warning is logged).
+   - fleet_shadow_claim — asks the coordinator what it WOULD assign for this factory's caps,
+     without claiming a lease for real (read-only / dry-run; if the API has no dry-run, claim
+     then immediately lease/release, or use a shadow factoryId — pick the least-invasive and
+     document it). Returns the would-be job id (or none).
+   - fleet_shadow_compare — given the LOCAL decision (the job the offline path actually ran)
+     and the coordinator's would-be decision, classify AGREE / DIVERGE / COORD_EMPTY /
+     LOCAL_EMPTY and append a structured line to a shadow log
+     (agent-queue/queue/.state/fleet-shadow.log: ts, localJob, coordJob, verdict).
+   - fleet_shadow_report — mirrors stage transitions to the coordinator as shadow events
+     (clearly flagged shadow=1) so reporting is exercised, but divergence in the coordinator
+     response is logged, never acted on.
+2. agent-queue.sh wiring (minimal, flag-gated):
+   - run loop: if SHADOW on, after the local authoritative decision each iteration, call
+     fleet_shadow_claim + fleet_shadow_compare (best-effort, error-swallowed — shadow must
+     NEVER fail a real job).
+   - ROUTE flag: thread it so claim sourcing honors it (ROUTE=1 ⇒ coordinator-sourced as
+     today; ROUTE=0 ⇒ local inbox authoritative even when AQ_FLEET=1).
+   - new subcommand `aq fleet-shadow-report` — summarize the shadow log (counts of
+     AGREE/DIVERGE/…, last N divergences). Add to dispatch + help.
+   - surface the three flags' resolved state in `aq status` / `aq fleet-status`.
+3. Cutover safety: document the recommended rollout ladder in README — (1) AQ_FLEET=1,
+   ROUTE=0, SHADOW=1 (observe, zero risk) → (2) inspect agreement rate → (3) flip ROUTE=1
+   once agreement is high → rollback = set ROUTE=0 (and/or AQ_FLEET=0) at any time.
+
+TESTS — extend selftest.sh (stub the coordinator like the P2-S3 fleet stub; all 60 prior
+checks stay green):
+- flags off: AQ_FLEET=0 ⇒ zero coordinator calls (incl. shadow); offline flow identical.
+- shadow agree: stub returns the same job the local path runs ⇒ shadow log records AGREE;
+  the real job still ships via the offline/local path; coordinator state NOT mutated for real.
+- shadow diverge: stub returns a different/empty job ⇒ DIVERGE/COORD_EMPTY logged; real job
+  still completes; nothing quarantined.
+- shadow is non-fatal: coordinator 5xx/timeout during shadow ⇒ real job still completes,
+  exit 0, a shadow-error noted.
+- ROUTE precedence: ROUTE=1 + SHADOW=1 ⇒ ROUTE path taken, warning logged, no shadow compare.
+- ROUTE=0 + AQ_FLEET=1 ⇒ local inbox is authoritative (coordinator not used to source work).
+- fleet-shadow-report summarizes the log counts correctly.
+
+VERIFY GATE:
+- bash agent-queue/selftest.sh   (60 prior + new shadow/flag cases; none weakened)
+- bash -n agent-queue/agent-queue.sh && bash -n agent-queue/lib/fleet-client.sh
+- shellcheck --severity=error agent-queue/agent-queue.sh agent-queue/lib/fleet-client.sh
+- node --check agent-queue/dashboard.mjs (if unchanged)
+
+CONSTRAINTS: bash + curl + POSIX awk only (no jq/new deps); reuse P2-S3 helpers; shadow must
+be strictly side-effect-free on real job state; offline path unchanged when AQ_FLEET=0;
+never hardcode tokens; conventional commits (feat(agent-queue): ...); never weaken a test;
+do not edit the common-plat repo.
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Implementation Report — Phase 2 Feature Flags + Shadow/Dual-run
+### Branch & commits / PR
+### Files changed
+### What was implemented (flag model + precedence, shadow claim/compare/report, cutover ladder)
+### Tests added (+ selftest summary = 60 prior + N new; esp. flags-off no-op, shadow non-fatal, ROUTE precedence)
+### Verify gate results
+### Deviations / assumptions (how shadow claim avoids real lease mutation)
+### Suggested next slice
--- a/agent-queue/docs/jobs/phase2-foundation.md
+++ b/agent-queue/docs/jobs/phase2-foundation.md
@ -0,0 +1,179 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat
+timeout: 5h
+---
+
+ROLE: Senior backend / distributed-systems engineer. Implement the PHASE 2
+FOUNDATION of the agent gigafactory: a new `fleet` module in platform-service
+covering (S1) the durable data model + repositories AND (S2) the CONCURRENCY CORE
+— atomic claim, leases, fencing, heartbeat, and a reaper. This is one long,
+self-contained backend slice. It supersedes the single-host stand-ins built in the
+agent-queue (devops-tools) repo.
+
+WHY THIS IS A SAFE LONG (UNATTENDED) RUN: everything is in ONE repo
+(learning_ai_common_plat), all logic is TypeScript, and ALL tests run on the
+in-memory datastore provider (DB_PROVIDER=memory) — NO live platform-service, NO
+Cosmos, NO network calls, NO tokens required. There are no external blockers.
+
+READ FIRST (this is NOT the platform-service you may assume — verify conventions):
+- services/platform-service/src/modules/items/{types,repository,routes}.ts — copy
+  this module pattern EXACTLY: types.ts -> repository.ts -> routes.ts, Zod schemas,
+  the cloud-agnostic datastore, productId on every doc, req.log/app.log, ESM with
+  .js import suffixes, no `any`, no console.log.
+- packages/datastore (or the existing datastore abstraction) — how repositories are
+  built, how optimistic concurrency (_etag / If-Match) is exposed, and how the
+  memory vs cosmos provider is selected (DB_PROVIDER).
+- packages/cosmos container registry — how containers are registered.
+- The fleet spec lives in the sibling devops-tools repo (read-only):
+  ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md
+  §4 (core contract: idempotency/atomic-claim/fencing/lease), §7 (scheduler/claim),
+  §8 (factory/lease/heartbeat), §13 (containers + fields), §18 (failure model),
+  §25 (durability/recovery), §26 (insights). Match these field names + semantics.
+
+PREREQUISITE / SETUP / BRANCHING:
+- Branch off CURRENT `main` of learning_ai_common_plat.
+- New branch: feat/gigafactory-p2-foundation. Commit in logical steps (data model,
+  repos, coordinator, routes, docs). Push + open a PR. DO NOT merge (human gate).
+- If node_modules is missing, run `pnpm install` once at the repo root. All tests
+  must pass with DB_PROVIDER=memory (set it in the test setup if not already).
+
+STRICT SCOPE:
+- Add ONE new module: services/platform-service/src/modules/fleet/ (+ tests there).
+- Register the new fleet_* Cosmos containers via the existing registration path.
+- Do NOT modify unrelated modules. Do NOT hand-edit template-managed infra
+  (.npmrc, docker-prep.sh*, tsconfig.base.json, pnpm-workspace.yaml) — they drift.
+- Every Cosmos doc MUST include productId. ESM everywhere. No `any` (Zod inference
+  or explicit types). No console.log (use req.log / app.log). Tests are sacred —
+  never weaken or delete a test to go green; fix the code.
+
+=================================================================
+PART S1 — DATA MODEL + REPOSITORIES
+=================================================================
+1. types.ts — Zod schemas + inferred types, each doc carrying productId:
+   - FleetJobDoc (pk /productId): manifestSnapshot, bodyMd (verbatim instructions),
+     stage (enum matching the agent-queue lifecycle:
+     queued|blocked|assigned|building|review|testing|shipped|failed|dead_letter),
+     idempotencyKey, trackerItemId?, parentId?, kind ('leaf'|'composite' default
+     'leaf'), checkpoint? {wipBranch,wipBase,wipCommit}, priority
+     (critical|high|medium|low), capabilities[], engineClass?, profile?, deps[],
+     depsMode?, budget? {usd?,tokens?,wall?}, retry? {max,backoff,on[]}, timestamps.
+   - FleetRunDoc (pk /jobId): jobId, attempt, factoryId?, engine, profileSnapshot?,
+     startedAt, endedAt?, exit?, verifyResult?, result?, insights {model?,tokensIn?,
+     tokensOut?,tokensCached?,costUsd?,estimated?,turns?,toolCalls?,filesChanged?,
+     linesAdded?,linesDeleted?}.
+   - FleetLeaseDoc (pk /jobId): jobId, holderFactoryId?, expiresAt?, leaseEpoch
+     (number, default 0), renewals (number), status (held|expired|released).
+   - FleetFactoryDoc (pk /productId): factoryId, descriptor, capabilities[],
+     health (ok|degraded|down), load, seatLimit, lastHeartbeatAt.
+   - FleetProfileDoc (pk /productId): name, version, immutable snapshot.
+   - FleetEventDoc (pk /jobId): append-only {type, at, actor?, data}.
+   - FleetArtifactDoc (pk /jobId): pointers to blob-stored artifacts (no inline logs).
+2. repository.ts — one repo per container on the datastore abstraction (memory +
+   cosmos): create, getById, list (by productId; jobs also by stage + by
+   idempotencyKey), update (returning/honoring _etag), delete where sensible,
+   appendEvent(jobId,event). Partition-aware; no cross-partition fan-out in hot paths.
+3. Register all fleet_* containers with correct partition keys.
+
+=================================================================
+PART S2 — CONCURRENCY CORE (claim / lease / fencing / heartbeat / reaper)
+=================================================================
+4. ATOMIC CLAIM (the heart): `claimNextJob(factory)` selects the highest-priority,
+   oldest eligible job whose stage is `queued` AND whose deps are satisfied AND
+   whose capabilities are a subset of the factory's, then atomically transitions it
+   to `assigned` and creates/acquires its lease — guarded by _etag / If-Match so
+   that under contention EXACTLY ONE factory wins; losers get a conflict and retry
+   the selection. No double-assignment, ever.
+5. LEASES + FENCING: acquiring a lease increments `leaseEpoch`. `renewLease`,
+   `releaseLease`. Every state-mutating call from a worker carries its leaseEpoch;
+   a call whose epoch is < the current epoch is REJECTED (fencing) — a stale/zombie
+   worker can never overwrite a reassigned job's state.
+6. HEARTBEAT: `heartbeat(factoryId)` updates lastHeartbeatAt + load/health.
+7. REAPER: `reapExpiredLeases(now)` scans leases with expiresAt < now, marks them
+   expired, bumps leaseEpoch, and returns the job to `queued` (or `blocked` if deps
+   now unmet) for re-claim — resume-from-checkpoint friendly (checkpoint pointer
+   preserved on the job). Reaper is idempotent. (Cosmos TTL does NOT do this — the
+   reaper must; document why.)
+8. IDEMPOTENCY: submit with an existing idempotencyKey + identical content => returns
+   the existing job (no dup); same key + different content while still queued =>
+   supersede; otherwise 409. (Mirror the agent-queue Slice 1 semantics.)
+9. DEPS: a job is `blocked` until each dep reaches shipped (or testing when
+   depsMode:soft); submit-time cycle detection rejects cyclic graphs.
+
+10. routes.ts — guarded REST under the existing auth + productId middleware:
+    POST /fleet/jobs (submit, idempotent), GET /fleet/jobs (list by stage),
+    GET /fleet/jobs/:id, PATCH /fleet/jobs/:id (fenced state transition),
+    POST /fleet/claim (atomic claim for a factory),
+    POST /fleet/jobs/:id/lease/renew, POST /fleet/jobs/:id/lease/release,
+    POST /fleet/factories/heartbeat, GET /fleet/jobs/:id/runs,
+    GET /fleet/jobs/:id/events. Validate every body with the Zod schemas. Register
+    the module in the app exactly as items is registered.
+
+=================================================================
+TESTS (Vitest — write alongside; memory provider; tests are sacred)
+=================================================================
+- schema validation: valid docs pass; missing productId / bad enum fail precisely
+  (>=1 invalid case per container).
+- repo CRUD round-trip per container; list filters by productId, by stage, by
+  idempotencyKey; appendEvent yields an ordered append-only stream.
+- ATOMIC CLAIM RACE: two claims contending for the SAME job version (same _etag) =>
+  exactly one succeeds, the other gets a conflict; assert no double-assignment.
+  (Deterministic: drive via the conditional/If-Match update, not real threads.)
+- priority+age selection: among eligible queued jobs, claim returns the
+  highest-priority then oldest.
+- deps gating: a job with unmet deps is `blocked` and NOT claimable; becomes
+  claimable once deps reach shipped; depsMode:soft satisfied at testing; cycle
+  rejected at submit.
+- FENCING: a state-mutating call with a stale leaseEpoch is rejected; the current
+  epoch succeeds.
+- REAPER: an expired lease => job back to queued, leaseEpoch bumped, checkpoint
+  preserved; running the reaper twice is idempotent.
+- HEARTBEAT updates lastHeartbeatAt/health; a stale factory is detectable.
+- IDEMPOTENT submit: same key+content => 1 job; key+changed content while queued =>
+  superseded; otherwise 409.
+- routes: submit+claim+renew+release+heartbeat+patch via fastify inject (shared
+  testing helpers); auth + productId enforced; invalid body rejected.
+
+VERIFY GATE (must all pass before finishing):
+- pnpm --filter @lysnrai/platform-service typecheck
+- pnpm --filter @lysnrai/platform-service test     (all new tests green; none weakened)
+- pnpm --filter @lysnrai/platform-service build
+Run the full repo gate too if quick: `pnpm build && pnpm test && pnpm typecheck`.
+
+DOCS:
+- A module README (or docblock) describing each container, the claim/lease/fence
+  protocol, and the reaper. In your REPORT, list which roadmap §4/§7/§8/§13/§18
+  items are now satisfied (I will tick them in the devops-tools repo — you must NOT
+  edit that repo).
+
+CONSTRAINTS: follow items-module conventions precisely; ESM .js imports; no any; no
+console.log; productId on every doc; conventional commits
+(feat(platform-service): ...); do not touch template-managed infra.
+
+FINAL OUTPUT — print the report in EXACTLY this format:
+
+## Implementation Report — Phase 2 Foundation (fleet module + coordinator)
+### Branch & commits
+- branch / based-on / PR
+- commits: <sha> <message> (one per line)
+### Files changed
+- <path>: <one-line summary>
+### What was implemented
+- S1 data model: <containers, partition keys, etag handling>
+- S2 concurrency: <claim algorithm, lease/fencing via leaseEpoch, reaper, heartbeat>
+- idempotency + deps + cycle detection: <how>
+### Tests added
+- <test name>: <assertion>   (esp. the atomic-claim race, fencing, reaper tests)
+- pnpm test summary: <N passed>
+### Verify gate results
+- typecheck / test / build (+ full-repo gate if run): <results>
+### Roadmap items now satisfied
+- §4: <...>  §7: <...>  §8: <...>  §13: <...>  §18: <...>
+### Deviations / assumptions
+- <datastore concurrency model, how the race test is made deterministic, anything stubbed>
+### Suggested next slice
+- Phase 2 Slice 3: factory-agent integration — agent-queue.sh registers/heartbeats/
+  claims/reports against this coordinator behind a flag, preserving offline mode;
+  plus the tracker echo wired through fleet_events.
--- a/agent-queue/docs/jobs/phase2-scheduler.md
+++ b/agent-queue/docs/jobs/phase2-scheduler.md
@ -0,0 +1,84 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat-scheduler
+timeout: 4h
+---
+
+ROLE: Senior backend engineer. Implement the PHASE 2 SCHEDULER / ROUTER CORE (§7)
+for the fleet coordinator: a deterministic, fixed-weight scoring engine that picks
+WHICH job a claiming factory gets, and wire it into the atomic claim.
+
+PARALLEL-SAFETY (two other Devins are running — DO NOT collide):
+- You OWN: services/platform-service/src/modules/fleet/scheduler.ts (NEW),
+  scheduler.test.ts (NEW), and the candidate-ranking section of coordinator.ts +
+  coordinator.test.ts.
+- You MUST NOT touch: types.ts, repository.ts, routes.ts, cosmos-init.ts, server.ts
+  (another Devin is editing those for fleet_artifacts). If you need a new type, define
+  it inside scheduler.ts. If wiring truly requires a types.ts change, instead re-export
+  from scheduler.ts. Import existing FleetJobDoc/FleetFactoryDoc from types.ts (read-only).
+- A third Devin is in a different repo (agent-queue) — no overlap.
+
+READ FIRST:
+- services/platform-service/src/modules/fleet/coordinator.ts — claimNextJob /
+  tryClaimJob: today it selects "highest-priority, oldest, deps-satisfied, capability-
+  subset". You will replace the SELECTION step with the scoring engine (keep the atomic
+  tryClaimJob CAS exactly as-is).
+- types.ts (read-only) — FleetJobDoc (priority, capabilities, budget, createdAt, deps,
+  stage), FleetFactoryDoc (capabilities, health, load, seatLimit).
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §7 (the formula
+  + tie-breaks + phasing note: Phase 2 = fixed weights; Phase 3 = tunable + preemption).
+
+PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-scheduler.
+Push + open PR. DO NOT merge.
+
+DELIVERABLES
+1. scheduler.ts (pure, no I/O, fully unit-testable):
+   - Weight config (fixed defaults, overridable via a passed-in object — NOT env here):
+     score = w1·capabilityFit + w2·affinity(prefersEngine/repo-stickiness)
+           + w3·(1/(1+load)) + w4·costFit(budget) + w5·health − w6·starvationPenalty(age)
+   - `scoreCandidate(job, factory, ctx, weights?) → { score, breakdown }` — return the
+     per-term breakdown for explainability (§7/Phase-3 readiness).
+   - `selectJob(candidates: FleetJobDoc[], factory, ctx, weights?) → FleetJobDoc | null` —
+     filter to deps-satisfied + capability-subset (reuse the coordinator's existing
+     predicates; if they're inline, extract pure helpers INTO scheduler.ts), then rank by
+     score; deterministic tie-break: higher priority → older createdAt → lower cost class.
+   - Pure, synchronous, no datastore calls. Health/load come from the factory doc; age
+     from job.createdAt vs ctx.now (coordinator-authoritative time, passed in).
+2. Wire into coordinator.claimNextJob: replace the ad-hoc selection with
+   `selectJob(...)`, passing the existing candidate set + the claiming factory + ctx.now.
+   Keep tryClaimJob's rev/updateIfMatch CAS and lease/fence logic byte-for-byte unchanged.
+   If the claim has no factory capabilities/health context today, thread the minimal fields
+   through ClaimContext (additive, in coordinator.ts only).
+
+TESTS (scheduler.test.ts + additions to coordinator.test.ts — tests are sacred):
+- capabilityFit: a factory missing a required cap → candidate filtered out (never selected).
+- priority dominates when all else equal; age breaks ties deterministically.
+- load: higher-load factory lowers score (1/(1+load)); health: degraded < ok.
+- starvation: an old low-priority job eventually outranks a fresh low-priority one.
+- costFit: a job exceeding the factory/budget cost class is penalized/last.
+- breakdown: scoreCandidate returns each weighted term (sums to score).
+- selectJob determinism: same inputs → same pick across runs; empty/no-eligible → null.
+- coordinator integration: claimNextJob still returns exactly one winner under the existing
+  concurrency tests (all prior fleet tests stay green); selection now follows the score.
+
+VERIFY GATE:
+- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet  (all green)
+- pnpm --filter @lysnrai/platform-service build
+- pnpm build && pnpm test  (no regression)
+
+CONSTRAINTS: ESM .js imports; no any; no console.log; fixed weights this phase (tunable +
+preemption are Phase 3 — do NOT build them); pure scheduler (no I/O); conventional commits
+(feat(platform-service): ...); do not touch the files reserved above; do not edit the
+agent-queue repo.
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Implementation Report — Phase 2 Scheduler/Router Core (§7)
+### Branch & commits / PR
+### Files changed
+### What was implemented (scoring terms, tie-breaks, coordinator wiring)
+### Tests added (+ pnpm test summary)
+### Verify gate results
+### Deviations / assumptions (what ctx fields were threaded, weight defaults chosen)
+### Suggested next slice (Phase 3 tunable weights + preemption + explainability UI)
--- a/agent-queue/docs/jobs/phase2-slice1.md
+++ b/agent-queue/docs/jobs/phase2-slice1.md
@ -0,0 +1,125 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat
+timeout: 4h
+---
+
+ROLE: Senior backend/distributed-systems engineer. Implement Phase 2 — Slice 1:
+the FLEET DATA MODEL + REPOSITORIES as a new platform-service module. This is the
+durable backbone (§13) that supersedes the single-host stand-ins. NO atomic
+claim/lease/fencing logic yet — that is Phase 2 Slice 2. This slice is schemas,
+repositories, container registration, basic guarded CRUD, and tests.
+
+NOTE: This runs in a DIFFERENT repo (learning_ai_common_plat), so it does NOT
+conflict with the agent-queue (devops-tools) slices and can run independently.
+
+READ FIRST (this is NOT the platform-service you may assume — verify conventions):
+- services/platform-service/src/modules/items/{types,repository,routes}.ts — copy
+  this module pattern EXACTLY (types.ts -> repository.ts -> routes.ts, Zod schemas,
+  the cloud-agnostic datastore, productId on every doc, req.log/app.log).
+- packages/cosmos (container registry) + how existing modules register containers.
+- The fleet container spec in the roadmap: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md
+  §13 lives in the devops-tools repo at ../learning_ai_devops_tools — read it for
+  the field lists (fleet_jobs incl. bodyMd + checkpoint; fleet_runs incl. token/
+  cost/tool/diff insights; fleet_leases incl. leaseEpoch; fleet_factories;
+  fleet_profiles; fleet_events; fleet_artifacts) and §25/§26.
+
+PREREQUISITE / BRANCHING:
+- Branch off CURRENT `main` of learning_ai_common_plat.
+- New branch: feat/gigafactory-p2-slice1. Push + open a PR. DO NOT merge.
+
+STRICT SCOPE:
+- Add a NEW module: services/platform-service/src/modules/fleet/ (+ its tests).
+- Register the new Cosmos containers via the existing registration path.
+- Do NOT modify unrelated modules. Do NOT hand-edit shared infra (.npmrc,
+  docker-prep.sh, tsconfig.base, pnpm-workspace) — those are template-managed.
+- ESM everywhere ("type": "module", .js import suffixes). No `any` (Zod inference
+  or explicit types). No console.log (use req.log/app.log). Every Cosmos doc has
+  productId. Tests are sacred.
+
+DELIVERABLES
+
+1. types.ts — Zod schemas + inferred types for each container, each with productId:
+   - FleetJobDoc (pk /productId): manifestSnapshot, bodyMd (verbatim instructions),
+     stage, idempotencyKey, trackerItemId?, parentId?, kind ('leaf'|'composite',
+     default 'leaf'), checkpoint? { wipBranch, wipBase, wipCommit }, priority,
+     capabilities[], engineClass?, profile?, deps[], depsMode?, timestamps.
+   - FleetRunDoc (pk /jobId): jobId, attempt, factoryId?, engine, profileSnapshot?,
+     startedAt, endedAt?, exit?, verifyResult?, result?, and insights: model?,
+     tokensIn?, tokensOut?, tokensCached?, costUsd?, estimated?, turns?, toolCalls?,
+     filesChanged?, linesAdded?, linesDeleted?.
+   - FleetLeaseDoc (pk /jobId): jobId, holderFactoryId?, expiresAt?, leaseEpoch
+     (number, default 0), renewals, status. (Fields only — reclaim/claim logic is S2.)
+   - FleetFactoryDoc (pk /productId): factoryId, descriptor, capabilities[], health,
+     load, lastHeartbeatAt, seatLimit.
+   - FleetProfileDoc (pk /productId): name, version, immutable snapshot (persona,
+     defaults). FleetEventDoc (pk /jobId): append-only event { type, at, data }.
+     FleetArtifactDoc (pk /jobId): pointers to blob-stored artifacts (no inline logs).
+   - Define enums for stage and result that MATCH the agent-queue lifecycle.
+
+2. repository.ts — one repository per container using the existing datastore
+   abstraction (so DB_PROVIDER=memory works in tests, cosmos in prod):
+   - CRUD: create, getById, list (by productId; jobs also by stage), update
+     (optimistic via _etag where the datastore supports it — expose the etag,
+     even though the ATOMIC claim flow is S2), delete where sensible.
+   - appendEvent(jobId, event) for the append-only fleet_events stream.
+   - All queries partition-aware; no cross-partition fan-out in hot paths.
+
+3. container registration — register all fleet_* containers with correct partition
+   keys via the existing cosmos container registry; memory provider auto-handles.
+
+4. routes.ts — minimal guarded REST under the existing auth + productId middleware:
+   - POST /fleet/jobs (create), GET /fleet/jobs (list by stage/productId),
+     GET /fleet/jobs/:id, PATCH /fleet/jobs/:id (stage/fields), and read endpoints
+     for runs (GET /fleet/jobs/:id/runs) + events. Keep it thin — claim/lease
+     endpoints are S2. Validate all bodies with the Zod schemas.
+   - Register the route module in the platform-service app the same way items does.
+
+TESTS (Vitest — write alongside; memory provider; tests sacred):
+- schema validation: valid docs pass; missing productId / bad enum fail with
+  precise errors; at least one invalid case per container.
+- repository CRUD round-trip per container (create→get→list→update→delete) on the
+  memory provider; list filters by productId and by stage (jobs).
+- appendEvent produces an ordered, append-only stream for a jobId.
+- routes: create+get+list+patch a job via fastify inject (use the shared testing
+  helpers); auth/productId enforced; invalid body rejected.
+- _etag surfaced on update (lost-update guard groundwork) — assert the etag flows.
+
+VERIFY GATE (must pass):
+- pnpm --filter @lysnrai/platform-service typecheck
+- pnpm --filter @lysnrai/platform-service test   (new tests green; none weakened)
+- pnpm --filter @lysnrai/platform-service build
+
+DOCS:
+- Short module README or header docblock describing the containers + that
+  claim/lease/fencing is Phase 2 Slice 2.
+- In ../learning_ai_devops_tools roadmap you may NOT edit (different repo) — instead
+  note in your report which §13 items are now satisfied so I can tick them.
+
+CONSTRAINTS: follow the items-module conventions precisely; ESM .js imports; no any;
+no console.log; productId everywhere; conventional commits (feat(platform-service):
+...); do not touch template-managed infra files.
+
+FINAL OUTPUT — print the report in EXACTLY this format:
+
+## Implementation Report — Phase 2 Slice 1
+### Branch & commits
+- branch / based-on / PR
+- commits: <sha> <message>
+### Files changed
+- <path>: <one-line summary>
+### What was implemented (1-4)
+- containers + schemas + repos + routes; partition keys; etag handling
+### Tests added
+- <test name>: <assertion>  (+ pnpm test summary: N passed)
+### Verify gate results
+- typecheck / test / build: <results>
+### §13 items now satisfied
+- <list which roadmap §13 boxes are done so the human can tick them>
+### Deviations / assumptions
+- <datastore/etag/provider choices>
+### Suggested next slice
+- Phase 2 Slice 2: atomic claim (_etag/If-Match) + lease renew/release + heartbeat
+  + reaper + fencing (leaseEpoch) — the concurrency core.
--- a/agent-queue/docs/jobs/phase2-slice3.md
+++ b/agent-queue/docs/jobs/phase2-slice3.md
@ -0,0 +1,156 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: agent-queue
+timeout: 4h
+---
+
+ROLE: Senior bash + distributed-systems engineer. Implement PHASE 2 SLICE 3 —
+FACTORY-AGENT INTEGRATION: make the single-host `agent-queue.sh` runner act as a
+"factory" that registers / heartbeats / claims / reports against the already-merged
+`fleet` coordinator in platform-service, **behind a feature flag**, while keeping
+the existing offline git-queue path 100% intact when the flag is off.
+
+NON-NEGOTIABLE DESIGN RULE (prevents merge churn + regressions):
+- Put ALL coordinator-client logic in a NEW separate file `agent-queue/lib/fleet-client.sh`
+  that `agent-queue.sh` sources. Touch `agent-queue.sh` only at a few well-defined hook
+  points (claim source, stage-transition reporting, dispatch/help). The offline git-queue
+  code path MUST be byte-for-byte behaviorally unchanged when `AQ_FLEET` is unset/0.
+- Gate every coordinator interaction on `AQ_FLEET=1`. Default (unset) = today's offline
+  behavior. All 53 existing selftest checks MUST still pass unchanged.
+
+READ FIRST (verify the real contract — do not guess):
+- agent-queue/agent-queue.sh — the runner. Study: the manifest/lifecycle stages
+  (queued→assigned→building→review→testing→shipped + blocked/failed/dead_letter),
+  `run_worker`/`cmd_run`/`ship`/`promote`, the Slice-4 `tracker_api` curl wrapper +
+  `_api_call` + awk JSON helpers (REUSE these patterns — POSIX awk, curl-only, no jq),
+  and the Slice-4 auto-echo hooks. Mirror that style exactly.
+- agent-queue/selftest.sh — how stub-driven HTTP tests work (the tracker stub overrides
+  the curl wrapper). Build the fleet stub the same way.
+- THE COORDINATOR CONTRACT (read-only, in the sibling repo
+  ../learning_ai_common_plat/services/platform-service/src/modules/fleet/routes.ts):
+  all routes are registered under the `/api` prefix. Exact endpoints:
+    POST   /api/fleet/factories/heartbeat   {factoryId, capabilities[], health, load}
+    POST   /api/fleet/claim                 {factoryId, capabilities[]} -> job + leaseEpoch + lease expiry (or empty)
+    GET    /api/fleet/jobs/:id
+    PATCH  /api/fleet/jobs/:id               fenced stage transition: {stage, checkpoint?, leaseEpoch}
+    POST   /api/fleet/jobs/:id/lease/renew   {leaseEpoch}
+    POST   /api/fleet/jobs/:id/lease/release {leaseEpoch}
+    GET    /api/fleet/jobs/:id/runs
+    GET    /api/fleet/jobs/:id/events
+  Note: there is NO client-side "register factory" or "append event" endpoint — registration
+  is the heartbeat upsert, and `fleet_events` are written SERVER-SIDE by the coordinator on
+  each PATCH/claim. The coordinator owns `leaseEpoch` fencing: a PATCH/renew carrying a stale
+  epoch is rejected (409/conflict).
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §7 (claim loop),
+  §8 (factory/heartbeat/claim/report/drain), §9 (split-brain/offline-degrade), §18 (fencing).
+
+PREREQUISITE / BRANCHING:
+- Branch off CURRENT `main` (Phase 1 complete; foundation + hardening merged).
+  New branch: feat/gigafactory-p2-slice3. Commit in logical steps. Push + open a PR.
+  DO NOT merge.
+
+CONFIG BLOCK (env, in fleet-client.sh; document in README):
+- AQ_FLEET            (0/1, default 0 — master switch; 0 = pure offline git-queue)
+- AQ_FLEET_API        (default http://localhost:4003/api)
+- AQ_FLEET_TOKEN      (bearer; never hardcode)
+- AQ_PRODUCT_ID       (reuse the Slice-4 var; X-Product-Id header)
+- AQ_FACTORY_ID       (default: hostname + short rand; stable per process)
+- AQ_FLEET_LEASE_RENEW_SEC (default 300), AQ_FLEET_CAPS (auto-detected caps override)
+
+DELIVERABLES
+
+1. `agent-queue/lib/fleet-client.sh` (new) — a sourced library, curl-only + POSIX awk
+   (reuse Slice-4 helpers; do not add deps):
+   - `fleet_enabled` — returns true iff AQ_FLEET=1 (guard for every other fn).
+   - `fleet_api METHOD PATH [json]` — curl wrapper adding bearer + X-Product-Id; returns
+     body; captures HTTP code; non-2xx is logged and surfaced (never crashes the runner).
+   - `fleet_detect_caps` — reuse the runner's existing capability auto-detection (os, engines,
+     tools) to build the capabilities array.
+   - `fleet_heartbeat` — POST factories/heartbeat (registration == first heartbeat); call at
+     loop start + every AQ_FLEET_LEASE_RENEW_SEC during long runs.
+   - `fleet_claim` — POST /fleet/claim with caps; parse job id + bodyMd + leaseEpoch + lease
+     expiry; materialize a transient local job file (reuse the Slice-4 from-tracker
+     materialization) so the existing runner executes it unchanged. Store leaseEpoch in the
+     job meta.
+   - `fleet_report STAGE [checkpoint]` — PATCH /fleet/jobs/:id with {stage, checkpoint?,
+     leaseEpoch}. **Fencing-aware:** if the coordinator returns conflict/409 (stale epoch),
+     the worker MUST self-abort the job (stop work, do NOT ship/merge) and log a fenced-abort
+     event — a reclaimed/zombie worker can never corrupt coordinator state.
+   - `fleet_lease_renew` / `fleet_lease_release` — fenced; renew on a timer while building;
+     release on terminal stages.
+   - `fleet_checkpoint` — capture {wipBranch, wipCommit} and send via fleet_report so a
+     reclaim can resume (durability, §25).
+
+2. Wire `agent-queue.sh` at MINIMAL hook points (all guarded by `fleet_enabled`):
+   - source `lib/fleet-client.sh` near the top.
+   - claim: when AQ_FLEET=1 and the local inbox is empty, try `fleet_claim` before idling
+     (coordinator jobs interleave with local `.md` files; local files still work).
+   - stage transitions (building/review/testing/shipped/failed): call `fleet_report` +
+     checkpoint — REPLACE the meaning of the Slice-4 direct tracker echo when AQ_FLEET=1
+     (the coordinator records `fleet_events`, becoming the audit source of truth → "tracker
+     echo routed through fleet_events"); keep the direct tracker echo as the offline path.
+   - heartbeat timer in the run loop; lease renew while a fleet job is building; release on done.
+   - new subcommands: `aq fleet-status` (heartbeat + show claimable count) and surface
+     factoryId/leaseEpoch in `status`; add to dispatch + help.
+
+3. OFFLINE-DEGRADE + SPLIT-BRAIN (§9/§18): if the coordinator is unreachable mid-job, the
+   runner finishes the in-flight job locally and reconciles on the next reachable call; on
+   reconnect it presents its leaseEpoch — if the coordinator reports it stale (reclaimed),
+   the local result is quarantined (marked, NOT auto-shipped) and surfaced for human triage.
+
+TESTS — extend `agent-queue/selftest.sh` (stub the fleet API exactly like the tracker stub;
+tests are sacred, all 53 prior checks stay green):
+- flag off (default): AQ_FLEET unset → ZERO fleet API calls; existing offline flow identical
+  (re-assert a couple of the offline cases under flag-off).
+- heartbeat/register: AQ_FLEET=1 loop start → stub receives POST factories/heartbeat with caps.
+- claim: stub returns a job → runner materializes a local job (bodyMd + leaseEpoch in meta)
+  and executes it to review/.
+- report + checkpoint: building/review/testing → stub receives PATCH /fleet/jobs/:id with the
+  correct stage + leaseEpoch (+ checkpoint on building).
+- FENCING: stub returns conflict on PATCH (stale epoch) → worker self-aborts, job NOT shipped,
+  a fenced-abort is logged/surfaced.
+- lease renew: long-running stub → at least one renew call with current leaseEpoch.
+- offline-degrade: stub returns connection error mid-job → job still completes locally; on
+  next call presenting a now-stale epoch → result quarantined (not auto-shipped).
+- no-leak: assert the prompt/bodyMd + token are never sent in a report/comment payload they
+  shouldn't be (reuse the Slice-4 sentinel check).
+
+VERIFY GATE (must all pass):
+- bash agent-queue/selftest.sh   (all prior 53 + new fleet cases green; none weakened)
+- bash -n agent-queue/agent-queue.sh && bash -n agent-queue/lib/fleet-client.sh
+- node --check agent-queue/dashboard.mjs   (if present/unchanged)
+- shellcheck --severity=error agent-queue/agent-queue.sh agent-queue/lib/fleet-client.sh
+
+DOCS:
+- README: a "Fleet integration (Phase 2)" section — the AQ_FLEET flag, env table, the
+  claim/heartbeat/report/fence/renew protocol, offline-degrade + quarantine behavior, and a
+  one-paragraph "offline vs fleet mode" explainer.
+- Tick the relevant §8/§9/§14 Phase-2 boxes in GIGAFACTORY_ROADMAP.md with a P2-S3 slice note.
+
+CONSTRAINTS: bash + curl + POSIX awk only (no jq, no new deps); reuse Slice-4 helpers; never
+hardcode tokens/secrets; offline path unchanged when AQ_FLEET unset; conventional commits
+(feat(agent-queue): ...); never weaken a test; do not edit the sibling common-plat repo.
+
+FINAL OUTPUT — print the report in EXACTLY this format:
+
+## Implementation Report — Phase 2 Slice 3 (factory-agent integration)
+### Branch & commits / PR
+### Files changed
+- <path>: <summary>
+### What was implemented
+- fleet-client.sh: <functions + flag gating>
+- agent-queue.sh hook points: <the few places touched + why minimal>
+- fencing + offline-degrade + quarantine: <how>
+- tracker echo via fleet_events: <how>
+### Tests added
+- <name>: <assertion>   (esp. flag-off no-op, claim, fenced self-abort, offline quarantine)
+- selftest summary: <N checks = 53 prior + M new>
+### Verify gate results
+- selftest / bash -n / node --check / shellcheck: <results>
+### Deviations / assumptions
+- <claim/lease contract details, anything stubbed, how registration maps to heartbeat>
+### Suggested next slice
+- Phase 2 remaining: scheduler/router wiring, factory enrollment + scoped tokens, feature-flag
+  shadow/dual-run, and the two-factory parallel demo (Phase 2 exit criteria).
--- a/agent-queue/docs/jobs/phase2-tracker-wiring.md
+++ b/agent-queue/docs/jobs/phase2-tracker-wiring.md
@ -0,0 +1,120 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat-tracker
+timeout: 4h
+---
+
+ROLE: Senior backend engineer. Implement the PHASE 2 DIRECT TRACKER -> MODULE WIRING
+(§10) for the fleet coordinator: a service-side bridge that turns a tracker Item into
+a fleet job (submitted through the coordinator, so it is routed by the §7 scheduler),
+and echoes the job's lifecycle back onto the Item — the full task<->job ROUND-TRIP,
+in-process, with no shell hop. This closes the §10 "direct tracker->module calls" box.
+
+PARALLEL-SAFETY: One other Devin is running in a DIFFERENT repo (learning_ai_devops_tools,
+the two-factory demo). There is NO other Devin in this repo, so you may edit any fleet
+file you need. Do NOT edit the agent-queue repo.
+
+READ FIRST (understand the contracts before writing):
+- services/platform-service/src/modules/fleet/coordinator.ts
+    - submitJob(productId, SubmitJobInput) -> { job, outcome } : idempotent submit; the
+      job already has a `trackerItemId` field (types.ts) — reuse it, do NOT add a new one.
+    - claimNextJob(ctx) already routes candidates through the §7 scheduler (selectJob).
+      You do NOT change claim/scheduler — tracker jobs flow through the SAME path.
+    - patchJobFenced / stage transitions — the lifecycle you will mirror to the tracker.
+- services/platform-service/src/modules/fleet/types.ts — FleetJobDoc.stage values,
+  SubmitJobSchema (trackerItemId, idempotencyKey, priority, capabilities, budget, kind).
+- services/platform-service/src/modules/fleet/routes.ts — existing fleet route patterns
+  (auth, getRequestProductId(req), Zod parse, productId enforcement). Add new routes here
+  in the SAME style.
+- services/platform-service/src/modules/items/{types,routes,repository}.ts — the Item
+  API contract you mirror to: Item fields (id, productId, title/description, status,
+  labels[]), the status vocabulary, and the comment/note mechanism. Call the items
+  repository DIRECTLY in-process (no HTTP/curl) — this is the whole point of "direct wiring".
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §10 (tracker
+  integration), §24.5 (echo rule), §14 Phase-2 checklist (the §10 box you will tick).
+
+PREREQUISITE / BRANCHING: branch off CURRENT main -> feat/gigafactory-p2-tracker-wiring.
+Push + open a PR. DO NOT merge.
+
+DELIVERABLES
+
+1. tracker-bridge.ts (NEW) — pure-ish service module (it may call the items + fleet
+   repositories, but no HTTP, no Fastify types inside it):
+   - `ingestItemAsJob(productId, itemId, opts?) -> { job, outcome }`:
+       * read the Item via the items repository (404 -> NotFoundError).
+       * map Item -> SubmitJobInput: title/description -> bodyMd (verbatim instruction);
+         labels carry manifest hints where present (engine-class:*, profile:*,
+         priority:*, cap:* -> capabilities[]); otherwise sane defaults.
+       * set trackerItemId = itemId and a STABLE idempotency-key (e.g. `tracker-<itemId>`),
+         then call coordinator.submitJob — so re-ingest of the same Item dedupes (no
+         duplicate job) and the job is scheduled by the §7 router like any other.
+   - `echoJobToItem(productId, jobId) -> { echoed: status | null }`:
+       * load the job; if it has no trackerItemId -> no-op (return null).
+       * map stage -> Item status (FULL round-trip, both directions of the lifecycle):
+           queued/assigned/building/review/testing -> in_progress
+           shipped                                 -> done
+           failed                                  -> blocked (+ note)
+       * append a comment/note with metrics ONLY (attempts, duration, cost/tokens if
+         present) — NEVER the prompt body / secrets.
+       * IDEMPOTENT: persist the last-echoed status (on the job doc or a small bridge
+         record) and make a re-echo of an unchanged outcome a no-op.
+   - Echo is BEST-EFFORT and downstream: an items-write failure NEVER fails the job —
+     surface it as a logged error / a `{ echoed: null, error }` shape, never throw into
+     the job lifecycle.
+
+2. Wire echo into stage transitions (server-side, opt-in, additive):
+   - When the coordinator/route performs a stage transition for a job that has a
+     trackerItemId, call echoJobToItem (guarded by a config flag, default OFF, e.g.
+     FLEET_TRACKER_ECHO; OFF => behavior byte-for-byte unchanged). Do not block or fail
+     the transition on echo error.
+
+3. Routes (routes.ts, additive — match existing auth/productId style):
+   - POST /fleet/tracker/ingest        { itemId }            -> ingestItemAsJob
+   - POST /fleet/tracker/echo          { jobId }             -> echoJobToItem (manual echo)
+   - All productId-scoped via getRequestProductId(req); a foreign productId cannot ingest
+     or echo another product's Item/job.
+
+TESTS (tracker-bridge.test.ts + route additions — tests are sacred; use @bytelyst/testing
+ the in-memory providers; NO live HTTP):
+- ingest creates exactly one job: Item -> job with trackerItemId set, bodyMd = description,
+  idempotency-key = tracker-<id>; the job is claimable via the normal claimNextJob path.
+- ingest label mapping: labels [engine-class:agentic-coder, priority:high, cap:os:mac]
+  -> job priority/capabilities reflect them.
+- ingest idempotent: ingesting the same Item twice -> one job (dedupe), outcome reflects it.
+- echo round-trip: a job advancing queued->building->shipped drives the Item
+  in_progress -> done, and a metrics-only comment is written (assert NO bodyMd/secret leaks).
+- echo failed -> Item blocked (+ note).
+- echo idempotent: re-echo of an unchanged stage -> no duplicate Item write.
+- echo non-fatal: items-write throws -> echoJobToItem returns { echoed:null,error }, the
+  job state is untouched, the transition still succeeds.
+- echo OFF (default flag): a stage transition performs ZERO items writes.
+- productId isolation: ingest/echo for a foreign productId -> not found / rejected.
+- REGRESSION: every existing fleet + items test stays green.
+
+VERIFY GATE:
+- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet src/modules/items
+- pnpm --filter @lysnrai/platform-service build
+- pnpm build && pnpm test   (no consumer regression)
+
+CONSTRAINTS: ESM `.js` import specifiers; no `any` (Zod inference / explicit types); no
+console.log (use app.log / req.log); every Cosmos doc keeps `productId`; reuse the
+existing `trackerItemId` field and items contract — do NOT fork a parallel schema;
+do NOT change claimNextJob or the scheduler; conventional commits
+(feat(platform-service): ...); do not edit the agent-queue repo.
+
+DOCS: tick §10 "direct tracker->module calls" in
+../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §14 Phase-2 (note the
+flag name + that it is the in-process round-trip; the agent-queue shell adapter remains the
+single-host path).
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Implementation Report — Phase 2 Direct Tracker -> Module Wiring (§10)
+### Branch & commits / PR
+### Files changed
+### What was implemented (ingest mapping, round-trip status map, echo idempotency + flag)
+### Tests added (+ pnpm test summary)
+### Verify gate results
+### Deviations / assumptions (Item status vocabulary matched, flag name, where last-echoed is stored)
+### Suggested next slice (Phase 3 tracker-web fleet control plane)
--- a/agent-queue/docs/jobs/phase2-two-factory-demo.md
+++ b/agent-queue/docs/jobs/phase2-two-factory-demo.md
@ -0,0 +1,91 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
+yolo: true
+lock: devops-tools-demo
+timeout: 4h
+---
+
+ROLE: Senior engineer. Build the PHASE 2 TWO-FACTORY PARALLEL DEMO — the final
+Phase-2 EXIT-CRITERIA box (§14): >=2 factories executing jobs in parallel via the
+coordinator, proving conflict-free atomic claims, lease fencing, and reaper-reclaim
+end-to-end. This is a DEMO HARNESS + DOCS, not new runtime behavior — agent-queue.sh
+and lib/fleet-client.sh already implement everything; you orchestrate + observe them.
+
+PARALLEL-SAFETY: One other Devin is running in a DIFFERENT repo (learning_ai_common_plat,
+the tracker-wiring slice) — no overlap. In THIS repo you OWN a NEW demo directory and the
+additive selftest/docs only:
+- You OWN (create/edit): agent-queue/demo/two-factory-demo.sh (NEW),
+  agent-queue/demo/README.md (NEW), additive checks in agent-queue/selftest.sh,
+  and the §14 Phase-2 demo/exit-criteria ticks in agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md.
+- You MUST NOT change the behavior of agent-queue.sh or lib/fleet-client.sh. You may READ
+  them and CALL them; if a tiny additive hook is unavoidable, keep it flag-gated and prove
+  all 68 existing selftest checks still pass byte-for-byte.
+- Leave the runtime agent-queue/queue/* working-tree artifacts ALONE (live-daemon state,
+  not yours) — never stage or commit them.
+
+READ FIRST:
+- agent-queue/agent-queue.sh — the run loop, AQ_FLEET / AQ_FLEET_ROUTE flags, claim path,
+  fencing/quarantine, offline-degrade.
+- agent-queue/lib/fleet-client.sh — fleet_register/heartbeat, claim, lease renew, fenced
+  PATCH, the coordinator HTTP wrappers and their env (AQ_FLEET_API, AQ_FLEET_TOKEN, factory id).
+- agent-queue/selftest.sh — how the EXISTING fleet tests STUB the coordinator (the canned
+  responder pattern). Reuse that exact stub style so the demo's selftest needs NO live service.
+- ../learning_ai_common_plat/services/platform-service/src/modules/fleet/coordinator.ts —
+  the claim/lease/fence/reaper contract you are demonstrating (read-only; do not edit).
+- agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §14 Phase-2 "Two-factory demo" + "Exit criteria".
+
+DELIVERABLES
+
+1. agent-queue/demo/two-factory-demo.sh — an orchestration script that:
+   - Starts >=2 factories (distinct factoryIds, e.g. mac-1 + ubuntu-1) against ONE
+     coordinator with AQ_FLEET=1 AQ_FLEET_ROUTE=1, each in its own working dir/queue so
+     they do not share local inbox state — they compete ONLY through the coordinator.
+   - Submits 3 jobs and lets the two factories drain them in parallel.
+   - DEMONSTRATES + ASSERTS the Phase-2 exit guarantees:
+       (a) no double-assign: each job is claimed/executed by exactly ONE factory.
+       (b) fencing: kill a factory MID-JOB -> the reaper returns the job -> the OTHER
+           factory reclaims and completes it AND the dead worker's late/zombie report is
+           FENCED (rejected, never shipped).
+       (c) parallelism: both factories make progress concurrently (not serialized).
+   - Prints a clear PASS/FAIL summary (per-job winner, reclaim event, fence event).
+   - DUAL MODE: works against a real coordinator when AQ_FLEET_API/AQ_FLEET_TOKEN are set;
+     otherwise drives the SAME selftest coordinator STUB so the demo is runnable + CI-safe
+     with zero external deps. Document both invocations.
+   - bash, mac+linux safe, curl-only, no new runtime deps; style consistent with the repo.
+
+2. agent-queue/demo/README.md — how to run the demo (stub mode + real-coordinator mode),
+   the env vars, what each asserted guarantee proves, and a short "what to watch" guide
+   (the kanban/log lines that show the reclaim + fence).
+
+3. selftest.sh — ADD a small number of checks (do NOT modify the existing 68) that run the
+   demo in STUB mode headlessly and assert: 3 jobs all reach a terminal state across the 2
+   factories with no double-assignment; the kill -> reclaim -> fenced-zombie path fires;
+   exit 0. Keep them fast + deterministic (seeded, no real sleeps where avoidable).
+
+TESTS / VERIFY GATE:
+- bash agent-queue/selftest.sh  -> all prior 68 + the new demo checks green, exit 0.
+- bash -n agent-queue/demo/two-factory-demo.sh && bash -n agent-queue/agent-queue.sh
+  && bash -n agent-queue/lib/fleet-client.sh  -> OK.
+- shellcheck --severity=error on the new script + the two core scripts -> clean.
+- node --check agent-queue/dashboard.mjs -> OK (must remain unchanged).
+
+CONSTRAINTS: do NOT alter agent-queue.sh / fleet-client.sh runtime behavior; reuse the
+existing coordinator stub pattern; never commit queue/* runtime artifacts; mac+linux safe;
+no emojis; conventional commits (feat(agent-queue): ...); tests sacred (the 68 stay green).
+
+DOCS: tick the §14 Phase-2 "Two-factory demo" box and, once the demo asserts all three
+guarantees, the Phase-2 "Exit criteria" line in GIGAFACTORY_ROADMAP.md — set §0 Phase 2 ->
+complete (or note the exact remaining %). This is the box that closes Phase 2.
+
+FINAL OUTPUT — report in EXACTLY this format:
+## Implementation Report — Phase 2 Two-Factory Parallel Demo (Exit Criteria)
+### Branch & commits / PR
+- branch / based-on: feat/gigafactory-p2-two-factory-demo off current main
+### Files changed
+### What was implemented (orchestration, the 3 asserted guarantees, stub vs real mode)
+### Tests added (+ selftest PASS/FAIL summary: prior 68 + new)
+### Verify gate results (selftest / bash -n / shellcheck / node --check)
+### Deviations / assumptions (how factories are isolated, how kill/reclaim is simulated in stub)
+### Phase 2 status (which §14 boxes now complete; exit criteria met Y/N; what (if anything) remains)
+### Suggested next slice (Phase 3 — tracker-web fleet control plane + DAG + budgets)
--- a/agent-queue/docs/jobs/phase3-overnight.md
+++ b/agent-queue/docs/jobs/phase3-overnight.md
@ -0,0 +1,162 @@
+---
+engine: devin
+cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
+yolo: true
+lock: common-plat-phase3
+timeout: 10h
+---
+
+ROLE: Senior full-stack engineer. Implement PHASE 3 of the Agent Gigafactory END-TO-END
+in `learning_ai_common_plat`, SEQUENTIALLY, over a long unattended run: smart routing
+(tunable weights + preemption), DAG job decomposition, per-product budgets, and the
+tracker-web fleet control plane. Work SLICE BY SLICE; each slice is self-contained,
+fully tested, and pushed before the next begins. This is an overnight run — favor
+correctness, small verifiable steps, and never leaving main/PR in a broken state.
+
+================================================================================
+PREREQUISITE (the operator guarantees this before starting): Phase 2 is COMPLETE and
+merged to origin/main — fleet foundation, atomic claim, scheduler/router core, artifacts,
+enrollment+tokens, feature-flags/shadow, the in-process tracker->module wiring, and the
+two-factory demo are ALL on main. You branch off CURRENT origin/main.
+================================================================================
+
+GLOBAL GUARDRAILS (unattended danger mode — obey strictly):
+- Branch: feat/gigafactory-phase3 off CURRENT origin/main. ONE long-lived branch; ONE
+  commit per slice (conventional commits). Push after EVERY slice. Open ONE PR after
+  Slice 1 and keep pushing to it. DO NOT MERGE anything. DO NOT touch origin/main.
+- Tests are SACRED: never delete, weaken, skip, or `.skip`/`.only` a test to go green.
+  If you cannot make a slice pass honestly, see the FAILURE PROTOCOL below.
+- A slice is "done" only when its VERIFY GATE is fully green. Never start slice N+1 with
+  slice N red.
+- Reserved / DO NOT TOUCH: the agent-queue repo (different repo), unrelated services
+  (cowork-service, extraction-service), packages/* internals (consume, don't edit),
+  and any backup/* or dependabot/* branches. Stay in services/platform-service +
+  dashboards/tracker-web.
+- Conventions: ESM `.js` import specifiers; no `any`; no console.log (use app.log/req.log,
+  and the tracker-web logger/telemetry pattern); every Cosmos doc carries `productId`;
+  reuse @bytelyst/* packages and existing module patterns (types.ts -> repository.ts ->
+  routes.ts). Do NOT hardcode colors/URLs/secrets.
+- CHECKPOINTING: maintain docs/GIGAFACTORY/gigafactory-phase3-progress.md on the branch. After each
+  slice, record: slice name, status (DONE/WIP/FAILED), commit sha, verify-gate result,
+  and any follow-ups. Commit it WITH the slice. If you resume after an interruption, read
+  it first and continue from the first not-DONE slice.
+
+FAILURE PROTOCOL (per slice): attempt the verify gate up to 3 times, fixing the ROOT
+cause each time (not the test). If still red after 3 honest attempts: commit the WIP with
+message `wip(<scope>): <slice> — BLOCKED: <one-line reason>`, mark it FAILED in
+progress.md with the exact failing output, and MOVE ON to the next slice that does NOT
+depend on it (dependencies noted per slice). Never thrash; never fake green.
+
+READ FIRST:
+- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md — §7 (scoring;
+  Phase-3 = tunable weights + preemption), §5/§6 (DAG/deps), §11/§13 (budgets), §14
+  Phase-3 checklist + Exit criteria, §16 Definition-of-Done.
+- services/platform-service/src/modules/fleet/{scheduler,coordinator,repository,routes,
+  types}.ts — the engine you extend (read the existing claim/lease/fence/selectJob).
+- dashboards/tracker-web/ — match its App-Router structure (src/app, src/app/api),
+  data-fetching/auth pattern, @bytelyst/ui + design-tokens usage, vitest + Playwright
+  (e2e/, playwright.config.ts) setup. The existing fleet HTTP API you consume:
+  POST/GET /fleet/jobs, GET /fleet/jobs/:id, PATCH /fleet/jobs/:id, POST /fleet/claim,
+  lease renew/release, POST /fleet/factories/heartbeat|enroll, token rotate/revoke,
+  GET /fleet/jobs/:id/runs, GET /fleet/jobs/:id/events, artifacts upload/list/get/delete.
+
+--------------------------------------------------------------------------------
+SLICE 1 — Tunable scoring weights + preemption (backend; depends on: nothing)
+--------------------------------------------------------------------------------
+Extend the PURE scheduler (scheduler.ts) without breaking §7 Phase-2 behavior:
+- Weights become configurable per-product/per-request (passed in; fixed defaults preserved
+  so existing tests stay green). Add a small typed FleetWeightConfig resolver (defaults ->
+  optional product override). NO env reads inside the pure module.
+- Preemption: a `selectWithPreemption(candidates, runningJobs, factory, ctx, weights?)`
+  that, when a CRITICAL job cannot be placed and only lower-priority jobs are running,
+  returns a preemption decision { evict: jobId, reason, breakdown } — PURE, no I/O.
+- Wire preemption into the coordinator behind a flag (FLEET_PREEMPTION, default OFF; OFF =
+  byte-for-byte current behavior). Eviction must checkpoint + requeue the victim via the
+  EXISTING fenced-requeue path (bump leaseEpoch; the zombie's late report is fenced).
+TESTS: weight override changes ranking; defaults reproduce all prior picks; preemption
+evicts only a strictly-lower-priority running job, never an equal/higher; victim is
+requeued with checkpoint + bumped epoch and its stale report is fenced; flag OFF = no
+preemption. VERIFY GATE: pnpm --filter @lysnrai/platform-service exec vitest run
+src/modules/fleet && build && (pnpm build && pnpm test).
+
+--------------------------------------------------------------------------------
+SLICE 2 — DAG job decomposition (backend; depends on: nothing; independent of S1)
+--------------------------------------------------------------------------------
+Parent/child jobs with dependency-gated execution (§5/§6):
+- types: a job may declare children (subtasks) and dependsOn[] (sibling/child ids). Reuse
+  existing kind ('leaf'|...) + parentId; add child submission + a DAG edge model. Cycle
+  detection at submit (extend the existing submit-time cycle check).
+- coordinator: a parent is not claimable until its children reach a terminal state (or its
+  declared deps are satisfied); completing the last child unblocks the parent. claimNextJob
+  only returns deps-satisfied jobs (extend the existing predicate). Fan-out: submitting a
+  parent with children atomically creates the children.
+- routes (additive): POST /fleet/jobs/:id/children (submit children), GET /fleet/jobs/:id/dag
+  (return the subtree + per-node stage). productId-scoped.
+TESTS: parent blocked until children done; last child completion unblocks parent; cycle at
+submit -> rejected; capability/priority still respected per node; DAG endpoint returns the
+correct subtree; all prior fleet tests green. VERIFY GATE as in Slice 1 (+ items unaffected).
+
+--------------------------------------------------------------------------------
+SLICE 3 — Per-product budgets + pause/resume (backend; depends on: nothing)
+--------------------------------------------------------------------------------
+Cost ceilings that pause routing (§11/§13):
+- A FleetBudgetDoc per productId (ceilingUsd, window, spentUsd, status active|paused).
+  Spend accrues from job run cost (reuse run/insights cost if present; else estimate from
+  budget.usd at completion). Container partitioned by /productId.
+- Enforcement in claimNextJob: if the product's budget is paused or the next job would
+  exceed the ceiling, that product's jobs are NOT claimed (other products unaffected).
+  Behind FLEET_BUDGETS (default OFF = unchanged).
+- routes (additive): GET/PUT /fleet/budgets/:productId, POST /fleet/budgets/:productId/pause,
+  POST /fleet/budgets/:productId/resume.
+TESTS: under ceiling -> claims proceed; crossing ceiling -> that product pauses, others
+still claim; manual pause blocks claims; resume restores; flag OFF = no enforcement;
+spend accounting is monotonic + idempotent per run. VERIFY GATE as above.
+
+--------------------------------------------------------------------------------
+SLICE 4 — tracker-web Fleet Control Plane UI (frontend; depends on: S1-S3 endpoints,
+          but build defensively — feature-detect/degrade if an endpoint is absent)
+--------------------------------------------------------------------------------
+A new `/fleet` section in dashboards/tracker-web (App Router), matching existing patterns:
+- Typed fleet API client (src/lib or src/app/api proxy as the repo does it) wrapping the
+  fleet endpoints with auth token injection (reuse the existing auth/client pattern).
+- Pages/components (use @bytelyst/ui + --*-tokens; every interactive element has an
+  aria-label or visible label):
+    * Fleet map: factories (id, caps, health, load, lease state) as live cards.
+    * Job table: filter by product/stage/priority; submit-job modal; row -> job detail.
+    * Job detail: stage timeline from /events, runs from /runs, artifacts list, a SHIP
+      action (PATCH stage), and the DAG subtree (from /dag) when present.
+    * Budget panel: per-product ceiling + spent + pause/resume controls.
+- Live updates via polling (simple, robust) unless an SSE/stream endpoint exists.
+TESTS: vitest component/unit tests for the client + key components (render, actions call
+the right endpoint, error/empty/degraded states); Playwright e2e for the core flow
+(see fleet map -> open a job -> ship; pause a budget -> resume). VERIFY GATE:
+the tracker-web `verify` script (typecheck + lint + test + e2e) green — run exactly what
+its package.json defines (e.g. pnpm --filter <tracker-web> run verify, or the documented
+equivalent). Do not weaken its lint/e2e config.
+
+--------------------------------------------------------------------------------
+SLICE 5 — Docs + roadmap + Phase-3 exit criteria (depends on: S1-S4 outcomes)
+--------------------------------------------------------------------------------
+- Update ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §14 Phase-3
+  checkboxes for every box you actually completed, with a one-line note + the flag names
+  (FLEET_PREEMPTION/FLEET_BUDGETS) and which are default-OFF. Tick the Phase-3 Exit-criteria
+  line ONLY if its conditions are genuinely met; otherwise note the exact remaining %.
+  (This is a docs edit in the OTHER repo — make it as a separate small commit/PR in
+  learning_ai_devops_tools, OR include the roadmap delta as a patch file under
+  docs/ in THIS branch and note it for the operator — do NOT entangle the two repos'
+  git history. Prefer the patch-file note if a clean cross-repo PR isn't trivial.)
+- Update dashboards/tracker-web/README + a short docs/GIGAFACTORY/FLEET_CONTROL_PLANE.md (how to use
+  the new UI, the flags, the endpoints consumed).
+- Finalize docs/GIGAFACTORY/gigafactory-phase3-progress.md with the end-state of every slice.
+
+FINAL OUTPUT — print ONE consolidated report in EXACTLY this format:
+## Implementation Report — Phase 3 (overnight)
+### Branch & PR
+### Per-slice results
+| slice | status (DONE/WIP/FAILED) | commit | verify gate | notes |
+### What was implemented (per slice: key files, flags, endpoints, UI surfaces)
+### Tests added (counts per area + the final verify-gate output per slice)
+### Deviations / assumptions (weight defaults, budget accounting source, polling vs SSE, any degraded UI paths)
+### Phase 3 status (which §14 boxes now complete; exit criteria met Y/N; remaining %)
+### Anything that needs a human decision (esp. risky majors, cross-repo roadmap tick)
+### Suggested next phase (Phase 4 — message bus + autoscaling + capability marketplace)
--- a/agent-queue/launchd/README.md
+++ b/agent-queue/launchd/README.md
@ -0,0 +1,70 @@
+# Boot-persistence: agent-queue as a macOS LaunchAgent
+
+Auto-start the `agent-queue` run loop on login and keep it alive across
+**reboot / crash / logout** — the one failure mode that `tmux` + `caffeinate`
+alone can't cover.
+
+| Layer | Survives terminal close | Survives sleep | Survives reboot |
+| ----- | :---------------------: | :------------: | :-------------: |
+| plain shell | no | no | no |
+| `tmux` | yes | no | no |
+| `caffeinate` | n/a | yes | no |
+| **LaunchAgent (this)** | yes | yes (via caffeinate) | **yes** |
+
+## Install
+
+```bash
+bash launchd/install.sh             # render plist, load, start now (RunAtLoad + KeepAlive)
+tail -f ~/Library/Logs/agent-queue/agent-queue.out.log
+```
+
+It renders `~/Library/LaunchAgents/com.bytelyst.agent-queue.plist` from the
+resolved repo path (works on any clone) and bootstraps it into your GUI session.
+
+## Use
+
+The LaunchAgent runs `agent-queue-boot.sh`, which wraps `agent-queue run` in
+`caffeinate`. Just drop prompt `.md` files into `queue/inbox/` — they get picked
+up automatically, now or after the next reboot.
+
+```bash
+aq add ~/jobs/phase3-overnight.md --engine codex   # or drop the file in queue/inbox/
+aqs                                                 # status
+```
+
+## Configure (no need to edit the plist)
+
+Put overrides in `~/.agent-queue.env` (untracked — also the place for tokens):
+
+```bash
+AGENT_QUEUE_ENGINE=codex   # codex (recommended: local repo) | devin | claude
+AGENT_QUEUE_MAX=1          # concurrent jobs on this host (default 3)
+# AGENT_QUEUE_NO_CAFFEINATE=1   # allow the Mac to idle-sleep (NOT for overnight runs)
+# DEVIN_BIN=/custom/path/devin  # if a CLI isn't on the default PATH
+```
+
+## Stop / uninstall
+
+```bash
+bash launchd/install.sh --uninstall   # bootout + remove plist (queued jobs stay put)
+```
+
+## Notes & gotchas
+
+- **codex vs devin:** for a local monorepo overnight runner, **codex** is the
+  default — it runs in-repo so `@bytelyst/*` workspace links resolve locally and
+  logs/token-usage parsing already work. Use **devin** when you want a cloud
+  sandbox doing the heavy lifting (and ACUs/network aren't a concern).
+- **Power:** caffeinate wraps the long-lived loop, so the Mac stays awake the
+  whole time the LaunchAgent runs. That's intended for a dedicated runner. Set
+  `AGENT_QUEUE_NO_CAFFEINATE=1` if you'd rather let it idle-sleep when no job is
+  active. Keep it plugged in with the lid open for true overnight runs.
+- **PATH:** launchd starts processes with a minimal `PATH`. Both the plist
+  (`EnvironmentVariables`) and the wrapper repair it, but if a CLI lives
+  somewhere unusual, point at it explicitly via `~/.agent-queue.env`.
+- **Dangerous mode:** jobs run `--yolo` (auto-approve) by default. The safety net
+  is the agent-queue lifecycle itself — jobs land in `review/` → `testing/` and
+  **shipping is always a manual human gate**. Never let an unattended run touch
+  `main`; push to a branch and open one PR.
+- **Auth:** cache `gh auth login` / git credentials and the agent CLI's auth
+  before relying on it overnight, or the first `push` will block forever.
--- a/agent-queue/launchd/install.sh
+++ b/agent-queue/launchd/install.sh
@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+#
+# install.sh — install (or remove) the macOS LaunchAgent that auto-starts the
+# agent-queue run loop on login and keeps it alive across reboot/crash.
+#
+#   bash launchd/install.sh             # render plist, load, and start now
+#   bash launchd/install.sh --uninstall # stop, unload, and remove the plist
+#
+# The plist is generated from the resolved repo path so it works on any clone.
+# Logs land in ~/Library/Logs/agent-queue/.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd -P)"
+AQ_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd -P)"
+WRAPPER="$AQ_DIR/agent-queue-boot.sh"
+
+LABEL="com.bytelyst.agent-queue"
+PLIST="$HOME/Library/LaunchAgents/$LABEL.plist"
+LOG_DIR="$HOME/Library/Logs/agent-queue"
+UID_NUM="$(id -u)"
+DOMAIN="gui/$UID_NUM"
+
+if [ "$(uname -s)" != "Darwin" ]; then
+  echo "install.sh: macOS only (LaunchAgents). On Linux use a systemd --user unit." >&2
+  exit 1
+fi
+
+uninstall() {
+  echo "[launchd] booting out $LABEL ..."
+  launchctl bootout "$DOMAIN/$LABEL" 2>/dev/null || true
+  rm -f "$PLIST"
+  echo "[launchd] removed $PLIST"
+  echo "[launchd] (the run loop is stopped; queued jobs stay in queue/inbox/)"
+}
+
+if [ "${1:-}" = "--uninstall" ] || [ "${1:-}" = "-u" ]; then
+  uninstall
+  exit 0
+fi
+
+[ -f "$WRAPPER" ] || { echo "install.sh: missing $WRAPPER" >&2; exit 1; }
+chmod +x "$WRAPPER" "$AQ_DIR/agent-queue.sh" 2>/dev/null || true
+mkdir -p "$HOME/Library/LaunchAgents" "$LOG_DIR"
+
+echo "[launchd] writing $PLIST"
+cat > "$PLIST" <<EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>$LABEL</string>
+
+    <key>ProgramArguments</key>
+    <array>
+        <string>/bin/bash</string>
+        <string>$WRAPPER</string>
+    </array>
+
+    <!-- Start on login and restart if it ever exits non-zero (crash/reboot). -->
+    <key>RunAtLoad</key>
+    <true/>
+    <key>KeepAlive</key>
+    <dict>
+        <key>SuccessfulExit</key>
+        <false/>
+    </dict>
+    <!-- Guard against tight crash loops. -->
+    <key>ThrottleInterval</key>
+    <integer>30</integer>
+
+    <key>WorkingDirectory</key>
+    <string>$AQ_DIR</string>
+
+    <key>StandardOutPath</key>
+    <string>$LOG_DIR/agent-queue.out.log</string>
+    <key>StandardErrorPath</key>
+    <string>$LOG_DIR/agent-queue.err.log</string>
+
+    <!-- launchd's PATH is minimal; the wrapper also repairs PATH defensively. -->
+    <key>EnvironmentVariables</key>
+    <dict>
+        <key>PATH</key>
+        <string>$HOME/.local/bin:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
+        <key>AGENT_QUEUE_ENGINE</key>
+        <string>codex</string>
+    </dict>
+</dict>
+</plist>
+EOF
+
+# Reload cleanly (bootout first so a re-run picks up plist changes).
+launchctl bootout "$DOMAIN/$LABEL" 2>/dev/null || true
+launchctl bootstrap "$DOMAIN" "$PLIST"
+launchctl enable "$DOMAIN/$LABEL"
+launchctl kickstart -k "$DOMAIN/$LABEL"
+
+echo "[launchd] installed + started: $LABEL"
+echo "[launchd] status : launchctl print $DOMAIN/$LABEL | sed -n '1,20p'"
+echo "[launchd] logs   : tail -f $LOG_DIR/agent-queue.out.log"
+echo "[launchd] stop   : bash $SCRIPT_DIR/install.sh --uninstall"
+echo
+echo "Drop prompt .md files into: $AQ_DIR/queue/inbox/"
+echo "Override engine/concurrency/secrets in ~/.agent-queue.env (e.g. AGENT_QUEUE_MAX=1)."
--- a/agent-queue/lib/fleet-client.sh
+++ b/agent-queue/lib/fleet-client.sh
@ -0,0 +1,578 @@
+# shellcheck shell=bash
+# ── Fleet coordinator client (Phase 2, §7/§8/§9/§18) ────────────────
+#
+# Sourced by agent-queue.sh. Lets the single-host runner act as a "factory" that
+# registers / heartbeats / claims / reports against the platform-service `fleet`
+# coordinator — BEHIND the AQ_FLEET flag. When AQ_FLEET is unset/0, every function
+# here is an immediate no-op and the offline git-queue path is byte-for-byte
+# unchanged. curl-only + POSIX awk (reuses agent-queue.sh helpers: log/err,
+# _meta_val, _json_str, _json_escape, detect_capabilities, active_workers, CURL_BIN).
+#
+# Contract (routes under AQ_FLEET_API, which already includes /api):
+#   POST /fleet/factories/heartbeat   {factoryId, capabilities[], health, load}
+#   POST /fleet/claim                 {factoryId, capabilities[], leaseSeconds}
+#                                       -> {claimed, job{id,bodyMd,leaseEpoch}, lease{...}}
+#   PATCH /fleet/jobs/:id             {stage, leaseEpoch, checkpoint?}  (409 = fenced)
+#   POST  /fleet/jobs/:id/lease/renew {leaseEpoch, leaseSeconds}        (409 = fenced)
+#   POST  /fleet/jobs/:id/lease/release {leaseEpoch, stage?}
+# The coordinator owns leaseEpoch fencing + writes fleet_events server-side; there
+# is no client-side "register" or "append event" call (register == first heartbeat).
+
+# ── Config (env-overridable) ────────────────────────────────────────
+AQ_FLEET="${AQ_FLEET:-0}"                                   # master switch (0 = offline)
+AQ_FLEET_API="${AQ_FLEET_API:-http://localhost:4003/api}"   # base URL incl. /api
+# Normalize: platform-service mounts the fleet routes under /api. Strip a trailing
+# slash and append /api unless already present, so AQ_FLEET_API=http://host:4003
+# (the natural form) works too instead of silently 404ing every fleet call.
+AQ_FLEET_API="${AQ_FLEET_API%/}"
+[[ "$AQ_FLEET_API" == */api ]] || AQ_FLEET_API="${AQ_FLEET_API}/api"
+AQ_FLEET_TOKEN="${AQ_FLEET_TOKEN:-}"                         # bearer; never hardcode
+# AQ_PRODUCT_ID is shared with the Slice-4 tracker config (X-Product-Id header).
+AQ_FACTORY_ID="${AQ_FACTORY_ID:-$( (hostname -s 2>/dev/null || hostname 2>/dev/null || echo factory) | tr -cd 'A-Za-z0-9._-')-$$}"
+AQ_FLEET_LEASE_RENEW_SEC="${AQ_FLEET_LEASE_RENEW_SEC:-300}" # heartbeat/renew cadence
+AQ_FLEET_LEASE_SECONDS="${AQ_FLEET_LEASE_SECONDS:-900}"     # requested lease duration
+AQ_FLEET_CAPS="${AQ_FLEET_CAPS:-}"                          # override caps (comma/space list)
+AQ_FLEET_CWD="${AQ_FLEET_CWD:-$PWD}"                        # cwd for claimed fleet jobs
+AQ_FLEET_API_CMD="${AQ_FLEET_API_CMD:-}"                    # test seam (stub script)
+AQ_FLEET_HB_TS=0                                            # last heartbeat epoch (mutable)
+
+# ── Slice 4: feature-flag levels (three explicit, independently-toggleable) ──
+# Precedence (documented in README §Cutover):
+#   AQ_FLEET=0          ⇒ pure offline, ZERO coordinator calls (master switch).
+#   AQ_FLEET_ROUTE=1    ⇒ route_via_service: coordinator is AUTHORITATIVE for claim
+#     (default; preserves the P2-S3 behavior).
+#   AQ_FLEET_ROUTE=0    ⇒ LOCAL inbox is authoritative (coordinator not used to
+#     source work) — the pre-cutover state.
+#   AQ_FLEET_SHADOW=1   ⇒ shadow/dual-run (requires AQ_FLEET=1 AND AQ_FLEET_ROUTE=0):
+#     run the normal offline path as authoritative AND query the coordinator in
+#     parallel WITHOUT acting on its responses, purely to record divergence.
+#   If AQ_FLEET_ROUTE=1 AND AQ_FLEET_SHADOW=1, ROUTE WINS and shadow is disabled
+#   (a one-shot warning is logged) — you never shadow and route at the same time.
+AQ_FLEET_ROUTE="${AQ_FLEET_ROUTE:-1}"
+# AQ_FLEET_AUTOSHIP=1 ⇒ when the factory's local verify gate passes, advance the
+# coordinator job testing -> shipped (the factory's verify IS the test phase).
+# Default 0 keeps the human review gate authoritative (job rests at testing).
+AQ_FLEET_AUTOSHIP="${AQ_FLEET_AUTOSHIP:-0}"
+# AQ_FLEET_PR=1 ⇒ for jobs that carry a `repo`, run the agent in an isolated
+# checkout on branch aq/job/<id>, then commit/push and open a PR; the PR URL is
+# reported back and recorded on the run. Checkouts are cached under AQ_FLEET_REPOS_DIR.
+AQ_FLEET_PR="${AQ_FLEET_PR:-0}"
+AQ_FLEET_REPOS_DIR="${AQ_FLEET_REPOS_DIR:-}"   # default resolved to $STATE/repos at call time
+AQ_FLEET_SHADOW="${AQ_FLEET_SHADOW:-0}"
+# Isolated factory id for the read-only shadow claim (never the real factory id).
+AQ_FLEET_SHADOW_FACTORY_ID="${AQ_FLEET_SHADOW_FACTORY_ID:-${AQ_FACTORY_ID}-shadow}"
+# Shadow divergence log (default resolved to $STATE/fleet-shadow.log at call time).
+AQ_FLEET_SHADOW_LOG="${AQ_FLEET_SHADOW_LOG:-}"
+_AQ_FLEET_SHADOW_WARNED=0   # one-shot ROUTE>SHADOW precedence warning (per process)
+SHADOW_COORD_JOB=""         # set by fleet_shadow_claim: would-be coordinator job id
+
+# ── §M0 RU gate (docs/GIGAFACTORY/FLEET_DISPATCH_REDESIGN.md §8/§12) ──
+# When ON, the run loop point-reads a cheap per-product queue version
+# (GET /fleet/queue-state, ~1 RU) and SKIPS the expensive claim while nothing has
+# changed and we are not mid-drain — slashing idle Cosmos RU. Default OFF
+# (opt-in): behavior is byte-for-byte unchanged unless AQ_FLEET_GATE=1, and the
+# gate always FAILS OPEN (claims) on any read error so work is never stranded.
+AQ_FLEET_GATE="${AQ_FLEET_GATE:-0}"
+# Force a full claim at least this often even when the gate is unchanged (backstops
+# a missed/raced version bump). 0 disables the periodic backstop.
+AQ_FLEET_GATE_SAFETY_SEC="${AQ_FLEET_GATE_SAFETY_SEC:-300}"
+AQ_FLEET_GATE_SEEN=""        # last-seen queue version (mutable, per process)
+AQ_FLEET_GATE_TS=0           # epoch of the last full (drained) claim attempt
+AQ_FLEET_GATE_DRAINING=1     # 1 = keep claiming (last claim got a job / startup)
+
+# fleet_enabled — true iff the coordinator integration is switched on.
+fleet_enabled() { [[ "${AQ_FLEET:-0}" == 1 ]]; }
+
+# fleet_route_enabled — coordinator is authoritative for claim/assignment (ROUTE=1).
+fleet_route_enabled() { fleet_enabled && [[ "${AQ_FLEET_ROUTE:-1}" == 1 ]]; }
+
+# fleet_shadow_enabled — shadow/dual-run is active. Pure (no logging): requires
+# AQ_FLEET=1 AND AQ_FLEET_ROUTE=0 AND AQ_FLEET_SHADOW=1. When ROUTE=1 this returns
+# false (ROUTE wins) — the precedence warning is emitted once by fleet_flags_warn_once.
+fleet_shadow_enabled() {
+  fleet_enabled || return 1
+  [[ "${AQ_FLEET_ROUTE:-1}" == 0 ]] || return 1
+  [[ "${AQ_FLEET_SHADOW:-0}" == 1 ]]
+}
+
+# fleet_flags_warn_once — emit the ROUTE>SHADOW precedence warning at most once.
+# Called from the run-loop init so an operator who sets ROUTE=1 + SHADOW=1 is told
+# that shadow is suppressed. No-op unless that exact (conflicting) combo is set.
+fleet_flags_warn_once() {
+  fleet_enabled || return 0
+  if [[ "${AQ_FLEET_ROUTE:-1}" == 1 && "${AQ_FLEET_SHADOW:-0}" == 1 && "${_AQ_FLEET_SHADOW_WARNED:-0}" != 1 ]]; then
+    err "fleet: AQ_FLEET_ROUTE=1 and AQ_FLEET_SHADOW=1 — ROUTE wins; shadow/dual-run is DISABLED. Set AQ_FLEET_ROUTE=0 to shadow."
+    _AQ_FLEET_SHADOW_WARNED=1
+  fi
+  return 0
+}
+
+# fleet_flags_state — one-line resolved flag summary (for `status` / `fleet-status`).
+fleet_flags_state() {
+  local route shadow
+  if [[ "${AQ_FLEET_ROUTE:-1}" == 1 ]]; then route="route_via_service"; else route="local-authoritative"; fi
+  if fleet_shadow_enabled; then shadow="shadow=ON"; else shadow="shadow=off"; fi
+  printf 'AQ_FLEET=1 route=%s(AQ_FLEET_ROUTE=%s) %s(AQ_FLEET_SHADOW=%s)' \
+    "$route" "${AQ_FLEET_ROUTE:-1}" "$shadow" "${AQ_FLEET_SHADOW:-0}"
+}
+
+# ── HTTP (curl only; same output contract as the Slice-4 tracker_api) ──
+# fleet_api <METHOD> <PATH> [JSON] -> response body, then a final HTTP-code line.
+fleet_api() {
+  local method=$1 path=$2 body=${3:-}
+  if [[ -n "$AQ_FLEET_API_CMD" ]]; then
+    "$AQ_FLEET_API_CMD" "$method" "$path" "$body"
+    return $?
+  fi
+  local url="${AQ_FLEET_API}${path}"
+  local -a args=(-sS -m "${AQ_FLEET_TIMEOUT:-30}" -X "$method"
+    -H "Content-Type: application/json" -w '\n%{http_code}')
+  [[ -n "$AQ_FLEET_TOKEN" ]] && args+=(-H "Authorization: Bearer $AQ_FLEET_TOKEN")
+  [[ -n "$AQ_PRODUCT_ID" ]] && args+=(-H "X-Product-Id: $AQ_PRODUCT_ID")
+  [[ -n "$body" ]] && args+=(--data "$body")
+  local out rc
+  out=$("$CURL_BIN" "${args[@]}" "$url" 2>/dev/null); rc=$?
+  if [[ $rc -ne 0 ]]; then printf '%s\n000\n' "$out"; else printf '%s\n' "$out"; fi
+}
+
+# _fleet_call <METHOD> <PATH> [JSON] -> sets globals FLEET_BODY + FLEET_CODE.
+_fleet_call() {
+  local out; out=$(fleet_api "$@")
+  FLEET_CODE=$(printf '%s' "$out" | tail -n1)
+  FLEET_BODY=$(printf '%s' "$out" | sed '$d')
+}
+
+# _fleet_json_num <key> (reads JSON on stdin) -> first numeric value for key.
+_fleet_json_num() {
+  grep -oE "\"$1\"[[:space:]]*:[[:space:]]*-?[0-9]+" | head -1 | grep -oE -- '-?[0-9]+$'
+}
+
+# _fleet_is_job <job> -> 0 if this job was claimed from the coordinator.
+_fleet_is_job() { [[ -n "$(_meta_val "$STATE/$1.meta" fleet_job_id)" ]]; }
+
+# fleet_detect_caps -> JSON array of capability tokens (override or auto-detected).
+fleet_detect_caps() {
+  local toks
+  if [[ -n "$AQ_FLEET_CAPS" ]]; then
+    toks=$(printf '%s' "$AQ_FLEET_CAPS" | tr ', ' '\n\n')
+  else
+    toks=$(detect_capabilities)
+  fi
+  local out="[" first=1 t
+  while IFS= read -r t; do
+    [[ -n "$t" ]] || continue
+    [[ $first -eq 1 ]] && first=0 || out+=","
+    out+="\"$(_json_escape "$t")\""
+  done <<< "$toks"
+  printf '%s]' "$out"
+}
+
+# ── Heartbeat (registration == first heartbeat) ─────────────────────
+fleet_heartbeat() {
+  fleet_enabled || return 0
+  local caps load body
+  caps=$(fleet_detect_caps)
+  load=$(active_workers 2>/dev/null || echo 0)
+  body="{\"factoryId\":\"$(_json_escape "$AQ_FACTORY_ID")\",\"capabilities\":$caps,\"health\":\"ok\",\"load\":${load:-0}}"
+  _fleet_call POST "/fleet/factories/heartbeat" "$body"
+  case "$FLEET_CODE" in
+    2*) AQ_FLEET_HB_TS=$(date +%s); return 0;;
+    *)  err "fleet: heartbeat failed (HTTP ${FLEET_CODE:-error}) — running degraded"; return 1;;
+  esac
+}
+
+# fleet_heartbeat_maybe — heartbeat only when the cadence interval has elapsed.
+fleet_heartbeat_maybe() {
+  fleet_enabled || return 0
+  local now; now=$(date +%s)
+  [[ $(( now - ${AQ_FLEET_HB_TS:-0} )) -ge "${AQ_FLEET_LEASE_RENEW_SEC:-300}" ]] && fleet_heartbeat
+  return 0
+}
+
+# ── §M0 RU gate helpers ─────────────────────────────────────────────
+# fleet_gate_enabled — true iff the cheap-poll gate is switched on.
+fleet_gate_enabled() { fleet_enabled && [[ "${AQ_FLEET_GATE:-0}" == 1 ]]; }
+
+# fleet_queue_version — print the product's queue version (GET /fleet/queue-state);
+# return non-zero on any read failure so callers can fail open.
+fleet_queue_version() {
+  _fleet_call GET "/fleet/queue-state"
+  case "$FLEET_CODE" in 2*) :;; *) return 1;; esac
+  printf '%s' "$FLEET_BODY" | _fleet_json_num version
+}
+
+# fleet_gate_should_claim — 0 = run the (expensive) claim this tick, 1 = skip it.
+# Read-only. Fails OPEN (claim) on any uncertainty so work is never stranded.
+# Always 0 when the gate is OFF, preserving the pre-gate behavior exactly.
+fleet_gate_should_claim() {
+  fleet_gate_enabled || return 0                          # gate off -> always claim
+  [[ "${AQ_FLEET_GATE_DRAINING:-1}" == 1 ]] && return 0   # mid-drain -> keep claiming
+  local now; now=$(date +%s)
+  if [[ "${AQ_FLEET_GATE_SAFETY_SEC:-0}" -gt 0 \
+        && $(( now - ${AQ_FLEET_GATE_TS:-0} )) -ge "${AQ_FLEET_GATE_SAFETY_SEC}" ]]; then
+    return 0                                              # periodic safety backstop
+  fi
+  local v; v=$(fleet_queue_version) || return 0           # read failed -> fail open
+  [[ -n "$v" ]] || return 0
+  [[ "$v" != "${AQ_FLEET_GATE_SEEN:-}" ]] && return 0     # changed -> claim
+  return 1                                                # unchanged + within backstop -> skip
+}
+
+# fleet_gate_note_claim <claim_rc> — update gate state after a claim attempt.
+#   rc 0 (claimed a job) -> stay draining (there may be more, keep claiming).
+#   rc 2 (nothing claimable) / 1 (API error) -> arm the gate: record the current
+#   version + timestamp and stop draining, so we skip until the version changes.
+fleet_gate_note_claim() {
+  fleet_gate_enabled || return 0
+  if [[ "${1:-1}" == 0 ]]; then AQ_FLEET_GATE_DRAINING=1; return 0; fi
+  AQ_FLEET_GATE_DRAINING=0
+  AQ_FLEET_GATE_TS=$(date +%s)
+  local v; v=$(fleet_queue_version) && [[ -n "$v" ]] && AQ_FLEET_GATE_SEEN="$v"
+  return 0
+}
+
+# ── Claim — pull one job and materialize it as a local inbox .md ────
+# Returns 0 = claimed + materialized, 2 = nothing claimable, 1 = API error.
+fleet_claim() {
+  fleet_enabled || return 2
+  local caps body; caps=$(fleet_detect_caps)
+  body="{\"factoryId\":\"$(_json_escape "$AQ_FACTORY_ID")\",\"capabilities\":$caps,\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900}}"
+  _fleet_call POST "/fleet/claim" "$body"
+  case "$FLEET_CODE" in 2*) :;; *) err "fleet: claim failed (HTTP ${FLEET_CODE:-error})"; return 1;; esac
+  printf '%s' "$FLEET_BODY" | grep -q '"claimed"[[:space:]]*:[[:space:]]*true' || return 2
+
+  local jid body_md epoch repo base_branch verify automerge="" engine_pick
+  jid=$(printf '%s' "$FLEET_BODY" | _json_str id)
+  body_md=$(printf '%s' "$FLEET_BODY" | _json_str bodyMd)
+  epoch=$(printf '%s' "$FLEET_BODY" | _fleet_json_num leaseEpoch)
+  repo=$(printf '%s' "$FLEET_BODY" | _json_str repo)
+  base_branch=$(printf '%s' "$FLEET_BODY" | _json_str baseBranch)
+  verify=$(printf '%s' "$FLEET_BODY" | _json_str verify)
+  printf '%s' "$FLEET_BODY" | grep -q '"autoMerge"[[:space:]]*:[[:space:]]*true' && automerge=true
+  # Concrete engine the submitter picked (job.engine wins over engineClass via
+  # resolve_engine). Only honor a KNOWN engine — never the run's 'unknown'/class
+  # placeholder — so an engineless job still falls back to the factory default.
+  engine_pick=$(printf '%s' "$FLEET_BODY" | _json_str engine)
+  case "$engine_pick" in devin | claude | codex | copilot) ;; *) engine_pick="" ;; esac
+  [[ -n "$jid" ]] || { err "fleet: claim returned no job id"; return 1; }
+
+  # Materialize a transient local job .md (same approach as from-tracker) so the
+  # existing runner executes a coordinator job unchanged. fleet-job-id +
+  # fleet-lease-epoch travel in frontmatter -> the job meta (see cmd_run).
+  local safe tmpdir tmp
+  safe=$(printf '%s' "$jid" | tr -c 'A-Za-z0-9._-' '_')
+  tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/aq-fleet.XXXXXX")
+  tmp="$tmpdir/fleet-$safe.md"
+  {
+    echo "---"
+    echo "cwd: $AQ_FLEET_CWD"
+    echo "yolo: true"
+    echo "fleet-job-id: $jid"
+    echo "fleet-lease-epoch: ${epoch:-0}"
+    [[ -n "$engine_pick" ]] && echo "engine: $engine_pick"
+    [[ -n "$repo" ]] && echo "fleet-repo: $repo"
+    [[ -n "$base_branch" ]] && echo "fleet-base-branch: $base_branch"
+    # Per-repo verify command (drives the existing verify gate) + auto-merge flag.
+    [[ -n "$verify" ]] && echo "verify: $verify"
+    [[ -n "$automerge" ]] && echo "fleet-automerge: true"
+    echo "idempotency-key: fleet-$jid"
+    echo "---"
+    echo
+    printf '%s\n' "$body_md"
+  } > "$tmp"
+  cmd_add "$tmp" >/dev/null 2>&1
+  rm -rf "$tmpdir"
+  log "fleet: claimed job $C_BOLD$jid$C_RESET (leaseEpoch=${epoch:-0})"
+  return 0
+}
+
+# ── Report a fenced stage transition ────────────────────────────────
+# fleet_report <job> <stage> [with-checkpoint] -> 0 ok, 2 FENCED (stale epoch:
+# caller must self-abort), 1 degraded (coordinator unreachable: continue locally).
+fleet_report() {
+  fleet_enabled || return 0
+  local job=$1 stage=$2 with_ckpt=${3:-} metaf jid epoch
+  metaf="$STATE/$job.meta"
+  jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
+  [[ -n "$jid" ]] || return 0
+  local ckpt=""
+  if [[ -n "$with_ckpt" ]]; then
+    local wb wc; wb=$(_meta_val "$metaf" wip_branch); wc=$(_meta_val "$metaf" wip_commit)
+    if [[ -n "$wb" ]]; then
+      ckpt=",\"checkpoint\":{\"wipBranch\":\"$(_json_escape "$wb")\""
+      [[ -n "$wc" ]] && ckpt+=",\"wipCommit\":\"$(_json_escape "$wc")\""
+      ckpt+="}"
+    fi
+  fi
+  # payload carries ONLY {stage, leaseEpoch, checkpoint} — never bodyMd/prompt/token.
+  _fleet_call PATCH "/fleet/jobs/$jid" "{\"stage\":\"$stage\",\"leaseEpoch\":${epoch:-0}$ckpt}"
+  case "$FLEET_CODE" in
+    2*)      echo "fleet_reported=$stage" >> "$metaf"; return 0;;
+    409|412) err "fleet: FENCED reporting stage=$stage (stale leaseEpoch=$epoch) — self-aborting $job"
+             echo "fleet_fenced=1" >> "$metaf"; return 2;;
+    *)       err "fleet: report stage=$stage failed (HTTP ${FLEET_CODE:-error}) — offline-degrade, continuing locally"
+             echo "fleet_degraded=1" >> "$metaf"; return 1;;
+  esac
+}
+
+# fleet_lease_renew <job> -> extend the lease; 0 ok, 2 fenced, 1 degraded.
+# A renewal lost to a transient blip (timeout / 5xx / proxy) would let the lease
+# expire and the coordinator reclaim a job that is still running, wasting the work.
+# So retry a TRANSIENT failure a few times with a short backoff (well within the
+# lease window); a 409/412 FENCE is terminal and never retried. Tunables:
+#   AQ_FLEET_RENEW_RETRIES (default 2 extra attempts), AQ_FLEET_RENEW_BACKOFF_SEC (2).
+fleet_lease_renew() {
+  fleet_enabled || return 0
+  local job=$1 metaf jid epoch
+  metaf="$STATE/$job.meta"
+  jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
+  [[ -n "$jid" ]] || return 0
+  local retries="${AQ_FLEET_RENEW_RETRIES:-2}" backoff="${AQ_FLEET_RENEW_BACKOFF_SEC:-2}" i=0
+  [[ "$retries" =~ ^[0-9]+$ ]] || retries=2
+  while :; do
+    _fleet_call POST "/fleet/jobs/$jid/lease/renew" "{\"leaseEpoch\":${epoch:-0},\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900}}"
+    case "$FLEET_CODE" in
+      2*) return 0;;
+      409|412) echo "fleet_fenced=1" >> "$metaf"; return 2;;
+    esac
+    # transient: retry up to $retries extra times before giving up degraded
+    [[ "$i" -ge "$retries" ]] && return 1
+    i=$((i + 1))
+    sleep "$backoff"
+  done
+}
+
+# fleet_lease_release <job> [stage] -> best-effort release on a terminal stage.
+fleet_lease_release() {
+  fleet_enabled || return 0
+  local job=$1 stage=${2:-} metaf jid epoch body
+  metaf="$STATE/$job.meta"
+  jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
+  [[ -n "$jid" ]] || return 0
+  body="{\"leaseEpoch\":${epoch:-0}"
+  [[ -n "$stage" ]] && body+=",\"stage\":\"$stage\""
+  body+="}"
+  _fleet_call POST "/fleet/jobs/$jid/lease/release" "$body"
+  return 0
+}
+
+# fleet_report_insights <job> [result] — report the run's cost/token/effort metrics
+# (parsed by parse_usage into the job meta) to the coordinator, recorded on the
+# current run. Also releases the held lease (the agent has finished its work unit).
+# Best-effort: never blocks the loop. Engines that don't expose usage locally
+# (e.g. devin) simply omit token/cost fields; `result` + endedAt still land.
+fleet_report_insights() {
+  fleet_enabled || return 0
+  local job=$1 result=${2:-} metaf jid epoch
+  metaf="$STATE/$job.meta"
+  jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
+  [[ -n "$jid" ]] || return 0
+  local model engine session ti to tc cost turns tools est ins=""
+  model=$(_meta_val "$metaf" model)
+  engine=$(_meta_val "$metaf" engine);    session=$(_meta_val "$metaf" session_id)
+  ti=$(_meta_val "$metaf" tokens_in);     to=$(_meta_val "$metaf" tokens_out)
+  tc=$(_meta_val "$metaf" tokens_cached); cost=$(_meta_val "$metaf" cost_usd)
+  turns=$(_meta_val "$metaf" turns);      tools=$(_meta_val "$metaf" tool_calls)
+  est=$(_meta_val "$metaf" usage_estimated)
+  [[ -n "$model" ]] && ins+=",\"model\":\"$(_json_escape "$model")\""
+  [[ -n "$engine" ]] && ins+=",\"engine\":\"$(_json_escape "$engine")\""
+  [[ -n "$session" ]] && ins+=",\"sessionId\":\"$(_json_escape "$session")\""
+  [[ "$ti"   =~ ^[0-9]+$ ]] && ins+=",\"tokensIn\":$ti"
+  [[ "$to"   =~ ^[0-9]+$ ]] && ins+=",\"tokensOut\":$to"
+  [[ "$tc"   =~ ^[0-9]+$ ]] && ins+=",\"tokensCached\":$tc"
+  [[ "$cost" =~ ^[0-9]+(\.[0-9]+)?$ ]] && ins+=",\"costUsd\":$cost"
+  [[ "$turns" =~ ^[0-9]+$ ]] && ins+=",\"turns\":$turns"
+  [[ "$tools" =~ ^[0-9]+$ ]] && ins+=",\"toolCalls\":$tools"
+  [[ "$est" == "true" || "$est" == "1" ]] && ins+=",\"estimated\":true"
+  local pr_url pr_branch pr_state
+  pr_url=$(_meta_val "$metaf" pr_url); pr_branch=$(_meta_val "$metaf" pr_branch)
+  pr_state=$(_meta_val "$metaf" pr_state)
+  local body="{\"leaseEpoch\":${epoch:-0}"
+  [[ -n "$ins" ]] && body+=",\"insights\":{${ins#,}}"
+  [[ -n "$result" ]] && body+=",\"result\":\"$(_json_escape "$result")\""
+  [[ -n "$pr_url" ]] && body+=",\"prUrl\":\"$(_json_escape "$pr_url")\""
+  [[ -n "$pr_branch" ]] && body+=",\"branch\":\"$(_json_escape "$pr_branch")\""
+  [[ -n "$pr_state" ]] && body+=",\"prState\":\"$(_json_escape "$pr_state")\""
+  body+="}"
+  _fleet_call POST "/fleet/jobs/$jid/lease/release" "$body"
+  return 0
+}
+
+# fleet_renew_active — renew leases for all in-flight (building/) fleet jobs.
+fleet_renew_active() {
+  fleet_enabled || return 0
+  local f job
+  for f in "$BUILDING"/*.md; do
+    [[ -e "$f" ]] || continue
+    job=$(basename "$f"); job=${job%.md}
+    _fleet_is_job "$job" && { fleet_lease_renew "$job" >/dev/null 2>&1 || true; }
+  done
+  return 0
+}
+
+# fleet_release_all_active — best-effort release leases for all in-flight (building/)
+# fleet jobs, e.g. on daemon shutdown, so the coordinator can reclaim them
+# immediately instead of waiting out the lease TTL (~900s). Never blocks shutdown;
+# no-op when fleet is disabled. Mirrors fleet_renew_active (release vs renew).
+fleet_release_all_active() {
+  fleet_enabled || return 0
+  local f job
+  for f in "$BUILDING"/*.md; do
+    [[ -e "$f" ]] || continue
+    job=$(basename "$f"); job=${job%.md}
+    _fleet_is_job "$job" && { fleet_lease_release "$job" >/dev/null 2>&1 || true; }
+  done
+  return 0
+}
+
+# fleet_quarantine <job> <file> <metaf> <logf> — a fenced (reclaimed) worker must
+# NOT ship: park the local result in failed/ for human triage (§9 split-brain).
+fleet_quarantine() {
+  local job=$1 file=$2 metaf=$3 logf=$4
+  {
+    echo "FLEET FENCED — the coordinator reclaimed this job (stale leaseEpoch)."
+    echo "Quarantining the local result — NOT shipping/merging. Needs human triage. ($(date))"
+  } >> "$logf"
+  [[ -e "$file" ]] && mv "$file" "$FAILED/" 2>/dev/null
+  { echo "result=fenced_quarantine"; echo "fleet_quarantined=1"; echo "ended=$(date +%s)"; } >> "$metaf"
+  err "fleet: quarantined $job (fenced/reclaimed) — surfaced for human triage"
+}
+
+# _fleet_stage_for <result> -> the coordinator stage for a job result/stage.
+_fleet_stage_for() {
+  case "$1" in
+    shipped) echo shipped;;
+    testing) echo testing;;
+    review)  echo review;;
+    failed|timeout|verify_failed|retries_exhausted|capability_mismatch|no_engine|rejected) echo failed;;
+    *) echo building;;
+  esac
+}
+
+# ── Slice 4: shadow / dual-run (strictly side-effect-free on real job state) ──
+
+# _fleet_shadow_log -> path to the structured shadow-divergence log.
+_fleet_shadow_log() { printf '%s\n' "${AQ_FLEET_SHADOW_LOG:-$STATE/fleet-shadow.log}"; }
+
+# fleet_shadow_claim — ask the coordinator what it WOULD assign for this factory's
+# capabilities, read-only. Side-effect-free on real job state, by construction:
+#   * uses an ISOLATED shadow factoryId (never the real one), so it can't take a
+#     job away from the real factory's identity;
+#   * sends "dryRun":true,"shadow":true — a coordinator that honors it never
+#     assigns (purely returns the would-be job);
+#   * if the coordinator DID assign anyway (no dry-run support), the lease is
+#     released immediately so no real assignment persists;
+#   * the would-be job is NEVER materialized / run / reported / shipped locally.
+# Sets SHADOW_COORD_JOB to the would-be job id ("" = none). Best-effort: any error
+# is recorded as SHADOW_ERROR and swallowed — shadow must NEVER fail a real job.
+fleet_shadow_claim() {
+  SHADOW_COORD_JOB=""
+  fleet_shadow_enabled || return 0
+  local caps body; caps=$(fleet_detect_caps)
+  body="{\"factoryId\":\"$(_json_escape "$AQ_FLEET_SHADOW_FACTORY_ID")\",\"capabilities\":$caps,\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900},\"dryRun\":true,\"shadow\":true}"
+  _fleet_call POST "/fleet/claim" "$body"
+  case "$FLEET_CODE" in
+    2*) : ;;
+    *) printf '%s\t%s\t%s\t%s\n' "$(date +%s)" "<none>" "<none>" "SHADOW_ERROR(claim:HTTP_${FLEET_CODE:-error})" \
+         >> "$(_fleet_shadow_log)" 2>/dev/null || true
+       return 0 ;;
+  esac
+  printf '%s' "$FLEET_BODY" | grep -q '"claimed"[[:space:]]*:[[:space:]]*true' || return 0
+  local jid epoch
+  jid=$(printf '%s' "$FLEET_BODY" | _json_str id)
+  epoch=$(printf '%s' "$FLEET_BODY" | _fleet_json_num leaseEpoch)
+  SHADOW_COORD_JOB="$jid"
+  # Undo any REAL lease the coordinator may have created (no dry-run support) so
+  # the shadow probe leaves zero residue. Best-effort, response ignored.
+  if [[ -n "$jid" ]]; then
+    _fleet_call POST "/fleet/jobs/$jid/lease/release" "{\"leaseEpoch\":${epoch:-0},\"shadow\":true}" >/dev/null 2>&1 || true
+  fi
+  return 0
+}
+
+# fleet_shadow_compare <localJobId> <coordJobId> — classify the local (authoritative)
+# decision against the coordinator's would-be decision and append a structured line
+# (ts<TAB>localJob<TAB>coordJob<TAB>verdict) to the shadow log. Verdicts:
+# AGREE | DIVERGE | COORD_EMPTY | LOCAL_EMPTY. Both-empty is a no-op (nothing to compare).
+fleet_shadow_compare() {
+  fleet_shadow_enabled || return 0
+  local lj=${1:-} cj=${2:-} verdict
+  if [[ -z "$lj" && -z "$cj" ]]; then return 0; fi
+  if [[ -n "$lj" && -z "$cj" ]]; then verdict=COORD_EMPTY
+  elif [[ -z "$lj" && -n "$cj" ]]; then verdict=LOCAL_EMPTY
+  elif [[ "$lj" == "$cj" ]]; then verdict=AGREE
+  else verdict=DIVERGE; fi
+  printf '%s\t%s\t%s\t%s\n' "$(date +%s)" "${lj:-<none>}" "${cj:-<none>}" "$verdict" \
+    >> "$(_fleet_shadow_log)" 2>/dev/null || true
+  return 0
+}
+
+# fleet_shadow_report <localJobId> <coordJobId> [stage] — mirror a stage transition
+# to the coordinator as a SHADOW event ("shadow":true,"dryRun":true) so the report
+# path is EXERCISED, but the coordinator response is NEVER acted on (no fence /
+# quarantine / state change) — divergence (e.g. 409) is only logged. Targets the
+# would-be coordinator job id; a no-op when there is none. Best-effort + swallowed.
+fleet_shadow_report() {
+  fleet_shadow_enabled || return 0
+  local lj=${1:-} cj=${2:-} stage=${3:-building}
+  [[ -n "$cj" ]] || return 0
+  _fleet_call PATCH "/fleet/jobs/$cj" "{\"stage\":\"$(_json_escape "$stage")\",\"shadow\":true,\"dryRun\":true}"
+  case "${FLEET_CODE:-}" in
+    2*) : ;;
+    *) printf '%s\t%s\t%s\t%s\n' "$(date +%s)" "${lj:-<none>}" "${cj:-<none>}" "SHADOW_REPORT_DIVERGE(HTTP_${FLEET_CODE:-error})" \
+         >> "$(_fleet_shadow_log)" 2>/dev/null || true ;;
+  esac
+  return 0
+}
+
+# fleet-shadow-report — summarize the shadow log: per-verdict counts, agreement
+# rate, and the last N divergences. Read-only; safe regardless of the flags.
+cmd_fleet_shadow_report() {
+  ensure_dirs
+  local n=10 logf; logf=$(_fleet_shadow_log)
+  [[ "${1:-}" =~ ^[0-9]+$ ]] && n=$1
+  if [[ ! -s "$logf" ]]; then
+    log "fleet shadow: no shadow log yet ($logf)."
+    log "fleet shadow: run with AQ_FLEET=1 AQ_FLEET_ROUTE=0 AQ_FLEET_SHADOW=1 to record divergence."
+    return 0
+  fi
+  log "fleet shadow report ($logf):"
+  awk -F'\t' '
+    { v=$4; sub(/\(.*/, "", v); c[v]++; tot++
+      if (v=="AGREE") ag++
+      if (v=="AGREE"||v=="DIVERGE"||v=="COORD_EMPTY"||v=="LOCAL_EMPTY") dec++ }
+    END {
+      split("AGREE DIVERGE COORD_EMPTY LOCAL_EMPTY SHADOW_ERROR SHADOW_REPORT_DIVERGE", ord, " ")
+      for (i=1; i<=6; i++) printf "  %-22s %d\n", ord[i], c[ord[i]]+0
+      printf "  %-22s %d\n", "TOTAL", tot+0
+      if (dec>0) printf "  %-22s %d%%\n", "AGREEMENT", int(100*ag/dec)
+    }' "$logf"
+  log "last $n divergence/error events:"
+  grep -E "$(printf '\t')(DIVERGE|COORD_EMPTY|LOCAL_EMPTY|SHADOW_ERROR|SHADOW_REPORT_DIVERGE)" "$logf" 2>/dev/null \
+    | tail -n "$n" \
+    | awk -F'\t' '{ printf "  ts=%s local=%s coord=%s verdict=%s\n", $1, $2, $3, $4 }' || true
+  return 0
+}
+
+# fleet-status — heartbeat (register) + print this factory's identity/caps + flags.
+cmd_fleet_status() {
+  ensure_dirs
+  if ! fleet_enabled; then
+    log "fleet: AQ_FLEET is off — running in offline git-queue mode (no coordinator)."
+    return 0
+  fi
+  log "fleet: factory=$C_BOLD$AQ_FACTORY_ID$C_RESET api=$AQ_FLEET_API"
+  log "fleet: flags=$(fleet_flags_state)"
+  fleet_flags_warn_once
+  log "fleet: capabilities=$(fleet_detect_caps)"
+  if fleet_shadow_enabled; then
+    log "fleet: SHADOW/dual-run mode — local inbox is authoritative; coordinator queried for comparison only (never acted on)."
+  elif ! fleet_route_enabled; then
+    log "fleet: ROUTE off — local inbox is authoritative; coordinator not used to source work."
+  fi
+  if fleet_heartbeat; then
+    log "fleet: heartbeat OK (registered)."
+  else
+    err "fleet: coordinator unreachable — would run in offline-degrade mode."
+  fi
+}
--- a/agent-queue/lib/fleet-dash.mjs
+++ b/agent-queue/lib/fleet-dash.mjs
@ -0,0 +1,218 @@
+// fleet-dash.mjs — read/act adapter that re-points the agent-queue TUI dashboard
+// at the platform-service `/fleet` REST API (roadmap Phase 3: "TUI dashboard
+// re-pointed at /fleet API (parity)").
+//
+// This module is intentionally pure-ish and dependency-injectable (the HTTP
+// `fetchImpl` is a parameter) so it is unit-testable WITHOUT a live service.
+// dashboard.mjs uses it ONLY when AQ_FLEET_DASH=1; otherwise the dashboard's
+// local-queue behavior is byte-for-byte unchanged.
+//
+// Auth + scoping mirror agent-queue/lib/fleet-client.sh:
+//   base URL  AQ_FLEET_API   (already includes /api)
+//   bearer    AQ_FLEET_TOKEN
+//   product   X-Product-Id: AQ_PRODUCT_ID  (sent on every request)
+
+const DEFAULT_TIMEOUT_MS = 8000;
+
+// fleetConfig(env) — resolve the dashboard's fleet mode from the environment.
+// enabled iff AQ_FLEET_DASH=1 (explicit opt-in). `ok` additionally requires the
+// api/token/product config to be complete; `missing` lists what's absent so the
+// dashboard can fail visibly instead of silently doing nothing.
+export function fleetConfig(env = process.env) {
+  const enabled = String(env.AQ_FLEET_DASH || '') === '1';
+  const api = String(env.AQ_FLEET_API || '').replace(/\/+$/, '');
+  const token = String(env.AQ_FLEET_TOKEN || '');
+  const productId = String(env.AQ_PRODUCT_ID || '');
+  const missing = [];
+  if (enabled) {
+    if (!api) missing.push('AQ_FLEET_API');
+    if (!token) missing.push('AQ_FLEET_TOKEN');
+    if (!productId) missing.push('AQ_PRODUCT_ID');
+  }
+  return { enabled, api, token, productId, ok: enabled && missing.length === 0, missing };
+}
+
+// ── stage mapping ───────────────────────────────────────────────────────────
+// Fleet stages collapse onto the local board's kanban buckets so the dashboard's
+// existing layout/gating/STAGE_TAG logic can be reused unchanged.
+const STAGE_BUCKET = {
+  queued: 'inbox',
+  assigned: 'building',
+  building: 'building',
+  review: 'review',
+  testing: 'testing',
+  shipped: 'shipped',
+  failed: 'failed',
+  dead_letter: 'failed',
+};
+export const mapStage = (s) => STAGE_BUCKET[s] || 'inbox';
+
+// Stages an operator can act on from the dashboard (parity with the local
+// ACTION_STAGES = review · testing · failed · inbox).
+const ACTIONABLE = new Set(['review', 'testing', 'failed', 'inbox']);
+const RUNNING_STAGES = new Set(['assigned', 'building']);
+const ITEM_ORDER = { review: 0, testing: 1, failed: 2, inbox: 3 };
+
+const trackerOf = (j) => j.trackerItemId || j.trackerItem || (j.data && j.data.trackerItem) || '';
+const capsOf = (j) => (Array.isArray(j.capabilities) ? j.capabilities.join(', ') : String(j.capabilities || ''));
+
+// toBoard({jobs, factories, metrics}) — normalize the raw API payloads into the
+// shape the dashboard renders. Pure (no I/O), so it is fully unit-testable.
+export function toBoard({ jobs = [], factories = [], metrics = null } = {}) {
+  const counts = { inbox: 0, building: 0, review: 0, testing: 0, shipped: 0, failed: 0 };
+  const items = [];
+  const running = [];
+  const recent = [];
+  for (const j of jobs) {
+    const bucket = mapStage(j.stage);
+    if (counts[bucket] !== undefined) counts[bucket] += 1;
+    const norm = {
+      // `stage` is the bucket so the dashboard's gate()/STAGE_TAG work unchanged;
+      // `fleetStage` keeps the true server stage for display.
+      stage: bucket,
+      fleetStage: j.stage,
+      id: j.id,
+      priority: j.priority || 'medium',
+      profile: j.profile || '',
+      capabilities: capsOf(j),
+      tracker_item: trackerOf(j),
+      leaseEpoch: j.leaseEpoch,
+      factoryId: j.factoryId || j.leaseFactoryId || '',
+      attempts: j.attempts,
+      updatedAt: j.updatedAt || j.createdAt || '',
+      raw: j,
+    };
+    if (RUNNING_STAGES.has(j.stage)) running.push(norm);
+    if (ACTIONABLE.has(bucket)) items.push(norm);
+    if (bucket === 'shipped' || bucket === 'failed') recent.push(norm);
+  }
+  items.sort((a, b) => (ITEM_ORDER[a.stage] - ITEM_ORDER[b.stage]) || cmp(a.id, b.id));
+  recent.sort((a, b) => cmp(String(b.updatedAt), String(a.updatedAt)));
+  return { counts, items, running, recent: recent.slice(0, 5), factories, metrics };
+}
+
+const cmp = (a, b) => (a < b ? -1 : a > b ? 1 : 0);
+
+// ── HTTP ──────────────────────────────────────────────────────────────────--
+// fleetFetch — a single request against the coordinator. NEVER throws: network
+// errors / timeouts / non-JSON bodies are returned as a structured result so the
+// TUI stays responsive. A timeout is enforced via AbortController.
+export async function fleetFetch(cfg, pathname, opts = {}, fetchImpl = globalThis.fetch) {
+  const url = `${cfg.api}${pathname}`;
+  const headers = {
+    Authorization: `Bearer ${cfg.token}`,
+    'X-Product-Id': cfg.productId,
+    Accept: 'application/json',
+  };
+  const hasBody = opts.body !== undefined;
+  if (hasBody) headers['Content-Type'] = 'application/json';
+  const ac = new AbortController();
+  const timer = setTimeout(() => ac.abort(), opts.timeoutMs || DEFAULT_TIMEOUT_MS);
+  try {
+    const res = await fetchImpl(url, {
+      method: opts.method || 'GET',
+      headers,
+      body: hasBody ? JSON.stringify(opts.body) : undefined,
+      signal: ac.signal,
+    });
+    let json = null;
+    let text = '';
+    try { text = await res.text(); } catch { /* ignore body read errors */ }
+    if (text) { try { json = JSON.parse(text); } catch { json = null; } }
+    return { ok: !!res.ok, status: res.status, json };
+  } catch (e) {
+    const timedOut = e && (e.name === 'AbortError' || e.code === 'ABORT_ERR');
+    return { ok: false, status: 0, json: null, error: timedOut ? 'timeout' : (e && e.message) || 'network error' };
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+// fetchBoard — assemble the board model. Jobs are REQUIRED (failure ⇒ board
+// fails). Metrics + factories are best-effort: a missing/404/501 factories
+// endpoint degrades to [] (it is optional server-side), and absent metrics
+// simply omits the aggregate panel.
+export async function fetchBoard(cfg, fetchImpl = globalThis.fetch) {
+  const jobsRes = await fleetFetch(cfg, '/fleet/jobs', {}, fetchImpl);
+  if (!jobsRes.ok || !jobsRes.json) {
+    return { ok: false, error: jobsRes.error || `jobs HTTP ${jobsRes.status}` };
+  }
+  const jobs = Array.isArray(jobsRes.json.jobs) ? jobsRes.json.jobs : [];
+
+  const metricsRes = await fleetFetch(cfg, '/fleet/metrics', {}, fetchImpl);
+  const metrics = metricsRes.ok && metricsRes.json ? metricsRes.json : null;
+
+  const facRes = await fleetFetch(cfg, '/fleet/factories', {}, fetchImpl);
+  const factories = facRes.ok && facRes.json && Array.isArray(facRes.json.factories)
+    ? facRes.json.factories
+    : [];
+
+  return { ok: true, board: toBoard({ jobs, factories, metrics }) };
+}
+
+// formatEvent — one fleet event → a single log line for the TUI log view.
+export function formatEvent(e) {
+  const at = e.at ? safeTime(e.at) : '';
+  const actor = e.actor ? ` ${e.actor}` : '';
+  const data = e.data && typeof e.data === 'object' && Object.keys(e.data).length
+    ? ` ${JSON.stringify(e.data)}`
+    : '';
+  return `${at}  ${e.type || '?'}${actor}${data}`.trim();
+}
+
+const safeTime = (iso) => {
+  const d = new Date(iso);
+  return Number.isNaN(d.getTime()) ? String(iso) : d.toLocaleTimeString();
+};
+
+// fetchEvents — the job's event stream rendered as log lines.
+export async function fetchEvents(cfg, jobId, fetchImpl = globalThis.fetch) {
+  const res = await fleetFetch(cfg, `/fleet/jobs/${encodeURIComponent(jobId)}/events`, {}, fetchImpl);
+  if (!res.ok || !res.json) {
+    return { ok: false, error: res.error || `events HTTP ${res.status}`, lines: [] };
+  }
+  const events = Array.isArray(res.json.events) ? res.json.events : [];
+  return { ok: true, lines: events.map(formatEvent) };
+}
+
+// Operator verbs the dashboard supports in fleet mode. `promote` has no safe
+// server contract (client-inferred stage transitions could violate workflow
+// invariants), so it is explicitly unavailable here.
+const FLEET_VERBS = new Set(['ship', 'requeue', 'reject']);
+
+// jobAction — execute an operator verb against the coordinator.
+//  ship    → re-GET the job for a FRESH leaseEpoch, then PATCH stage=shipped
+//            (a stale snapshot epoch would be fenced with 409).
+//  requeue → POST /actions/requeue   (lease-free operator action)
+//  reject  → POST /actions/reject
+// Returns {ok, message}; never throws.
+export async function jobAction(cfg, item, verb, fetchImpl = globalThis.fetch) {
+  if (verb === 'promote') return { ok: false, message: 'promote is not available in fleet mode' };
+  if (!FLEET_VERBS.has(verb)) return { ok: false, message: `${verb} not supported in fleet mode` };
+  const id = item && item.id;
+  if (!id) return { ok: false, message: 'no job selected' };
+
+  if (verb === 'ship') {
+    const cur = await fleetFetch(cfg, `/fleet/jobs/${encodeURIComponent(id)}`, {}, fetchImpl);
+    if (!cur.ok || !cur.json) return { ok: false, message: cur.error || `job HTTP ${cur.status}` };
+    const res = await fleetFetch(
+      cfg,
+      `/fleet/jobs/${encodeURIComponent(id)}`,
+      { method: 'PATCH', body: { stage: 'shipped', leaseEpoch: cur.json.leaseEpoch } },
+      fetchImpl,
+    );
+    if (res.status === 409) return { ok: false, message: 'job changed (fenced) — refresh and retry' };
+    if (!res.ok) return { ok: false, message: res.error || `ship HTTP ${res.status}` };
+    return { ok: true, message: `shipped ${id}` };
+  }
+
+  const res = await fleetFetch(
+    cfg,
+    `/fleet/jobs/${encodeURIComponent(id)}/actions/${verb}`,
+    { method: 'POST', body: {} },
+    fetchImpl,
+  );
+  if (res.status === 409) return { ok: false, message: 'job conflict/terminal — refresh and retry' };
+  if (!res.ok) return { ok: false, message: res.error || `${verb} HTTP ${res.status}` };
+  return { ok: true, message: `${verb} ${id}` };
+}
--- a/agent-queue/lib/fleet-dash.test.mjs
+++ b/agent-queue/lib/fleet-dash.test.mjs
@ -0,0 +1,287 @@
+// fleet-dash.test.mjs — dependency-light unit tests for the fleet-mode dashboard
+// adapter. Uses node:assert only (no test framework), matching the repo style.
+// Run: `node fleet-dash.test.mjs`  (also wired into selftest.sh).
+//
+// These tests prove the dashboard's CONTRACT ASSUMPTIONS against the /fleet API
+// (request shaping, response mapping, graceful degradation, action semantics)
+// via an injected fetch stub. They do NOT prove live server compatibility.
+
+import assert from 'node:assert/strict';
+import {
+  fleetConfig,
+  mapStage,
+  toBoard,
+  fleetFetch,
+  fetchBoard,
+  fetchEvents,
+  formatEvent,
+  jobAction,
+} from './fleet-dash.mjs';
+
+let passed = 0;
+const t = (name, fn) => {
+  try {
+    const r = fn();
+    if (r && typeof r.then === 'function') {
+      return r.then(
+        () => { passed += 1; },
+        (e) => { console.error(`  ✗ ${name}\n    ${e && e.message}`); process.exitCode = 1; },
+      );
+    }
+    passed += 1;
+  } catch (e) {
+    console.error(`  ✗ ${name}\n    ${e && e.message}`);
+    process.exitCode = 1;
+  }
+  return undefined;
+};
+
+// A recording fetch stub. `routes` maps a matcher → {status, body} (or a fn).
+function makeFetch(routes) {
+  const calls = [];
+  const fetchImpl = async (url, opts = {}) => {
+    calls.push({ url, opts, headers: opts.headers || {}, method: opts.method || 'GET' });
+    let entry = routes[url];
+    if (!entry) {
+      // try suffix match (path only)
+      const key = Object.keys(routes).find((k) => url.endsWith(k));
+      entry = key ? routes[key] : undefined;
+    }
+    if (typeof entry === 'function') entry = entry({ url, opts });
+    if (entry === undefined) return mkRes(404, '{}');
+    if (entry.throw) throw Object.assign(new Error(entry.throw), { name: entry.name || 'Error' });
+    return mkRes(entry.status ?? 200, entry.body ?? '');
+  };
+  fetchImpl.calls = calls;
+  return fetchImpl;
+}
+const mkRes = (status, body) => ({
+  ok: status >= 200 && status < 300,
+  status,
+  text: async () => (typeof body === 'string' ? body : JSON.stringify(body)),
+});
+
+const CFG = { enabled: true, ok: true, api: 'http://svc/api', token: 'tok', productId: 'prodX', missing: [] };
+
+await (async () => {
+  // ── fleetConfig ──
+  t('fleetConfig: AQ_FLEET_DASH unset ⇒ disabled', () => {
+    const c = fleetConfig({});
+    assert.equal(c.enabled, false);
+    assert.equal(c.ok, false);
+  });
+  t('fleetConfig: enabled but missing config ⇒ not ok, lists missing', () => {
+    const c = fleetConfig({ AQ_FLEET_DASH: '1' });
+    assert.equal(c.enabled, true);
+    assert.equal(c.ok, false);
+    assert.deepEqual(c.missing.sort(), ['AQ_FLEET_API', 'AQ_FLEET_TOKEN', 'AQ_PRODUCT_ID'].sort());
+  });
+  t('fleetConfig: enabled + complete ⇒ ok, trims trailing slash', () => {
+    const c = fleetConfig({ AQ_FLEET_DASH: '1', AQ_FLEET_API: 'http://svc/api/', AQ_FLEET_TOKEN: 'k', AQ_PRODUCT_ID: 'p' });
+    assert.equal(c.ok, true);
+    assert.equal(c.api, 'http://svc/api');
+  });
+
+  // ── mapStage ──
+  t('mapStage: fleet stages collapse to board buckets', () => {
+    assert.equal(mapStage('queued'), 'inbox');
+    assert.equal(mapStage('assigned'), 'building');
+    assert.equal(mapStage('building'), 'building');
+    assert.equal(mapStage('review'), 'review');
+    assert.equal(mapStage('testing'), 'testing');
+    assert.equal(mapStage('shipped'), 'shipped');
+    assert.equal(mapStage('failed'), 'failed');
+    assert.equal(mapStage('dead_letter'), 'failed');
+    assert.equal(mapStage('weird'), 'inbox');
+  });
+
+  // ── toBoard ──
+  t('toBoard: counts, actionable items, running, recent', () => {
+    const jobs = [
+      { id: 'a', stage: 'queued', priority: 'high', capabilities: ['os:any'] },
+      { id: 'b', stage: 'building', priority: 'critical', factoryId: 'mac-1', leaseEpoch: 3 },
+      { id: 'c', stage: 'review', updatedAt: '2026-01-01T00:00:02Z' },
+      { id: 'd', stage: 'testing' },
+      { id: 'e', stage: 'shipped', updatedAt: '2026-01-01T00:00:09Z' },
+      { id: 'f', stage: 'failed', updatedAt: '2026-01-01T00:00:05Z' },
+      { id: 'g', stage: 'dead_letter', updatedAt: '2026-01-01T00:00:01Z' },
+    ];
+    const b = toBoard({ jobs });
+    assert.equal(b.counts.inbox, 1);
+    assert.equal(b.counts.building, 1);
+    assert.equal(b.counts.review, 1);
+    assert.equal(b.counts.testing, 1);
+    assert.equal(b.counts.shipped, 1);
+    assert.equal(b.counts.failed, 2); // failed + dead_letter
+    // running = assigned/building only
+    assert.deepEqual(b.running.map((x) => x.id), ['b']);
+    assert.equal(b.running[0].fleetStage, 'building');
+    assert.equal(b.running[0].factoryId, 'mac-1');
+    // actionable items exclude building/shipped, ordered review<testing<failed<inbox
+    assert.deepEqual(b.items.map((x) => x.id), ['c', 'd', 'f', 'g', 'a']);
+    // item.stage is the bucket (so dashboard gate()/STAGE_TAG reuse works)
+    assert.equal(b.items[0].stage, 'review');
+    assert.equal(b.items[4].stage, 'inbox');
+    // recent = shipped+failed, newest first, capped at 5
+    assert.deepEqual(b.recent.map((x) => x.id), ['e', 'f', 'g']);
+  });
+
+  // ── fleetFetch: headers + product scoping on every request ──
+  await t('fleetFetch: sends bearer + X-Product-Id; parses JSON', async () => {
+    const f = makeFetch({ '/fleet/jobs': { status: 200, body: { jobs: [] } } });
+    const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
+    assert.equal(r.ok, true);
+    assert.deepEqual(r.json, { jobs: [] });
+    const h = f.calls[0].headers;
+    assert.equal(h.Authorization, 'Bearer tok');
+    assert.equal(h['X-Product-Id'], 'prodX');
+    assert.equal(f.calls[0].url, 'http://svc/api/fleet/jobs');
+  });
+  await t('fleetFetch: network error ⇒ ok:false with message (no throw)', async () => {
+    const f = makeFetch({ '/fleet/jobs': { throw: 'boom' } });
+    const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
+    assert.equal(r.ok, false);
+    assert.equal(r.status, 0);
+    assert.match(r.error, /boom/);
+  });
+  await t('fleetFetch: abort ⇒ timeout error', async () => {
+    const f = makeFetch({ '/fleet/jobs': { throw: 'aborted', name: 'AbortError' } });
+    const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
+    assert.equal(r.ok, false);
+    assert.equal(r.error, 'timeout');
+  });
+  await t('fleetFetch: non-JSON 500 body ⇒ ok:false, json null', async () => {
+    const f = makeFetch({ '/fleet/jobs': { status: 500, body: '<html>err</html>' } });
+    const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
+    assert.equal(r.ok, false);
+    assert.equal(r.status, 500);
+    assert.equal(r.json, null);
+  });
+
+  // ── fetchBoard: assembly + degradation ──
+  await t('fetchBoard: jobs + metrics + factories assembled', async () => {
+    const f = makeFetch({
+      '/fleet/jobs': { body: { jobs: [{ id: 'a', stage: 'review' }] } },
+      '/fleet/metrics': { body: { utilizationPct: 50, alerts: [] } },
+      '/fleet/factories': { body: { factories: [{ factoryId: 'mac-1', health: 'ok' }] } },
+    });
+    const r = await fetchBoard(CFG, f);
+    assert.equal(r.ok, true);
+    assert.equal(r.board.items.length, 1);
+    assert.equal(r.board.metrics.utilizationPct, 50);
+    assert.equal(r.board.factories.length, 1);
+  });
+  await t('fetchBoard: factories 404 ⇒ degrades to []', async () => {
+    const f = makeFetch({
+      '/fleet/jobs': { body: { jobs: [] } },
+      '/fleet/metrics': { body: {} },
+      '/fleet/factories': { status: 404, body: {} },
+    });
+    const r = await fetchBoard(CFG, f);
+    assert.equal(r.ok, true);
+    assert.deepEqual(r.board.factories, []);
+  });
+  await t('fetchBoard: factories 501 ⇒ degrades to []', async () => {
+    const f = makeFetch({
+      '/fleet/jobs': { body: { jobs: [] } },
+      '/fleet/metrics': { body: {} },
+      '/fleet/factories': { status: 501, body: {} },
+    });
+    const r = await fetchBoard(CFG, f);
+    assert.equal(r.ok, true);
+    assert.deepEqual(r.board.factories, []);
+  });
+  await t('fetchBoard: metrics failure ⇒ board still ok, metrics null', async () => {
+    const f = makeFetch({
+      '/fleet/jobs': { body: { jobs: [] } },
+      '/fleet/metrics': { status: 500, body: 'oops' },
+      '/fleet/factories': { body: { factories: [] } },
+    });
+    const r = await fetchBoard(CFG, f);
+    assert.equal(r.ok, true);
+    assert.equal(r.board.metrics, null);
+  });
+  await t('fetchBoard: jobs failure ⇒ board fails with error', async () => {
+    const f = makeFetch({ '/fleet/jobs': { status: 503, body: '{}' } });
+    const r = await fetchBoard(CFG, f);
+    assert.equal(r.ok, false);
+    assert.match(r.error, /503/);
+  });
+
+  // ── events ──
+  t('formatEvent: renders type + actor + data', () => {
+    const line = formatEvent({ type: 'claimed', actor: 'mac-1', at: '2026-01-01T00:00:00Z', data: { leaseEpoch: 2 } });
+    assert.match(line, /claimed/);
+    assert.match(line, /mac-1/);
+    assert.match(line, /leaseEpoch/);
+  });
+  await t('fetchEvents: maps events to lines', async () => {
+    const f = makeFetch({
+      '/events': { body: { events: [{ type: 'queued', at: '2026-01-01T00:00:00Z', data: {} }, { type: 'claimed', data: {} }] } },
+    });
+    const r = await fetchEvents(CFG, 'job-1', f);
+    assert.equal(r.ok, true);
+    assert.equal(r.lines.length, 2);
+    assert.match(r.lines[1], /claimed/);
+    assert.match(f.calls[0].url, /\/fleet\/jobs\/job-1\/events$/);
+  });
+  await t('fetchEvents: failure ⇒ ok:false, empty lines', async () => {
+    const f = makeFetch({ '/events': { status: 500, body: 'x' } });
+    const r = await fetchEvents(CFG, 'job-1', f);
+    assert.equal(r.ok, false);
+    assert.deepEqual(r.lines, []);
+  });
+
+  // ── jobAction ──
+  await t('jobAction: ship re-GETs fresh leaseEpoch then PATCHes shipped', async () => {
+    let patchBody = null;
+    const f = makeFetch({
+      'http://svc/api/fleet/jobs/j1': ({ opts }) => {
+        if ((opts.method || 'GET') === 'PATCH') { patchBody = JSON.parse(opts.body); return { status: 200, body: { id: 'j1', stage: 'shipped' } }; }
+        return { status: 200, body: { id: 'j1', stage: 'testing', leaseEpoch: 7 } }; // fresh epoch
+      },
+    });
+    const r = await jobAction(CFG, { id: 'j1', leaseEpoch: 2 /* stale */ }, 'ship', f);
+    assert.equal(r.ok, true);
+    assert.equal(patchBody.stage, 'shipped');
+    assert.equal(patchBody.leaseEpoch, 7); // used the freshly-fetched epoch, not the stale 2
+  });
+  await t('jobAction: ship 409 ⇒ actionable fenced message', async () => {
+    const f = makeFetch({
+      'http://svc/api/fleet/jobs/j1': ({ opts }) => (opts.method === 'PATCH'
+        ? { status: 409, body: '{}' }
+        : { status: 200, body: { id: 'j1', leaseEpoch: 7 } }),
+    });
+    const r = await jobAction(CFG, { id: 'j1' }, 'ship', f);
+    assert.equal(r.ok, false);
+    assert.match(r.message, /fenced|refresh/i);
+  });
+  await t('jobAction: requeue ⇒ POST /actions/requeue', async () => {
+    const f = makeFetch({ '/fleet/jobs/j1/actions/requeue': { status: 200, body: { id: 'j1', stage: 'queued' } } });
+    const r = await jobAction(CFG, { id: 'j1' }, 'requeue', f);
+    assert.equal(r.ok, true);
+    assert.equal(f.calls[0].method, 'POST');
+    assert.match(f.calls[0].url, /\/actions\/requeue$/);
+  });
+  await t('jobAction: reject 409 ⇒ conflict message', async () => {
+    const f = makeFetch({ '/fleet/jobs/j1/actions/reject': { status: 409, body: '{}' } });
+    const r = await jobAction(CFG, { id: 'j1' }, 'reject', f);
+    assert.equal(r.ok, false);
+    assert.match(r.message, /conflict|terminal|refresh/i);
+  });
+  t('jobAction: promote ⇒ explicitly unavailable in fleet mode', async () => {
+    return jobAction(CFG, { id: 'j1' }, 'promote', makeFetch({})).then((r) => {
+      assert.equal(r.ok, false);
+      assert.match(r.message, /promote/i);
+    });
+  });
+})();
+
+// Summary line (selftest greps for PASS).
+process.on('exit', () => {
+  if (process.exitCode && process.exitCode !== 0) {
+    console.error('fleet-dash.test FAIL');
+  } else {
+    console.log(`fleet-dash.test PASS (${passed} assertions)`);
+  }
+});
--- a/agent-queue/profiles/backend-engineer.md
+++ b/agent-queue/profiles/backend-engineer.md
@ -0,0 +1,19 @@
+---
+name: backend-engineer
+persona: |
+  You are a senior backend engineer. Favor minimal, well-tested changes. Respect
+  service boundaries, validate inputs, handle errors explicitly, and never log
+  secrets. Prefer existing libraries and patterns over new dependencies. Keep
+  migrations and API changes backward-compatible unless the task says otherwise.
+capabilities: [os:any, node>=20, has:pnpm]
+default-verify: pnpm -s typecheck && pnpm -s test
+engine-class: agentic-coder
+prefers-engine: [devin, claude]
+allowed-scope: ["backend/**", "services/**", "packages/**"]
+review-policy: manual
+---
+
+# backend-engineer
+
+Server-side work. Inherits a typecheck+test verify gate and a scope limited to
+backend/service/package code.
--- a/agent-queue/profiles/developer.md
+++ b/agent-queue/profiles/developer.md
@ -0,0 +1,20 @@
+---
+name: developer
+persona: |
+  You are a pragmatic senior software engineer. Make the smallest correct change
+  that satisfies the task. Match the surrounding code style and existing patterns,
+  keep diffs focused, and never commit secrets. Add or update tests when you change
+  behavior, and explain non-obvious decisions briefly in the commit message.
+capabilities: [os:any, has:git]
+default-verify:
+engine-class: agentic-coder
+prefers-engine: [devin, claude, codex]
+allowed-scope: ["**"]
+review-policy: manual
+---
+
+# developer
+
+General-purpose engineering profile. No default verify (parks in review for a
+human gate) and an unrestricted scope — pick a more specific profile when you
+want a tighter blast radius or an automatic QA gate.
--- a/agent-queue/profiles/docs-writer.md
+++ b/agent-queue/profiles/docs-writer.md
@ -0,0 +1,18 @@
+---
+name: docs-writer
+persona: |
+  You are a technical writer. Produce clear, accurate documentation that matches
+  the repository's existing voice and structure. Update READMEs, guides, and
+  references; keep examples runnable and links valid. Do not change source code
+  beyond doc comments. Never include secrets in examples.
+capabilities: [os:any]
+default-verify:
+engine-class: agentic-coder
+prefers-engine: [claude, devin]
+allowed-scope: ["docs/**", "**/*.md", "**/*.mdx"]
+review-policy: manual
+---
+
+# docs-writer
+
+Documentation profile. Scoped to docs + markdown; parks in review for a human read.
--- a/agent-queue/profiles/frontend-engineer.md
+++ b/agent-queue/profiles/frontend-engineer.md
@ -0,0 +1,18 @@
+---
+name: frontend-engineer
+persona: |
+  You are a senior frontend engineer. Build accessible, responsive UI that matches
+  the existing component library and design tokens. Keep state management simple,
+  avoid unnecessary dependencies, and ensure type-safety. Verify the build and
+  tests pass before finishing; never hardcode secrets or API keys.
+capabilities: [os:any, node>=20, has:pnpm]
+default-verify: pnpm -s typecheck && pnpm -s build
+engine-class: agentic-coder
+prefers-engine: [claude, devin]
+allowed-scope: ["dashboards/**", "apps/**", "packages/ui/**", "src/**"]
+review-policy: manual
+---
+
+# frontend-engineer
+
+Client/UI work. Inherits a typecheck+build gate and a UI-oriented scope.
--- a/agent-queue/profiles/planner.md
+++ b/agent-queue/profiles/planner.md
@ -0,0 +1,19 @@
+---
+name: planner
+persona: |
+  You are a planning agent. Break an objective into a dependency-ordered set of
+  small, well-scoped tasks, each mappable to a job .md (with a profile, scope, and
+  verify). Output the plan as markdown; do not implement the tasks yourself.
+capabilities: [os:any]
+default-verify:
+engine-class: agentic-coder
+prefers-engine: [claude]
+allowed-scope: ["docs/**", "**/*.md"]
+review-policy: manual
+---
+
+# planner (reserved)
+
+Reserved for a future planning/decomposition flow that emits child jobs with
+`deps:` wiring. Usable today as a docs-scoped persona; automatic job emission is
+a later slice.
--- a/agent-queue/profiles/qa.md
+++ b/agent-queue/profiles/qa.md
@ -0,0 +1,18 @@
+---
+name: qa
+persona: |
+  You are a QA engineer. Write and strengthen tests; reproduce bugs with a failing
+  test first, then confirm the fix. Cover edge cases, error paths, and regressions.
+  Do not weaken or delete existing tests to make a suite pass — fix the cause.
+  Keep tests deterministic and fast.
+capabilities: [os:any, node>=20, has:pnpm]
+default-verify: pnpm -s test
+engine-class: agentic-coder
+prefers-engine: [codex, claude]
+allowed-scope: ["**/*.test.*", "**/*.spec.*", "test/**", "tests/**", "e2e/**"]
+review-policy: manual
+---
+
+# qa
+
+Test-focused profile. Inherits a `pnpm -s test` gate and a test-files scope.
--- a/agent-queue/profiles/reviewer.md
+++ b/agent-queue/profiles/reviewer.md
@ -0,0 +1,19 @@
+---
+name: reviewer
+persona: |
+  You are a code reviewer. Do NOT modify code. Read the diff/changes and produce a
+  concise review: correctness, security, tests, readability, and scope adherence.
+  Flag risky or out-of-scope changes and supply-chain concerns (edits to shared
+  packages). Output findings as markdown with severity labels.
+capabilities: [os:any, has:git]
+default-verify:
+engine-class: review-only
+prefers-engine: [claude]
+allowed-scope: ["docs/**", "**/*.md"]
+review-policy: manual
+---
+
+# reviewer
+
+Read-only review profile. `engine-class: review-only` has no concrete runner
+mapping yet (reserved) — use an explicit `engine:` until a review engine lands.
--- a/agent-queue/profiles/ui-designer.md
+++ b/agent-queue/profiles/ui-designer.md
@ -0,0 +1,19 @@
+---
+name: ui-designer
+persona: |
+  You are a UI/visual designer. Focus on visual hierarchy, spacing, color, and
+  typography using the existing design tokens and component library. Keep changes
+  consistent with the design system, ensure sufficient contrast, and respect
+  light/dark themes. Prefer token references over hardcoded values.
+capabilities: [os:any, node>=20]
+default-verify:
+engine-class: agentic-coder
+prefers-engine: [claude, devin]
+allowed-scope: ["packages/ui/**", "packages/design-tokens/**", "**/*.css", "design/**"]
+review-policy: manual
+---
+
+# ui-designer
+
+Visual/design-system work scoped to UI + tokens + styles. Parks in review for a
+human visual check.
--- a/agent-queue/profiles/ux-designer.md
+++ b/agent-queue/profiles/ux-designer.md
@ -0,0 +1,19 @@
+---
+name: ux-designer
+persona: |
+  You are a UX designer. Focus on user flows, information architecture, and
+  interaction states (empty, loading, error, success). Produce wireframes,
+  flow descriptions, and copy as markdown/specs. Justify decisions with usability
+  heuristics and accessibility (WCAG) considerations. Do not change production code.
+capabilities: [os:any]
+default-verify:
+engine-class: agentic-coder
+prefers-engine: [claude]
+allowed-scope: ["docs/**", "design/**", "**/*.md"]
+review-policy: manual
+---
+
+# ux-designer
+
+Flows, IA, and interaction specs. Documentation-scoped; parks in review for human
+sign-off (no automatic verify gate).
--- a/agent-queue/queue/.state/.gitkeep
+++ b/agent-queue/queue/.state/.gitkeep
--- a/agent-queue/queue/building/.gitkeep
+++ b/agent-queue/queue/building/.gitkeep
--- a/agent-queue/queue/failed/.gitkeep
+++ b/agent-queue/queue/failed/.gitkeep
--- a/agent-queue/queue/inbox/.gitkeep
+++ b/agent-queue/queue/inbox/.gitkeep
--- a/agent-queue/queue/logs/.gitkeep
+++ b/agent-queue/queue/logs/.gitkeep
--- a/agent-queue/queue/review/.gitkeep
+++ b/agent-queue/queue/review/.gitkeep
--- a/agent-queue/queue/shipped/.gitkeep
+++ b/agent-queue/queue/shipped/.gitkeep
--- a/agent-queue/queue/testing/.gitkeep
+++ b/agent-queue/queue/testing/.gitkeep
--- a/agent-queue/selftest.sh
+++ b/agent-queue/selftest.sh
--- a/aliases/README.md
+++ b/aliases/README.md
@ -39,7 +39,7 @@ The loader can be sourced from any directory. It discovers the `aliases/` folder
 ## Requirements

 - Supported shells: Bash and Zsh
- Optional commands used by aliases: `git`, `tmux`, `tree`, and `vim` or `$EDITOR`
+- Optional commands used by aliases: `git`, `tmux`, `tree`, `vim` or `$EDITOR`, and `caffeinate` (macOS, for `awake`/`longrun`)

 ## Examples

@ -49,8 +49,17 @@ gd          # git diff
 tl          # tmux list-sessions
 tn work     # tmux new-session -s work
 ta work     # tmux attach-session -t work
+aq <cmd>    # agent-queue runner (init|add|run|status|watch|dash|stop|logs)
+aqs         # agent-queue status
+aqd         # agent-queue Node live dashboard
+awake <cmd> # macOS: run <cmd> while keeping the machine awake (caffeinate -dimsu)
+longrun phase3 codex --full-auto "<prompt>"   # detached+awake+logged overnight run
+ta phase3   # reattach to the run;  tail -f ~/longrun-phase3-*.log  to follow output
 ```

+See [`AI.dev/CHEATSHEETS/long-running-jobs.md`](../../learning_ai_common_plat/AI.dev/CHEATSHEETS/long-running-jobs.md)
+(in `learning_ai_common_plat`) for the full overnight-run guide and best practices.
+
 ## Local Aliases

 Keep machine- or org-specific shortcuts out of the portable default files. Start from `_local.example.alias` if you want private local aliases such as branch-specific git commands.
--- a/aliases/_agent.alias
+++ b/aliases/_agent.alias
@ -0,0 +1,5 @@
+# agent-queue — folder kanban runner for devin/claude/codex CLIs
+# Resolved relative to the aliases dir so it works on any machine/clone.
+alias aq="$BYTELYST_ALIAS_DIR/../agent-queue/agent-queue.sh"
+alias aqs="$BYTELYST_ALIAS_DIR/../agent-queue/agent-queue.sh status"
+alias aqd="$BYTELYST_ALIAS_DIR/../agent-queue/agent-queue.sh dash"
--- a/aliases/_longrun.alias
+++ b/aliases/_longrun.alias
@ -0,0 +1,90 @@
+# Long-running / overnight agent runs — keep-awake + detachable tmux + logged output.
+# Full guide: AI.dev/CHEATSHEETS/long-running-jobs.md (in learning_ai_common_plat).
+
+# macOS: keep the machine awake while a command runs (prevents sleep stalling the job).
+# On Linux this alias is a no-op label; use `systemd-inhibit` instead.
+alias awake='caffeinate -dimsu'
+
+# longrun <session> <command> [args...]
+#   Runs <command> in a DETACHED tmux session, wrapped in caffeinate (macOS) so the
+#   machine won't sleep, teeing all output to ~/longrun-<session>-<timestamp>.log.
+#   Survives closing the terminal; reattach with `ta <session>`, stop with
+#   `tmux kill-session -t <session>`.
+#   e.g.  longrun phase3 codex --dangerously-bypass-approvals-and-sandbox "Read ... and execute it"
+longrun() {
+  # --- usage ---
+  if [ "$#" -lt 2 ]; then
+    echo "usage: longrun <session> <command> [args...]" >&2
+    echo "  e.g. longrun phase3 codex --full-auto \"<the overnight prompt>\"" >&2
+    echo "  env: LONGRUN_LOG_DIR overrides the log directory (default: \$HOME)" >&2
+    return 2
+  fi
+
+  # --- required dependency: tmux ---
+  if ! command -v tmux >/dev/null 2>&1; then
+    echo "longrun: 'tmux' is required but not installed." >&2
+    echo "  install: macOS 'brew install tmux'  |  Debian/Ubuntu 'sudo apt-get install -y tmux'" >&2
+    return 1
+  fi
+
+  local sess="$1"; shift
+
+  # --- session name must be free ---
+  if tmux has-session -t "$sess" 2>/dev/null; then
+    echo "longrun: a tmux session named '$sess' already exists." >&2
+    echo "  attach: ta $sess  |  stop: tmux kill-session -t $sess  |  or choose another name" >&2
+    return 1
+  fi
+
+  # --- the command must be runnable ---
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "longrun: command not found on PATH: '$1'" >&2
+    echo "  make sure the agent CLI is installed and your PATH is set for non-login shells." >&2
+    return 127
+  fi
+
+  # --- log file must be writable ---
+  local ts log dir
+  dir="${LONGRUN_LOG_DIR:-$HOME}"
+  ts="$(date +%Y%m%d-%H%M%S)"
+  log="$dir/longrun-${sess}-${ts}.log"
+  if ! mkdir -p "$dir" 2>/dev/null || ! ( : > "$log" ) 2>/dev/null; then
+    echo "longrun: cannot write log file: $log" >&2
+    echo "  set LONGRUN_LOG_DIR to a writable directory and retry." >&2
+    return 1
+  fi
+
+  # --- optional dependency: caffeinate (keep-awake) ---
+  local keep=""
+  if command -v caffeinate >/dev/null 2>&1; then
+    keep="caffeinate -dimsu "
+  elif [ "$(uname -s 2>/dev/null)" = "Darwin" ]; then
+    echo "longrun: WARNING — 'caffeinate' not found on macOS; the machine may sleep mid-run." >&2
+  else
+    echo "longrun: note — no 'caffeinate' (non-macOS). To prevent sleep, wrap with 'systemd-inhibit'." >&2
+  fi
+
+  # --- launch (detached), capturing any tmux startup error ---
+  local cmd inner errf
+  cmd="$(printf '%q ' "$@")"
+  inner="${keep}${cmd}2>&1 | tee \"$log\""
+  errf="$(mktemp 2>/dev/null || echo "/tmp/longrun-err.$$")"
+  if ! tmux new-session -d -s "$sess" "$inner" 2>"$errf"; then
+    echo "longrun: failed to start tmux session '$sess':" >&2
+    [ -s "$errf" ] && sed 's/^/  /' "$errf" >&2
+    rm -f "$errf" "$log"
+    return 1
+  fi
+  rm -f "$errf"
+
+  # --- confirm it is actually running (quick-exit detection) ---
+  if ! tmux has-session -t "$sess" 2>/dev/null; then
+    echo "longrun: WARNING — session '$sess' is not running; the command may have exited immediately." >&2
+    echo "  check the log: $log" >&2
+    return 1
+  fi
+
+  echo "[longrun] session=$sess"
+  echo "[longrun] log=$log"
+  echo "[longrun] attach: ta $sess   |   tail: tail -f \"$log\"   |   stop: tmux kill-session -t $sess"
+}
--- a/aliases/_source_all.alias
+++ b/aliases/_source_all.alias
@ -15,3 +15,5 @@ source "$BYTELYST_ALIAS_DIR/_cd.alias"
 source "$BYTELYST_ALIAS_DIR/_ls.alias"
 source "$BYTELYST_ALIAS_DIR/_general.alias"
 source "$BYTELYST_ALIAS_DIR/_shell.alias"
+source "$BYTELYST_ALIAS_DIR/_agent.alias"
+source "$BYTELYST_ALIAS_DIR/_longrun.alias"
--- a/bytelyst-cli.sh
+++ b/bytelyst-cli.sh
@ -16,6 +16,15 @@ YELLOW=$(tput setaf 3)
 BLUE=$(tput setaf 4)
 RESET=$(tput sgr0)

+CLI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# agent-queue delegates to the standalone tool (no GitHub token / jq / curl needed),
+# so handle it BEFORE the GITHUB_TOKEN + required-tools gates below.
+if [[ "${1:-}" == "agent-queue" || "${1:-}" == "aq" ]]; then
+    shift
+    exec "$CLI_DIR/agent-queue/agent-queue.sh" "$@"
+fi
+
 REQUIRED_TOOLS=(jq curl)

 # Check for required tools
@ -26,17 +35,49 @@ for tool in "${REQUIRED_TOOLS[@]}"; do
    fi
 done

-# Load .env if present
+# Load .env if present. `set -a` exports everything sourced; this safely handles
+# quoted values and spaces, unlike `export $(grep ... | xargs)`.
 if [[ -f .env ]]; then
-    export $(grep -v '^#' .env | xargs)
+    set -a
+    # shellcheck disable=SC1091
+    . ./.env
+    set +a
 fi

-# Validate GITHUB_TOKEN
-if [[ -z "$GITHUB_TOKEN" ]]; then
-    echo "${RED}❌ Error: GITHUB_TOKEN is not set.\nSet it in your environment (e.g., export GITHUB_TOKEN=... in ~/.zshrc, ~/.bashrc, or .env).${RESET}"
+# Validate GITHUB_TOKEN (printf so the newline renders, unlike echo "...\n...")
+if [[ -z "${GITHUB_TOKEN:-}" ]]; then
+    printf '%s❌ Error: GITHUB_TOKEN is not set.\nSet it in your environment (e.g. export GITHUB_TOKEN=... in ~/.zshrc, ~/.bashrc, or .env).%s\n' "$RED" "$RESET" >&2
    exit 1
 fi

+# gh_get_all <url> -> echo one JSON array combining ALL pages (per_page=100).
+# Verifies HTTP 200 on every page before parsing; returns non-zero on API error.
+gh_get_all() {
+    local base="$1" page=1 combined="[]"
+    local joiner='&'; [[ "$base" == *'?'* ]] || joiner='?'
+    while :; do
+        local resp http body n
+        resp=$(curl -sS -w $'\n%{http_code}' \
+            -H "Authorization: token $GITHUB_TOKEN" \
+            -H "Accept: application/vnd.github+json" \
+            "${base}${joiner}per_page=100&page=${page}")
+        http="${resp##*$'\n'}"
+        body="${resp%$'\n'*}"
+        if [[ "$http" != "200" ]]; then
+            printf '%s❌ GitHub API error (HTTP %s) for %s%s\n' "$RED" "$http" "$base" "$RESET" >&2
+            printf '%s' "$body" | jq -r '.message? // empty' >&2 2>/dev/null || true
+            return 1
+        fi
+        n=$(printf '%s' "$body" | jq 'length' 2>/dev/null || echo 0)
+        [[ "$n" -eq 0 ]] && break
+        combined=$(jq -s 'add' <(printf '%s' "$combined") <(printf '%s' "$body"))
+        [[ "$n" -lt 100 ]] && break
+        page=$((page+1))
+        [[ "$page" -gt 100 ]] && break
+    done
+    printf '%s' "$combined"
+}
+
 usage() {
    echo "${BLUE}Bytelyst CLI - Unified GitHub DevOps Tool${RESET}"
    echo ""
@ -47,6 +88,7 @@ usage() {
    echo "  check-collaborators --input <input.json>"
    echo "  export              --type <repos|users> --output <file.json>"
    echo "  remove-user-from-all-repos --user <username> [--input <file.json>]"
+    echo "  agent-queue (aq)    <init|add|run|status|watch|dash|stop|logs|requeue|clean> — agent prompt queue runner"
    echo "  help                Show this help message"
    echo ""
    echo "If no command is given, an interactive menu will be shown."
@ -65,8 +107,9 @@ list_public_repos() {
        echo "${RED}❌ Please provide --user <username>.${RESET}"; exit 1
    fi
    echo "${BLUE}🔍 Fetching all public repositories for user: $user...${RESET}"
-    local response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/users/$user/repos?per_page=100&type=public")
-    local repos=$(echo "$response" | jq -r '.[].full_name')
+    local json repos
+    json=$(gh_get_all "https://api.github.com/users/$user/repos?type=public") || exit 1
+    repos=$(printf '%s' "$json" | jq -r '.[].full_name')
    if [[ -z "$repos" ]]; then
        echo "${YELLOW}🚫 No public repositories found for user.${RESET}"
    else
@ -87,8 +130,9 @@ list_private_repos() {
        echo "${RED}❌ Please provide --org <orgname>.${RESET}"; exit 1
    fi
    echo "${BLUE}🔍 Fetching all private repositories for org: $org...${RESET}"
-    local response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/orgs/$org/repos?per_page=100&type=private")
-    local repos=$(echo "$response" | jq -r '.[].full_name')
+    local json repos
+    json=$(gh_get_all "https://api.github.com/orgs/$org/repos?type=private") || exit 1
+    repos=$(printf '%s' "$json" | jq -r '.[].full_name')
    if [[ -z "$repos" ]]; then
        echo "${YELLOW}🚫 No private repositories found for org.${RESET}"
    else
@ -117,12 +161,19 @@ check_collaborators() {
    fi
    for repo in "${repos[@]}"; do
        echo "${BLUE}🔍 Checking repo: $repo${RESET}"
-        local collaborators=$(curl -s -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/repos/$org/$repo/collaborators" | jq -r '.[].login')
+        local cjson collaborators
+        cjson=$(gh_get_all "https://api.github.com/repos/$org/$repo/collaborators") \
+            || { echo "${YELLOW}⚠️ Skipping $repo (API error).${RESET}"; continue; }
+        collaborators=$(printf '%s' "$cjson" | jq -r '.[].login')
        local non_whitelisted=()
        for collab in $collaborators; do
-            if [[ ! " ${whitelist[@]} " =~ " ${collab} " ]]; then
-                non_whitelisted+=("$collab")
-            fi
+            # explicit membership test (avoids the array-concatenation pitfall of
+            # [[ " ${whitelist[@]} " =~ " $collab " ]], which false-matches substrings)
+            local is_white=false w
+            for w in "${whitelist[@]}"; do
+                [[ "$w" == "$collab" ]] && { is_white=true; break; }
+            done
+            $is_white || non_whitelisted+=("$collab")
        done
        if [[ ${#non_whitelisted[@]} -gt 0 ]]; then
            echo "${YELLOW}🚨 Repository: $repo${RESET}"
@ -209,14 +260,15 @@ remove_user_from_all_repos() {

 interactive_menu() {
    echo "${BLUE}Bytelyst CLI Interactive Menu${RESET}"
-    select opt in "List Public Repos" "List Private Repos" "Check Collaborators" "Export JSON" "Remove User from All Repos" "Exit"; do
+    select opt in "List Public Repos" "List Private Repos" "Check Collaborators" "Export JSON" "Remove User from All Repos" "Agent Queue Status" "Exit"; do
        case $REPLY in
            1) read -p "Enter GitHub username: " user; list_public_repos --user "$user";;
            2) read -p "Enter GitHub org: " org; list_private_repos --org "$org";;
            3) read -p "Enter path to input.json: " input; check_collaborators --input "$input";;
            4) read -p "Export type (repos/users): " type; read -p "Output file: " output; export_json --type "$type" --output "$output";;
            5) read -p "Enter GitHub username: " user; remove_user_from_all_repos --user "$user";;
-            6) exit 0;;
+            6) "$CLI_DIR/agent-queue/agent-queue.sh" status;;
+            7) exit 0;;
            *) echo "Invalid option.";;
        esac
    done
@ -234,6 +286,7 @@ case $1 in
    check-collaborators) shift; check_collaborators "$@";;
    export) shift; export_json "$@";;
    remove-user-from-all-repos) shift; remove_user_from_all_repos "$@";;
+    agent-queue|aq) shift; exec "$CLI_DIR/agent-queue/agent-queue.sh" "$@";;
    help|--help|-h) usage;;
    *) echo "${RED}Unknown command: $1${RESET}"; usage; exit 1;;
 esac 
--- a/cli-install-report.md
+++ b/cli-install-report.md
@ -0,0 +1,49 @@
+# CLI install report (WSL) — final
+
+Generated: 2026-05-29T21:20:00-07:00
+
+System: WSL Ubuntu (user v-sadhandapa)
+
+Installed CLIs (verified):
+
+- Claude Code
+  - Path: /home/v-sadhandapa/.local/bin/claude
+  - Version: 2.1.158
+  - Install method: official installer (https://claude.ai/install.sh)
+  - Auth: claude auth login   (or claude auth setup-token; ANTHROPIC_API_KEY env)
+
+- OpenAI Codex
+  - Path: /home/v-sadhandapa/.npm-global/bin/codex
+  - Version: codex-cli 0.135.0
+  - Install method: npm i -g @openai/codex (fallback installer used if necessary)
+  - Auth: codex login  (or: printenv OPENAI_API_KEY | codex login --with-api-key)
+
+- Devin
+  - Path: /home/v-sadhandapa/.local/bin/devin
+  - Version: devin 2026.5.26-2
+  - Install method: official installer (https://cli.devin.ai/install.sh)
+  - Auth: devin auth login
+
+- Antigravity (agy)
+  - Path: /home/v-sadhandapa/.local/bin/agy
+  - Version: 1.0.3
+  - Install method: official installer (https://antigravity.google/cli/install.sh)
+  - Auth: agy login
+
+- GitHub Copilot CLI
+  - Path: /snap/bin/copilot
+  - Version: 1.0.56
+  - Install method: sudo snap install copilot-cli
+  - Auth: copilot auth login
+
+PATH changes made:
+- ~/.npm-global/bin and ~/.local/bin were added to ~/.profile and ~/.bashrc (persisted)
+
+Symlinks:
+- A helper script was added to the repo: ./make_symlinks_wsl.sh — run it to create /usr/local/bin symlinks (requires sudo).
+
+Notes:
+- No API keys or credentials were added to any shell profiles.
+- For interactive logins, run the auth commands listed above; they may prompt or open device-flow URLs.
+- Logs: ~/cli-install-wsl.log
+
--- a/dashboard/.gitea/workflows/ci.yml
+++ b/dashboard/.gitea/workflows/ci.yml
@ -11,74 +11,86 @@ on:
      - 'pnpm-lock.yaml'
      - 'pnpm-workspace.yaml'
      - '.pnpmfile.cjs'
+      - '.gitea/workflows/ci.yml'
+  pull_request:
+    paths:
+      - 'backend/**'
+      - 'web/**'
+      - 'shared/**'
+      - 'package.json'
+      - 'pnpm-lock.yaml'
+      - 'pnpm-workspace.yaml'
+      - '.pnpmfile.cjs'
+      - '.gitea/workflows/ci.yml'

 concurrency:
  group: ci-devops-dashboard-${{ github.ref }}
  cancel-in-progress: true

+env:
+  # Self-contained CI: resolve @bytelyst/* deps from the local Gitea registry
+  # rather than a sibling learning_ai_common_plat checkout on the runner.
+  BYTELYST_PACKAGE_SOURCE: gitea
+
 jobs:
  build-and-test:
    name: Build, Test & Typecheck
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
-      - name: Pull latest
+      # Check out into the runner workspace (${{ gitea.workspace }}) instead of
+      # cd-ing into a hard-coded host path and `git reset --hard` on the live
+      # checkout. CI must never mutate an operator's working tree.
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Enable pnpm
        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          git fetch origin main
-          git checkout main
-          git reset --hard origin/main
+          corepack enable
+          corepack prepare pnpm@10.6.5 --activate

      - name: Secret scan
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm secret-scan
+        run: pnpm secret-scan

      - name: Install dependencies
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm install:common-plat
-
-      - name: Build backend
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-backend build
-
-      - name: Build web
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-web build
-
-      - name: Typecheck backend
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-backend typecheck
-
-      - name: Typecheck web
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-web typecheck
-
-      - name: Test backend
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-backend test:run
-
-      - name: Test web
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-web test:run
+        run: pnpm install:gitea

      - name: Lint
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-backend lint
-          pnpm --filter @bytelyst/devops-web lint
+        run: pnpm lint
+
+      - name: Typecheck
+        run: pnpm typecheck
+
+      - name: Build
+        run: pnpm build
+
+      - name: Unit tests
+        run: pnpm test:run
+
+      # Coverage gate for the backend's tested modules (auth, csrf, health,
+      # hermes-ops, deployments/orchestrator, services). Thresholds live in
+      # `backend/vitest.config.ts`. Add files there as they gain real tests
+      # — ratchet up, never relax.
+      - name: Coverage gate (backend)
+        run: pnpm --filter @bytelyst/devops-backend test:coverage
+
+      # Playwright browsers are pulled per-CI-run. The web suite (`pnpm
+      # test:e2e`) starts its own Next dev server via Playwright's
+      # `webServer` config; the backend is intentionally NOT started — the
+      # hermes spec intercepts `/api/hermes/ops` (which would otherwise
+      # need to shell out to systemctl/git/ps on a live VM) and the
+      # dashboard spec mocks every other backend route via `page.route`.
+      # See `docs/prompts/ci-e2e-hardening.md` for the design.
+      - name: Install Playwright browsers
+        run: pnpm --filter @bytelyst/devops-web exec playwright install --with-deps chromium

      - name: E2E tests
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          pnpm --filter @bytelyst/devops-web test:e2e
+        run: pnpm --filter @bytelyst/devops-web test:e2e

  docker-build:
    name: Build Docker Images
@ -86,26 +98,17 @@ jobs:
    needs: [build-and-test]
    timeout-minutes: 20
    steps:
-      - name: Pull latest
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          git fetch origin main
-          git checkout main
-          git reset --hard origin/main
+      - name: Checkout
+        uses: actions/checkout@v4

      - name: Build backend Docker image
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          docker build -f backend/Dockerfile -t devops-backend:latest .
+        run: docker build -f backend/Dockerfile -t devops-backend:latest .

      - name: Build web Docker image
-        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-          docker build -f web/Dockerfile -t devops-web:latest .
+        run: docker build -f web/Dockerfile -t devops-web:latest .

      - name: Test Docker Compose
        run: |
-          cd /opt/bytelyst/bytelyst-devops-tools/dashboard
          docker compose up -d
          sleep 10
          docker compose down
--- a/dashboard/.npmrc
+++ b/dashboard/.npmrc
@ -0,0 +1 @@
+@bytelyst:registry=http://localhost:3300/api/packages/learning_ai_user/npm/
--- a/dashboard/DEPLOYMENT.md
+++ b/dashboard/DEPLOYMENT.md
@ -1,111 +1,533 @@
-# DevOps Dashboard Deployment Guide
+# DevOps & Admin Dashboard Deployment Guide

-## Current Status
+> Canonical deployment doc for `dashboard/`. The previous `DEPLOYMENT_GUIDE.md`
+> has been folded into this file; it remains as a one-line redirect for
+> backwards compatibility with `deploy.sh` and external links.

-The DevOps dashboard has been significantly enhanced with production-ready features, but deployment requires resolving workspace dependencies.
+## Overview

-## Dependency Issues
+This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboard using the existing Traefik gateway infrastructure, following the same pattern as the trading dashboard (https://invttrdg.bytelyst.com).

-The dashboard currently depends on workspace packages from `learning_ai_common_plat`:
- `@bytelyst/config` - Configuration management
- `@bytelyst/auth` - Authentication utilities
- `@bytelyst/cosmos` - Cosmos DB client
- `@bytelyst/errors` - Error handling
- `@bytelyst/react-auth` - React auth context
- `@bytelyst/telemetry-client` - Telemetry
+## Public URLs

-## Deployment Options
+For the full living bookmark list across all ByteLyst apps, APIs, Hermes
+dashboards, and last deploy timestamps, see
+[`../docs/app-url-bookmarks.md`](../docs/app-url-bookmarks.md).

-### Option 1: Deploy with Common Platform (Recommended)
+- **DevOps Dashboard**: `https://devops.bytelyst.com`
+- **Admin Dashboard**: `https://admin.bytelyst.com`
+- **API Gateway**: `https://api.bytelyst.com`
+  - Platform API: `https://api.bytelyst.com/platform/api`
+  - DevOps API: `https://api.bytelyst.com/api/devops`

-**Prerequisites:**
-1. Ensure `learning_ai_common_plat` packages are built and available
-2. Configure npm registry to point to local package registry
-3. Use the provided install scripts
+## Ports — quick reference
+
+The web container always listens on **3000** internally; what changes is what
+the host exposes. Memorize the column for the deployment mode you're in:
+
+| Mode                                | Web (host)         | Backend (host)    | Notes                                                              |
+|-------------------------------------|--------------------|-------------------|--------------------------------------------------------------------|
+| Local dev (`pnpm dev`)              | `localhost:3000`   | `localhost:4004`  | Next listens directly on 3000.                                     |
+| Docker Compose (this repo)          | `localhost:3049`   | `localhost:4004`  | `docker-compose.yml` maps `127.0.0.1:3049:3000` (loopback only).   |
+| Production (Traefik)                | `https://devops.bytelyst.com` | `https://api.bytelyst.com/api/devops` | Traefik label `loadbalancer.server.port=3000` targets the container port. |
+
+Whenever a doc says "the dashboard runs on port 3000", it means the **container
+port** seen by Traefik / Next dev mode — not the host port for the deployed
+stack. Use the table above instead of relying on prose.
+
+## Architecture
+
+```
+Internet → Traefik Gateway → Services
+                              ├─ DevOps Web      (container :3000, host :3049)
+                              ├─ DevOps Backend  (:4004)
+                              ├─ Admin Web       (:3001)
+                              ├─ Platform Service (:4003)
+                              └─ Trading Dashboard (:3085)
+```
+
+- **Traefik**: API gateway and reverse proxy.
+- **Docker network**: All services connect via `learning_ai_common_plat_default`.
+- **Domain routing**: Traefik routes by host header.
+- **SSL/TLS**: Managed by Traefik with Let's Encrypt.
+
+## Prerequisites
+
+1. Platform stack running with Traefik gateway.
+2. Docker and Docker Compose installed.
+3. Domain names configured with DNS pointing to your server.
+4. Azure Cosmos DB account (shared with platform-service).
+5. Platform Service running and accessible.
+
+## Quick Start
+
+### 1. Start the platform stack (if not running)

-**Steps:**
 ```bash
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-
-# Install dependencies with common platform
-pnpm install:common-plat
-
-# Build both backend and web
-pnpm build
-
-# Deploy with Docker Compose
+cd /opt/bytelyst/learning_ai_common_plat
 docker-compose up -d
 ```

-### Option 2: Deploy Standalone (Simplified)
+### 2. Deploy the dashboards

-**Prerequisites:**
-1. Remove workspace dependencies
-2. Implement simplified auth/config/cosmos layers
-3. Set up environment variables
+```bash
+cd /opt/bytelyst/learning_ai_devops_tools/dashboard
+./deploy.sh
+```

-**Environment Variables Required:**
-```env
+This will:
+- Deploy the DevOps Dashboard (backend + web)
+- Deploy the Admin Dashboard via the platform stack
+- Run health checks
+- Print deployment information
+
+## Local development
+
+If you only need a non-containerized iteration loop (no Traefik, no Docker):
+
+```bash
+cd /opt/bytelyst/learning_ai_devops_tools/dashboard
+
+# Resolve workspace deps
+pnpm install:common-plat   # uses sibling learning_ai_common_plat checkout
+# or
+pnpm install:gitea         # uses local Gitea registry at localhost:3300
+
+pnpm dev                   # backend on 4004, web on 3000 (NOT 3049)
+```
+
+Required env vars are documented under **Environment Configuration** below; for
+local dev a minimal `.env` with `JWT_SECRET`, `COSMOS_*`, and
+`PLATFORM_SERVICE_URL` is enough.
+
+## Manual Docker deployment
+
+### Deploy DevOps Dashboard
+
+```bash
+cd /opt/bytelyst/learning_ai_devops_tools/dashboard
+docker-compose up -d --build
+```
+
+### Deploy Admin Dashboard
+
+```bash
+cd /opt/bytelyst/learning_ai_common_plat
+docker-compose up -d admin-web
+```
+
+## Environment Configuration
+
+### DevOps Dashboard (`.env`)
+
+```bash
+# Backend
 PORT=4004
-PLATFORM_SERVICE_URL=http://localhost:4003
+PLATFORM_SERVICE_URL=http://platform-service:4003
 COSMOS_ENDPOINT=https://your-cosmos-account.documents.azure.com:443/
 COSMOS_KEY=your-cosmos-primary-key
 COSMOS_DATABASE=bytelyst-platform
-JWT_SECRET=your-jwt-signing-secret
-CSRF_SECRET=your-csrf-secret-change-in-production
+JWT_SECRET=your-production-jwt-secret
+CSRF_SECRET=your-production-csrf-secret
+ENCRYPTION_KEY=your-production-encryption-key
+PRODUCT_ID=bytelyst-devops
+PRODUCT_NAME=ByteLyst DevOps Dashboard
+
+# Azure Key Vault (optional)
+AZURE_TENANT_ID=your-tenant-id
+AZURE_CLIENT_ID=your-client-id
+AZURE_CLIENT_SECRET=your-client-secret
+AZURE_KEY_VAULT_URL=https://your-keyvault.vault.azure.net/
+
+# Frontend
+NEXT_PUBLIC_DEVOPS_API_URL=https://api.bytelyst.com/devops
+NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
+NEXT_PUBLIC_ADMIN_WEB_URL=https://admin.bytelyst.com
+NEXT_PUBLIC_PRODUCT_ID=bytelyst-devops
+NEXT_PUBLIC_PRODUCT_NAME=ByteLyst DevOps Dashboard
 ```

-**Steps:**
+### Platform Dashboard (`.env`)
+
+Add to your platform `.env`:
+
 ```bash
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard/backend
-npm install
-npm run build
-npm start
-
-# In another terminal:
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard/web
-npm install
-npm run build
-npm start
+# Admin Web Dashboard
+NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
+NEXT_PUBLIC_DEVOPS_WEB_URL=https://devops.bytelyst.com
 ```

-### Option 3: Deploy to Production Server
+## Traefik Configuration

-**Prerequisites:**
-1. Production server with Node.js 22+
-2. Azure Cosmos DB account
-3. Platform service instance
-4. Docker installed
+Both dashboards use Traefik labels for routing.

-**Steps:**
+### DevOps Web
+
+```yaml
+labels:
+  - 'traefik.enable=true'
+  - 'traefik.http.routers.devops-web.rule=Host(`devops.bytelyst.com`)'
+  - 'traefik.http.services.devops-web.loadbalancer.server.port=3000'   # container port
+```
+
+### DevOps Backend API
+
+```yaml
+labels:
+  - 'traefik.enable=true'
+  - 'traefik.http.routers.devops-api.rule=PathPrefix(`/api/devops`)'
+  - 'traefik.http.services.devops-api.loadbalancer.server.port=4004'
+```
+
+### Admin Web
+
+```yaml
+labels:
+  - 'traefik.enable=true'
+  - 'traefik.http.routers.admin-web.rule=Host(`admin.bytelyst.com`)'
+  - 'traefik.http.services.admin-web.loadbalancer.server.port=3001'
+```
+
+## DNS Configuration
+
+Add DNS records pointing to your Traefik gateway server:
+
+```
+devops.bytelyst.com      A  <your-server-ip>
+admin.bytelyst.com       A  <your-server-ip>
+api.bytelyst.com         A  <your-server-ip>
+```
+
+## SSL/TLS Configuration
+
+Traefik can automatically handle SSL certificates with Let's Encrypt:
+
+```yaml
+command:
+  - '--certificatesresolvers.myresolver.acme.tlschallenge=true'
+  - '--certificatesresolvers.myresolver.acme.email=admin@bytelyst.com'
+  - '--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json'
+```
+
+Then update router labels:
+
+```yaml
+labels:
+  - 'traefik.http.routers.devops-web.tls=true'
+  - 'traefik.http.routers.devops-web.tls.certresolver=myresolver'
+```
+
+## Cross-Navigation
+
+### DevOps Dashboard → Admin Dashboard
+- Header includes a "Platform Admin" link with Shield icon.
+- Opens admin dashboard in a new tab.
+- Uses `NEXT_PUBLIC_ADMIN_WEB_URL`.
+
+### Admin Dashboard → DevOps Dashboard
+- Sidebar includes a "DevOps Dashboard" link with Server icon.
+- Opens devops dashboard in a new tab.
+- Uses `NEXT_PUBLIC_DEVOPS_WEB_URL`.
+
+## Shared Authentication
+
+1. **Platform Service Auth**: Both authenticate against platform-service.
+2. **JWT Tokens**: Same `JWT_SECRET` validates tokens across services.
+3. **Per-Product Access**: Admin access is checked per-product via membership roles.
+4. **Single Sign-On**: Users stay logged in across both dashboards.
+
+### Granting Access
+
+To grant a user access to both dashboards:
+
+1. Ensure user exists in platform-service.
+2. Add admin membership for both products:
+
+```json
+{
+  "memberships": [
+    { "productId": "bytelyst-devops",   "role": "admin", "plan": "pro" },
+    { "productId": "bytelyst-platform", "role": "admin", "plan": "pro" }
+  ]
+}
+```
+
+## Health Checks
+
+- DevOps Backend: `http://localhost:4004/health`
+- DevOps Web: `http://localhost:3049` (Docker Compose host port; container :3000)
+- Admin Web: `http://localhost:3001`
+- Traefik Dashboard: `http://localhost:8080`
+
+## Troubleshooting
+
+### Network issues
 ```bash
-# Build Docker images
-docker-compose build
+# Check if the platform network exists
+docker network inspect learning_ai_common_plat_default

-# Tag and push to registry
-docker tag devops-backend:latest your-registry/devops-backend:latest
-docker tag devops-web:latest your-registry/devops-web:latest
-docker push your-registry/devops-backend:latest
-docker push your-registry/devops-web:latest
-
-# On production server:
-docker pull your-registry/devops-backend:latest
-docker pull your-registry/devops-web:latest
-docker-compose -f docker-compose.prod.yml up -d
+# Check container connectivity
+docker network inspect learning_ai_common_plat_default | grep devops
 ```

+### Traefik routing
+```bash
+# Traefik dashboard
+http://localhost:8080
+
+# Traefik logs
+docker logs $(docker ps -q -f name=gateway)
+
+# Router config for the devops web container
+docker inspect devops-web | grep -A 10 Labels
+```
+
+### Authentication failures
+- Verify `JWT_SECRET` matches across all services.
+- Check platform-service is accessible: `curl http://localhost:4003/health`.
+- Ensure the user has the right product memberships.
+
+### Service not starting
+```bash
+docker logs devops-backend
+docker logs devops-web
+docker logs admin-web
+docker ps
+docker inspect devops-backend | grep -A 5 Health
+```
+
+### Workspace dependency errors
+```bash
+pnpm install:common-plat   # local sibling checkout
+pnpm install:gitea         # local Gitea registry
+```
+
+## Service Management
+
+### Stop services
+```bash
+cd /opt/bytelyst/learning_ai_devops_tools/dashboard
+docker-compose down
+
+cd /opt/bytelyst/learning_ai_common_plat
+docker-compose stop admin-web
+```
+
+### Restart services
+```bash
+cd /opt/bytelyst/learning_ai_devops_tools/dashboard
+docker-compose restart
+
+cd /opt/bytelyst/learning_ai_common_plat
+docker-compose restart admin-web
+```
+
+### View logs
+```bash
+# DevOps
+docker logs -f devops-backend
+docker logs -f devops-web
+
+# Admin
+docker logs -f admin-web
+
+# Traefik
+docker logs -f gateway
+```
+
+## Comparison with Trading Dashboard
+
+| Feature      | Trading              | DevOps                  | Admin                  |
+|--------------|----------------------|-------------------------|------------------------|
+| Domain       | invttrdg.bytelyst.com| devops.bytelyst.com     | admin.bytelyst.com     |
+| Web Port     | 3085 (host)          | 3049 (host) / 3000 (ctr)| 3001 (host)            |
+| Backend Port | 4018                 | 4004                    | N/A                    |
+| Network      | platform_net         | platform_net            | default                |
+| Traefik      | Yes                  | Yes                     | Yes                    |
+| Auth         | Platform             | Platform                | Platform               |
+
+## Privilege Surface (Docker socket + host mounts)
+
+The `devops-backend` container has root-equivalent access to the host. This
+section documents exactly what is mounted, which routes use each mount, and
+what the blast radius looks like if an admin token leaks. It exists so reviewers
+don't have to reverse-engineer this from `docker-compose.yml` and the route
+handlers — and so any future change to the mount set is reviewed against this
+list rather than slipped in.
+
+### Mounts (from `docker-compose.yml`)
+
+| Host path                          | Container path                    | Mode | Purpose                                                                 |
+|------------------------------------|-----------------------------------|------|-------------------------------------------------------------------------|
+| `/var/run/docker.sock`             | `/var/run/docker.sock`            | rw   | Allows `docker` CLI inside the container to control the host daemon. Used by the `system` and `vm` modules. **Equivalent to root on the host.** |
+| `/opt/bytelyst/learning_ai_devops_tools/scripts` | `/vm-scripts`        | ro   | Bash scripts the `vm` module shells out to (`HostingerVM/*.sh`). Read-only mount; the container cannot modify the script set. |
+| `/var/log/vm-cleanup.log`          | `/host-logs/vm-cleanup.log`       | rw   | The `vm` cleanup script appends here; backend reads it via `/api/vm/cleanup-log`. |
+| `/var/log/vm-health-check.log`     | `/host-logs/vm-health-check.log`  | rw   | Health-check probe output; backend reads it via `/api/vm/health`.       |
+| `/var/log/docker-watchdog.log`     | `/host-logs/docker-watchdog.log`  | rw   | Watchdog tail used by the VM panel.                                     |
+| `extra_hosts: host-gateway`        | `host.docker.internal`-equivalent | —    | Lets the container reach `host:11434` (Ollama) and other host-only services. Not a filesystem mount, but a privilege-relevant capability — the container can talk to anything bound to `127.0.0.1` on the host. |
+
+The container's listening port (`4004`) is bound to `127.0.0.1` only, so the
+API is **not** exposed to the public internet by this compose file — access is
+expected via Tailscale or an SSH tunnel. Any reverse proxy in front of it
+(Traefik in production) is responsible for its own auth + TLS.
+
+### What shells out + which routes (auth column = effective gate)
+
+| Route                                            | Handler module                | What it executes                                                                  | Auth        |
+|--------------------------------------------------|-------------------------------|-----------------------------------------------------------------------------------|-------------|
+| `GET /system/metrics`                            | `system/repository.ts`        | `df -h ...`                                                                       | `requireAdmin` |
+| `GET /docker/stats`                              | `system/repository.ts`        | `docker images / ps / volume ls / system df` (read-only)                          | `requireAdmin` |
+| `POST /docker/cleanup`                           | `system/repository.ts`        | `docker container prune -f`, `docker image prune -a -f`, `docker volume prune -f`, `docker builder prune -f` (a fixed allow-list — request body picks one of the four "types") | `requireAdmin` |
+| `GET /vm/health`                                 | `vm/repository.ts`            | `bash $VM_SCRIPTS_PATH/vm-health-check.sh --json`                                 | `requireAdmin` |
+| `GET /vm/cleanup-log`                            | `vm/repository.ts`            | reads `/host-logs/vm-cleanup.log`                                                 | `requireAdmin` |
+| `GET /vm/cron-status`                            | `vm/repository.ts`            | `crontab -l`                                                                      | `requireAdmin` |
+| `POST /vm/cleanup`                               | `vm/repository.ts`            | `bash $VM_SCRIPTS_PATH/vm-cleanup.sh`                                             | `requireAdmin` |
+| `GET /vm/containers`, `.../unhealthy`, `.../:name/logs` | `vm/repository.ts`     | `docker ps`, `docker inspect`, `docker stats`, `docker logs`                      | `requireAdmin` |
+| `POST /vm/containers/:name/restart`              | `vm/repository.ts`            | `docker restart "<name>"` (name is a path param — see "Known sharp edges" below)  | `requireAdmin` |
+| `GET /vm/ollama/models`, `DELETE /vm/ollama/models/:name` | `vm/repository.ts`    | HTTP-only (talks to host Ollama via `host-gateway`). No shell-out.                | `requireAdmin` |
+| `POST /code-quality/check`                       | `code-quality/repository.ts`  | `npm run typecheck`, `npm run lint`, `npm run build`, `npm run test:run` in the request-supplied `projectPath`. | `requireAdmin` *(added concurrently with this doc; previously unauthenticated — see the Phase 5 P1 commit)* |
+| `POST /deployments/trigger/:serviceId`           | `deployments/orchestrator.ts` | `bash <service.scriptPath>` from the registered service registry (paths are stored at create-time, not request-time). | `requireAdmin` |
+| `/hermes/ops` (snapshot)                         | `hermes-ops/repository.ts`    | Read-only probes: `systemctl is-active/is-enabled`, `git status`, `du -sh`, `ps`, `tailscale ip`, `runuser -u uma -- systemctl --user ...`. No state-changing commands. | `requireAdmin` *(Phase 7 — private-only)* |
+| `/hermes/telemetry/:instance`                    | `hermes-telemetry/repository.ts` | Read-only: `runuser -u <user> -- hermes sessions/cron/memory/skills list --json`, `git -C <backup-repo> log`, tail of the watchdog log. No state-changing commands. | `requireAdmin` |
+
+### Blast radius if an admin token is leaked
+
+Anyone holding a valid admin JWT for this product can, today:
+
+- Run any of the four pre-defined `docker prune` commands (data loss for
+  containers/images/volumes), restart any container, read any container's logs.
+- Trigger the host VM cleanup script and crontab listing.
+- Trigger any deployment script registered in the service registry.
+- Run `npm run` lifecycle scripts in any directory the container can read
+  (since `code-quality/check` takes a caller-supplied `projectPath`).
+- Read the three host logs that are mounted in.
+
+In other words, an admin token is **equivalent to a host shell**, modulo the
+specific commands the codebase chooses to wrap. There is currently **no
+allow-list wrapper** between the backend and the docker socket; the backend
+constructs `docker ...` shell strings directly with `execAsync`.
+
+### Known sharp edges (track and shrink)
+
+1. **Container name is interpolated into a shell string.** `docker restart
+   "${name}"` and similar paths in `vm/repository.ts` use `execAsync` with a
+   template literal. The `:name` path parameter is admin-only but is not
+   validated against a `^[a-zA-Z0-9._-]+$` allow-list. Lock this down before
+   exposing the dashboard to a wider admin pool.
+2. **`projectPath` for `/code-quality/check` is unvalidated.** The handler
+   passes the caller-supplied path straight into `execAsync({ cwd })`. Even
+   with `requireAdmin` added, this should be constrained to a known set of
+   project roots (or rejected if it escapes the workspace).
+3. **No per-route audit-log on shell-outs.** `audit/repository.ts` records
+   deployment triggers but not `/docker/cleanup` or `/vm/cleanup`. A leaked
+   token's actions are reconstructable only from container stdout + host logs.
+4. **The container runs as root.** Both the backend `Dockerfile` and the bind-
+   mounts assume root. A non-root user with `docker` group membership would
+   shrink the in-container blast radius without losing functionality (the
+   socket is still root on the host); revisit when ready.
+5. **`fastify-rate-limit` is global, not per-route.** A leaked admin token
+   currently isn't slowed down on the destructive endpoints any more than it
+   is on read-only ones.
+
+### Mitigation roadmap (incremental, not all at once)
+
+- [x] **P1:** Allow-list wrapper around shell-outs. *(`lib/shell.ts` ships with
+      `execAllowed` (no shell, just `execFile` with an explicit argv) plus
+      per-command helpers — `dockerRestart(name)` validates against
+      `[a-zA-Z0-9][a-zA-Z0-9._-]{0,127}`, `dockerPrune(kind, {all?})` validates
+      kind ∈ {container,image,volume,builder} and rejects `--all` on non-image,
+      `runBashScript(path, args, {allowedRoots})` and `runNpmScript(script,
+      {cwd, allowedRoots})` lock both the script path and cwd to a configured
+      set of roots. 17 unit tests cover the rejection paths; `vm/restartContainer`
+      and `system/dockerCleanup` migrated. Module covered by the test:coverage
+      gate (≥95% lines).)*
+- [x] **P1:** Validate `/code-quality/check`'s `projectPath` against a
+      configured set of allowed roots. *(`runCodeQualityCheck` now calls
+      `assertPathInAllowedRoots(projectPath, getAllowedRoots())` before any
+      lifecycle script runs; `getAllowedRoots()` reads
+      `CODE_QUALITY_ALLOWED_ROOTS` (colon-separated) with a default of
+      `/opt/bytelyst`. The path is also re-resolved (normalised, `..`
+      collapsed) before being passed to `runNpmScript`, which lifts it to its
+      own argv slot — no shell interpolation.)*
+- [x] **P2:** Audit-log every shell-out (command + arg vector + actor + result).
+      *(Audit schema extended with `action: 'shell-exec'` + `entityType: 'host'`.
+      `POST /docker/cleanup`, `POST /vm/cleanup`, `POST /vm/containers/:name/restart`
+      now write a Cosmos audit row including the actor (`authUserId`/`authRole`),
+      entity id (`docker-cleanup:<type>` etc.), and a sanitized details payload.
+      Audit writes are best-effort — a Cosmos hiccup logs a warn but never
+      fails the request.)*
+- [x] **P2:** Run the backend container as a non-root user with `docker` group
+      membership; rebuild the Dockerfile accordingly. *(Dockerfile scaffolds
+      a non-root `app` user (uid 1001) with `docker` group membership at a
+      build-arg-configurable GID. Default `BACKEND_USER=root` preserves the
+      current behaviour so existing deployments don't break; set
+      `BACKEND_USER=app` and `DOCKER_GID=$(getent group docker | cut -d: -f3)`
+      to flip it on. Requires host-side prep on the bind-mounted log files —
+      see "Running non-root" below for the exact `chmod`/`chgrp` recipe.)*
+- [ ] **P3:** Move from `docker.sock` to a thin daemon (`docker-proxy`-style)
+      that exposes only the verbs the dashboard actually needs (`stats`,
+      `restart`, `logs`, the four `prune` variants).
+
+### Running non-root
+
+Concrete recipe to flip the backend off root:
+
+```bash
+# 1. Find the host's docker group GID
+DOCKER_GID=$(getent group docker | cut -d: -f3)
+
+# 2. Make the bind-mounted log files group-owned by docker and group-writable
+#    so the in-container `app` user (gid=$DOCKER_GID) can read/write them.
+sudo chgrp docker /var/log/vm-cleanup.log /var/log/vm-health-check.log /var/log/docker-watchdog.log
+sudo chmod g+rw /var/log/vm-cleanup.log /var/log/vm-health-check.log /var/log/docker-watchdog.log
+
+# 3. Confirm the VM scripts mount is world-readable (it's read-only inside
+#    the container, so 0o755 on the directory is enough).
+sudo chmod -R o+rX /opt/bytelyst/learning_ai_devops_tools/scripts
+
+# 4. Rebuild the backend image with BACKEND_USER=app and the host's GID.
+cd /opt/bytelyst/learning_ai_devops_tools/dashboard
+docker compose build --build-arg BACKEND_USER=app --build-arg DOCKER_GID=$DOCKER_GID backend
+
+# 5. Restart and verify
+docker compose up -d backend
+docker exec devops-backend whoami   # → app
+docker exec devops-backend id       # uid=1001(app) gid=$DOCKER_GID(docker)
+curl -fsS http://localhost:4004/health
+```
+
+If the backend can't reach the docker socket after the flip, double-check
+the in-container `id` matches `getent group docker` on the host. The
+`docker.sock` bind-mount carries its host ownership into the container,
+so the in-container gid must match.
+
+Operators reviewing whether to grant a new admin should read this whole section
+before doing so. Adding a new shell-out path in code is a **privilege change**
+and must update this table in the same commit.
+
+## Production Checklist
+
+- [ ] Platform stack running with Traefik.
+- [ ] DNS records configured.
+- [ ] SSL/TLS certificates configured in Traefik.
+- [ ] Environment variables set for production.
+- [ ] Cosmos DB connection configured.
+- [ ] `JWT_SECRET` matches across all services.
+- [ ] User memberships configured for access.
+- [ ] Health checks passing.
+- [ ] Cross-navigation links working.
+- [ ] Monitoring and logging configured.
+
 ## Features Implemented

-The dashboard includes these production-ready features:
-
-### Backend (Port 4004)
+### Backend (port 4004)
 - ✅ CI/CD pipeline with Gitea Actions
- ✅ E2E tests with Playwright
+- ✅ E2E tests with Playwright (gated; see `.gitea/workflows/ci.yml`)
 - ✅ Telemetry integration
 - ✅ Error boundary
 - ✅ CSRF protection with token refresh
 - ✅ Service CRUD operations
- ✅ Real-time log streaming (SSE)
+- ✅ Deployment log retrieval (JSON polling — no SSE; see backend README)
 - ✅ Audit logging
 - ✅ Structured logging
 - ✅ Database migrations
@ -115,58 +537,13 @@ The dashboard includes these production-ready features:
 - ✅ Docker cleanup endpoints
 - ✅ OpenAPI/Swagger documentation at `/docs`

-### Frontend (Port 3000)
+### Frontend (container :3000, host :3049 under Compose)
 - ✅ Service management UI
 - ✅ Deployment monitoring
 - ✅ Health dashboard
 - ✅ Metrics/charts page
 - ✅ System management page
- ✅ Real-time log viewer
+- ✅ Log viewer (poll-based)
 - ✅ Accessibility features (ARIA, keyboard nav)
 - ✅ PWA manifest
 - ✅ Responsive design
-
-## Services Configured
-
-The dashboard can deploy:
-1. **Investment Trading** (`learning_ai_invt_trdg`)
-2. **Agentic Notes** (`learning_ai_notes`)
-3. **AI Clock** (`learning_ai_clock`)
-4. **Platform Services** (`learning_ai_common_plat`) - can be added
-
-## Next Steps for Production Deployment
-
-1. **Resolve Workspace Dependencies**: Ensure common platform packages are accessible
-2. **Configure Environment Variables**: Set production values for Cosmos, JWT, etc.
-3. **Set Up Infrastructure**: Azure Cosmos DB, platform service instance
-4. **Configure CI/CD**: Update Gitea Actions with production registry
-5. **Test Deployments**: Verify all deployment scripts work in production
-6. **Set Up Monitoring**: Configure logging, metrics, and alerting
-
-## Access
-
- **Dashboard**: http://localhost:3000 (or production URL)
- **API**: http://localhost:4004 (or production URL)
- **API Docs**: http://localhost:4004/docs
- **System Management**: Navigate to System page in dashboard
-
-## Troubleshooting
-
-**Workspace dependency errors:**
-```bash
-# Use the install scripts provided
-pnpm install:common-plat  # For local development
-pnpm install:gitea       # For Gitea environment
-```
-
-**Docker build failures:**
- Ensure Dockerfiles reference correct lock files
- Check that all dependencies are in registry
- Verify context paths in docker-compose.yml
-
-**Port conflicts:**
- Backend uses port 4004
- Web uses port 3000
- Ensure these ports are available
-
-The dashboard is feature-complete and ready for production deployment once the dependency infrastructure is resolved.
--- a/dashboard/DEPLOYMENT_GUIDE.md
+++ b/dashboard/DEPLOYMENT_GUIDE.md
@ -1,339 +1,5 @@
 # DevOps & Admin Dashboard Deployment Guide

-## Overview
-
-This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboard using the existing Traefik gateway infrastructure, following the same pattern as the trading dashboard (https://invttrdg.bytelyst.com).
-
-## URLs
-
- **DevOps Dashboard**: `https://devops.bytelyst.com`
- **Admin Dashboard**: `https://admin.bytelyst.com`
- **API Gateway**: `https://api.bytelyst.com`
-  - Platform API: `https://api.bytelyst.com/platform/api`
-  - DevOps API: `https://api.bytelyst.com/api/devops`
-
-## Architecture
-
-Both dashboards follow the same pattern as the trading dashboard:
-
-```
-Internet → Traefik Gateway → Services
-                              ├─ DevOps Web (port 3049)
-                              ├─ DevOps Backend (port 4004)
-                              ├─ Admin Web (port 3001)
-                              ├─ Platform Service (port 4003)
-                              └─ Trading Dashboard (port 3085)
-```
-
- **Traefik**: Acts as API gateway and reverse proxy
- **Docker Network**: All services connect via `learning_ai_common_plat_default`
- **Domain Routing**: Traefik routes based on host headers
- **SSL/TLS**: Managed by Traefik with Let's Encrypt
-
-## Prerequisites
-
-1. Platform stack running with Traefik gateway
-2. Docker and Docker Compose installed
-3. Domain names configured with DNS pointing to your server
-4. Azure Cosmos DB account (shared with platform-service)
-5. Platform Service running and accessible
-
-## Quick Start
-
-### 1. Start Platform Stack (if not running)
-
-```bash
-cd /opt/bytelyst/learning_ai_common_plat
-docker-compose up -d
-```
-
-### 2. Deploy Dashboards
-
-```bash
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-./deploy.sh
-```
-
-This will:
- Deploy DevOps Dashboard (backend + web)
- Deploy Admin Dashboard via platform stack
- Run health checks
- Show deployment information
-
-## Manual Deployment
-
-### Deploy DevOps Dashboard
-
-```bash
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-docker-compose up -d --build
-```
-
-### Deploy Admin Dashboard
-
-```bash
-cd /opt/bytelyst/learning_ai_common_plat
-docker-compose up -d admin-web
-```
-
-## Environment Configuration
-
-### DevOps Dashboard (.env)
-
-```bash
-# Backend
-PORT=4004
-PLATFORM_SERVICE_URL=http://platform-service:4003
-COSMOS_ENDPOINT=https://your-cosmos-account.documents.azure.com:443/
-COSMOS_KEY=your-cosmos-primary-key
-COSMOS_DATABASE=bytelyst-platform
-JWT_SECRET=your-production-jwt-secret
-CSRF_SECRET=your-production-csrf-secret
-ENCRYPTION_KEY=your-production-encryption-key
-PRODUCT_ID=bytelyst-devops
-PRODUCT_NAME=ByteLyst DevOps Dashboard
-
-# Azure Key Vault (optional)
-AZURE_TENANT_ID=your-tenant-id
-AZURE_CLIENT_ID=your-client-id
-AZURE_CLIENT_SECRET=your-client-secret
-AZURE_KEY_VAULT_URL=https://your-keyvault.vault.azure.net/
-
-# Frontend
-NEXT_PUBLIC_DEVOPS_API_URL=https://api.bytelyst.com/devops
-NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
-NEXT_PUBLIC_ADMIN_WEB_URL=https://admin.bytelyst.com
-NEXT_PUBLIC_PRODUCT_ID=bytelyst-devops
-NEXT_PUBLIC_PRODUCT_NAME=ByteLyst DevOps Dashboard
-```
-
-### Platform Dashboard (.env)
-
-Add to your platform `.env`:
-
-```bash
-# Admin Web Dashboard
-NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
-NEXT_PUBLIC_DEVOPS_WEB_URL=https://devops.bytelyst.com
-```
-
-## Traefik Configuration
-
-Both dashboards use Traefik labels for routing:
-
-### DevOps Web
-```yaml
-labels:
-  - 'traefik.enable=true'
-  - 'traefik.http.routers.devops-web.rule=Host(`devops.bytelyst.com`)'
-  - 'traefik.http.services.devops-web.loadbalancer.server.port=3000'
-```
-
-### DevOps Backend API
-```yaml
-labels:
-  - 'traefik.enable=true'
-  - 'traefik.http.routers.devops-api.rule=PathPrefix(`/api/devops`)'
-  - 'traefik.http.services.devops-api.loadbalancer.server.port=4004'
-```
-
-### Admin Web
-```yaml
-labels:
-  - 'traefik.enable=true'
-  - 'traefik.http.routers.admin-web.rule=Host(`admin.bytelyst.com`)'
-  - 'traefik.http.services.admin-web.loadbalancer.server.port=3001'
-```
-
-## DNS Configuration
-
-Add DNS records pointing to your Traefik gateway server:
-
-```
-devops.bytelyst.com      A  <your-server-ip>
-admin.bytelyst.com       A  <your-server-ip>
-api.bytelyst.com         A  <your-server-ip>
-```
-
-## SSL/TLS Configuration
-
-Traefik can automatically handle SSL certificates with Let's Encrypt. Add to your Traefik configuration:
-
-```yaml
-command:
-  - '--certificatesresolvers.myresolver.acme.tlschallenge=true'
-  - '--certificatesresolvers.myresolver.acme.email=admin@bytelyst.com'
-  - '--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json'
-```
-
-Then update router labels:
-
-```yaml
-labels:
-  - 'traefik.http.routers.devops-web.tls=true'
-  - 'traefik.http.routers.devops-web.tls.certresolver=myresolver'
-```
-
-## Cross-Navigation Features
-
-Both dashboards include cross-navigation links:
-
-### DevOps Dashboard → Admin Dashboard
- Header includes "Platform Admin" link with Shield icon
- Opens admin dashboard in new tab
- Uses configured `NEXT_PUBLIC_ADMIN_WEB_URL`
-
-### Admin Dashboard → DevOps Dashboard
- Sidebar includes "DevOps Dashboard" link with Server icon
- Opens devops dashboard in new tab
- Uses configured `NEXT_PUBLIC_DEVOPS_WEB_URL`
-
-## Shared Authentication
-
-Both dashboards use the same authentication system:
-
-1. **Platform Service Auth**: Both authenticate against platform-service
-2. **JWT Tokens**: Same JWT secret validates tokens across services
-3. **Per-Product Access**: Admin access is checked per-product via membership roles
-4. **Single Sign-On**: Users stay logged in across both dashboards
-
-### Granting Access
-
-To grant a user access to both dashboards:
-
-1. Ensure user exists in platform-service
-2. Add admin membership for both products:
-
-```json
-{
-  "memberships": [
-    {
-      "productId": "bytelyst-devops",
-      "role": "admin",
-      "plan": "pro"
-    },
-    {
-      "productId": "bytelyst-platform",
-      "role": "admin",
-      "plan": "pro"
-    }
-  ]
-}
-```
-
-## Health Checks
-
- DevOps Backend: `http://localhost:4004/health`
- DevOps Web: `http://localhost:3049`
- Admin Web: `http://localhost:3001`
- Traefik Dashboard: `http://localhost:8080`
-
-## Troubleshooting
-
-### Network Issues
-```bash
-# Check if platform network exists
-docker network inspect learning_ai_common_plat_default
-
-# Check container connectivity
-docker network inspect learning_ai_common_plat_default | grep devops
-```
-
-### Traefik Routing
-```bash
-# Check Traefik dashboard
-http://localhost:8080
-
-# Check Traefik logs
-docker logs $(docker ps -q -f name=gateway)
-
-# Check router configuration
-docker inspect devops-web | grep -A 10 Labels
-```
-
-### Authentication Failures
- Verify JWT_SECRET matches across all services
- Check platform-service is accessible: `curl http://localhost:4003/health`
- Ensure user has proper product memberships
-
-### Service Not Starting
-```bash
-# Check service logs
-docker logs devops-backend
-docker logs devops-web
-docker logs admin-web
-
-# Check health status
-docker ps
-docker inspect devops-backend | grep -A 5 Health
-```
-
-## Monitoring
-
-Both dashboards include:
- Performance monitoring hooks
- Audit logging
- Health check endpoints
- Error tracking
-
-Monitor these through:
- Traefik Dashboard: `http://localhost:8080`
- Grafana (if configured): `http://localhost:3000`
- Loki logs (if configured): `http://localhost:3100`
-
-## Comparison with Trading Dashboard
-
-| Feature | Trading | DevOps | Admin |
-|---------|---------|--------|-------|
-| Domain | invttrdg.bytelyst.com | devops.bytelyst.com | admin.bytelyst.com |
-| Web Port | 3085 | 3049 | 3001 |
-| Backend Port | 4018 | 4004 | N/A |
-| Network | platform_net | platform_net | default |
-| Traefik | Yes | Yes | Yes |
-| Auth | Platform | Platform | Platform |
-
-## Service Management
-
-### Stop Services
-```bash
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-docker-compose down
-
-cd /opt/bytelyst/learning_ai_common_plat
-docker-compose stop admin-web
-```
-
-### Restart Services
-```bash
-cd /opt/bytelyst/bytelyst-devops-tools/dashboard
-docker-compose restart
-
-cd /opt/bytelyst/learning_ai_common_plat
-docker-compose restart admin-web
-```
-
-### View Logs
-```bash
-# DevOps
-docker logs -f devops-backend
-docker logs -f devops-web
-
-# Admin
-docker logs -f admin-web
-
-# Traefik
-docker logs -f gateway
-```
-
-## Production Checklist
-
- [ ] Platform stack running with Traefik
- [ ] DNS records configured
- [ ] SSL/TLS certificates configured in Traefik
- [ ] Environment variables set for production
- [ ] Cosmos DB connection configured
- [ ] JWT_SECRET matches across all services
- [ ] User memberships configured for access
- [ ] Health checks passing
- [ ] Cross-navigation links working
- [ ] Monitoring and logging configured
+This file is a redirect kept for backwards compatibility (e.g. `deploy.sh`).
+The canonical deployment guide is now [`DEPLOYMENT.md`](./DEPLOYMENT.md). Open
+that file for the current content; do not edit this stub.
--- a/dashboard/ENDPOINTS.md
+++ b/dashboard/ENDPOINTS.md
@ -4,6 +4,14 @@ Canonical URL reference for the ByteLyst DevOps dashboard workspace.

 Use this document when you need the dashboard website URL, browser routes, backend API endpoints, health checks, or the related integration URLs referenced by the dashboard.

+> **Local port note:** every `http://localhost:3000` URL in this file refers to
+> the **`pnpm dev`** workflow, where Next listens directly on 3000. Under the
+> Docker Compose deployment in this repo, the same web container is exposed on
+> the host as **`http://localhost:3049`** (compose maps `127.0.0.1:3049:3000`).
+> Substitute `:3049` for `:3000` whenever you're hitting the dockerized stack.
+> Production traffic goes through Traefik on `https://devops.bytelyst.com` and
+> doesn't expose either port. See `DEPLOYMENT.md` for the full port table.
+
 ## Canonical Bases

 | Surface | Local | Production | Notes |
--- a/dashboard/README.md
+++ b/dashboard/README.md
@ -23,7 +23,7 @@ dashboard/
 - **Service Registry**: Manage all ByteLyst services (trading, notes, clock, etc.)
 - **Deployment Orchestration**: Trigger deployments via existing bash scripts
 - **Health Monitoring**: Real-time health checks for all services with caching
- **Deployment History**: Audit trail of all deployments with log streaming
+- **Deployment History**: Audit trail of all deployments with captured logs (JSON-polled by the web client; no SSE)
 - **Cross-Navigation**: One-click link to Platform Admin dashboard
 - **Hermes Mission Control**: Read-only mock dashboard for portfolio-wide execution, task ledger, product health, history, agents, and settings
 - **Testing**: Vitest for backend, React Testing Library for frontend
@ -50,11 +50,9 @@ dashboard/
 - Validated path parameters, query parameters, and request bodies
 - Strict validation on update operations to prevent accidental field changes

-### Deployment Log Streaming
- Added SSE endpoint for real-time log streaming (`GET /api/deployments/:id/logs`)
- Frontend EventSource integration with cleanup function
- Automatic polling for running deployments (1-second interval)
- Proper connection cleanup on client disconnect
+### Deployment Logs
+- Endpoint `GET /api/deployments/:id/logs` returns the full captured stdout/stderr + current status as a single JSON payload (admin only).
+- The web client polls this endpoint while a deployment is `running`. There is intentionally no SSE/WebSocket stream — the previous attempt with `fastify-sse-v2` was incompatible with Fastify 5 and was removed. If a real-time stream is needed later, implement it explicitly via `reply.raw` and update this section in the same change.

 ### Security Enhancements
 - Added rate limiting: 100 requests per minute per IP
@ -104,7 +102,7 @@ pnpm dev              # Runs on port 4004
 ```bash
 cd web
 cp .env.local.example .env.local  # Add your URLs
-pnpm dev              # Runs on port 3000
+pnpm dev              # Next dev server on http://localhost:3000 (no Docker)
 ```

 ### Running Both
@ -163,7 +161,7 @@ Production deployments use `https://api.bytelyst.com/devops` for `NEXT_PUBLIC_DE
 - `GET /api/deployments` - Recent deployments (with `?limit=` query param)
 - `GET /api/deployments/service/:serviceId` - Deployments for specific service
 - `GET /api/deployments/:id` - Single deployment
- `GET /api/deployments/:id/logs` - Stream deployment logs via SSE
+- `GET /api/deployments/:id/logs` - Get captured deployment logs as JSON (web client polls this; no SSE)
 - `POST /api/deployments/trigger/:serviceId` - Trigger deployment (admin only)

 ### Health
@ -199,8 +197,8 @@ See [DEPLOYMENT.md](./DEPLOYMENT.md) for detailed deployment instructions.

 Deploy as a ByteLyst product:
 - Product ID: `devops-internal`
- Backend port: 4004
- Web port: 3000
+- Backend port: 4004 (host) / 4004 (container)
+- Web port: 3000 (container) — exposed on host as **`localhost:3049`** under Docker Compose; dev mode (`pnpm dev`) listens directly on `localhost:3000`. See [`DEPLOYMENT.md`](./DEPLOYMENT.md) for the full port table.
 - Use existing deployment scripts in parent directory
 - Public API base: `https://api.bytelyst.com/devops`

--- a/dashboard/REVIEW_ACTIONS.md
+++ b/dashboard/REVIEW_ACTIONS.md
@ -0,0 +1,91 @@
+# Dashboard Repo Review — Top Actions
+
+Reviewed: 2026-05-27. Scope: `/opt/bytelyst/learning_ai_devops_tools/dashboard` (the ByteLyst DevOps Dashboard pnpm workspace: `backend/` Fastify 5 + `web/` Next.js 16).
+
+Baseline state (verified during review):
+- `pnpm typecheck` — passes for both backend and web.
+- `pnpm test:run` — passes (backend 9 tests / 1 file, web 11 tests / 2 files).
+- `pnpm secret-scan` — clean.
+- `.env` is gitignored; only `.env.example` files are tracked.
+
+The dashboard is functional and well-structured, but several issues block CI, hide regressions, and create operational risk. Actions are ordered by priority.
+
+---
+
+## P0 — Broken / Urgent
+
+### 1. CI workflow points at a non-existent path
+`.gitea/workflows/ci.yml` runs everything from `/opt/bytelyst/bytelyst-devops-tools/dashboard`, but the actual checkout lives at `/opt/bytelyst/learning_ai_devops_tools/dashboard`. The same wrong path is hard-coded in `DEPLOYMENT.md` and `scripts/deploy-hotcopy.sh`.
+
+- Action: replace the hard-coded path with `${{ gitea.workspace }}` (or a single `WORKDIR` env var) in <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/.gitea/workflows/ci.yml" />, then fix the two other references in <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/DEPLOYMENT.md" /> and <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/scripts/deploy-hotcopy.sh" />.
+- Verify: trigger a CI run on a throwaway branch and confirm green.
+
+### 2. "Lint" steps are no-ops
+Both `backend/package.json` and `web/package.json` define `lint` as `echo 'No linting configured...'`. The CI step "Lint" therefore always passes regardless of code quality. There is no ESLint, Biome, or equivalent configured anywhere in the workspace.
+
+- Action: pick one tool (recommend ESLint + `@typescript-eslint` for backend, Next.js's built-in ESLint config for web, since `next` already ships it). Wire `next lint` into `web/package.json` and add a minimal `.eslintrc` to backend.
+- Verify: `pnpm lint` returns a non-zero exit on a deliberately bad change.
+
+---
+
+## P1 — Important Gaps
+
+### 3. Test coverage is extremely thin
+Backend has 12 modules (`services`, `deployments`, `health`, `audit`, `backup`, `system`, `env`, `azure-config`, `code-quality`, `cosmos-config`, `hermes-ops`, `vm`) but only `services` has a test file. The deployment orchestrator (`backend/src/modules/deployments/orchestrator.ts`), CSRF (`backend/src/lib/csrf.ts`), and auth (`backend/src/lib/auth.ts`) — the highest-risk surfaces — have no tests at all.
+
+- Action: add `*.test.ts` for at least `auth`, `csrf`, `deployments/orchestrator`, and `health` repository before adding more features. Mirror the style of <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/backend/src/modules/services/services.test.ts" />.
+- Add `pnpm test:coverage` to CI and fail under a threshold (start at 50 %, raise over time).
+
+### 4. SSE deployment-log streaming is disabled — RESOLVED (removed)
+The TODO has been resolved by **removing the SSE claim**, not by shipping it: the `fastify-sse-v2` dependency is gone from `backend/package.json`, the commented-out import + plugin registration are gone from `backend/src/server.ts`, and the deployment-log endpoint is now documented as JSON-polled. The web client never used `EventSource` (`web/src/lib/api.ts` already polls `/api/deployments/:id/logs` via the normal `apiRequest` helper), so no UI change was required. README/DEPLOYMENT.md updated to match. If a real-time stream is wanted later, ship it explicitly via `reply.raw` and update the docs in the same change.
+
+### 5. Documentation drift — RESOLVED
+- `DEPLOYMENT.md` is now the single canonical deployment guide; `DEPLOYMENT_GUIDE.md` is reduced to a one-line redirect for backwards compat with `deploy.sh` and external links. `deploy.sh` updated to reference `DEPLOYMENT.md`.
+- `DEPLOYMENT.md` carries an explicit **Ports — quick reference** table that distinguishes container port (`:3000`), Compose host port (`:3049`), and the Traefik production URL — so the 3000-vs-3049 question has one truthful answer per deployment mode rather than three contradictory prose claims.
+- `README.md` "Web port: 3000" rewritten to call out container vs Compose host vs dev-mode explicitly and link to the port table.
+- `ENDPOINTS.md` got a top-of-file note: every `localhost:3000` URL in the file refers to `pnpm dev`; substitute `:3049` for the Dockerized stack. The `https://api.bytelyst.com/api/devops` vs `/devops` ambiguity was already resolved by the existing "URL Note" section (kept).
+
+### 6. Docker socket + host log mounts are very privileged — RESOLVED (documented; allow-list wrapper queued)
+`DEPLOYMENT.md` now has a **Privilege Surface** section enumerating every host mount, every shell-outing route + the exact commands it runs, the auth gate on each, and an explicit blast-radius statement ("an admin token is equivalent to a host shell, modulo what the codebase wraps"). Concurrent fix: the `/code-quality/check` endpoint was missing `requireAdmin` and was therefore reachable unauthenticated even though it shells out to `npm run typecheck/lint/build/test:run` in a caller-supplied `projectPath` — that's been gated to admin in the same commit. Two follow-up P1s remain in the doc's mitigation roadmap (allow-list wrapper around shell-outs; validate `code-quality/check` `projectPath` against an allowed root set); P2/P3 cover audit-logging shell-outs, dropping root in the container, and moving off the raw docker socket.
+
+---
+
+## P2 — Hygiene
+
+### 7. Backend module structure isn't enforced
+Most modules follow the `routes.ts / repository.ts / types.ts` triple, but a few have extras (`deployments/orchestrator.ts`). There is no architectural test, README, or generator. New contributors will diverge.
+
+- Action: add a short `backend/src/modules/README.md` describing the convention, and (optionally) an architectural test using `dependency-cruiser` or a custom vitest.
+
+### 8. README is unfocused
+`README.md` mixes "Recent Improvements" (a changelog), feature list, setup, env vars, and full API docs into one 219-line file. The first cat of the file even shows it begins with two blank lines after the title — easy to miss content.
+
+- Action: trim README to: what / quickstart / pointers. Move "Recent Improvements" into `CHANGELOG.md` and keep API docs only in `ENDPOINTS.md` / Swagger.
+
+### 9. `.pnpmfile.cjs` dual-mode install is undocumented in CI
+`pnpm install:common-plat` vs `pnpm install:gitea` is only mentioned in the README. The CI workflow uses `install:common-plat`, which only works if the runner has the sibling `learning_ai_common_plat` checkout available. That assumption isn't asserted anywhere.
+
+- Action: add a pre-install check that fails fast with a clear message if the expected workspace path is missing, and document the runner prerequisites in the CI file.
+
+### 10. No production logging / metrics story
+`backend/src/server.ts` uses Fastify's default logger only. There is a `web/src/lib/telemetry.ts` file but nothing wires it to a backend. The dashboard advertises "monitoring" but doesn't emit its own structured telemetry.
+
+- Action: decide on a target (pino transports → stdout for container logs is enough for now) and write down the choice. If Prometheus / OpenTelemetry is in scope, file a tracked issue rather than leaving it implied.
+
+### 11. E2E tests aren't wired into local workflow
+`web/e2e/dashboard.spec.ts` and `web/e2e/hermes.spec.ts` exist and `pnpm test:e2e` is defined, but nothing documents how to start the backend+web before running them, and CI's E2E step (visible in `.gitea/workflows/ci.yml`) is cut off in the file — need to confirm it actually launches the stack.
+
+- Action: read the bottom half of `ci.yml` and confirm the E2E job sets up backend+web; if not, fix it. Add a `pnpm test:e2e` recipe to README that explicitly says "run `pnpm dev` first" or use Playwright's `webServer` config.
+
+---
+
+## Suggested execution order
+
+1. Fix the CI path (#1) — unblocks everything else.
+2. Reconcile the SSE TODO (#4) — either remove the claim or ship the feature.
+3. Add real linting (#2) and tighten test coverage on auth/csrf/orchestrator (#3).
+4. Documentation pass: ports, deployment docs, README trim (#5, #8).
+5. Privilege/operational hardening (#6, #10).
+6. Convention + DX polish (#7, #9, #11).
+
+Each item above is small enough to land as a single PR.
--- a/dashboard/backend/.env.example
+++ b/dashboard/backend/.env.example
@ -10,3 +10,8 @@ AZURE_TENANT_ID=your-azure-tenant-id
 AZURE_CLIENT_ID=your-azure-client-id
 AZURE_CLIENT_SECRET=your-azure-client-secret
 AZURE_KEY_VAULT_URL=https://your-key-vault.vault.azure.net/
+
+# Structured logging (pino → stdout). Override per environment as needed.
+# Levels: fatal | error | warn | info | debug | trace | silent
+# Default: debug in non-prod, info in prod (when NODE_ENV=production).
+LOG_LEVEL=info
--- a/dashboard/backend/Dockerfile
+++ b/dashboard/backend/Dockerfile
@ -1,14 +1,34 @@
 # Build context: bytelyst-devops-tools/dashboard/ (monorepo root)
+#
+# Uses pnpm (matches `packageManager` field in package.json) and the
+# workspace `pnpm-lock.yaml` at the dashboard root. The previously-used
+# `npm ci` against `backend/package-lock.json` was broken because the
+# npm lockfile had been regenerated inside the pnpm workspace and
+# contained pnpm-store symlinks (e.g. node_modules/typescript pointing
+# at ../node_modules/.pnpm/typescript@5.9.3/...), which npm treated as
+# `link: true` and skipped installing — leaving `tsc` missing.
+#
+# BYTELYST_PACKAGE_SOURCE=gitea disables the `.pnpmfile.cjs` filesystem
+# lookup of `learning_ai_common_plat` (which isn't in the build context).
+# Backend has no `@bytelyst/*` deps so the pnpmfile is a no-op for it,
+# but we set the env explicitly for clarity.
+
 # --- Stage 1: Build ---
 FROM node:20-alpine AS builder

-WORKDIR /app/backend
+ENV BYTELYST_PACKAGE_SOURCE=gitea
+RUN corepack enable && corepack prepare pnpm@10.6.5 --activate

-COPY backend/package.json backend/package-lock.json ./
-RUN npm ci --ignore-scripts
+WORKDIR /app

-COPY backend/tsconfig.json ./
-COPY backend/src/ ./src/
+# Workspace metadata (pnpm needs the root files to resolve the workspace).
+COPY package.json pnpm-lock.yaml pnpm-workspace.yaml .pnpmfile.cjs ./
+COPY backend/package.json ./backend/
+
+RUN pnpm install --frozen-lockfile --filter "@bytelyst/devops-backend..." --ignore-scripts
+
+COPY backend/tsconfig.json ./backend/
+COPY backend/src/ ./backend/src/

 # Build-time env vars (baked into the bundle)
 ARG BYTELYST_COMMIT_SHA=unknown
@ -27,22 +47,66 @@ ENV BYTELYST_COMMIT_SHA=${BYTELYST_COMMIT_SHA} \
    BYTELYST_COMMIT_MESSAGE=${BYTELYST_COMMIT_MESSAGE} \
    BYTELYST_DOCKER_IMAGE=${BYTELYST_DOCKER_IMAGE}

-RUN npm run build
+WORKDIR /app/backend
+RUN pnpm run build
+
+# Carve out a production-only deploy bundle (node_modules without devDeps).
+RUN pnpm --filter "@bytelyst/devops-backend" deploy --prod --legacy /deploy

 # --- Stage 2: Run ---
-FROM node:20-alpine AS runner
+# Use Debian slim (not Alpine) because vm-health-check.sh uses GNU df flags
+# (--output=pcent, --output=avail) that BusyBox df does not support.
+FROM node:20-slim AS runner

 WORKDIR /app/backend

-COPY backend/package.json backend/package-lock.json ./
-RUN npm ci --omit=dev --ignore-scripts
-RUN apk add --no-cache curl
+# Install tools needed by the VM management module:
+#   bash       — vm-health-check.sh and vm-cleanup.sh require bash
+#   docker.io  — docker CLI to communicate with the host daemon via socket
+#   python3    — used in inline python3 -c snippets inside the scripts
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      curl bash docker.io python3 \
+    && rm -rf /var/lib/apt/lists/*

-COPY --from=builder /app/backend/dist ./dist
+# Non-root user setup (Phase 5 P2 mitigation roadmap, item #4).
+# The backend doesn't strictly need root — its only privileged action is
+# talking to the docker daemon, which group membership covers. We create
+# the user + a docker group at a build-arg-configurable GID so the GID
+# can match the host's docker group (`getent group docker` on the host).
+#
+# Default `BACKEND_USER=root` keeps the current behaviour so existing
+# deployments don't break. Set `BACKEND_USER=app` to run non-root; this
+# requires the bind-mounted log files in `/var/log/vm-*.log` and
+# `/var/log/docker-watchdog.log` to be group-readable+writable by the
+# matching docker GID (or world-readable for read-only paths). See
+# `dashboard/DEPLOYMENT.md` Privilege Surface → "Running non-root".
+ARG BACKEND_USER=root
+ARG DOCKER_GID=999
+# `docker.io` in debian:bookworm-slim creates a `docker` group at a
+# distro-chosen GID (commonly 101). Reconcile it to ${DOCKER_GID} so the
+# in-container group matches the host's docker GID. If no `docker` group
+# exists yet, create one at ${DOCKER_GID}.
+RUN if getent group docker >/dev/null; then \
+        groupmod --gid "${DOCKER_GID}" docker; \
+    else \
+        groupadd --system --gid "${DOCKER_GID}" docker; \
+    fi \
+    && useradd --system --create-home --uid 1001 --gid "${DOCKER_GID}" --shell /sbin/nologin app \
+    && chown -R app:"${DOCKER_GID}" /app
+
+# Bring in the deploy bundle (package.json, prod node_modules) and compiled JS.
+COPY --from=builder --chown=app:${DOCKER_GID} /deploy/package.json ./package.json
+COPY --from=builder --chown=app:${DOCKER_GID} /deploy/node_modules ./node_modules
+COPY --from=builder --chown=app:${DOCKER_GID} /app/backend/dist ./dist

 ENV NODE_ENV=production
 ENV PORT=4004

 EXPOSE 4004

+# Switch to non-root only when explicitly opted in via build arg. If the
+# arg is `app`, the next two layers actually drop privileges; if `root`,
+# they're a no-op.
+USER ${BACKEND_USER}
+
 CMD ["node", "dist/server.js"]
--- a/dashboard/backend/eslint.config.js
+++ b/dashboard/backend/eslint.config.js
@ -0,0 +1,27 @@
+import js from '@eslint/js';
+import tseslint from 'typescript-eslint';
+import globals from 'globals';
+
+// Flat config (ESLint 9). Real linting — replaces the previous no-op `echo`.
+// Correctness rules from the recommended sets stay errors and fail CI;
+// stylistic/known-pattern rules are relaxed so the current tree is clean.
+export default tseslint.config(
+  { ignores: ['dist/**', 'coverage/**', 'node_modules/**'] },
+  js.configs.recommended,
+  ...tseslint.configs.recommended,
+  {
+    files: ['**/*.{ts,mts,cts}'],
+    languageOptions: {
+      globals: { ...globals.node },
+    },
+    rules: {
+      // Fastify request/reply are cast to `any` at framework boundaries.
+      '@typescript-eslint/no-explicit-any': 'off',
+      // Surface dead code without failing the build on work-in-progress.
+      '@typescript-eslint/no-unused-vars': [
+        'warn',
+        { argsIgnorePattern: '^_', varsIgnorePattern: '^_', caughtErrors: 'none' },
+      ],
+    },
+  },
+);
--- a/dashboard/backend/package-lock.json
+++ b/dashboard/backend/package-lock.json
--- a/dashboard/backend/package.json
+++ b/dashboard/backend/package.json
@ -13,7 +13,7 @@
    "test": "vitest",
    "test:run": "vitest run",
    "test:coverage": "vitest run --coverage",
-    "lint": "echo 'No linting configured for backend'",
+    "lint": "eslint src",
    "migrate": "tsx src/scripts/run-migrations.ts up",
    "migrate:rollback": "tsx src/scripts/run-migrations.ts down"
  },
@ -26,15 +26,20 @@
    "@fastify/swagger-ui": "^5.2.1",
    "dotenv": "^16.4.5",
    "fastify": "^5.2.1",
-    "fastify-sse-v2": "^4.2.2",
    "jose": "^6.1.2",
+    "pino": "^10.3.1",
+    "pino-pretty": "^13.1.3",
    "zod": "^3.24.1"
  },
  "devDependencies": {
+    "@eslint/js": "^9.18.0",
    "@types/node": "^25.0.3",
    "@vitest/coverage-v8": "3.2.4",
+    "eslint": "^9.18.0",
+    "globals": "^15.14.0",
    "tsx": "^4.21.0",
    "typescript": "^5.9.3",
+    "typescript-eslint": "^8.20.0",
    "vitest": "^3.1.2"
  }
 }
--- a/dashboard/backend/src/lib/auth.test.ts
+++ b/dashboard/backend/src/lib/auth.test.ts
@ -0,0 +1,135 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import { SignJWT } from 'jose';
+import type { FastifyRequest } from 'fastify';
+
+// Mock config so the auth module sees a deterministic JWT secret + product id.
+// Mocks must be declared before importing the SUT.
+vi.mock('./config.js', () => ({
+  config: { JWT_SECRET: 'test-jwt-secret-for-unit-tests' },
+  productId: 'devops-internal',
+}));
+
+const { extractAuth, requireAdmin, AuthError } = await import('./auth.js');
+
+const SECRET = new TextEncoder().encode('test-jwt-secret-for-unit-tests');
+
+async function makeToken(payload: Record<string, unknown>, opts?: { issuer?: string }): Promise<string> {
+  return new SignJWT(payload)
+    .setProtectedHeader({ alg: 'HS256' })
+    .setIssuer(opts?.issuer ?? 'bytelyst-platform')
+    .setSubject((payload.sub as string) ?? 'user-1')
+    .setExpirationTime('1h')
+    .sign(SECRET);
+}
+
+function reqWith(headers: Record<string, string>): FastifyRequest {
+  return { headers } as unknown as FastifyRequest;
+}
+
+describe('extractAuth', () => {
+  it('returns null when Authorization header is missing', async () => {
+    expect(await extractAuth(reqWith({}))).toBeNull();
+  });
+
+  it('returns null when Authorization is not a Bearer token', async () => {
+    expect(await extractAuth(reqWith({ authorization: 'Basic abc' }))).toBeNull();
+  });
+
+  it('returns null when the token is malformed', async () => {
+    expect(await extractAuth(reqWith({ authorization: 'Bearer not-a-jwt' }))).toBeNull();
+  });
+
+  it('returns null when issuer does not match', async () => {
+    const token = await makeToken({ sub: 'u1', role: 'admin' }, { issuer: 'other-issuer' });
+    expect(await extractAuth(reqWith({ authorization: `Bearer ${token}` }))).toBeNull();
+  });
+
+  it('returns null when signature does not verify (wrong secret)', async () => {
+    const wrong = await new SignJWT({ sub: 'u1', role: 'admin' })
+      .setProtectedHeader({ alg: 'HS256' })
+      .setIssuer('bytelyst-platform')
+      .setExpirationTime('1h')
+      .sign(new TextEncoder().encode('different-secret'));
+    expect(await extractAuth(reqWith({ authorization: `Bearer ${wrong}` }))).toBeNull();
+  });
+
+  it('elevates to admin when global role is admin', async () => {
+    const token = await makeToken({ sub: 'u1', role: 'admin', email: 'a@b.com', productId: 'devops-internal' });
+    const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
+    expect(result).toEqual({
+      userId: 'u1',
+      role: 'admin',
+      email: 'a@b.com',
+      productId: 'devops-internal',
+    });
+  });
+
+  it('elevates to admin via per-product membership for the target productId', async () => {
+    const token = await makeToken({
+      sub: 'u2',
+      role: 'user',
+      products: [
+        { productId: 'other-product', role: 'admin', plan: 'pro' },
+        { productId: 'devops-internal', role: 'admin', plan: 'pro' },
+      ],
+    });
+    const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
+    expect(result?.role).toBe('admin');
+    expect(result?.userId).toBe('u2');
+  });
+
+  it('does not elevate when product membership is for a different product', async () => {
+    const token = await makeToken({
+      sub: 'u3',
+      role: 'user',
+      products: [{ productId: 'other-product', role: 'admin', plan: 'pro' }],
+    });
+    const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
+    expect(result?.role).toBe('user');
+  });
+
+  it('does not elevate when product membership role is not admin', async () => {
+    const token = await makeToken({
+      sub: 'u4',
+      role: 'user',
+      products: [{ productId: 'devops-internal', role: 'viewer', plan: 'free' }],
+    });
+    const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
+    expect(result?.role).toBe('user');
+  });
+
+  it('defaults role to "user" when payload has no role', async () => {
+    const token = await makeToken({ sub: 'u5' });
+    const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
+    expect(result?.role).toBe('user');
+  });
+});
+
+describe('requireAdmin', () => {
+  let req: { authUserId?: string; authRole?: string };
+
+  beforeEach(() => {
+    req = {};
+  });
+
+  it('throws AuthError(403) when no auth context is present', () => {
+    expect(() => requireAdmin(req as unknown as FastifyRequest)).toThrow(AuthError);
+    try {
+      requireAdmin(req as unknown as FastifyRequest);
+    } catch (e) {
+      expect((e as InstanceType<typeof AuthError>).statusCode).toBe(403);
+    }
+  });
+
+  it('throws AuthError(403) for non-admin role', () => {
+    req.authUserId = 'u1';
+    req.authRole = 'user';
+    expect(() => requireAdmin(req as unknown as FastifyRequest)).toThrow(AuthError);
+  });
+
+  it('returns userId when role is admin', () => {
+    req.authUserId = 'u1';
+    req.authRole = 'admin';
+    expect(requireAdmin(req as unknown as FastifyRequest)).toEqual({ userId: 'u1' });
+  });
+});
--- a/dashboard/backend/src/lib/config.ts
+++ b/dashboard/backend/src/lib/config.ts
@ -31,5 +31,13 @@ const envSchema = z.object({

 export const config = envSchema.parse(process.env);

+// Warn loudly when insecure default keys are in use
+if (config.CSRF_SECRET === 'default-csrf-secret-change-in-production') {
+  console.warn('[config] WARNING: CSRF_SECRET is using the insecure default — set CSRF_SECRET in .env before deploying to production');
+}
+if (config.ENCRYPTION_KEY === 'default-encryption-key-change-in-production') {
+  console.warn('[config] WARNING: ENCRYPTION_KEY is using the insecure default — set ENCRYPTION_KEY in .env before deploying to production');
+}
+
 export const productId = productIdentity.productId;
 export const productName = productIdentity.name;
--- a/dashboard/backend/src/lib/csrf.test.ts
+++ b/dashboard/backend/src/lib/csrf.test.ts
@ -0,0 +1,77 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+
+// Pin a deterministic CSRF secret. Mocks must be declared before importing the SUT.
+vi.mock('./config.js', () => ({
+  config: { CSRF_SECRET: 'csrf-test-secret' },
+  productId: 'devops-internal',
+}));
+
+const { generateCsrfToken, validateCsrfToken, getSessionId } = await import('./csrf.js');
+
+describe('generateCsrfToken / validateCsrfToken', () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date('2026-01-01T00:00:00Z'));
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
+  it('produces a base64-encoded token that round-trips through validate', () => {
+    const token = generateCsrfToken('session-1');
+    expect(token).toMatch(/^[A-Za-z0-9+/=]+$/);
+    expect(validateCsrfToken(token, 'session-1')).toBe(true);
+  });
+
+  it('rejects when the session id does not match', () => {
+    const token = generateCsrfToken('session-1');
+    expect(validateCsrfToken(token, 'session-2')).toBe(false);
+  });
+
+  it('rejects when the token has been tampered with (signature mismatch)', () => {
+    const token = generateCsrfToken('session-1');
+    const decoded = Buffer.from(token, 'base64').toString('utf-8');
+    const [sid, ts] = decoded.split(':');
+    // Replace the trailing hash with garbage of the same length.
+    const tampered = Buffer.from(`${sid}:${ts}:${'0'.repeat(64)}`).toString('base64');
+    expect(validateCsrfToken(tampered, 'session-1')).toBe(false);
+  });
+
+  it('rejects when the token is older than the 1h window', () => {
+    const token = generateCsrfToken('session-1');
+    // Advance just past the 3_600_000ms cutoff.
+    vi.setSystemTime(new Date(Date.now() + 3_600_001));
+    expect(validateCsrfToken(token, 'session-1')).toBe(false);
+  });
+
+  it('accepts when the token is just inside the 1h window', () => {
+    const token = generateCsrfToken('session-1');
+    vi.setSystemTime(new Date(Date.now() + 3_599_000));
+    expect(validateCsrfToken(token, 'session-1')).toBe(true);
+  });
+
+  it('rejects garbage input without throwing', () => {
+    expect(validateCsrfToken('not-base64!!!', 'session-1')).toBe(false);
+    expect(validateCsrfToken('', 'session-1')).toBe(false);
+  });
+
+  it('produces different tokens for different sessions at the same instant', () => {
+    const t1 = generateCsrfToken('session-a');
+    const t2 = generateCsrfToken('session-b');
+    expect(t1).not.toBe(t2);
+    expect(validateCsrfToken(t1, 'session-b')).toBe(false);
+    expect(validateCsrfToken(t2, 'session-a')).toBe(false);
+  });
+});
+
+describe('getSessionId', () => {
+  it('returns authUserId when present on the request', () => {
+    expect(getSessionId({ authUserId: 'user-42' })).toBe('user-42');
+  });
+
+  it('returns null when authUserId is absent', () => {
+    expect(getSessionId({})).toBeNull();
+    expect(getSessionId({ headers: {} })).toBeNull();
+  });
+});
--- a/dashboard/backend/src/lib/dashboard-alerts.test.ts
+++ b/dashboard/backend/src/lib/dashboard-alerts.test.ts
@ -0,0 +1,44 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const appendFileMock = vi.hoisted(() => vi.fn());
+vi.mock('fs/promises', () => ({ appendFile: appendFileMock }));
+
+const { appendDashboardWarning, clearDashboardWarningDedupe } = await import('./dashboard-alerts.js');
+
+describe('dashboard-alerts', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    clearDashboardWarningDedupe();
+    delete process.env.HERMES_DASHBOARD_ALERT_LOG;
+  });
+
+  it('does nothing when the alert log is not configured', async () => {
+    const wrote = await appendDashboardWarning({ severity: 'warn', instance: 'vijay', message: 'gateway down' });
+    expect(wrote).toBe(false);
+    expect(appendFileMock).not.toHaveBeenCalled();
+  });
+
+  it('writes a routed warning line when configured', async () => {
+    process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
+    const wrote = await appendDashboardWarning(
+      { severity: 'critical', instance: 'bheem', message: 'backup missing' },
+      Date.parse('2026-05-31T07:00:00Z'),
+    );
+
+    expect(wrote).toBe(true);
+    expect(appendFileMock).toHaveBeenCalledWith(
+      '/tmp/hermes-dashboard-warnings.log',
+      '2026-05-31T07:00:00.000Z CRITICAL instance=bheem backup missing\n',
+      'utf8',
+    );
+  });
+
+  it('deduplicates for one hour and writes again after expiry', async () => {
+    process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
+    const input = { severity: 'warn' as const, instance: 'all' as const, message: 'shared warning' };
+    expect(await appendDashboardWarning(input, 1_000)).toBe(true);
+    expect(await appendDashboardWarning(input, 2_000)).toBe(false);
+    expect(await appendDashboardWarning(input, 3_602_000)).toBe(true);
+    expect(appendFileMock).toHaveBeenCalledTimes(2);
+  });
+});
--- a/dashboard/backend/src/lib/dashboard-alerts.ts
+++ b/dashboard/backend/src/lib/dashboard-alerts.ts
@ -0,0 +1,48 @@
+import { appendFile } from 'fs/promises';
+
+type AlertSeverity = 'info' | 'warn' | 'critical';
+type AlertInstance = 'vijay' | 'bheem' | 'all';
+
+interface DashboardWarningInput {
+  severity: AlertSeverity;
+  instance: AlertInstance;
+  message: string;
+}
+
+const DEDUPE_WINDOW_MS = 60 * 60 * 1000;
+const recent = new Map<string, number>();
+
+function severityToken(severity: AlertSeverity): string {
+  if (severity === 'critical') return 'CRITICAL';
+  if (severity === 'warn') return 'WARNING';
+  return 'INFO';
+}
+
+function alertKey(input: DashboardWarningInput): string {
+  return `${input.severity}\0${input.instance}\0${input.message}`;
+}
+
+function purgeExpired(now: number): void {
+  for (const [key, at] of recent) {
+    if (now - at > DEDUPE_WINDOW_MS) recent.delete(key);
+  }
+}
+
+export async function appendDashboardWarning(input: DashboardWarningInput, now = Date.now()): Promise<boolean> {
+  const logPath = process.env.HERMES_DASHBOARD_ALERT_LOG;
+  if (!logPath) return false;
+
+  purgeExpired(now);
+  const key = alertKey(input);
+  const previous = recent.get(key);
+  if (previous && now - previous <= DEDUPE_WINDOW_MS) return false;
+
+  recent.set(key, now);
+  const line = `${new Date(now).toISOString()} ${severityToken(input.severity)} instance=${input.instance} ${input.message}\n`;
+  await appendFile(logPath, line, 'utf8');
+  return true;
+}
+
+export function clearDashboardWarningDedupe(): void {
+  recent.clear();
+}
--- a/dashboard/backend/src/lib/logger.ts
+++ b/dashboard/backend/src/lib/logger.ts
@ -0,0 +1,74 @@
+// Centralized pino logger.
+//
+// Fastify already uses pino under the hood, but we want one configured pino
+// instance shared between Fastify (via `logger: <instance>` in `Fastify({...})`)
+// and any non-request code path (background tasks, repositories called outside
+// a request, scripts). Importing the same instance everywhere means uniform
+// formatting, redaction, and log level — and gives us one place to change
+// transport later.
+//
+// Env knobs:
+//   LOG_LEVEL   — pino level (`fatal|error|warn|info|debug|trace|silent`).
+//                 Default: `debug` in non-production, `info` in production.
+//   NODE_ENV    — `production` flips the default level.
+//
+// Redaction:
+//   We strip Authorization headers and a small allow-list of secret-shaped
+//   field names (`password`, `token`, `secret`, common Azure/JWT keys) from
+//   any logged object so that an accidental `req.log.info(req.body)` or
+//   `logger.error({ err, config }, ...)` doesn't leak credentials.
+
+import pino from 'pino';
+
+const isProd = process.env.NODE_ENV === 'production';
+const level = process.env.LOG_LEVEL ?? (isProd ? 'info' : 'debug');
+
+// Field paths we never want in logs. Pino's redact uses fast-redact's
+// dot-path syntax with `*` wildcards. Cover the common cases without trying
+// to be exhaustive — this is a backstop, not the primary defense.
+const redactPaths = [
+  // Headers (Fastify request log shape)
+  'req.headers.authorization',
+  'req.headers.cookie',
+  'request.headers.authorization',
+  'request.headers.cookie',
+  'headers.authorization',
+  'headers.cookie',
+  // Common secret-shaped keys at the top level of a logged object
+  '*.password',
+  '*.token',
+  '*.refreshToken',
+  '*.refresh_token',
+  '*.accessToken',
+  '*.access_token',
+  '*.csrfToken',
+  '*.csrf_token',
+  '*.JWT_SECRET',
+  '*.CSRF_SECRET',
+  '*.ENCRYPTION_KEY',
+  '*.COSMOS_KEY',
+  '*.AZURE_CLIENT_SECRET',
+];
+
+export const logger = pino({
+  level,
+  redact: {
+    paths: redactPaths,
+    censor: '[REDACTED]',
+  },
+  // Stable, JSON to stdout in every environment. If you want pretty output
+  // locally, pipe through `pino-pretty` from your shell — we deliberately
+  // don't bundle it as a runtime dep.
+  base: { service: 'devops-backend' },
+  timestamp: pino.stdTimeFunctions.isoTime,
+});
+
+// Convenience: a child logger tagged with a module name. Use this in
+// repositories / background workers so log lines carry their origin
+// without having to repeat it in every call site.
+//
+//   const log = childLogger('deployments/orchestrator');
+//   log.error({ err, deploymentId }, 'background work failed');
+export function childLogger(module: string) {
+  return logger.child({ module });
+}
--- a/dashboard/backend/src/lib/shell.test.ts
+++ b/dashboard/backend/src/lib/shell.test.ts
@ -0,0 +1,165 @@
+import { describe, it, expect, beforeEach, vi } from 'vitest';
+
+const execFileMock = vi.hoisted(() => vi.fn());
+vi.mock('child_process', () => ({ execFile: execFileMock }));
+
+const {
+  assertPathInAllowedRoots,
+  dockerPrune,
+  dockerRestart,
+  execAllowed,
+  InvalidShellArgError,
+  runBashScript,
+  runNpmScript,
+} = await import('./shell.js');
+
+function setExec(handler: (cmd: string, args: string[]) => { error?: Error; stdout?: string; stderr?: string }) {
+  execFileMock.mockImplementation(
+    (
+      command: string,
+      args: string[],
+      _opts: unknown,
+      cb: (err: unknown, result?: { stdout: string; stderr: string }) => void,
+    ) => {
+      const res = handler(command, args);
+      if (res.error) cb(res.error);
+      else cb(null, { stdout: res.stdout ?? '', stderr: res.stderr ?? '' });
+    },
+  );
+}
+
+describe('execAllowed', () => {
+  beforeEach(() => vi.clearAllMocks());
+
+  it('passes argv through to execFile without a shell', async () => {
+    setExec(() => ({ stdout: 'ok' }));
+    const result = await execAllowed('docker', ['ps', '-a']);
+    expect(result.stdout).toBe('ok');
+    expect(execFileMock).toHaveBeenCalledTimes(1);
+    const [cmd, args] = execFileMock.mock.calls[0];
+    expect(cmd).toBe('docker');
+    expect(args).toEqual(['ps', '-a']);
+  });
+});
+
+describe('dockerRestart', () => {
+  beforeEach(() => vi.clearAllMocks());
+
+  it('rejects names with shell metacharacters before reaching execFile', async () => {
+    await expect(dockerRestart('foo; rm -rf /')).rejects.toBeInstanceOf(InvalidShellArgError);
+    await expect(dockerRestart('foo bar')).rejects.toBeInstanceOf(InvalidShellArgError);
+    await expect(dockerRestart('$(whoami)')).rejects.toBeInstanceOf(InvalidShellArgError);
+    await expect(dockerRestart('')).rejects.toBeInstanceOf(InvalidShellArgError);
+    expect(execFileMock).not.toHaveBeenCalled();
+  });
+
+  it('accepts valid container names and forwards them as a single argv element', async () => {
+    setExec(() => ({ stdout: 'restarted' }));
+    await dockerRestart('hermes-gateway');
+    const [, args] = execFileMock.mock.calls[0];
+    // `restart` and the name are separate argv slots — never one
+    // concatenated string that could be re-parsed by a shell.
+    expect(args).toEqual(['restart', 'hermes-gateway']);
+  });
+
+  it('non-string input throws InvalidShellArgError', async () => {
+    // @ts-expect-error — testing runtime guard
+    await expect(dockerRestart(undefined)).rejects.toBeInstanceOf(InvalidShellArgError);
+    // @ts-expect-error — testing runtime guard
+    await expect(dockerRestart(123)).rejects.toBeInstanceOf(InvalidShellArgError);
+  });
+});
+
+describe('dockerPrune', () => {
+  beforeEach(() => vi.clearAllMocks());
+
+  it('rejects unknown prune kinds', async () => {
+    // @ts-expect-error — exercising the runtime check
+    await expect(dockerPrune('everything')).rejects.toBeInstanceOf(InvalidShellArgError);
+  });
+
+  it('emits the documented argv per kind', async () => {
+    setExec(() => ({ stdout: '' }));
+    await dockerPrune('container');
+    expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['container', 'prune', '-f']);
+    await dockerPrune('image', { all: true });
+    // `docker image prune -a -f` — kind first, then the verb, then -a/-f flags.
+    expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['image', 'prune', '-a', '-f']);
+    await dockerPrune('volume');
+    expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['volume', 'prune', '-f']);
+    await dockerPrune('builder');
+    expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['builder', 'prune', '-f']);
+  });
+
+  it('rejects --all on non-image kinds', async () => {
+    await expect(dockerPrune('container', { all: true })).rejects.toBeInstanceOf(InvalidShellArgError);
+  });
+});
+
+describe('assertPathInAllowedRoots', () => {
+  it('accepts paths inside an allowed root', () => {
+    expect(assertPathInAllowedRoots('/opt/projects/foo', ['/opt/projects'])).toBe('/opt/projects/foo');
+    expect(assertPathInAllowedRoots('/opt/projects/foo/bar/baz', ['/opt/projects'])).toBe('/opt/projects/foo/bar/baz');
+    expect(assertPathInAllowedRoots('/opt/projects', ['/opt/projects'])).toBe('/opt/projects');
+  });
+
+  it('rejects relative paths', () => {
+    expect(() => assertPathInAllowedRoots('relative/path', ['/opt/projects'])).toThrow(InvalidShellArgError);
+    expect(() => assertPathInAllowedRoots('./foo', ['/opt/projects'])).toThrow(InvalidShellArgError);
+  });
+
+  it('rejects ../ escape attempts even when prefix-matching the root', () => {
+    expect(() => assertPathInAllowedRoots('/opt/projects/../etc', ['/opt/projects'])).toThrow(InvalidShellArgError);
+    expect(() => assertPathInAllowedRoots('/opt/projects/../../etc', ['/opt/projects'])).toThrow(InvalidShellArgError);
+  });
+
+  it('rejects sibling directories that share a prefix string', () => {
+    // /opt/projects-evil should NOT be accepted just because it starts with /opt/projects
+    expect(() => assertPathInAllowedRoots('/opt/projects-evil/foo', ['/opt/projects'])).toThrow(InvalidShellArgError);
+  });
+
+  it('checks every allowed root', () => {
+    expect(assertPathInAllowedRoots('/srv/app', ['/opt/projects', '/srv/app'])).toBe('/srv/app');
+  });
+});
+
+describe('runBashScript', () => {
+  beforeEach(() => vi.clearAllMocks());
+
+  it('rejects scripts outside allowed roots', async () => {
+    await expect(runBashScript('/etc/init.d/anything', [], { allowedRoots: ['/opt/projects'] }))
+      .rejects.toBeInstanceOf(InvalidShellArgError);
+  });
+
+  it('runs a script that is inside an allowed root', async () => {
+    setExec(() => ({ stdout: 'ok' }));
+    const result = await runBashScript('/opt/projects/deploy.sh', ['--prod'], { allowedRoots: ['/opt/projects'] });
+    expect(result.stdout).toBe('ok');
+    const [cmd, args] = execFileMock.mock.calls[0];
+    expect(cmd).toBe('bash');
+    expect(args).toEqual(['/opt/projects/deploy.sh', '--prod']);
+  });
+});
+
+describe('runNpmScript', () => {
+  beforeEach(() => vi.clearAllMocks());
+
+  it('rejects npm scripts not in the lifecycle allow-list', async () => {
+    // @ts-expect-error — exercising the runtime guard
+    await expect(runNpmScript('publish', { allowedRoots: ['/opt/projects'], cwd: '/opt/projects/foo' }))
+      .rejects.toBeInstanceOf(InvalidShellArgError);
+  });
+
+  it('rejects cwd outside allowed roots', async () => {
+    await expect(runNpmScript('typecheck', { allowedRoots: ['/opt/projects'], cwd: '/etc' }))
+      .rejects.toBeInstanceOf(InvalidShellArgError);
+  });
+
+  it('runs a whitelisted lifecycle script in an allowed cwd', async () => {
+    setExec(() => ({ stdout: 'ok' }));
+    await runNpmScript('typecheck', { allowedRoots: ['/opt/projects'], cwd: '/opt/projects/foo' });
+    const [cmd, args] = execFileMock.mock.calls[0];
+    expect(cmd).toBe('npm');
+    expect(args).toEqual(['run', 'typecheck']);
+  });
+});
--- a/dashboard/backend/src/lib/shell.ts
+++ b/dashboard/backend/src/lib/shell.ts
@ -0,0 +1,170 @@
+// Allow-list wrapper around shell-outs.
+//
+// Every privileged route in this backend ultimately runs `docker`, `bash`,
+// `npm`, etc. on the host. Historically those were issued as template-literal
+// strings passed through `child_process.exec`, which means a misvalidated
+// path param can become a shell-injection. This module fixes that by:
+//
+//   1. Always passing argv as a real array to `execFile` (no shell expansion,
+//      no string templating). `execAllowed()` is the only escape hatch and it
+//      still uses `execFile`, never `exec`.
+//   2. Exposing per-command helpers (`dockerRestart`, `dockerPrune`,
+//      `runBashScript`, `runNpmScript`) that validate their inputs against
+//      a per-command allow-list regex. Repos call these instead of building
+//      `docker ...` strings directly.
+//
+// This is the "allow-list wrapper" item from the DEPLOYMENT.md privilege-
+// surface mitigation roadmap.
+
+import { execFile } from 'child_process';
+import { isAbsolute, normalize, relative, resolve } from 'path';
+import { promisify } from 'util';
+import { childLogger } from './logger.js';
+
+const execFileAsync = promisify(execFile);
+const log = childLogger('lib/shell');
+
+export interface ShellExecOptions {
+  cwd?: string;
+  timeoutMs?: number;
+  env?: NodeJS.ProcessEnv;
+}
+
+export interface ShellResult {
+  stdout: string;
+  stderr: string;
+}
+
+/**
+ * Run a single command with an explicit `argv` array. No shell expansion,
+ * no string interpolation. Prefer the per-command helpers below; reach for
+ * this when the command isn't on the allow-list yet.
+ */
+export async function execAllowed(
+  command: string,
+  args: string[],
+  options: ShellExecOptions = {},
+): Promise<ShellResult> {
+  log.debug({ command, args, cwd: options.cwd }, 'shell exec');
+  const { stdout, stderr } = await execFileAsync(command, args, {
+    cwd: options.cwd,
+    timeout: options.timeoutMs ?? 30_000,
+    env: options.env,
+    maxBuffer: 10 * 1024 * 1024,
+  });
+  return {
+    stdout: stdout?.toString?.() ?? String(stdout ?? ''),
+    stderr: stderr?.toString?.() ?? String(stderr ?? ''),
+  };
+}
+
+// --- Docker allow-list ------------------------------------------------------
+
+// Container/volume/image names from the docker daemon. Docker's own rule is
+// `[a-zA-Z0-9][a-zA-Z0-9_.-]+` but we tighten the leading char too.
+const CONTAINER_NAME_RE = /^[a-zA-Z0-9][a-zA-Z0-9._-]{0,127}$/;
+
+export class InvalidShellArgError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = 'InvalidShellArgError';
+  }
+}
+
+function assertContainerName(name: string): void {
+  if (typeof name !== 'string' || !CONTAINER_NAME_RE.test(name)) {
+    throw new InvalidShellArgError(`Invalid container name: ${JSON.stringify(name)}`);
+  }
+}
+
+/** `docker restart <name>` — validated. */
+export async function dockerRestart(name: string): Promise<ShellResult> {
+  assertContainerName(name);
+  return execAllowed('docker', ['restart', name], { timeoutMs: 30_000 });
+}
+
+const PRUNE_KINDS = ['container', 'image', 'volume', 'builder'] as const;
+export type PruneKind = typeof PRUNE_KINDS[number];
+
+/** `docker <kind> prune -f` (`-a` only valid for `image`). */
+export async function dockerPrune(kind: PruneKind, opts: { all?: boolean } = {}): Promise<ShellResult> {
+  if (!PRUNE_KINDS.includes(kind)) {
+    throw new InvalidShellArgError(`Invalid prune kind: ${JSON.stringify(kind)}`);
+  }
+  const args: string[] = [kind, 'prune', '-f'];
+  if (opts.all) {
+    if (kind !== 'image') throw new InvalidShellArgError('`all` is only valid for image prune');
+    args.splice(2, 0, '-a');
+  }
+  return execAllowed('docker', args, { timeoutMs: 60_000 });
+}
+
+// --- Filesystem-path allow-list --------------------------------------------
+
+/**
+ * Verify that `candidate` is an absolute path that resolves inside one of
+ * the allowed roots. Used to lock down request-supplied `cwd` values
+ * (e.g. `/code-quality/check`'s `projectPath`) so callers can't run
+ * lifecycle scripts in arbitrary directories.
+ */
+export function assertPathInAllowedRoots(candidate: string, allowedRoots: string[]): string {
+  if (typeof candidate !== 'string' || !isAbsolute(candidate)) {
+    throw new InvalidShellArgError(`Path must be absolute: ${JSON.stringify(candidate)}`);
+  }
+  const resolved = resolve(normalize(candidate));
+  for (const root of allowedRoots) {
+    const resolvedRoot = resolve(normalize(root));
+    const rel = relative(resolvedRoot, resolved);
+    // Inside the root iff the relative path doesn't escape upward
+    // (no leading `..`) and isn't an absolute path back out.
+    if (rel === '' || (!rel.startsWith('..') && !isAbsolute(rel))) {
+      return resolved;
+    }
+  }
+  throw new InvalidShellArgError(
+    `Path is not inside an allowed root: ${JSON.stringify(candidate)}`,
+  );
+}
+
+// --- bash / npm wrappers ----------------------------------------------------
+
+/**
+ * Run a `bash <script>` invocation with `cwd` constrained to allowed
+ * roots. The script path itself must also be inside an allowed root.
+ */
+export async function runBashScript(
+  scriptPath: string,
+  args: string[] = [],
+  options: ShellExecOptions & { allowedRoots: string[] } = { allowedRoots: [] },
+): Promise<ShellResult> {
+  const safeScript = assertPathInAllowedRoots(scriptPath, options.allowedRoots);
+  if (options.cwd) assertPathInAllowedRoots(options.cwd, options.allowedRoots);
+  return execAllowed('bash', [safeScript, ...args], {
+    cwd: options.cwd,
+    timeoutMs: options.timeoutMs ?? 300_000,
+    env: options.env,
+  });
+}
+
+const NPM_LIFECYCLE = ['typecheck', 'lint', 'build', 'test', 'test:run', 'start'] as const;
+export type NpmLifecycle = typeof NPM_LIFECYCLE[number];
+
+/**
+ * `npm run <script>` constrained to a known set of lifecycle scripts and
+ * run only inside an allowed project root. Used by `/code-quality/check`.
+ */
+export async function runNpmScript(
+  script: NpmLifecycle,
+  options: ShellExecOptions & { allowedRoots: string[] } = { allowedRoots: [] },
+): Promise<ShellResult> {
+  if (!NPM_LIFECYCLE.includes(script)) {
+    throw new InvalidShellArgError(`npm script not in allow-list: ${JSON.stringify(script)}`);
+  }
+  if (!options.cwd) throw new InvalidShellArgError('npm run requires a cwd');
+  assertPathInAllowedRoots(options.cwd, options.allowedRoots);
+  return execAllowed('npm', ['run', script], {
+    cwd: options.cwd,
+    timeoutMs: options.timeoutMs ?? 120_000,
+    env: options.env,
+  });
+}
--- a/dashboard/backend/src/modules/audit/types.ts
+++ b/dashboard/backend/src/modules/audit/types.ts
@ -2,8 +2,11 @@ import { z } from 'zod';

 export const AuditLogSchema = z.object({
  id: z.string(),
-  action: z.enum(['create', 'update', 'delete', 'deploy', 'trigger']),
-  entityType: z.enum(['service', 'deployment', 'user']),
+  // `shell-exec` covers privileged shell-outs (docker prune, container
+  // restart, code-quality npm runs) so a leaked admin token's actions are
+  // reconstructable from cosmos rather than only from container stdout.
+  action: z.enum(['create', 'update', 'delete', 'deploy', 'trigger', 'shell-exec']),
+  entityType: z.enum(['service', 'deployment', 'user', 'host']),
  entityId: z.string(),
  userId: z.string(),
  role: z.string(),
--- a/dashboard/backend/src/modules/backup/repository.ts
+++ b/dashboard/backend/src/modules/backup/repository.ts
@ -1,7 +1,10 @@
 import { getContainer } from '../../lib/cosmos-init.js';
 import { productId } from '../../lib/config.js';
+import { childLogger } from '../../lib/logger.js';
 import type { Backup, BackupParams } from './types.js';

+const log = childLogger('backup/repository');
+
 const BACKUPS_CONTAINER = 'backups';

 export async function createBackup(params: BackupParams = {}): Promise<Backup> {
@ -21,7 +24,7 @@ export async function createBackup(params: BackupParams = {}): Promise<Backup> {
      backupData[containerName] = resources;
      totalItems += resources.length;
    } catch (error) {
-      console.error(`Failed to backup container ${containerName}:`, error);
+      log.error({ err: error, containerName }, "failed to backup container");
      throw error;
    }
  }
@ -54,7 +57,7 @@ export async function getBackups(): Promise<Backup[]> {

    return resources as Backup[];
  } catch (error) {
-    console.error('Failed to get backups:', error);
+    log.error({ err: error }, "failed to get backups");
    return [];
  }
 }
@ -89,7 +92,7 @@ export async function restoreBackup(backupId: string): Promise<void> {
      try {
        await targetContainer.items.upsert(item);
      } catch (error) {
-        console.error(`Failed to restore item in ${containerName}:`, error);
+        log.error({ err: error, containerName }, "failed to restore backup item");
      }
    }
  }
@ -99,7 +102,7 @@ export async function deleteBackup(backupId: string): Promise<void> {
  try {
    await getContainer(BACKUPS_CONTAINER).item(backupId).delete();
  } catch (error) {
-    console.error('Failed to delete backup:', error);
+    log.error({ err: error }, "failed to delete backup");
    throw error;
  }
 }
--- a/dashboard/backend/src/modules/code-quality/repository.ts
+++ b/dashboard/backend/src/modules/code-quality/repository.ts
@ -2,12 +2,61 @@ import { exec } from 'child_process';
 import { promisify } from 'util';
 import { readFile } from 'fs/promises';
 import { join } from 'path';
+import { assertPathInAllowedRoots, InvalidShellArgError, runNpmScript, type NpmLifecycle } from '../../lib/shell.js';
 import type { CodeQualityReport, CodeQualityCheckParams, CodeQualityIssue } from './types.js';

 const execAsync = promisify(exec);

+// Allow-listed roots inside which `/code-quality/check` may run
+// `npm run typecheck/lint/build/test:run`. Anything outside these roots is
+// rejected before a subprocess is spawned. Configure via
+// `CODE_QUALITY_ALLOWED_ROOTS` (colon-separated) for non-default deployments.
+const DEFAULT_ALLOWED_ROOTS = ['/opt/bytelyst'];
+function getAllowedRoots(): string[] {
+  const raw = process.env.CODE_QUALITY_ALLOWED_ROOTS?.trim();
+  if (!raw) return DEFAULT_ALLOWED_ROOTS;
+  return raw.split(':').map((s) => s.trim()).filter(Boolean);
+}
+
+// Run an `npm run <script>` invocation through the shell allow-list and
+// always resolve, even on non-zero exit (the parsers downstream want to
+// inspect stdout+stderr regardless of exit code).
+async function runScriptCapturingOutput(
+  script: NpmLifecycle,
+  cwd: string,
+  timeoutMs: number,
+): Promise<{ output: string; ok: boolean }> {
+  try {
+    const { stdout, stderr } = await runNpmScript(script, {
+      allowedRoots: getAllowedRoots(),
+      cwd,
+      timeoutMs,
+    });
+    return { output: `${stdout}${stderr}`, ok: true };
+  } catch (error) {
+    if (error instanceof InvalidShellArgError) throw error;
+    const e = error as { stdout?: string; stderr?: string; message?: string };
+    return { output: `${e.stdout ?? ''}${e.stderr ?? ''}` || (e.message ?? ''), ok: false };
+  }
+}
+
 export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promise<CodeQualityReport> {
  const { projectId, projectPath, checks } = params;
+
+  // Reject paths outside the allow-list before spawning anything.
+  // `assertPathInAllowedRoots` returns the resolved absolute form so we
+  // pass that into the npm wrapper rather than the raw input.
+  let resolvedPath: string;
+  try {
+    resolvedPath = assertPathInAllowedRoots(projectPath, getAllowedRoots());
+  } catch (error) {
+    if (error instanceof InvalidShellArgError) {
+      throw new Error(
+        `projectPath is not inside an allowed root (${getAllowedRoots().join(', ')}); refusing to run lifecycle scripts there.`,
+      );
+    }
+    throw error;
+  }
  const issues: CodeQualityIssue[] = [];
  const summary = {
    totalIssues: 0,
@ -27,66 +76,35 @@ export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promi
  // TypeScript check
  if (checks.includes('typescript')) {
    const tsStart = Date.now();
-    try {
-      const { stdout, stderr } = await execAsync('npm run typecheck', {
-        cwd: projectPath,
-        timeout: 60000,
-      });
-      const output = stdout + stderr;
-      const tsIssues = parseTypeScriptOutput(output, projectPath);
+    const { output } = await runScriptCapturingOutput('typecheck', resolvedPath, 60000);
+    const tsIssues = parseTypeScriptOutput(output, resolvedPath);
    issues.push(...tsIssues);
    categories.typescript.duration = Date.now() - tsStart;
    categories.typescript.errors = tsIssues.filter(i => i.type === 'error').length;
    categories.typescript.warnings = tsIssues.filter(i => i.type === 'warning').length;
-    } catch (error: any) {
-      categories.typescript.duration = Date.now() - tsStart;
-      const output = error.stdout + error.stderr || error.message;
-      const tsIssues = parseTypeScriptOutput(output, projectPath);
-      issues.push(...tsIssues);
-      categories.typescript.errors = tsIssues.filter(i => i.type === 'error').length;
-      categories.typescript.warnings = tsIssues.filter(i => i.type === 'warning').length;
-    }
  }

  // ESLint check
  if (checks.includes('eslint')) {
    const eslintStart = Date.now();
-    try {
-      const { stdout, stderr } = await execAsync('npm run lint', {
-        cwd: projectPath,
-        timeout: 60000,
-      });
-      const output = stdout + stderr;
-      const eslintIssues = parseEslintOutput(output, projectPath);
+    const { output } = await runScriptCapturingOutput('lint', resolvedPath, 60000);
+    const eslintIssues = parseEslintOutput(output, resolvedPath);
    issues.push(...eslintIssues);
    categories.eslint.duration = Date.now() - eslintStart;
    categories.eslint.errors = eslintIssues.filter(i => i.type === 'error').length;
    categories.eslint.warnings = eslintIssues.filter(i => i.type === 'warning').length;
-    } catch (error: any) {
-      categories.eslint.duration = Date.now() - eslintStart;
-      const output = error.stdout + error.stderr || error.message;
-      const eslintIssues = parseEslintOutput(output, projectPath);
-      issues.push(...eslintIssues);
-      categories.eslint.errors = eslintIssues.filter(i => i.type === 'error').length;
-      categories.eslint.warnings = eslintIssues.filter(i => i.type === 'warning').length;
-    }
  }

  // Build check
  if (checks.includes('build')) {
    const buildStart = Date.now();
-    try {
-      const { stdout, stderr } = await execAsync('npm run build', {
-        cwd: projectPath,
-        timeout: 120000,
-      });
+    const { output, ok } = await runScriptCapturingOutput('build', resolvedPath, 120000);
+    categories.build.duration = Date.now() - buildStart;
+    if (ok) {
      categories.build.success = true;
-      categories.build.duration = Date.now() - buildStart;
-    } catch (error: any) {
+    } else {
      categories.build.success = false;
-      categories.build.duration = Date.now() - buildStart;
-      const output = error.stdout + error.stderr || error.message;
-      const buildIssues = parseBuildOutput(output, projectPath);
+      const buildIssues = parseBuildOutput(output, resolvedPath);
      issues.push(...buildIssues);
      categories.build.errors = buildIssues.length;
    }
@ -95,25 +113,16 @@ export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promi
  // Test check
  if (checks.includes('test')) {
    const testStart = Date.now();
-    try {
-      const { stdout, stderr } = await execAsync('npm run test:run', {
-        cwd: projectPath,
-        timeout: 120000,
-      });
-      const output = stdout + stderr;
+    const { output, ok } = await runScriptCapturingOutput('test:run', resolvedPath, 120000);
    const testResults = parseTestOutput(output);
+    categories.test.duration = Date.now() - testStart;
+    categories.test.passed = testResults.passed;
+    categories.test.failed = testResults.failed;
+    if (ok) {
      categories.test.success = testResults.failed === 0;
-      categories.test.passed = testResults.passed;
-      categories.test.failed = testResults.failed;
-      categories.test.duration = Date.now() - testStart;
-    } catch (error: any) {
+    } else {
      categories.test.success = false;
-      categories.test.duration = Date.now() - testStart;
-      const output = error.stdout + error.stderr || error.message;
-      const testResults = parseTestOutput(output);
-      categories.test.passed = testResults.passed;
-      categories.test.failed = testResults.failed;
-      const testIssues = parseTestOutputErrors(output, projectPath);
+      const testIssues = parseTestOutputErrors(output, resolvedPath);
      issues.push(...testIssues);
    }
  }
@ -124,13 +133,13 @@ export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promi
  summary.warnings = issues.filter(i => i.type === 'warning').length;
  summary.infos = issues.filter(i => i.type === 'info').length;

-  const projectName = projectPath.split('/').pop() || projectPath;
+  const projectName = resolvedPath.split('/').pop() || resolvedPath;

  return {
    id: `cq-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
    projectId,
    projectName,
-    projectPath,
+    projectPath: resolvedPath,
    timestamp: new Date().toISOString(),
    summary,
    categories,
@ -148,7 +157,7 @@ function parseTypeScriptOutput(output: string, projectPath: string): CodeQuality
    if (tsErrorMatch) {
      issues.push({
        id: `ts-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
-        type: tsErrorMatch[3] as 'error' | 'warning',
+        type: tsErrorMatch[4] as 'error' | 'warning',  // group 4 = type; group 3 = column
        category: 'typescript',
        file: tsErrorMatch[1],
        line: parseInt(tsErrorMatch[2]),
@ -167,10 +176,12 @@ function parseEslintOutput(output: string, projectPath: string): CodeQualityIssu
  const lines = output.split('\n');

  for (const line of lines) {
-    // ESLint format: file:line:col message [rule]
-    const eslintMatch = line.match(/(.+\.tsx?):(\d+):(\d+)\s+(.+?)\s+\[(.+)\]/);
+    // ESLint unix format: file:line:col: message [rule]
+    // Rule part in brackets may or may not be present depending on formatter
+    const eslintMatch = line.match(/(.+\.tsx?):(\d+):(\d+)[:\s]+(.+?)(?:\s+\[([^\]]+)\])?$/);
    if (eslintMatch) {
-      const severity = eslintMatch[4].includes('error') ? 'error' : 'warning';
+      const msgAndLevel = eslintMatch[4];
+      const severity = /\berror\b/i.test(msgAndLevel) ? 'error' : 'warning';
      issues.push({
        id: `eslint-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
        type: severity,
@ -178,8 +189,8 @@ function parseEslintOutput(output: string, projectPath: string): CodeQualityIssu
        file: eslintMatch[1],
        line: parseInt(eslintMatch[2]),
        column: parseInt(eslintMatch[3]),
-        message: eslintMatch[4],
-        rule: eslintMatch[5],
+        message: msgAndLevel,
+        rule: eslintMatch[5] ?? 'unknown',
      });
    }
  }
@ -210,18 +221,24 @@ function parseTestOutput(output: string): { passed: number; failed: number } {
  let passed = 0;
  let failed = 0;

-  // Try to parse Vitest output
-  const vitestMatch = output.match(/Test Files\s+(\d+)\s+\((\d+)\s+failed/);
-  if (vitestMatch) {
-    failed = parseInt(vitestMatch[2]);
-    passed = parseInt(vitestMatch[1]) - failed;
+  // Try to parse Vitest output — use "Tests" line (individual tests), not "Test Files" line
+  // Format: " Tests  3 failed | 5 passed (8)"  or  " Tests  8 passed (8)"
+  const vitestFailMatch = output.match(/\bTests\b\s+(\d+)\s+failed[^|]*\|\s*(\d+)\s+passed/);
+  const vitestPassMatch = output.match(/\bTests\b\s+(\d+)\s+passed/);
+  if (vitestFailMatch) {
+    failed = parseInt(vitestFailMatch[1]);
+    passed = parseInt(vitestFailMatch[2]);
+  } else if (vitestPassMatch) {
+    passed = parseInt(vitestPassMatch[1]);
+    failed = 0;
  }

-  // Try to parse Jest output
-  const jestMatch = output.match(/Tests:\s+(\d+)\s+passed,?\s*(\d+)\s+failed/);
-  if (jestMatch) {
-    passed = parseInt(jestMatch[1]);
-    failed = parseInt(jestMatch[2]);
+  // Try to parse Jest output: "Tests: 5 passed, 2 failed" or "Tests: 2 failed, 5 passed"
+  const jestPassMatch = output.match(/Tests:.*?(\d+)\s+passed/);
+  const jestFailMatch = output.match(/Tests:.*?(\d+)\s+failed/);
+  if (jestPassMatch || jestFailMatch) {
+    passed = jestPassMatch ? parseInt(jestPassMatch[1]) : 0;
+    failed = jestFailMatch ? parseInt(jestFailMatch[1]) : 0;
  }

  return { passed, failed };
--- a/dashboard/backend/src/modules/code-quality/routes.ts
+++ b/dashboard/backend/src/modules/code-quality/routes.ts
@ -1,10 +1,16 @@
 import { FastifyInstance } from 'fastify';
 import { runCodeQualityCheck } from './repository.js';
 import { CodeQualityCheckParamsSchema } from './types.js';
+import { requireAdmin } from '../../lib/auth.js';

 export async function codeQualityRoutes(fastify: FastifyInstance) {
-  // Run code quality check
-  fastify.post('/code-quality/check', async (request, reply) => {
+  // Run code quality check.
+  // Admin-only: this route shells out (`npm run typecheck/lint/build/test:run`)
+  // in a caller-supplied `projectPath` and is therefore privileged. See the
+  // "Privilege Surface" section in `dashboard/DEPLOYMENT.md`.
+  fastify.post('/code-quality/check', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (request, reply) => {
    try {
      const params = CodeQualityCheckParamsSchema.parse(request.body);
      const report = await runCodeQualityCheck(params);
--- a/dashboard/backend/src/modules/deployments/orchestrator.test.ts
+++ b/dashboard/backend/src/modules/deployments/orchestrator.test.ts
@ -0,0 +1,143 @@
+import { describe, it, expect, beforeEach, vi } from 'vitest';
+import type { Service } from '../services/types.js';
+
+// --- I/O mocks. Hoisted so vi.mock factories below can see them. ---------------
+const execMock = vi.hoisted(() => vi.fn());
+vi.mock('child_process', () => ({ exec: execMock }));
+
+const createDeploymentMock = vi.hoisted(() => vi.fn());
+const updateDeploymentMock = vi.hoisted(() => vi.fn());
+vi.mock('./repository.js', () => ({
+  createDeployment: createDeploymentMock,
+  updateDeployment: updateDeploymentMock,
+}));
+
+const getServiceByIdMock = vi.hoisted(() => vi.fn());
+const updateServiceMock = vi.hoisted(() => vi.fn());
+vi.mock('../services/repository.js', () => ({
+  getServiceById: getServiceByIdMock,
+  updateService: updateServiceMock,
+}));
+
+vi.mock('../../lib/config.js', () => ({
+  config: {},
+  productId: 'devops-internal',
+}));
+
+const { triggerDeployment } = await import('./orchestrator.js');
+
+function makeService(overrides?: Partial<Service>): Service {
+  return {
+    id: 'svc-1',
+    name: 'Test Service',
+    scriptPath: 'deploy.sh',
+    healthUrl: 'https://example.com/health',
+    repoPath: '../repo',
+    status: 'up',
+    version: '1.0.0',
+    productId: 'devops-internal',
+    ...overrides,
+  };
+}
+
+// promisify(exec) calls exec(cmd, options, cb(err, { stdout, stderr })). Drive
+// the callback synchronously off the mock so the deferred script work resolves
+// before our awaited assertion.
+function setExec(handler: () => { error?: Error & { stdout?: string; stderr?: string }; stdout?: string; stderr?: string }) {
+  execMock.mockImplementation(
+    (
+      _cmd: string,
+      _opts: unknown,
+      cb: (err: (Error & { stdout?: string; stderr?: string }) | null, result?: { stdout: string; stderr: string }) => void,
+    ) => {
+      const res = handler();
+      if (res.error) cb(res.error);
+      else cb(null, { stdout: res.stdout ?? '', stderr: res.stderr ?? '' });
+    },
+  );
+}
+
+describe('triggerDeployment', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    createDeploymentMock.mockImplementation(async (data) => ({ id: 'dep-1', ...data }));
+    updateDeploymentMock.mockResolvedValue({});
+    getServiceByIdMock.mockImplementation(async (id) => makeService({ id, version: '0.9.0' }));
+    updateServiceMock.mockResolvedValue({});
+  });
+
+  it('creates a pending deployment record and returns its id immediately', async () => {
+    setExec(() => ({ stdout: 'deployed v1.2.3', stderr: '' }));
+    const id = await triggerDeployment(makeService(), 'tester@bytelyst');
+    expect(id).toBe('dep-1');
+    expect(createDeploymentMock).toHaveBeenCalledWith({
+      serviceId: 'svc-1',
+      version: 'pending',
+      triggeredBy: 'tester@bytelyst',
+      productId: 'devops-internal',
+    });
+  });
+
+  // Wait for the post-trigger async work to flush. We can't await the inner
+  // promise directly (orchestrator deliberately fire-and-forgets), so we yield
+  // ticks until updateDeployment is observed.
+  async function flushBackground(): Promise<void> {
+    for (let i = 0; i < 50; i++) {
+      if (updateDeploymentMock.mock.calls.length > 0) return;
+      await Promise.resolve();
+    }
+  }
+
+  it('marks the deployment success and updates the service version on a clean run', async () => {
+    setExec(() => ({ stdout: 'release version: 2.5.1\n', stderr: '' }));
+    await triggerDeployment(makeService(), 'tester');
+    await flushBackground();
+
+    const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
+    expect(finalCall.status).toBe('success');
+    expect(finalCall.version).toBe('2.5.1');
+    expect(typeof finalCall.completedAt).toBe('string');
+
+    // Service is moved to 'up' with the extracted version.
+    expect(updateServiceMock).toHaveBeenCalledWith(
+      'svc-1',
+      expect.objectContaining({ status: 'up', version: '2.5.1' }),
+    );
+  });
+
+  it('falls back to version "unknown" when the script logs no recognizable version', async () => {
+    setExec(() => ({ stdout: 'all good, no numbers here', stderr: '' }));
+    await triggerDeployment(makeService(), 'tester');
+    await flushBackground();
+
+    const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
+    expect(finalCall.status).toBe('success');
+    expect(finalCall.version).toBe('unknown');
+  });
+
+  it('marks the deployment failed and the service down when the script throws', async () => {
+    const err = Object.assign(new Error('exit 1'), { stdout: 'partial', stderr: 'boom' });
+    setExec(() => ({ error: err }));
+    await triggerDeployment(makeService(), 'tester');
+    await flushBackground();
+
+    const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
+    expect(finalCall.status).toBe('failed');
+    expect(finalCall.logs).toContain('ERROR: exit 1');
+    expect(finalCall.logs).toContain('STDERR:\nboom');
+    expect(finalCall).not.toHaveProperty('version');
+
+    expect(updateServiceMock).toHaveBeenCalledWith('svc-1', { status: 'down' });
+  });
+
+  it('does not crash when getServiceById returns null in the success path', async () => {
+    getServiceByIdMock.mockResolvedValue(null);
+    setExec(() => ({ stdout: 'version: 1.0.0', stderr: '' }));
+    await triggerDeployment(makeService(), 'tester');
+    await flushBackground();
+
+    expect(updateServiceMock).not.toHaveBeenCalled();
+    const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
+    expect(finalCall.status).toBe('success');
+  });
+});
--- a/dashboard/backend/src/modules/deployments/orchestrator.ts
+++ b/dashboard/backend/src/modules/deployments/orchestrator.ts
@ -4,8 +4,10 @@ import { join } from 'path';
 import type { Service } from '../services/types.js';
 import { createDeployment, updateDeployment } from './repository.js';
 import { productId } from '../../lib/config.js';
+import { childLogger } from '../../lib/logger.js';

 const execAsync = promisify(exec);
+const log = childLogger('deployments/orchestrator');

 export async function triggerDeployment(service: Service, triggeredBy: string): Promise<string> {
  // Create deployment record
@ -20,7 +22,7 @@ export async function triggerDeployment(service: Service, triggeredBy: string):

  // Trigger bash script asynchronously
  runDeploymentScript(service, deploymentId).catch(error => {
-    console.error(`Deployment ${deploymentId} failed:`, error);
+    log.error({ err: error, deploymentId, serviceId: service.id }, 'background deployment failed');
  });

  return deploymentId;
@ -30,6 +32,10 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
  const scriptDir = join(process.cwd(), '../../'); // Go to bytelyst-devops-tools root
  const scriptPath = join(scriptDir, service.scriptPath);

+  let finalStatus: 'success' | 'failed' = 'failed';
+  let logs = '';
+  let version: string | undefined;
+
  try {
    const { stdout, stderr } = await execAsync(`bash ${scriptPath}`, {
      cwd: scriptDir,
@ -40,15 +46,9 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
      },
    });

-    const logs = `STDOUT:\n${stdout}\n\nSTDERR:\n${stderr}`;
-
-    // Update deployment as success
-    await updateDeployment(deploymentId, {
-      status: 'success',
-      logs,
-      completedAt: new Date().toISOString(),
-      version: extractVersion(stdout + stderr) || 'unknown',
-    });
+    logs = `STDOUT:\n${stdout}\n\nSTDERR:\n${stderr}`;
+    finalStatus = 'success';
+    version = extractVersion(stdout + stderr) || 'unknown';

    // Update service status
    const { getServiceById, updateService } = await import('../services/repository.js');
@ -57,21 +57,14 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
      await updateService(service.id, {
        status: 'up',
        lastDeployedAt: new Date().toISOString(),
-        version: extractVersion(stdout + stderr) || svc.version,
+        version: version || svc.version,
      });
    }
  } catch (error: any) {
-    const logs = error instanceof Error
+    logs = error instanceof Error
      ? `ERROR: ${error.message}\n\n${(error as any).stdout ? `STDOUT:\n${(error as any).stdout}\n\n` : ''}${(error as any).stderr ? `STDERR:\n${(error as any).stderr}` : ''}`
      : String(error);

-    // Update deployment as failed
-    await updateDeployment(deploymentId, {
-      status: 'failed',
-      logs,
-      completedAt: new Date().toISOString(),
-    });
-
    // Update service status to down
    const { getServiceById, updateService } = await import('../services/repository.js');
    const svc = await getServiceById(service.id);
@ -80,6 +73,21 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
        status: 'down',
      });
    }
+  } finally {
+    // Always write final status — ensures the deployment never gets stuck in 'running'
+    try {
+      await updateDeployment(deploymentId, {
+        status: finalStatus,
+        logs,
+        completedAt: new Date().toISOString(),
+        ...(version ? { version } : {}),
+      });
+    } catch (updateError) {
+      log.error(
+        { err: updateError, deploymentId, finalStatus },
+        'failed to persist final deployment status',
+      );
+    }
  }
 }

--- a/dashboard/backend/src/modules/deployments/routes.ts
+++ b/dashboard/backend/src/modules/deployments/routes.ts
@ -13,23 +13,29 @@ import { createAuditLog } from '../audit/repository.js';
 import { productId } from '../../lib/config.js';

 export async function deploymentRoutes(fastify: FastifyInstance) {
-  // Get recent deployments across all services
-  fastify.get('/deployments', async (req, reply) => {
+  // Get recent deployments across all services (admin only)
+  fastify.get('/deployments', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
    const query = QueryParamsSchema.parse(req.query);
    const deployments = await getRecentDeployments(query.limit);
    return reply.send(deployments);
  });

-  // Get deployments for a specific service
-  fastify.get('/deployments/service/:serviceId', async (req, reply) => {
+  // Get deployments for a specific service (admin only)
+  fastify.get('/deployments/service/:serviceId', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
    const params = TriggerDeploymentParamsSchema.parse(req.params);
    const query = QueryParamsSchema.parse(req.query);
    const deployments = await getDeploymentsByService(params.serviceId, query.limit);
    return reply.send(deployments);
  });

-  // Get single deployment
-  fastify.get('/deployments/:id', async (req, reply) => {
+  // Get single deployment (admin only)
+  fastify.get('/deployments/:id', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
    const params = DeploymentParamsSchema.parse(req.params);
    const deployment = await getDeploymentById(params.id);
    if (!deployment) {
@ -38,9 +44,13 @@ export async function deploymentRoutes(fastify: FastifyInstance) {
    return reply.send(deployment);
  });

-  // Get deployment logs (SSE disabled due to Fastify 5 compatibility)
-  // TODO: Re-enable SSE when fastify-sse-v2 supports Fastify 5
-  fastify.get('/deployments/:id/logs', async (req, reply) => {
+  // Get deployment logs (admin only). Returns the captured stdout/stderr +
+  // current status as a single JSON payload. The web client polls this for
+  // running deployments — there is intentionally no SSE/streaming variant
+  // (see server.ts for the full rationale).
+  fastify.get('/deployments/:id/logs', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
    const params = DeploymentParamsSchema.parse(req.params);
    const deployment = await getDeploymentById(params.id);

--- a/dashboard/backend/src/modules/env/repository.ts
+++ b/dashboard/backend/src/modules/env/repository.ts
@ -0,0 +1,31 @@
+import type { EnvVar } from './types.js';
+
+const envVars = new Map<string, EnvVar>();
+
+export async function getEnvVars(): Promise<EnvVar[]> {
+  return Array.from(envVars.values()).sort((a, b) => a.name.localeCompare(b.name));
+}
+
+export async function getEnvVar(id: string): Promise<EnvVar | null> {
+  return envVars.get(id) ?? null;
+}
+
+export async function upsertEnvVar(input: Partial<EnvVar> & { name: string }): Promise<EnvVar> {
+  const id = input.id || input.name.toLowerCase().replace(/[^a-z0-9_]+/g, '_');
+  const envVar: EnvVar = {
+    id,
+    name: input.name,
+    value: input.isSecret ? 'REDACTED' : input.value ?? '',
+    isSecret: input.isSecret ?? true,
+    source: input.source ?? 'local',
+    azureKeyVaultName: input.azureKeyVaultName,
+    azureSecretName: input.azureSecretName,
+    updatedAt: new Date().toISOString(),
+  };
+  envVars.set(id, envVar);
+  return envVar;
+}
+
+export async function deleteEnvVar(id: string): Promise<boolean> {
+  return envVars.delete(id);
+}
--- a/dashboard/backend/src/modules/env/routes.ts
+++ b/dashboard/backend/src/modules/env/routes.ts
@ -0,0 +1,61 @@
+import type { FastifyInstance } from 'fastify';
+import { BadRequestError, requireAdmin } from '../../lib/auth.js';
+import { deleteEnvVar, getEnvVar, getEnvVars, upsertEnvVar } from './repository.js';
+import { EnvVarInputSchema, EnvVarParamsSchema } from './types.js';
+
+export async function envRoutes(fastify: FastifyInstance) {
+  fastify.get('/env', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
+    return reply.send(await getEnvVars());
+  });
+
+  fastify.get('/env/:id', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
+    const params = EnvVarParamsSchema.parse(req.params);
+    const envVar = await getEnvVar(params.id);
+    if (!envVar) return reply.code(404).send({ error: 'Environment variable not found' });
+    return reply.send(envVar);
+  });
+
+  fastify.post('/env', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
+    try {
+      const input = EnvVarInputSchema.parse(req.body) as { name: string };
+      return reply.code(201).send(await upsertEnvVar(input));
+    } catch (error) {
+      if (error instanceof Error) throw new BadRequestError(error.message);
+      throw error;
+    }
+  });
+
+  fastify.put('/env/:id', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
+    try {
+      const params = EnvVarParamsSchema.parse(req.params);
+      const input = EnvVarInputSchema.parse({ ...(req.body as object), id: params.id }) as { name: string; id: string };
+      return reply.send(await upsertEnvVar(input));
+    } catch (error) {
+      if (error instanceof Error) throw new BadRequestError(error.message);
+      throw error;
+    }
+  });
+
+  fastify.delete('/env/:id', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
+    const params = EnvVarParamsSchema.parse(req.params);
+    const deleted = await deleteEnvVar(params.id);
+    if (!deleted) return reply.code(404).send({ error: 'Environment variable not found' });
+    return reply.code(204).send();
+  });
+
+  fastify.post('/env/sync-azure', {
+    preHandler: async (req) => requireAdmin(req),
+  }, async (req, reply) => {
+    return reply.send({ synced: 0, errors: ['Azure Key Vault sync is not configured in this local dashboard build.'] });
+  });
+}
--- a/dashboard/backend/src/modules/env/types.ts
+++ b/dashboard/backend/src/modules/env/types.ts
@ -0,0 +1,22 @@
+import { z } from 'zod';
+
+export const EnvVarSchema = z.object({
+  id: z.string().min(1),
+  name: z.string().min(1),
+  value: z.string().default(''),
+  isSecret: z.boolean().default(true),
+  source: z.enum(['local', 'azure-key-vault']).default('local'),
+  azureKeyVaultName: z.string().optional(),
+  azureSecretName: z.string().optional(),
+  updatedAt: z.string().datetime().default(() => new Date().toISOString()),
+});
+
+export const EnvVarParamsSchema = z.object({
+  id: z.string().min(1),
+});
+
+export const EnvVarInputSchema = EnvVarSchema.omit({ name: true }).partial().extend({
+  name: z.string().min(1),
+});
+
+export type EnvVar = z.infer<typeof EnvVarSchema>;
--- a/dashboard/backend/src/modules/health/health.test.ts
+++ b/dashboard/backend/src/modules/health/health.test.ts
@ -0,0 +1,127 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import type { Service } from '../services/types.js';
+
+const { checkServiceHealth, checkAllServices, clearHealthCache } = await import('./repository.js');
+
+function makeService(overrides?: Partial<Service>): Service {
+  return {
+    id: 'svc-1',
+    name: 'Test Service',
+    scriptPath: '../deploy.sh',
+    healthUrl: 'https://example.com/health',
+    repoPath: '../repo',
+    status: 'up',
+    version: '1.0.0',
+    productId: 'devops-internal',
+    ...overrides,
+  };
+}
+
+describe('checkServiceHealth', () => {
+  beforeEach(() => {
+    clearHealthCache();
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date('2026-01-01T00:00:00Z'));
+    // Each test installs its own fetch mock as needed.
+    vi.stubGlobal('fetch', vi.fn());
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+    vi.unstubAllGlobals();
+  });
+
+  it('reports "up" for a fast 2xx response', async () => {
+    (globalThis.fetch as unknown as ReturnType<typeof vi.fn>).mockResolvedValue({ ok: true });
+    const result = await checkServiceHealth(makeService());
+    expect(result.status).toBe('up');
+    expect(result.serviceId).toBe('svc-1');
+    expect(result.lastCheck).toBe('2026-01-01T00:00:00.000Z');
+    expect(result.responseTime).toBeGreaterThanOrEqual(0);
+  });
+
+  it('reports "down" for a non-2xx response', async () => {
+    (globalThis.fetch as unknown as ReturnType<typeof vi.fn>).mockResolvedValue({ ok: false });
+    const result = await checkServiceHealth(makeService({ id: 'svc-down' }));
+    expect(result.status).toBe('down');
+  });
+
+  it('reports "down" when fetch throws (network/timeout)', async () => {
+    (globalThis.fetch as unknown as ReturnType<typeof vi.fn>).mockRejectedValue(new Error('boom'));
+    const result = await checkServiceHealth(makeService({ id: 'svc-net' }));
+    expect(result.status).toBe('down');
+    // Failure path does not record a responseTime.
+    expect(result.responseTime).toBeUndefined();
+  });
+
+  it('caches successful results within the 30s TTL window', async () => {
+    const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
+    fetchMock.mockResolvedValue({ ok: true });
+    await checkServiceHealth(makeService({ id: 'svc-cache' }));
+    await checkServiceHealth(makeService({ id: 'svc-cache' }));
+    await checkServiceHealth(makeService({ id: 'svc-cache' }));
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+  });
+
+  it('refetches after the cache TTL expires', async () => {
+    const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
+    fetchMock.mockResolvedValue({ ok: true });
+    await checkServiceHealth(makeService({ id: 'svc-ttl' }));
+    vi.setSystemTime(new Date(Date.now() + 31_000));
+    await checkServiceHealth(makeService({ id: 'svc-ttl' }));
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+  });
+
+  it('caches failures for ~5s, not the full 30s', async () => {
+    const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
+    fetchMock.mockRejectedValue(new Error('boom'));
+    await checkServiceHealth(makeService({ id: 'svc-fail' }));
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+
+    // Within the short failure-cache window: still served from cache.
+    vi.setSystemTime(new Date(Date.now() + 4_000));
+    await checkServiceHealth(makeService({ id: 'svc-fail' }));
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+
+    // Past the short failure window: refetch.
+    vi.setSystemTime(new Date(Date.now() + 2_000));
+    await checkServiceHealth(makeService({ id: 'svc-fail' }));
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+  });
+
+  it('clearHealthCache forces a refetch', async () => {
+    const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
+    fetchMock.mockResolvedValue({ ok: true });
+    await checkServiceHealth(makeService({ id: 'svc-clear' }));
+    clearHealthCache();
+    await checkServiceHealth(makeService({ id: 'svc-clear' }));
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+  });
+});
+
+describe('checkAllServices', () => {
+  beforeEach(() => {
+    clearHealthCache();
+    vi.stubGlobal('fetch', vi.fn());
+  });
+
+  afterEach(() => {
+    vi.unstubAllGlobals();
+  });
+
+  it('returns a result per input service in input order', async () => {
+    const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
+    fetchMock.mockImplementation(async (url: string) => ({ ok: !url.includes('bad') }));
+
+    const services = [
+      makeService({ id: 'a', healthUrl: 'https://a.example.com/health' }),
+      makeService({ id: 'b', healthUrl: 'https://bad.example.com/health' }),
+      makeService({ id: 'c', healthUrl: 'https://c.example.com/health' }),
+    ];
+
+    const out = await checkAllServices(services);
+    expect(out).toHaveLength(3);
+    expect(out.map(h => h.serviceId)).toEqual(['a', 'b', 'c']);
+    expect(out.map(h => h.status)).toEqual(['up', 'down', 'up']);
+  });
+});
--- a/dashboard/backend/src/modules/health/routes.ts
+++ b/dashboard/backend/src/modules/health/routes.ts
@ -53,16 +53,8 @@ export async function healthRoutes(fastify: FastifyInstance) {
  // Clear health cache (admin only)
  fastify.delete('/health/cache', {
    preHandler: async (req) => requireAdmin(req),
-  }, async (req, reply) => {
-    try {
-      requireAdmin(req);
+  }, async (_req, reply) => {
    clearHealthCache();
    return reply.send({ message: 'Health cache cleared' });
-    } catch (error) {
-      if (error instanceof Error) {
-        throw new BadRequestError(error.message);
-      }
-      throw error;
-    }
  });
 }
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`@bytelyst:registry=http://localhost:3300/api/packages/learning_ai_user/npm/`