Compare commits
No commits in common. "main" and "docs/hermes-setup-upgrade-roadmap" have entirely different histories.
main
...
docs/herme
6
.gitattributes
vendored
6
.gitattributes
vendored
@ -1,6 +0,0 @@
|
||||
* text=auto eol=lf
|
||||
|
||||
# Enforce LF for shell scripts and text files
|
||||
*.sh text eol=lf
|
||||
*.ps1 text eol=lf
|
||||
*.md text eol=lf
|
||||
@ -1,71 +0,0 @@
|
||||
name: Shell CI — agent-queue + CLI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'agent-queue/**'
|
||||
- 'bytelyst-cli.sh'
|
||||
- '.gitea/workflows/shell-ci.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'agent-queue/**'
|
||||
- 'bytelyst-cli.sh'
|
||||
- '.gitea/workflows/shell-ci.yml'
|
||||
|
||||
concurrency:
|
||||
group: shell-ci-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: shellcheck + syntax
|
||||
runs-on: [ubuntu-latest, bytelyst, hostinger]
|
||||
container:
|
||||
image: node:20-bookworm
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
github-server-url: https://gitea.bytelyst.com
|
||||
|
||||
- name: Install shellcheck
|
||||
run: |
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq shellcheck
|
||||
shellcheck --version
|
||||
|
||||
- name: shellcheck (errors fail the build)
|
||||
run: |
|
||||
shellcheck --severity=error --shell=bash \
|
||||
agent-queue/agent-queue.sh \
|
||||
agent-queue/selftest.sh \
|
||||
bytelyst-cli.sh
|
||||
|
||||
- name: bash syntax check (gating, all scripts)
|
||||
run: |
|
||||
bash -n agent-queue/agent-queue.sh
|
||||
bash -n agent-queue/selftest.sh
|
||||
bash -n bytelyst-cli.sh
|
||||
|
||||
- name: agent-queue self-test (no-op engine cycle)
|
||||
run: ./agent-queue/selftest.sh
|
||||
|
||||
- name: node syntax check (dashboard)
|
||||
run: node --check agent-queue/dashboard.mjs
|
||||
|
||||
- name: smoke test (init + add + drain, no real agent)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
export AGENT_QUEUE_ROOT="$PWD/.ci-queue"
|
||||
./agent-queue/agent-queue.sh init
|
||||
# task with an invalid cwd lands in failed/ without launching any agent
|
||||
printf '%s\n' '---' 'engine: devin' 'cwd: /no/such/dir' 'yolo: true' '---' '# ci' \
|
||||
> /tmp/ci-task.md
|
||||
./agent-queue/agent-queue.sh add /tmp/ci-task.md
|
||||
./agent-queue/agent-queue.sh run --once
|
||||
test -f "$AGENT_QUEUE_ROOT"/failed/*.md
|
||||
echo "smoke OK: task routed to failed/ as expected"
|
||||
rm -rf "$AGENT_QUEUE_ROOT"
|
||||
63
.github/workflows/ci.yml
vendored
63
.github/workflows/ci.yml
vendored
@ -1,63 +0,0 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
shellcheck:
|
||||
name: Shellcheck
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install shellcheck
|
||||
run: sudo apt-get update && sudo apt-get install -y shellcheck
|
||||
- name: Run shellcheck on shell scripts
|
||||
run: |
|
||||
files=$(git ls-files '*.sh' || true)
|
||||
if [ -z "$files" ]; then
|
||||
echo "No shell scripts to check"
|
||||
exit 0
|
||||
fi
|
||||
echo "$files"
|
||||
shellcheck $files
|
||||
|
||||
syntax:
|
||||
name: Syntax & EOL checks
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Fail on CRLF in scripts
|
||||
run: |
|
||||
CRLF_FILES=$(git ls-files '*.sh' | xargs -r grep -Il $'\r' || true)
|
||||
if [ -n "$CRLF_FILES" ]; then
|
||||
echo "CRLF found in the following files:"; echo "$CRLF_FILES";
|
||||
exit 1
|
||||
fi
|
||||
echo "No CRLF in shell scripts"
|
||||
- name: Bash syntax-check
|
||||
run: |
|
||||
for f in $(git ls-files '*.sh'); do
|
||||
echo "Checking $f";
|
||||
bash -n "$f";
|
||||
done
|
||||
|
||||
preview-runner:
|
||||
name: Preview installer scripts
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Bash syntax-check run_installers
|
||||
run: bash -n run_installers.sh
|
||||
- name: Preview run_installers (safe)
|
||||
run: ./run_installers.sh --preview
|
||||
|
||||
windows-preview:
|
||||
name: PowerShell preview
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Preview PowerShell wrapper
|
||||
shell: pwsh
|
||||
run: |
|
||||
./run_installers.ps1 -Preview
|
||||
19
.gitignore
vendored
19
.gitignore
vendored
@ -10,8 +10,6 @@ __pycache__/
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
!dashboard/backend/src/modules/env/
|
||||
!dashboard/backend/src/modules/env/**
|
||||
|
||||
# IDE files
|
||||
.vscode/
|
||||
@ -38,7 +36,6 @@ accounts.json
|
||||
.azure/
|
||||
|
||||
# Generated outputs and local data caches
|
||||
graphify-out/
|
||||
supabase monitor/output/
|
||||
youtube/captions/
|
||||
github_repo_scanners/contributor_repos/
|
||||
@ -49,19 +46,3 @@ bytelyst-ai.json
|
||||
saravanakumardb.json
|
||||
saravanakumardb1.json
|
||||
list_repos_contributors_by_user_saravanakumardb.json
|
||||
|
||||
# Agent-queue transient runtime state (jobs move through these dirs at runtime;
|
||||
# keep the dirs via .gitkeep but never track the per-job lifecycle files).
|
||||
agent-queue/queue/.state/*
|
||||
agent-queue/queue/inbox/*
|
||||
agent-queue/queue/building/*
|
||||
agent-queue/queue/testing/*
|
||||
agent-queue/queue/review/*
|
||||
agent-queue/queue/failed/*
|
||||
agent-queue/queue/shipped/*
|
||||
agent-queue/queue/logs/*
|
||||
!agent-queue/queue/*/.gitkeep
|
||||
|
||||
# gigafactory deploy script runtime pids
|
||||
scripts/.gigafactory-platform-service.pid
|
||||
scripts/.gigafactory-tracker-web.pid
|
||||
|
||||
32
AGENTS.md
32
AGENTS.md
@ -32,7 +32,6 @@ Read these first:
|
||||
- `remove_user_guided.sh`
|
||||
- `remove_user_from_repos.sh`
|
||||
- `scripts/`
|
||||
- `scripts/tracker-seed/` - file work items into the ByteLyst tracker (see "Cutting Tracker Items")
|
||||
- `git-work-safety-tools/`
|
||||
- `github_access_scripts/`
|
||||
|
||||
@ -76,37 +75,6 @@ These may contain secrets, usernames, or operational snapshots. Avoid printing c
|
||||
4. Make the smallest coherent change set.
|
||||
5. If docs or discoverability changed, update the canonical docs listed above.
|
||||
|
||||
## Cutting Tracker Items (work tracking)
|
||||
|
||||
When the user asks to **"cut items to track"** (file feature/bug/task tickets for
|
||||
some work — e.g. the findings in `ENGINEERING_REVIEW_SCORECARD.md`), use the
|
||||
seed tooling in `scripts/tracker-seed/`. Do **not** hand-roll API calls.
|
||||
|
||||
How the tracker works:
|
||||
- Items live in the ByteLyst tracker, served by **platform-service**
|
||||
(`POST /api/items`, in `learning_ai_common_plat/services/platform-service`),
|
||||
and viewed in **tracker-web** (`learning_ai_common_plat/dashboards/tracker-web`, `:3003`).
|
||||
- Item schema: `{ productId, type: bug|feature|task, priority: critical|high|medium|low,
|
||||
title, description, labels[], source, visibility, ... }`. Items are scoped per `productId`.
|
||||
- Auth is an HS256 JWT signed with the shared `JWT_SECRET` (verified offline by
|
||||
platform-service); the seed script mints one itself.
|
||||
|
||||
Standard procedure:
|
||||
1. **Add payloads** to `scripts/tracker-seed/engineering-review-items.json`
|
||||
(or a new payload file): one entry per item, scoped to the right `productId`.
|
||||
Use the `repoToProductId` map in that file for repo → product slugs
|
||||
(e.g. `learning_ai_notes` → `notelett`, common-plat/infra → `platform`).
|
||||
2. **Preview** with no side effects: `node scripts/tracker-seed/seed-tracker-items.mjs --dry-run`.
|
||||
3. **Create for real only when the platform stack is up** (it writes real records):
|
||||
`JWT_SECRET=<secret> PLATFORM_API_URL=http://localhost:4003 node scripts/tracker-seed/seed-tracker-items.mjs`.
|
||||
The script dedupes by title per product, so re-running is safe (`--force` to bypass).
|
||||
4. If the stack is **not running** (no Docker / nothing on `:4003`), do **not**
|
||||
stand up Cosmos just to seed — commit the payloads + run instructions and tell
|
||||
the user to run the script when the stack is next up. Confirm before any live
|
||||
write (creating items is a side-effecting datastore operation).
|
||||
|
||||
See `scripts/tracker-seed/README.md` for full details.
|
||||
|
||||
## Good First Checks
|
||||
|
||||
```bash
|
||||
|
||||
@ -34,12 +34,6 @@ Do not assume a single dependency graph or runtime model across the whole repo.
|
||||
- `remove_user_from_repos.sh`
|
||||
- `git-work-safety-tools/*.sh`
|
||||
|
||||
## Cutting Tracker Items
|
||||
|
||||
When asked to "cut items to track", use `scripts/tracker-seed/` (seeds the
|
||||
ByteLyst tracker via platform-service `POST /api/items`). Do not hand-roll API
|
||||
calls. Full procedure is in the "Cutting Tracker Items" section of `AGENTS.md`.
|
||||
|
||||
## Safety Notes
|
||||
|
||||
- Treat `accounts.json`, account snapshot JSON files, `.env` files, and generated contributor/output data as sensitive.
|
||||
|
||||
@ -1,335 +0,0 @@
|
||||
# Engineering Review & Scorecard
|
||||
|
||||
> Evidence-based, read-only review of the entire `~/code/mygh` workspace (~38 git
|
||||
> repos) per `docs/prompts/engineering-review-scorecard.md`. Generated 2026-05-30.
|
||||
>
|
||||
> **Method:** static inspection only — file reads, `grep`, and read-only `git`.
|
||||
> No builds, installs, or test runs were executed (that would mutate the trees),
|
||||
> so dynamic results (pass/fail, coverage %) are inferred from config + test
|
||||
> counts, not measured. See §9 for limits. Per-repo evidence was gathered by
|
||||
> parallel read-only agents and spot-verified.
|
||||
|
||||
---
|
||||
|
||||
## 1. Executive Summary
|
||||
|
||||
**What this is:** a single developer running a surprisingly coherent *product
|
||||
ecosystem* — ~10 product apps (clock, notes, fastgap, peakpulse, flowmonk,
|
||||
efforise, jarvis_jr, trails, talk2obsidian, local-memory-gpt, voice-ai-agent,
|
||||
multimodal/mindlyst) sharing one platform monorepo (`learning_ai_common_plat`,
|
||||
36 `@bytelyst/*` packages, auth/Cosmos/design-tokens), orchestrated by a single
|
||||
`docker-compose.ecosystem.yml` (~20 services) and driven heavily by AI agents
|
||||
through a homegrown `agent-queue`. This is far more disciplined than a typical
|
||||
"learning" folder.
|
||||
|
||||
**Overall maturity:** **Beta-quality ecosystem.** A core of genuinely
|
||||
production-grade repos (`learning_ai_notes`, `learning_ai_trails`,
|
||||
`oss/claw-code`/`claw-cowork`, `learning_ai_clock`, `learning_ai_fastgap`)
|
||||
surrounded by a long tail of MVP/prototype repos with thin or zero tests and no
|
||||
CI.
|
||||
|
||||
**Biggest strengths (top 3)**
|
||||
1. **Strong platform discipline.** Shared `@bytelyst/*` packages, a repeated
|
||||
`types.ts → repository.ts → routes.ts` backend pattern, Cosmos partition-key
|
||||
conventions (`/userId`, `productId` on every doc), per-repo `AGENTS.md`,
|
||||
conventional commits, and field-level encryption (`field-encrypt.ts`) recur
|
||||
across the best repos.
|
||||
2. **Clean security posture for a personal workspace.** Secret scans across all
|
||||
repos surfaced **no real committed production secrets** — only `.env.example`
|
||||
placeholders, the public Azure Cosmos emulator key, dev `JWT_SECRET=dev-...`
|
||||
values, and Azure Key Vault *references*. `.gitignore` is present nearly
|
||||
everywhere.
|
||||
3. **Top repos are legitimately good.** `notes`, `trails`, and the two Rust
|
||||
`claw-*` repos show modular architecture, real test suites (28–80+ files),
|
||||
CI, multi-stage Docker, and strict typing (`0` `as any` in several backends).
|
||||
|
||||
**Biggest risks (top 3)**
|
||||
1. **CI is the weak link.** GitHub Actions is **disabled (billing)** on the
|
||||
platform monorepo `learning_ai_common_plat` and on `voice_ai_agent`
|
||||
(`*.disabled` workflows); ~15 repos have **no CI at all**. The shared
|
||||
platform that everything depends on has no automated gate.
|
||||
2. **Process churn dirties the repos.** A live `agent-queue` daemon + `devin`
|
||||
agents in `--permission-mode dangerous` were actively writing to repos; ~14
|
||||
repos were found dirty with uncommitted work, several behind `origin`. Work
|
||||
is at risk of being lost or silently diverging.
|
||||
3. **Testing is bimodal.** Excellent in the flagship repos, **zero** in many
|
||||
others (`productivity_web`, `webui_copilot`, `pytorch_todo_predictor`,
|
||||
`server-survival`, `sidecar_setup`, `mac_tooling`). No portfolio-wide
|
||||
coverage signal.
|
||||
|
||||
**Is the dev style helping or hurting velocity?** **Net helping, but fraying at
|
||||
the edges.** The platform/agent approach clearly lets one person ship a dozen
|
||||
apps — that's the upside. The drag is operational: disabled CI, constantly-dirty
|
||||
working trees, abandoned worktrees, and "AI-generated scaffolding smell" in a
|
||||
few repos (e.g. `magic_clipboard_mgr`'s 50+ service files + phase-named test
|
||||
buckets). Tightening the commit/CI loop would convert a lot of that churn back
|
||||
into velocity.
|
||||
|
||||
---
|
||||
|
||||
## 2. Overall Score Sheet
|
||||
|
||||
Scores are 1–10 (1 = critical/broken, 10 = production-grade), aggregated across
|
||||
the ~30 code repos (pure docs/usage repos excluded from category math).
|
||||
|
||||
| Category | Score | Justification (evidence) |
|
||||
|---|---|---|
|
||||
| A. Repository organization | **8** | Consistent `@bytelyst/*` + `types/repository/routes` pattern, per-repo `AGENTS.md`, clear monorepos; minus for ~14 dirty trees, stray worktrees, a few unstructured repos. |
|
||||
| B. Code quality | **7** | Flagships: strict TS, `0` `as any`, no `console.log`, Zod validation. Tail: `print()`-heavy (`2nd_brain` 60+, `mac_tooling` 200+), `any` leaks, AI-scaffold smell (`magic_clipboard_mgr`). |
|
||||
| C. Architecture | **8** | Genuinely strong: shared platform, datastore abstraction, deterministic engines (`flowmonk` scheduler), risk-scoring (`trails`), MCP integrations, clean native/web boundaries. |
|
||||
| D. DevOps & deployment | **6** | Ecosystem compose orchestrates ~20 services, multi-stage Dockerfiles common — but **CI disabled on the platform repo**, ~15 repos with no CI, and **0 healthchecks** in `docker-compose.ecosystem.yml`. |
|
||||
| E. Testing | **6** | Bimodal: `notes`/`fastgap`/`clock`/`trails`/`claw-*` have 28–600+ tests; many repos have 0. E2E frequently `continue-on-error: true`. No measured coverage. |
|
||||
| F. Security | **8** | No real committed secrets anywhere; field encryption + Key Vault refs in the mature repos; `.gitignore`/`.env.example` discipline. Minus for `NODE_TLS_REJECT_UNAUTHORIZED=0` in some Docker, thin input-validation in prototypes. |
|
||||
| G. Product readiness | **7** | Several apps runnable end-to-end (web+backend); mobile/native surfaces often partial; CI-disabled + flaky E2E hold back true "launchable". |
|
||||
| H. AI-agent practices | **6** | Impressive tooling (`agent-queue`, profiles, job briefs, `AGENTS.md`), but guardrails are weak: `--permission-mode dangerous`, agents dirtying live repos, duplicate work landing upstream, no enforced test-before-commit. |
|
||||
| I. Personal workflow | **6** | Good: conventional commits, auto `backup-main-*` branches, `AGENTS.md`. Bad: ~14 dirty repos, branches behind `origin`, abandoned worktrees, no unified release/issue discipline. |
|
||||
| **Weighted overall** | **≈ 7.0** | Beta-quality. See weighting below. |
|
||||
|
||||
**Weighting & rationale:** Security (F) and Product readiness (G) weighted ~1.5×,
|
||||
Testing (E) and DevOps (D) ~1.25× (these gate real-world reliability);
|
||||
A/B/C/H/I at 1.0×. The strong architecture/security pull the number up; the
|
||||
weak CI/testing pull it back to a solid-but-not-shippable **~7.0**.
|
||||
|
||||
---
|
||||
|
||||
## 3. Per-Product / Per-Repo Breakdown
|
||||
|
||||
Maturity legend: **PROD** = production-grade, **BETA**, **MVP**, **PROTO** =
|
||||
prototype/learning, **REF** = docs/reference (not code).
|
||||
|
||||
### Flagship products (platform-integrated)
|
||||
| Repo | Stack | Tests | CI | Docker | Maturity |
|
||||
|---|---|---|---|---|---|
|
||||
| `learning_ai_notes` | Fastify5 + Next16 + Expo, Cosmos | 80+ files | ✓ gitea | ✓ | **BETA→PROD** |
|
||||
| `learning_ai_trails` | Fastify5 + Next16 + SDK, Cosmos | 28 files | ✓ gitea | ✓ | **PROD** |
|
||||
| `learning_ai_clock` | Next16 PWA + iOS/Android, Fastify | 662 total | ✓ gitea | ✓ | **BETA** |
|
||||
| `learning_ai_fastgap` | Expo + Next16 + Fastify | 700+ total | ✓ gitea (7 jobs) | ✓ | **BETA** |
|
||||
| `learning_ai_peakpulse` | SwiftUI + Fastify | 26 files | ✓ (backend) | ✓ | **BETA→PROD** |
|
||||
| `learning_ai_flowmonk` | Next16 + Fastify + Expo | 102 backend | ✓ gitea | ✓ | **BETA** |
|
||||
| `learning_ai_efforise` | React/Vite + Fastify + RN | ~9 backend | ✓ gitea | ✓ | **MVP** |
|
||||
| `learning_ai_dev_intelli` | Fastify + Next16, GitHub API | 52 backend | ✓ gitea | ✓ | **MVP** |
|
||||
| `learning_ai_local_memory_gpt` | Fastify + Next16, SQLite/Ollama | 122 | ✓ gitea | ✓ | **MVP** |
|
||||
| `learning_ai_talk2obsidian` | Fastify + Vite, SQLite/Ollama | 8 | ✗ | ✓ | **BETA** |
|
||||
| `learning_voice_ai_agent` | Python + Fastify + Next + KMP | 463+ | ⚠ disabled | ✓ | **BETA** |
|
||||
| `learning_multimodal_memory_agents` (MindLyst) | KMP + Next + Fastify | 33 | ⚠ disabled | ✓ | **MVP** |
|
||||
| `learning_ai_jarvis_jr` | SwiftUI + Next + Android | ~13 web | ✓ gitea | ✓ | **ALPHA/BETA** |
|
||||
| `learning_ai_auth_app` | iOS/watchOS/Android (spec+UI) | 0 (here) | ✗ | ✗ | **MVP (spec)** |
|
||||
|
||||
### Platform & infra
|
||||
| Repo | Stack | Notes | Maturity |
|
||||
|---|---|---|---|
|
||||
| `learning_ai_common_plat` | pnpm monorepo, 36 `@bytelyst/*`, Fastify, Cosmos | ~466k LOC; full auth (OAuth/MFA/passkeys/SAML); **GH Actions disabled (billing)**, gitea CI active | **PROD** |
|
||||
| `learning_ai_devops_tools` | Bash + Python + Node (this repo) | GitHub admin scripts, `agent-queue`, Hermes dashboard; thin tests | **PROD (scripts) / MVP (dash)** |
|
||||
| `learning_ai_k8s_streaming` | Python FastAPI + Helm | Use-case registry, HPA/probes, load tools | **BETA→PROD** |
|
||||
| `learning_ai_local_llms` | Next16 dashboard + Python TTS | Ollama mission-control; 57 tests | **BETA** |
|
||||
|
||||
### Tools / OSS / native
|
||||
| Repo | Stack | Notes | Maturity |
|
||||
|---|---|---|---|
|
||||
| `oss/learning_ai_claw-code-oss` | Rust workspace (10+ crates) | `unsafe forbid`, clippy pedantic, 40+ test files | **PROD** |
|
||||
| `oss/learning_ai_claw-cowork` | Rust + Tauri + Python | 65+ test files, E2E, Docker | **PROD** |
|
||||
| `learning_magic_terminal` | **Rust** | README+CI+many tests; command-blocks v2; dirty(5) | **BETA** |
|
||||
| `learning_notif_scanr` | **Swift** (Package.swift) | tests present, **no CI**, no Docker | **MVP** |
|
||||
| `ios/learning_swift_hourglass` | Swift/SwiftUI macOS | MVVM, 2 test files, no CI | **MVP** |
|
||||
| `learning_ai_magic_clipboard_mgr` | Swift/macOS, GRDB | 24 tests but 50+ services + phase-named tests (AI-scaffold smell) | **MVP** |
|
||||
| `learning_ai_mac_tooling` | Python FastAPI + React | forensics toolkit; **0 tests**, 200+ `print()`, 3k-line files | **PROTO** |
|
||||
| `copilot/learning_ai_uxui_web` | Next16 + MSW + Playwright | component showcase, Lighthouse CI | **MVP** |
|
||||
| `learning_ai_productivity_web` | Next15, client-only | clean registry pattern, **0 tests** | **MVP** |
|
||||
| `learning_ai_webui_copilot` | Python FastAPI + LangChain | rules/policy engines, **0 tests, no Docker/CI** | **MVP** |
|
||||
| `learning_agent_monitoring_fx` | npm monorepo + KMP | agent/ingest/web work, native WIP, 54 `console.log`, TODOs | **BETA** |
|
||||
| `learning_agentic_tools_portal` | Python Flask + uv | minimal (1 endpoint, 1 test), has CI | **PROTO** |
|
||||
| `learning_server-survival-devops-web` | Vanilla JS + Three.js | playable game, **0 tests** | **MVP** |
|
||||
| `learning_pytorch_todo_predictor` | Python + PyTorch | educational, **0 tests**, **no upstream** | **PROTO** |
|
||||
| `learning_sidecar_setup` | Next16 scaffold + py stub | scaffolding only, **no upstream**, dirty(8) | **PROTO** |
|
||||
| `learning_claude_code_setup` | Bash + markdown | setup notes/scripts; dirty(1) | **REF** |
|
||||
| `learning_github_copilot` | Markdown (CLI/SDK docs) | reference only | **REF** |
|
||||
| `learning_python_sandbox` | Python | LeetCode/learning; dirty(1) | **PROTO** |
|
||||
| `learning_ai_materials` | Docs | NBA handover package | **REF** |
|
||||
| `learning_windsurf_setup` | Usage logs | not a codebase | **N/A** |
|
||||
|
||||
---
|
||||
|
||||
## 4. Findings by Dimension
|
||||
|
||||
### A. Repository organization
|
||||
- **Fact:** Strong, repeated conventions — `AGENTS.md`/`CLAUDE.md` per repo, pnpm
|
||||
workspaces, `types→repository→routes` backend modules, `docs/` with PRD/ROADMAP.
|
||||
- **Fact:** ~14 repos dirty at audit time; abandoned `worktrees/` (now cleaned);
|
||||
some repos behind `origin`. Two repos (`pytorch_todo_predictor`,
|
||||
`sidecar_setup`) have **no git upstream**.
|
||||
- **Reco:** Adopt a "clean tree or it doesn't exist" rule (see §8). Add upstreams
|
||||
for the two orphan repos or mark them clearly local.
|
||||
|
||||
### B. Code quality
|
||||
- **Fact:** Best repos enforce strict TS (`0` `as any` in `notes`, `trails`,
|
||||
`local_memory_gpt` backends), no `console.log` (Fastify logger), Zod validation.
|
||||
- **Fact:** `learning_ai_2nd_brain` has 60+ `print()`; `mac_tooling` 200+ and
|
||||
3k+-line files (`network_transfer_audit.py` 3521 lines); `magic_clipboard_mgr`
|
||||
shows AI-scaffold smell (50+ service files, `Phase5–8`/`RemainingQATests`).
|
||||
- **Reco:** Lint-gate `print()`/`console.log` in the Python/TS repos; split the
|
||||
3k-line files; audit `magic_clipboard_mgr` for stubbed vs real services.
|
||||
|
||||
### C. Architecture
|
||||
- **Fact:** Clear separation and reuse: shared auth/datastore/design-tokens,
|
||||
deterministic scheduler (`flowmonk`), risk engine (`trails`), use-case registry
|
||||
(`k8s_streaming`), MCP tool servers, Rust crate boundaries (`claw-*`).
|
||||
- **Reco:** This is the strongest dimension — protect it by keeping product
|
||||
domains out of `common_plat` and vice-versa.
|
||||
|
||||
### D. DevOps & deployment
|
||||
- **Fact:** `docker-compose.ecosystem.yml` wires ~20 services (10 backends + 10
|
||||
webs) + infra (Cosmos emulator, Azurite, Traefik, Loki, Grafana, MCP); 30
|
||||
`restart:` policies, 24 `build:` contexts, but **0 `healthcheck:` blocks**.
|
||||
- **Fact:** GH Actions disabled on `common_plat` + `voice_ai_agent`; ~15 repos no CI.
|
||||
- **Reco (P1):** Add healthchecks + `depends_on: condition: service_healthy` to
|
||||
the ecosystem compose; re-enable or fully migrate CI to gitea self-hosted.
|
||||
|
||||
### E. Testing
|
||||
- **Fact:** `fastgap` (~700), `clock` (662), `notes` (80+ files), `voice_ai_agent`
|
||||
(463+), `claw-cowork` (65+ files) are excellent; ~8 repos have 0 tests.
|
||||
- **Fact:** E2E often `continue-on-error: true` (`fastgap`, `flowmonk`,
|
||||
`jarvis_jr`, `local_memory_gpt`) — i.e. not actually gating.
|
||||
- **Reco:** Set a per-repo minimum (smoke + happy-path) and stop masking E2E
|
||||
failures with `continue-on-error` once stabilized.
|
||||
|
||||
### F. Security
|
||||
- **Fact:** No real committed secrets across all repos. Matches were
|
||||
`.env.example` placeholders, the public Cosmos emulator key
|
||||
(`C2y6yDjf5/R...`), `dev-*` JWT secrets, and Azure Key Vault references.
|
||||
- **Fact:** Field encryption (AES-256-GCM) in `clock`/`notes`/`dev_intelli`;
|
||||
`unsafe_code = "forbid"` in the Rust repos.
|
||||
- **Watch:** `NODE_TLS_REJECT_UNAUTHORIZED=0` seen in some Docker setups; thin
|
||||
input validation / no rate-limiting in the prototype Python apps.
|
||||
|
||||
### G. Product readiness
|
||||
- **Fact:** Web+backend pairs generally run end-to-end; native/mobile surfaces
|
||||
(iOS/Android/KMP) are frequently partial or scaffolded.
|
||||
- **Reco:** Pick 2–3 flagships (`notes`, `trails`, `clock`) and drive them to a
|
||||
true launch checklist; treat the rest explicitly as experiments.
|
||||
|
||||
### H. AI-agent practices
|
||||
- **Fact:** Sophisticated `agent-queue` (profiles, job briefs, lifecycle dirs,
|
||||
Node dashboard) — genuinely advanced for a solo setup.
|
||||
- **Fact:** Guardrails weak: agents run `--permission-mode dangerous`, write to
|
||||
live working trees (caused the dirty-repo churn), and **landed duplicate work**
|
||||
(during this session a rebase auto-dropped 2 commits already pushed upstream).
|
||||
- **Reco:** Standardize the agent task contract (§8): one task = one branch =
|
||||
clean tree → tests → commit → push; ignore runtime/queue state in git (already
|
||||
fixed in this repo this session).
|
||||
|
||||
### I. Personal engineering workflow
|
||||
- **Fact:** Conventional commits, auto `backup-main-*` branches (nice safety net),
|
||||
`AGENTS.md` discipline.
|
||||
- **Fact:** Too many long-lived dirty trees and behind-`origin` branches; no
|
||||
visible issue tracker or release cadence.
|
||||
- **Reco:** A weekly "sync sweep" (rebase+push all clean repos, list dirty) — you
|
||||
effectively did this manually this session; automate it.
|
||||
|
||||
---
|
||||
|
||||
## 5. Prioritized Action Plan
|
||||
|
||||
**P0 — now (correctness / risk)**
|
||||
1. **Re-establish a working CI gate on `learning_ai_common_plat`** (everything
|
||||
depends on it). Either fix GH Actions billing or make gitea CI the enforced
|
||||
gate. *(M, common_plat)*
|
||||
2. **Resolve the ~14 dirty repos**: review + commit or discard intentionally;
|
||||
add upstreams for `pytorch_todo_predictor` & `sidecar_setup`. *(M, workspace)*
|
||||
3. **Decide the agent-queue daemon policy** so it doesn't write to live trees
|
||||
uncontrolled (it was running in `dangerous` mode). *(S, devops_tools)*
|
||||
|
||||
**P1 — this week**
|
||||
4. Add **healthchecks** to `docker-compose.ecosystem.yml` (0 today) + ordered
|
||||
`depends_on`. *(M, common_plat/ecosystem)*
|
||||
5. Stop masking E2E with `continue-on-error: true` once stabilized; make at least
|
||||
smoke E2E gating. *(M, fastgap/flowmonk/jarvis_jr)*
|
||||
6. Replace `print()` with logging in `2nd_brain` (60+) and `mac_tooling` (200+).
|
||||
*(S–M)*
|
||||
|
||||
**P2 — this month**
|
||||
7. Add minimum test suites to the 0-test repos that matter (`productivity_web`,
|
||||
`webui_copilot`, `agent_monitoring_fx`). *(M)*
|
||||
8. Audit `magic_clipboard_mgr` for dead/stubbed services (50+ files). *(M)*
|
||||
9. Split 3k-line files in `mac_tooling`. *(M)*
|
||||
10. Remove `NODE_TLS_REJECT_UNAUTHORIZED=0` from Docker; add rate-limiting to the
|
||||
Python prototypes. *(S–M)*
|
||||
|
||||
**P3 — nice to have**
|
||||
11. Portfolio-wide coverage reporting + dependency audit (`npm audit`/`pip-audit`)
|
||||
in CI. *(M)*
|
||||
12. A lightweight issue/release cadence for the 2–3 flagships. *(S)*
|
||||
|
||||
---
|
||||
|
||||
## 6. Safe Auto-Fix Candidates
|
||||
*(Low-risk; listed only — not applied. Each needs your approval.)*
|
||||
- **Ecosystem compose healthchecks** — add `healthcheck:` to each backend/web
|
||||
service in `docker-compose.ecosystem.yml`. Safe: additive.
|
||||
- **Add upstreams** for `learning_pytorch_todo_predictor` and
|
||||
`learning_sidecar_setup` (`git remote add origin … && git push -u`). Safe once
|
||||
remote exists.
|
||||
- **Lint rule to ban `print()`** in `learning_ai_2nd_brain` (ruff `T20`) — flags
|
||||
only; you fix incrementally.
|
||||
- **Drop `NODE_TLS_REJECT_UNAUTHORIZED=0`** from Docker envs where a real CA/host
|
||||
override is available. (Verify per service first.)
|
||||
- **`.gitignore` audit** for the few repos still tracking runtime artifacts
|
||||
(pattern already fixed in `devops_tools` this session).
|
||||
|
||||
## 7. Delegate-to-Agent Queue
|
||||
Ready-to-paste briefs (each self-contained, one branch, clean-tree rule):
|
||||
1. **"Add healthchecks to ecosystem compose"** — repo `common_plat`; read
|
||||
`docker-compose.ecosystem.yml`; add `healthcheck` + ordered `depends_on` to
|
||||
all `*-backend`/`*-web` services; `docker compose config` must pass; no app
|
||||
code changes.
|
||||
2. **"De-`print()` 2nd_brain"** — repo `learning_ai_2nd_brain`; replace `print()`
|
||||
with `typer.echo`/logging in `src/brain/**`; keep behavior identical; run
|
||||
`pytest`.
|
||||
3. **"Bootstrap tests for webui_copilot"** — repo `learning_ai_webui_copilot`;
|
||||
add `pytest` smoke tests for `site_backend` rules/policy engines + a copilot
|
||||
happy-path; wire a `.github`/gitea CI job.
|
||||
4. **"Service audit: magic_clipboard_mgr"** — repo `learning_ai_magic_clipboard_mgr`;
|
||||
produce a report of which of the 50+ services are wired vs stubbed; no code
|
||||
changes.
|
||||
5. **"Stabilize E2E"** — repos `fastgap`/`flowmonk`; make smoke E2E reliable, then
|
||||
remove `continue-on-error: true` for that job only.
|
||||
|
||||
## 8. Recommended Standard Operating Procedure (for every agent task)
|
||||
1. **One task = one branch** off latest `origin/main`; never work on a dirty tree.
|
||||
2. **Scope it** with a job brief (you already do this in `agent-queue/docs/jobs/`).
|
||||
3. **Test before commit**: typecheck + lint + unit must pass locally.
|
||||
4. **Commit small**, conventional messages; **push the branch**, open a PR — don't
|
||||
let agents push straight to `main` of the shared platform.
|
||||
5. **Never track runtime/queue state** (ignore `agent-queue/queue/*` lifecycle —
|
||||
fixed here this session).
|
||||
6. **Prefer least-privilege** over `--permission-mode dangerous`; reserve dangerous
|
||||
mode for sandboxed/disposable checkouts.
|
||||
7. **Weekly sync sweep**: rebase+push all clean repos, list dirty ones for review.
|
||||
|
||||
## 9. What I Could Not Inspect
|
||||
- **No dynamic results.** I did not run `npm/pnpm install`, builds, `pytest`,
|
||||
`vitest`, Playwright, `cargo test`, or `docker compose up` (those mutate trees /
|
||||
need services). Test counts and CI configs are evidence of *intended* coverage,
|
||||
not measured pass/coverage.
|
||||
- **No live `git` per-repo ahead/behind** inside the read-only agents (they lacked
|
||||
shell git); branch/dirty facts come from the orchestrator's own checks and may
|
||||
have shifted as the agent-queue daemon ran.
|
||||
- **One agent batch misfired**: it reported 5 repos as "missing"
|
||||
(`claude_code_setup`, `github_copilot`, `magic_terminal`, `notif_scanr`,
|
||||
`python_sandbox`) due to a read-access issue; I re-scanned them directly —
|
||||
they exist (notably `magic_terminal` = Rust, `notif_scanr` = Swift).
|
||||
- **Mobile/native depth** (iOS/Android/KMP/Tauri runtime behavior) and **secret
|
||||
*values*** were not executed/decrypted — only presence/format was checked.
|
||||
- **`.env.ecosystem`** holds dev-only values; production secret management
|
||||
(Key Vault wiring) was inferred from references, not verified live.
|
||||
|
||||
---
|
||||
|
||||
### TL;DR
|
||||
- Coherent **beta-grade product ecosystem** (~38 repos) — far beyond "learning".
|
||||
- **Architecture & security are strong; CI & testing are the weak links.**
|
||||
- **P0:** restore a CI gate on `common_plat`, clean the ~14 dirty repos, and rein
|
||||
in the `dangerous`-mode agent-queue.
|
||||
- A handful of flagships (`notes`, `trails`, `claw-*`, `clock`, `fastgap`) are
|
||||
genuinely production-grade; the long tail is MVP/prototype.
|
||||
- Tighten the agent commit/CI loop (§8) and most of the operational churn
|
||||
converts back into velocity.
|
||||
@ -42,14 +42,6 @@ If you are new to the repo, read these in order:
|
||||
|
||||
These are for scanning many repositories, checking dirty state, and performing safer batch git workflows.
|
||||
|
||||
### Work Tracking ("cut items to track")
|
||||
|
||||
- `scripts/tracker-seed/seed-tracker-items.mjs`
|
||||
- Files feature/bug/task items into the ByteLyst tracker (platform-service `POST /api/items`, viewed in tracker-web), scoped per `productId`.
|
||||
- Preview safely: `node scripts/tracker-seed/seed-tracker-items.mjs --dry-run`
|
||||
- Create (stack up): `JWT_SECRET=<secret> PLATFORM_API_URL=http://localhost:4003 node scripts/tracker-seed/seed-tracker-items.mjs`
|
||||
- See [scripts/tracker-seed/README.md](scripts/tracker-seed/README.md) and the "Cutting Tracker Items" section in [AGENTS.md](AGENTS.md).
|
||||
|
||||
### Deployment Operations
|
||||
|
||||
- `./deployment-status.sh`
|
||||
|
||||
@ -1,46 +0,0 @@
|
||||
Installation guide — learning_ai_devops_tools
|
||||
|
||||
Purpose
|
||||
|
||||
This repository contains interactive, safe installers and helpers to install CLI tools (Claude Code, OpenAI Codex, Antigravity agy, Devin, GitHub Copilot) on WSL/Ubuntu, macOS, and Windows.
|
||||
|
||||
Files of interest
|
||||
|
||||
- install_clis_wsl.sh — interactive WSL installer (WSL/Ubuntu). Preview and confirm before running remote installers.
|
||||
- make_symlinks_wsl.sh — creates /usr/local/bin symlinks (requires sudo)
|
||||
- run_installers.sh — cross-platform wrapper to run installers from WSL or show instructions
|
||||
- run_installers.ps1 — Windows PowerShell wrapper to run WSL or show Windows-native steps
|
||||
- cli-install-report.md — generated report of installs (example)
|
||||
|
||||
Quick start (WSL/Ubuntu)
|
||||
|
||||
1. Open WSL (Ubuntu) shell.
|
||||
2. cd /mnt/d/SANDBOX/mygh/learning_ai_devops_tools
|
||||
3. Ensure scripts use LF and are executable:
|
||||
sudo apt-get update && sudo apt-get install -y dos2unix
|
||||
dos2unix install_clis_wsl.sh run_installers.sh make_symlinks_wsl.sh || true
|
||||
chmod +x install_clis_wsl.sh run_installers.sh make_symlinks_wsl.sh
|
||||
4. Run the interactive installer (will preview each remote installer and ask confirmation):
|
||||
bash -i ./install_clis_wsl.sh
|
||||
|
||||
Quick start (Windows PowerShell with WSL)
|
||||
|
||||
- From PowerShell run (recommended):
|
||||
wsl bash -ic "cd /mnt/d/SANDBOX/mygh/learning_ai_devops_tools && dos2unix install_clis_wsl.sh || true && bash -i ./install_clis_wsl.sh"
|
||||
|
||||
Quick start (macOS)
|
||||
|
||||
- Inspect installers first. macOS support is similar to Linux; use the run_installers.sh wrapper to list commands. Do NOT pipe unknown scripts to shell without review.
|
||||
|
||||
Security and safety
|
||||
|
||||
- All remote installers are previewed before execution.
|
||||
- No secrets or API keys are written to shell profiles.
|
||||
- Auth steps are left interactive (use the tool's login commands).
|
||||
|
||||
Developer notes
|
||||
|
||||
- Use .gitattributes to enforce LF endings on shell scripts across platforms.
|
||||
- To reproduce: run the scripts from a fresh WSL Ubuntu session and follow interactive prompts.
|
||||
|
||||
If you want, run './run_installers.sh' to get an interactive cross-platform flow.
|
||||
3
agent-queue/.gitignore
vendored
3
agent-queue/.gitignore
vendored
@ -1,3 +0,0 @@
|
||||
# Queue contents are tracked in-repo by request (prompts, logs, state) so no data is lost.
|
||||
# NOTE: daemon.pid + .state heartbeats are pure runtime and will churn/conflict in git —
|
||||
# remove them from tracking (re-add a narrow ignore) if the noise becomes a problem.
|
||||
@ -1,556 +0,0 @@
|
||||
# agent-queue
|
||||
|
||||
A zero-dependency **folder "kanban" runner** for headless coding-agent CLIs —
|
||||
**Devin**, **Claude Code**, and **OpenAI Codex**. Drop prompt `.md` files into a folder,
|
||||
and they get executed (in auto-approve mode) one slot at a time, moving through
|
||||
`inbox → building → review → testing → shipped` (plus `failed`) with live status.
|
||||
|
||||
> **Vision & roadmap:** where this is headed — a distributed multi-machine "gigafactory"
|
||||
> (fleet of factories × tools × profiles, scheduler-routed, built on platform-service +
|
||||
> tracker-web) — is specified as a checklist-driven implementation roadmap in
|
||||
> [`docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md`](docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md).
|
||||
> A full architecture overview, diagrams, code map and onboarding live alongside it in
|
||||
> [`docs/GIGAFACTORY/`](docs/GIGAFACTORY/).
|
||||
|
||||
> **Run safety:** how the daemon and its agents must operate (isolated worktrees,
|
||||
> branch-per-task, least-privilege instead of blanket `--yolo`/dangerous on live
|
||||
> checkouts) is defined in [`docs/RUN_POLICY.md`](docs/RUN_POLICY.md). Read it
|
||||
> before enabling `yolo: true`.
|
||||
|
||||
**Build/ship lifecycle — auto-QA, manual ship:**
|
||||
|
||||
```
|
||||
inbox ─▶ building ─▶ review ─▶ testing ─▶ shipped
|
||||
(queued) (agent (rc=0; (verify (you ran
|
||||
running) awaiting passed — `ship`)
|
||||
verify) QA gate)
|
||||
│
|
||||
agent rc≠0 / │ verify fails
|
||||
timeout ──────────┴──────────────▶ failed
|
||||
```
|
||||
|
||||
- **Auto:** agent exits 0 → `review/`. If a `verify:` command is configured it runs
|
||||
automatically: **pass → `testing/` (QA)**, **fail → `failed/`**. No `verify:` →
|
||||
the job parks in `review/` for a manual `promote`.
|
||||
- **Manual:** you `ship` a `testing/` job → `shipped/` (the human gate). Shipping is
|
||||
never automatic.
|
||||
|
||||
> **Why this exists:** the agent CLIs ship a minimal local interface (no built-in
|
||||
> batch/queue/dashboard — that lives in their *cloud* products). This is the
|
||||
> zero-dependency bash glue that turns "run one prompt interactively" into
|
||||
> "queue many and walk away."
|
||||
|
||||
---
|
||||
|
||||
## Quick start
|
||||
|
||||
```bash
|
||||
cd learning_ai_devops_tools/agent-queue
|
||||
chmod +x agent-queue.sh
|
||||
./agent-queue.sh init
|
||||
|
||||
# queue a roadmap for Devin, running in the tracker-web repo, auto-approving everything
|
||||
./agent-queue.sh add ~/roadmaps/UX-2.md \
|
||||
--engine devin \
|
||||
--cwd /Users/sd9235/code/mygh/learning_ai_common_plat/dashboards/tracker-web \
|
||||
--yolo
|
||||
|
||||
# start processing (foreground; Ctrl-C to stop). Run up to 3 agents at once (default).
|
||||
./agent-queue.sh run --max 3
|
||||
```
|
||||
|
||||
In a **second terminal**, watch progress:
|
||||
|
||||
```bash
|
||||
./agent-queue.sh watch
|
||||
```
|
||||
|
||||
```
|
||||
AGENT QUEUE /…/agent-queue/queue
|
||||
inbox 3 building 2 review 1 testing 2 shipped 5 failed 0 running 2/2
|
||||
|
||||
RUNNING
|
||||
20260528-2130__UX-2 devin 4m12s pid 51234 ⏺ Edited src/app/dashboard/items/page.tsx
|
||||
20260528-2131__UX-3 claude 1m02s pid 51290 Running: pnpm typecheck
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How a task is configured
|
||||
|
||||
Each `.md` carries optional **frontmatter** telling the runner which engine to use,
|
||||
which directory to run in, and whether to auto-approve:
|
||||
|
||||
```md
|
||||
---
|
||||
engine: devin # devin | claude | codex | copilot (default: $AGENT_QUEUE_ENGINE)
|
||||
cwd: /abs/path/to/repo # where the agent executes (default: cwd when added)
|
||||
yolo: true # auto-approve ALL tools (default: true)
|
||||
lock: my-repo # optional mutex key (default: cwd). Jobs sharing a key run serially
|
||||
timeout: 45m # optional. 90s|45m|2h|1d. On expiry → failed (result=timeout)
|
||||
verify: pnpm -s test # optional auto-QA gate. Runs in cwd after rc=0:
|
||||
# pass → testing/ (QA), fail → failed/
|
||||
# (omit to park in review/ for manual promote)
|
||||
---
|
||||
|
||||
# Your task / roadmap goes here
|
||||
...
|
||||
```
|
||||
|
||||
`add --engine/--cwd/--yolo` will inject this frontmatter for you if the file doesn't
|
||||
already have a `---` block.
|
||||
|
||||
### Manifest fields (Gigafactory Phase 1)
|
||||
|
||||
The runner parses the richer [gigafactory manifest](docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md#5-the-evolved-job-manifest-feature)
|
||||
**backward-compatibly** — a legacy `engine`/`cwd`/`yolo`-only `.md` behaves exactly as before.
|
||||
Fields marked **RESERVED** are parsed, stored in `.state/<job>.meta`, and shown in `status`, but
|
||||
are otherwise **no-ops until a later phase** (they do not yet affect execution).
|
||||
|
||||
| Field | Status | Default | Meaning |
|
||||
| ----- | ------ | ------- | ------- |
|
||||
| `engine` | active | `$AGENT_QUEUE_ENGINE` | explicit engine (`devin\|claude\|codex\|copilot`) — always wins over `engine-class` |
|
||||
| `cwd` / `yolo` / `lock` / `timeout` / `verify` | active | see above | Phase-0 behavior, unchanged |
|
||||
| `priority` | **active** | `medium` | `critical\|high\|medium\|low`. Inbox is picked **highest-priority first, then oldest** (was pure FIFO) |
|
||||
| `engine-class` | **active** | _(none)_ | used only when `engine` is unset: `agentic-coder`→`devin,claude,codex`; `chat-coder`→`copilot`. Picks the first **available** engine. No engine available → job fails `result=no_engine` |
|
||||
| `prefers-engine` | **active** | _(none)_ | optional order hint for `engine-class` resolution, e.g. `[claude, devin]` |
|
||||
| `capabilities` | **active** | _(none)_ | hard host requirements, e.g. `[os:any, node>=20, has:git]`. If the host can't satisfy them the job is sent to `failed/` with `result=capability_mismatch` **and the agent is never launched** (grammar below) |
|
||||
| `idempotency-key` | **active** | _(none)_ | dedupe on `add` (semantics below) |
|
||||
| `profile` | **active** | _(none)_ | inherit persona + verify/caps/engine-class/prefers-engine/allowed-scope/review-policy from `profiles/<name>.md` (job fields override — see **Profiles**) |
|
||||
| `prefers` | RESERVED | _(none)_ | soft routing/affinity hints (e.g. `[factory:mac-2]`) |
|
||||
| `budget` | RESERVED | _(none)_ | `{ usd, tokens, wall }` ceilings (`wall` enforcement is a later slice) |
|
||||
| `deps` / `deps-mode` | **active** | _(none)_ | block until each referenced `idempotency-key` is in `shipped/` (or `testing/` when `deps-mode: soft`). Submit-time cycle detection (see **Profiles & deps**) |
|
||||
| `retry` | **active** | _(none)_ | `{ max: N, backoff: 5m, on: [timeout, verify_failed, crash] }` — requeue failures with backoff up to `max`, then `retries_exhausted` (see **Resilience**) |
|
||||
| `review-policy` | RESERVED | _(none)_ | `auto\|manual\|reviewers:[…]` |
|
||||
| `artifacts` | RESERVED | _(none)_ | extra outputs to capture (coverage, screenshots) |
|
||||
| `tracker-item` | RESERVED | _(none)_ | link back to the originating tracker task |
|
||||
|
||||
**Capability grammar** (a job matches a host iff **every** required token is satisfied):
|
||||
|
||||
| Token form | Example | Satisfied when |
|
||||
| ---------- | ------- | -------------- |
|
||||
| `key` (bare presence) | `gpu` | the host advertises `key` in any form |
|
||||
| `key:value` (exact) | `os:mac`, `engine:devin`, `has:git` | the host advertises that exact token |
|
||||
| `key:any` (wildcard) | `os:any` | the host advertises any `key:*` (so `os:any` matches every host) |
|
||||
| `key<op>version` (`>=` `>` `=` `<=` `<`) | `node>=20` | numeric/semver-major compare vs the host's `key:<n>` |
|
||||
|
||||
The host advertises (via `detect_capabilities`): `os:<mac\|linux>`, `engine:<each available engine>`,
|
||||
`node:<major>`, and `has:<git\|pnpm\|docker>` when present.
|
||||
|
||||
**`idempotency-key` semantics** (on `add`, hashing the frontmatter-stripped body):
|
||||
|
||||
- same key **+ same body** → **no-op** (logged `duplicate, skipped`).
|
||||
- same key **+ different body**, prior job still in `inbox/` → **supersedes** it (replaces the queued file).
|
||||
- same key **+ different body**, prior job already past `inbox/` (building/review/testing/shipped) →
|
||||
**rejected** with a clear error (use a new key, or requeue the existing job).
|
||||
|
||||
## Engine mapping
|
||||
|
||||
| `engine:` | Command run | Auto-approve flag (`yolo: true`) |
|
||||
| --------- | ----------- | -------------------------------- |
|
||||
| `devin` | `devin -p --prompt-file <body>` | `--permission-mode dangerous` |
|
||||
| `claude` | `claude -p` (body on **stdin**) | `--dangerously-skip-permissions` |
|
||||
| `codex` | `codex exec` (body on **stdin**) | `--dangerously-bypass-approvals-and-sandbox` |
|
||||
| `copilot` | `copilot -p` (body on **stdin**) | `--allow-all-tools` _(best-effort; chat-coder class target)_ |
|
||||
|
||||
The frontmatter is **stripped** before the body reaches the agent, and
|
||||
claude/codex receive it on **stdin** so a body starting with `--` is never
|
||||
misparsed as a flag.
|
||||
|
||||
> Flags drift between CLI versions — if one changes, edit `build_agent_cmd()` in
|
||||
> `agent-queue.sh` (it's the single place each engine is mapped).
|
||||
|
||||
## Commands
|
||||
|
||||
| Command | What it does |
|
||||
| ------- | ------------ |
|
||||
| `init` | create the `queue/` folders |
|
||||
| `add <file> [--engine E] [--cwd P] [--yolo\|--no-yolo]` | queue a prompt into `inbox/` |
|
||||
| `run [--max N] [--engine E] [--once]` | process the inbox (foreground loop) |
|
||||
| `status` | kanban counts + running-worker table (marks `⚠ stalled`; per-job insights sub-line) |
|
||||
| `watch [interval]` | live `status` (bash), redrawn every N seconds (default 2) |
|
||||
| `insights [job]` | per-job metrics, or a recent-jobs table + per-engine token/cost/success rollup (see **Insights**) |
|
||||
| `recover` | reclaim orphaned `building/` jobs (dead worker) back to `inbox/` (see **Resilience**) |
|
||||
| `dash [--interval N]` | **interactive Node dashboard** — navigable numbered job list with single-key actions (see below) |
|
||||
| `stop` | kill running workers + the run loop |
|
||||
| `logs <job> [-f]` | print / follow a job's log |
|
||||
| `promote <job>` | advance one stage forward: `review → testing → shipped` |
|
||||
| `ship <job>` | **manual gate:** move a `testing/` (QA) job → `shipped/` |
|
||||
| `reject <job>` | send a `review/` or `testing/` job → `failed/` |
|
||||
| `requeue <job>` | move a `failed`/`review`/`testing` job back to `inbox/` for a fresh run |
|
||||
| `clean [--keep N]` | archive finished logs+meta beyond the newest N (default 50) into `queue/.archive/` |
|
||||
|
||||
Only one `run` loop may be active per queue — a second `run` against the same
|
||||
queue is refused while the first is alive (a stale `daemon.pid` is cleared).
|
||||
|
||||
### Interactive dashboard (`dash`)
|
||||
|
||||
`dash` is a single-script, menu-driven control panel (think a tiny "glassbox").
|
||||
It shows the kanban counts, live RUNNING workers (engine, elapsed, last log
|
||||
line, stall), a **navigable numbered JOBS list**, and RECENT finished jobs — and
|
||||
lets you act on jobs without leaving the screen. Every action shells out to
|
||||
`agent-queue.sh`, so the script stays the single source of truth.
|
||||
|
||||
| Key | Action |
|
||||
| --- | ------ |
|
||||
| `↑`/`↓`, `j`/`k`, `1`–`9` | select a job in the JOBS list |
|
||||
| `enter` / `l` | view the selected job's log (live, auto-refreshing) |
|
||||
| `p` | promote (`review → testing → shipped`) |
|
||||
| `s` | ship (`testing`/QA → `shipped`, the manual gate) |
|
||||
| `x` | reject (`review`/`testing` → `failed`) — asks `y/n` |
|
||||
| `u` | requeue (`failed`/`review`/`testing` → `inbox`) — asks `y/n` |
|
||||
| `r` | start the `run` loop (detached → `logs/run-loop.log`) |
|
||||
| `S` | stop the run loop + running workers |
|
||||
| `g` | refresh now · `?`/`h` help · `q`/`Ctrl-C` quit |
|
||||
|
||||
The header shows a `● run loop pid N` / `○ run loop stopped` indicator. Run it in
|
||||
a TTY for the interactive mode; piped/non-TTY it falls back to a read-only live view.
|
||||
|
||||
## Via `bytelyst-cli.sh`
|
||||
|
||||
Wired into the repo's unified CLI (no GitHub token required for this subcommand):
|
||||
|
||||
```bash
|
||||
./bytelyst-cli.sh agent-queue run --max 3 # full passthrough
|
||||
./bytelyst-cli.sh aq status # short alias
|
||||
```
|
||||
|
||||
## Boot-persistence (auto-start on login)
|
||||
|
||||
To run the worker non-stop and survive **reboot / crash / logout** (not just a
|
||||
closed terminal), install the macOS LaunchAgent — it auto-starts `agent-queue run`
|
||||
on login under `caffeinate` and restarts it via `KeepAlive`:
|
||||
|
||||
```bash
|
||||
bash launchd/install.sh # install + start now
|
||||
bash launchd/install.sh --uninstall # stop + remove
|
||||
```
|
||||
|
||||
Override engine/concurrency/secrets in `~/.agent-queue.env` (e.g.
|
||||
`AGENT_QUEUE_ENGINE=codex`, `AGENT_QUEUE_MAX=1`). See [`launchd/README.md`](launchd/README.md)
|
||||
for the full layer comparison (tmux/caffeinate vs LaunchAgent) and gotchas.
|
||||
|
||||
## Folder layout
|
||||
|
||||
```
|
||||
queue/
|
||||
inbox/ # drop / queued .md files (oldest eligible picked first)
|
||||
building/ # currently executing (agent running)
|
||||
review/ # agent exited 0 — awaiting the auto-QA verify gate (or manual promote)
|
||||
testing/ # verify passed (QA) — awaiting manual `ship`
|
||||
shipped/ # manually shipped — the terminal success stage
|
||||
failed/ # non-zero exit, bad cwd, timeout, verify failure, or manual reject
|
||||
logs/ # <job>.log — full agent + verify output
|
||||
locks/ # per-key flock files (Linux hardening; unused on macOS)
|
||||
.state/ # <job>.meta heartbeats + daemon.pid (runtime only)
|
||||
.archive/ # <ts>/ — logs+meta moved here by `clean`
|
||||
```
|
||||
|
||||
**`result=` values** written to `<job>.meta`: `review`, `testing`, `shipped`,
|
||||
`failed`, `timeout`, `verify_failed`, `rejected`, `requeued`, `capability_mismatch`
|
||||
(host missing a required capability — agent never launched), `no_engine`
|
||||
(an `engine-class` had no available engine), `retries_exhausted` (failed after
|
||||
`retry.max` attempts — single-host dead-letter stand-in), `retry_scheduled`
|
||||
(transient: requeued for another attempt), `recovered` (transient: an orphan was
|
||||
reclaimed to `inbox/`).
|
||||
|
||||
## Profiles & deps
|
||||
|
||||
### Profiles (roadmap §6)
|
||||
|
||||
A **profile** is a reusable role preset in `profiles/<name>.md`. A job opts in with
|
||||
`profile: <name>` and inherits any of these fields it does **not** set itself:
|
||||
`verify` (from the profile's `default-verify`), `capabilities`, `engine-class`,
|
||||
`prefers-engine`, `allowed-scope`, `review-policy`. The profile's `persona` block is
|
||||
**prepended** to the body sent to the engine (the job `.md` on disk is unchanged;
|
||||
secrets are never logged). Resolution runs **before** the capability gate and engine
|
||||
resolution, so inherited caps / engine-class take effect.
|
||||
|
||||
**Precedence:** `job field > profile field > built-in default`. Set `AGENT_QUEUE_PROFILES`
|
||||
to point at a different catalog directory (defaults to `./profiles`).
|
||||
|
||||
Starter catalog: `developer`, `backend-engineer`, `frontend-engineer`, `ux-designer`,
|
||||
`ui-designer`, `qa`, `reviewer`, `docs-writer`, and a reserved `planner`. Each presets
|
||||
`name`, `persona`, `capabilities`, `default-verify`, `engine-class`, `prefers-engine`,
|
||||
`allowed-scope`, and `review-policy`.
|
||||
|
||||
**allowed-scope (warn-only this phase).** After a run on a git `cwd`, changed paths
|
||||
outside the profile/job `allowed-scope` globs (`dir/**` matches the whole subtree) are
|
||||
logged as a `WARNING` and recorded as `scope_warning=` in the meta — **non-blocking**
|
||||
(the job is not failed). `path_in_scope` is exposed as a unit-testable function.
|
||||
|
||||
### deps / DAG, single host (roadmap §5)
|
||||
|
||||
`deps: [keyA, keyB]` references other jobs by their author-controlled
|
||||
`idempotency-key`. A dep is **satisfied** when a job with that key is in `shipped/`
|
||||
(default), or in `shipped/` **or** `testing/` when the dependent job sets
|
||||
`deps-mode: soft`. A job with unmet deps is **blocked**: it is skipped in inbox
|
||||
selection (never launched, never failed) and surfaced in `status` as
|
||||
`blocked (waiting on: <keys>)`, then re-evaluated every loop until its deps are met.
|
||||
`add` performs **submit-time cycle detection** over the inbox + active-stage dep graph
|
||||
and rejects (nonzero exit) a job that would create a cycle. Cross-machine deps are P2.
|
||||
|
||||
## Resilience (crash recovery & work preservation)
|
||||
|
||||
Single-host implementations of the durability model (roadmap §25):
|
||||
|
||||
- **Orphan recovery.** A job left in `building/` whose worker process is dead (no
|
||||
live `pid`, PID-reuse-guarded by `pidstart`) is an orphan from a previous
|
||||
crash/power-off. On `run` startup and on every loop iteration (or on demand via
|
||||
`agent-queue.sh recover`) it is moved back to `inbox/` with `attempts`
|
||||
incremented. Recovery is **idempotent** — once moved out of `building/` it is
|
||||
never recovered twice.
|
||||
- **WIP checkpointing.** When a job's `cwd` is a git repo, the worker creates/checks
|
||||
out a dedicated branch **`aq/wip/<job>`** at start and commits any changes to it
|
||||
on **every** exit path — success, failure, timeout, and SIGTERM/SIGINT (via a
|
||||
trap). It **never** commits to `main`/your current branch. Non-git `cwd` is
|
||||
skipped cleanly. `wip_branch` / `wip_base` / `wip_commit` are recorded in the meta.
|
||||
- **Resume.** When an orphan/retry of a job whose `aq/wip/<job>` branch already
|
||||
exists is relaunched, that branch is checked out first so the agent **continues
|
||||
from the checkpoint** instead of from zero.
|
||||
- **Retry policy** (`retry` frontmatter, now active). On a failure whose class is in
|
||||
`on` (`crash`/`agent_error` for a non-zero agent exit, `timeout`, `verify_failed`)
|
||||
the job is requeued to `inbox/` honoring `backoff` (selection skips it until
|
||||
`next_eligible`) up to `max` attempts; on exhaustion it lands in `failed/` with
|
||||
`result=retries_exhausted`, preserving the WIP branch + full log. No `retry` =
|
||||
no retry (Phase-0 behavior).
|
||||
|
||||
All bookkeeping (`attempts`, `next_eligible`, `wip_*`) is append-only in the meta
|
||||
and re-derivable from the meta + folder location, so recovery is crash-safe.
|
||||
|
||||
## Insights (metrics & token accounting)
|
||||
|
||||
Each finished run records into `<job>.meta`: `duration_s`, `exit`, `result`,
|
||||
`attempts`, and — for a git `cwd` — `files_changed` / `lines_added` /
|
||||
`lines_deleted` (diffed `wip_base..HEAD`). A single `parse_usage <engine> <log>`
|
||||
adapter extracts `model` / `tokens_in` / `tokens_out` / `tokens_cached` /
|
||||
`cost_usd` / `turns` / `tool_calls` when the engine exposes them.
|
||||
|
||||
```bash
|
||||
agent-queue.sh insights <job> # full metrics for one job
|
||||
agent-queue.sh insights # recent-jobs table + per-engine rollup
|
||||
```
|
||||
|
||||
> **Token caveat (honest):** real usage is captured only where the engine surfaces
|
||||
> it. A cooperating wrapper may emit a machine-readable `AQ_USAGE key=value …` line;
|
||||
> otherwise per-engine heuristics apply (Claude/Codex token fields parsed; Devin
|
||||
> session metrics + Copilot are API-only and currently TODO in `parse_usage`). When
|
||||
> a value is not provider-reported it is **omitted or flagged `usage_estimated`** —
|
||||
> numbers are never fabricated. The per-engine rollup marks totals that include any
|
||||
> estimated value with `*`.
|
||||
|
||||
## Tracker integration (§10)
|
||||
|
||||
Closes the task ↔ job round-trip against the platform-service **items API**: a
|
||||
tracker Item can become a job, and a job's outcome echoes back to the Item.
|
||||
|
||||
```bash
|
||||
agent-queue.sh from-tracker <ITEM_ID> # pull an Item -> materialize a job in inbox/
|
||||
agent-queue.sh to-tracker <job> # echo the job's current outcome to its Item
|
||||
```
|
||||
|
||||
All HTTP goes through one curl wrapper (`tracker_api`); there are no other network
|
||||
calls. Real use needs **platform-service running and a bearer token**.
|
||||
|
||||
### Config (env)
|
||||
|
||||
| Var | Default | Meaning |
|
||||
| --- | ------- | ------- |
|
||||
| `AQ_TRACKER_API` | `http://localhost:4003` | base URL of the items API (routes live under `/api`) |
|
||||
| `AQ_TRACKER_TOKEN` | _(none)_ | bearer token — **required** for real calls; never hardcode |
|
||||
| `AQ_PRODUCT_ID` | _(none)_ | productId (sent as `X-Product-Id`; every Item has one) |
|
||||
| `AQ_TRACKER_CWD` | `$PWD` | cwd a tracker-derived job runs in (Items carry no cwd) |
|
||||
| `AQ_TRACKER_AUTO` | `0` | `1` = auto-echo on each transition (default OFF — echo is manual) |
|
||||
| `AQ_TRACKER_STATUS_INPROGRESS` / `_DONE` / `_FAILED` | `in_progress` / `done` / `wont_fix` | Item status per bucket (the API has no blocked/failed status) |
|
||||
| `AQ_TRACKER_API_CMD` | _(none)_ | test seam: a stub that replaces the curl HTTP entirely (selftest uses it) |
|
||||
|
||||
### `from-tracker` — Item → job
|
||||
|
||||
`GET /api/items/<id>`, then maps fields to job frontmatter:
|
||||
|
||||
| Item | Job |
|
||||
| ---- | --- |
|
||||
| `title` + `description` | job body (verbatim instruction markdown) |
|
||||
| `id` | `tracker-item: <id>` and `idempotency-key: tracker-<id>` (stable) |
|
||||
| `priority` | `priority:` (label overrides; else Item priority; else `medium`) |
|
||||
| label `engine-class:<x>` | `engine-class: <x>` |
|
||||
| label `profile:<x>` | `profile: <x>` |
|
||||
| label `priority:<x>` | `priority: <x>` |
|
||||
| label `cap:<token>` | a `capabilities: [...]` entry |
|
||||
|
||||
Idempotent on the derived `idempotency-key` (Slice 1 dedupe) — pulling the same
|
||||
Item twice never enqueues a duplicate.
|
||||
|
||||
### `to-tracker` — job → Item (one-way echo, §24.5)
|
||||
|
||||
Only if the job's meta has a `tracker-item`. Maps the job's stage/result to an Item
|
||||
status and `PATCH /api/items/<id>/status`, then `POST /api/items/<id>/comments`
|
||||
with a **metrics-only** summary (result, attempts, duration, tokens/cost, +/- lines —
|
||||
**never prompt content or secrets**):
|
||||
|
||||
| job result/stage | Item status |
|
||||
| ---------------- | ----------- |
|
||||
| building / review / testing / recovered | `in_progress` |
|
||||
| shipped | `done` |
|
||||
| failed / timeout / verify_failed / retries_exhausted / capability_mismatch / no_engine / rejected | `wont_fix` (override via `AQ_TRACKER_STATUS_FAILED`) |
|
||||
|
||||
Idempotent via `tracker_echoed` in the meta (re-echoing an unchanged outcome is a
|
||||
no-op). The echo is **one-way** (child → tracker) and **never authoritative for
|
||||
execution**: an echo failure is logged and the job continues unchanged. With
|
||||
`AQ_TRACKER_AUTO=1` the worker echoes automatically on each transition; otherwise
|
||||
echo is manual. `status` / `insights` surface the `tracker-item` and last echoed status.
|
||||
|
||||
## Fleet integration (Phase 2)
|
||||
|
||||
Behind the `AQ_FLEET` flag, the runner becomes a **factory** that registers,
|
||||
heartbeats, claims, and reports against the platform-service `fleet` coordinator —
|
||||
so coordinator jobs run alongside local `.md` files on the same host. All
|
||||
coordinator logic lives in [`lib/fleet-client.sh`](lib/fleet-client.sh) (curl-only +
|
||||
POSIX awk, sourced by `agent-queue.sh`); the few hook points in the runner are all
|
||||
gated on `fleet_enabled`.
|
||||
|
||||
> **Offline vs fleet mode.** With `AQ_FLEET` unset/`0` (the default) the runner is
|
||||
> the pure offline git-queue described above — **zero** coordinator calls, behavior
|
||||
> byte-for-byte unchanged. With `AQ_FLEET=1` the run loop also registers + claims
|
||||
> from the coordinator, reports fenced stage transitions, renews leases, and (in
|
||||
> fleet mode) routes the outcome echo through the coordinator's `fleet_events`
|
||||
> instead of the direct tracker echo. The tracker echo remains the offline path.
|
||||
|
||||
```bash
|
||||
AQ_FLEET=1 AQ_FLEET_TOKEN=… AQ_PRODUCT_ID=… agent-queue.sh fleet-status # register + show identity
|
||||
AQ_FLEET=1 AQ_FLEET_TOKEN=… AQ_PRODUCT_ID=… agent-queue.sh run # claim + execute coordinator jobs
|
||||
```
|
||||
|
||||
### Config (env)
|
||||
|
||||
| Var | Default | Meaning |
|
||||
| --- | ------- | ------- |
|
||||
| `AQ_FLEET` | `0` | master switch — `1` enables coordinator integration; `0`/unset = offline git-queue (zero coordinator calls) |
|
||||
| `AQ_FLEET_ROUTE` | `1` | `route_via_service`: `1` = coordinator is authoritative for claim (P2-S3 behavior); `0` = local inbox authoritative (coordinator not used to source work) |
|
||||
| `AQ_FLEET_AUTOSHIP` | `0` | `1` = when the local verify gate passes, advance the coordinator job `testing → shipped` (the factory's verify **is** the test phase); `0` = report `testing` and rest for the human review gate |
|
||||
| `AQ_FLEET_PR` | `0` | `1` = for a job carrying a `repo`, run the agent in an isolated checkout on branch `aq/job/<id>`, then commit/push and `gh pr create`; the PR URL is reported back and recorded on the run |
|
||||
| `AQ_FLEET_REPOS_DIR` | `.state/repos` | cache dir for PR-mode repo checkouts (one per repo) |
|
||||
| `AQ_FLEET_REPO_BASE` | _(none)_ | base dir of existing local repos; a job `repo` matching `<base>/<repo>` is cloned from there (fast, no network) and PRs are pushed to its GitHub origin (embedded creds stripped) |
|
||||
| `GH_BIN` | `gh` | GitHub CLI used to open PRs in PR mode |
|
||||
| `AQ_FLEET_SHADOW` | `0` | shadow/dual-run: `1` (requires `AQ_FLEET=1` + `AQ_FLEET_ROUTE=0`) queries the coordinator in parallel and records divergence, **never acting on it** |
|
||||
| `AQ_FLEET_SHADOW_FACTORY_ID` | `<factory>-shadow` | isolated id used for the read-only shadow claim (never the real factory id) |
|
||||
| `AQ_FLEET_SHADOW_LOG` | `.state/fleet-shadow.log` | structured shadow-divergence log (`ts⇥localJob⇥coordJob⇥verdict`) |
|
||||
| `AQ_FLEET_API` | `http://localhost:4003/api` | coordinator base URL (already includes `/api`) |
|
||||
| `AQ_FLEET_TOKEN` | _(none)_ | bearer token — never hardcode |
|
||||
| `AQ_PRODUCT_ID` | _(none)_ | productId (sent as `X-Product-Id`; shared with the tracker config) |
|
||||
| `AQ_FACTORY_ID` | `<hostname>-<pid>` | stable factory identity for this process |
|
||||
| `AQ_FLEET_LEASE_RENEW_SEC` | `300` | heartbeat / lease-renew cadence |
|
||||
| `AQ_FLEET_CAPS` | _(auto)_ | override the auto-detected capability tokens (comma/space list) |
|
||||
| `AQ_FLEET_CWD` | `$PWD` | cwd a claimed coordinator job runs in |
|
||||
| `AQ_FLEET_API_CMD` | _(none)_ | test seam: a stub that replaces the curl HTTP entirely (selftest uses it) |
|
||||
|
||||
### Protocol (claim / heartbeat / report / fence / renew)
|
||||
|
||||
- **register / heartbeat:** `POST /fleet/factories/heartbeat {factoryId, capabilities[], health, load}` — registration *is* the first heartbeat; re-sent on `AQ_FLEET_LEASE_RENEW_SEC` cadence.
|
||||
- **claim:** `POST /fleet/claim {factoryId, capabilities[], leaseSeconds}`. A returned job (`id`, `bodyMd`, `leaseEpoch`) is materialized as a transient local `.md` (frontmatter `fleet-job-id` + `fleet-lease-epoch`) so the existing runner executes it unchanged, interleaved with local files.
|
||||
- **report (fenced):** each stage transition (`building`/`review`/`testing`/`shipped`/`failed`) is `PATCH /fleet/jobs/:id {stage, leaseEpoch, checkpoint?}`. The coordinator writes `fleet_events` server-side. The payload carries only stage/epoch/checkpoint — **never** the prompt/`bodyMd` or token.
|
||||
- **fencing (§18):** if a report/renew returns **conflict/409** (stale `leaseEpoch` → the coordinator reclaimed us), the worker **self-aborts**: it stops, does **not** ship/merge, and **quarantines** the local result to `failed/` (`result=fenced_quarantine`) for human triage. A reclaimed zombie can never corrupt coordinator state.
|
||||
- **lease renew / release:** `POST /fleet/jobs/:id/lease/renew` while building (fenced); `…/lease/release` on terminal stages.
|
||||
- **checkpoint:** the WIP `{wipBranch, wipCommit}` is sent with the building report so a reclaim can resume (§25).
|
||||
|
||||
### Offline-degrade + quarantine (§9)
|
||||
|
||||
If the coordinator is **unreachable** mid-job (5xx / connection error), the report
|
||||
is treated as *degraded* (logged, `fleet_degraded=1`): the in-flight job **finishes
|
||||
locally** rather than being abandoned. On the next reachable call the worker
|
||||
presents its `leaseEpoch`; if the coordinator now reports it **stale** (it was
|
||||
reclaimed during the outage), the local result is **quarantined** (marked, not
|
||||
auto-shipped) and surfaced for human triage — split-brain is resolved in favor of
|
||||
the coordinator without losing the work. `status` shows the factory id + per-job
|
||||
`fleet=<id>@e<epoch>`; `insights` lists the `fleet_*` fields.
|
||||
|
||||
### Feature flags + shadow / dual-run (Slice 4, §16/§27)
|
||||
|
||||
Three explicit, independently-toggleable levels gate the coordinator — a safe,
|
||||
reversible path to validate the fleet coordinator against the proven single-host
|
||||
(P1) behavior **before** any real cutover:
|
||||
|
||||
| Flag | Effect |
|
||||
| ---- | ------ |
|
||||
| `AQ_FLEET=0` | **Pure offline.** Zero coordinator calls (including shadow). Offline git-queue path is byte-for-byte unchanged. |
|
||||
| `AQ_FLEET_ROUTE=1` (default) | **route_via_service** — the coordinator is *authoritative* for claim/assignment (today's P2-S3 behavior). |
|
||||
| `AQ_FLEET_ROUTE=0` | **Local inbox authoritative** — the coordinator is *not* used to source work (the pre-cutover state). |
|
||||
| `AQ_FLEET_AUTOSHIP=1` | **Autonomous ship.** On a passing local verify, advance the coordinator job `testing → shipped` (closing the `testing → shipped` gap). Default `0` reports `testing` and leaves the job for the human review gate / `ship` operator action. |
|
||||
| `AQ_FLEET_SHADOW=1` | **Shadow / dual-run** (requires `AQ_FLEET=1` **and** `AQ_FLEET_ROUTE=0`): run the offline path as authoritative **and** query the coordinator in parallel, recording divergence **without acting on it**. |
|
||||
|
||||
**Precedence.** Shadow is only meaningful when `ROUTE=0`. If both `AQ_FLEET_ROUTE=1`
|
||||
and `AQ_FLEET_SHADOW=1` are set, **ROUTE wins** and shadow is disabled (a one-shot
|
||||
warning is logged) — you never route *and* shadow at the same time.
|
||||
|
||||
**Side-effect-free by construction.** Shadow **never** ships, quarantines, or
|
||||
mutates real job state. `fleet_shadow_claim` asks the coordinator what it *would*
|
||||
assign using an **isolated `-shadow` factoryId** + `"dryRun":true,"shadow":true`;
|
||||
if a coordinator without dry-run support actually assigned, the lease is **released
|
||||
immediately** so no real assignment persists. The would-be job is never
|
||||
materialized, run, or shipped locally. `fleet_shadow_report` mirrors the local
|
||||
stage as a shadow event (`"shadow":true`) purely to exercise reporting — the
|
||||
coordinator response is logged but **never acted on** (no fence/quarantine).
|
||||
|
||||
Each iteration `fleet_shadow_compare` classifies the local (authoritative) decision
|
||||
vs the coordinator's would-be decision as **AGREE / DIVERGE / COORD_EMPTY /
|
||||
LOCAL_EMPTY** and appends a line to the shadow log. Summarize it any time:
|
||||
|
||||
```bash
|
||||
agent-queue.sh fleet-shadow-report # per-verdict counts + agreement rate + recent divergences
|
||||
agent-queue.sh fleet-shadow-report 25 # last 25 divergence/error events
|
||||
agent-queue.sh status # surfaces the three flags' resolved state
|
||||
```
|
||||
|
||||
**Cutover ladder (rollback at any step):**
|
||||
|
||||
1. **Observe (zero risk):** `AQ_FLEET=1 AQ_FLEET_ROUTE=0 AQ_FLEET_SHADOW=1 run` —
|
||||
the local path stays authoritative; the coordinator is only shadowed.
|
||||
2. **Inspect agreement:** `fleet-shadow-report` — drive `AGREEMENT` toward 100%,
|
||||
investigating each `DIVERGE`.
|
||||
3. **Cut over:** once agreement is high, flip `AQ_FLEET_ROUTE=1` (coordinator
|
||||
becomes authoritative).
|
||||
4. **Rollback:** set `AQ_FLEET_ROUTE=0` (and/or `AQ_FLEET=0`) at any time — instant
|
||||
return to the local/offline path, no data migration.
|
||||
|
||||
## Config (env overrides)
|
||||
|
||||
| Var | Default | Meaning |
|
||||
| --- | ------- | ------- |
|
||||
| `AGENT_QUEUE_ROOT` | `./queue` | where the kanban folders live |
|
||||
| `AGENT_QUEUE_MAX` | `3` | max concurrent agents (override per-run with `run --max N`) |
|
||||
| `AGENT_QUEUE_ENGINE` | `devin` | default engine when none in frontmatter |
|
||||
| `AGENT_QUEUE_POLL` | `3` | inbox poll interval (seconds) |
|
||||
| `AGENT_QUEUE_VERIFY` | _(empty)_ | default auto-QA verify command; per-job `verify:` overrides it |
|
||||
| `AGENT_QUEUE_STALL_MIN` | `10` | minutes of unchanged log before a worker is `⚠ stalled` |
|
||||
| `DEVIN_BIN` / `CLAUDE_BIN` / `CODEX_BIN` / `COPILOT_BIN` | autodetected | override CLI binary paths |
|
||||
| `FLOCK_BIN` / `TIMEOUT_BIN` | autodetected | `flock` (lock hardening) and `timeout`/`gtimeout` (hard timeouts); absent on stock macOS — see notes |
|
||||
|
||||
## ⚠️ Safety
|
||||
|
||||
Running agents with `yolo: true` means **no approval prompts** — they will edit files,
|
||||
run shell commands, and commit unattended. Mitigate:
|
||||
|
||||
- Prefer **scope-locked** prompt files (e.g. "edit only under `dashboards/tracker-web/`").
|
||||
- Tell prompts **not to `git push`** — review commits before they leave your machine.
|
||||
- **Same-repo safety is automatic:** jobs sharing a `cwd` (or `lock:` key) are
|
||||
serialized, so two agents never run in one repo at once — even at `--max 2+`.
|
||||
- Set a `timeout:` on long jobs so a wedged agent can't run forever.
|
||||
- Watch cost: each job is a full agent session.
|
||||
|
||||
### Portability notes
|
||||
|
||||
- **macOS** has no `flock`/`timeout`; locking relies on the single run-loop
|
||||
(enforced by the second-run refusal) and timeouts use a pure-bash watchdog.
|
||||
Install coreutils (`gtimeout`) for hard process-tree kills.
|
||||
- **Linux** (incl. Gitea CI) uses `flock` + `timeout` for cross-process hardening.
|
||||
|
||||
## Roadmap / nice-to-haves
|
||||
|
||||
- [x] Per-repo lock to serialize same-repo jobs automatically (`lock:` / cwd).
|
||||
- [x] Per-job `timeout:` with hard kill (or bash watchdog fallback).
|
||||
- [x] Stall detection in `status`/`dash`.
|
||||
- [x] `requeue` failed jobs + `clean`/archive old runs.
|
||||
- [x] Build/ship lifecycle: `review → testing → shipped` with auto-QA `verify:` gate + manual `ship`.
|
||||
- [ ] `--push` opt-in policy + commit review gate.
|
||||
- [ ] Optional notifications (Slack/desktop) on done/failed/stall.
|
||||
- [ ] Persisted run-loop as a daemon/service with auto-restart.
|
||||
@ -1,48 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# agent-queue-boot.sh — boot/login entrypoint for the agent-queue run loop.
|
||||
#
|
||||
# Launched by the macOS LaunchAgent (see launchd/) so the folder-kanban worker
|
||||
# auto-starts on login AND survives reboot/crash (LaunchAgent KeepAlive). This is
|
||||
# the reboot-persistence layer that tmux + caffeinate alone cannot provide.
|
||||
#
|
||||
# It does three things launchd's minimal environment needs:
|
||||
# 1. Repairs PATH so the agent CLIs (codex/devin/claude) + caffeinate are found.
|
||||
# 2. Loads optional overrides from ~/.agent-queue.env.
|
||||
# 3. Wraps `agent-queue run` in caffeinate (macOS) so the Mac won't sleep while
|
||||
# a job is running. NOTE: because the run loop is long-lived, this keeps the
|
||||
# machine awake for as long as the LaunchAgent runs — intended for a dedicated
|
||||
# overnight runner. Set AGENT_QUEUE_NO_CAFFEINATE=1 to allow idle sleep.
|
||||
#
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd -P)"
|
||||
|
||||
# launchd hands processes a bare PATH — prepend the usual CLI install locations
|
||||
# (Homebrew arm64/intel, ~/.local/bin for devin, system dirs) ahead of it.
|
||||
export PATH="$HOME/.local/bin:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:${PATH:-}"
|
||||
|
||||
# Optional per-machine overrides (engine, concurrency, tokens, NETWORK, etc.).
|
||||
# This file is NOT tracked — keep secrets/host-specific config here.
|
||||
if [ -f "$HOME/.agent-queue.env" ]; then
|
||||
# shellcheck disable=SC1091
|
||||
. "$HOME/.agent-queue.env"
|
||||
fi
|
||||
|
||||
# Recommended default for a local monorepo overnight runner (see long-running-jobs
|
||||
# cheat sheet): codex runs in-repo so @bytelyst/* workspace links resolve locally.
|
||||
: "${AGENT_QUEUE_ENGINE:=codex}"
|
||||
export AGENT_QUEUE_ENGINE
|
||||
|
||||
echo "[agent-queue-boot] $(date '+%Y-%m-%d %H:%M:%S') starting run loop" \
|
||||
"(engine=$AGENT_QUEUE_ENGINE, max=${AGENT_QUEUE_MAX:-3})"
|
||||
|
||||
# Keep the Mac awake for the lifetime of the loop unless explicitly opted out.
|
||||
keep=""
|
||||
if [ "${AGENT_QUEUE_NO_CAFFEINATE:-0}" != "1" ] && command -v caffeinate >/dev/null 2>&1; then
|
||||
keep="caffeinate -dimsu"
|
||||
fi
|
||||
|
||||
# exec so the LaunchAgent tracks the real worker PID (clean KeepAlive restarts).
|
||||
# shellcheck disable=SC2086
|
||||
exec $keep "$SCRIPT_DIR/agent-queue.sh" run
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,769 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
// agent-queue dashboard — a zero-dependency, INTERACTIVE TUI for the folder queue.
|
||||
//
|
||||
// Reads the same queue/ state written by agent-queue.sh and re-renders a board
|
||||
// every interval: kanban counts, running workers (engine, elapsed, last log line),
|
||||
// and a navigable, numbered job list you can act on without leaving the screen.
|
||||
//
|
||||
// Lifecycle: inbox → building → review → testing → shipped (+ failed)
|
||||
//
|
||||
// Interactive keys (when run in a TTY):
|
||||
// ↑/↓ or j/k or 1-9 select a job enter / l view its log
|
||||
// p promote s ship (testing→shipped) x reject
|
||||
// u requeue r run loop S stop g refresh now
|
||||
// ? help q / Ctrl-C quit
|
||||
// All actions shell out to agent-queue.sh — it stays the single source of truth.
|
||||
//
|
||||
// Usage: node dashboard.mjs [--interval 2] [--root /path/to/queue]
|
||||
// AGENT_QUEUE_ROOT=/path node dashboard.mjs
|
||||
// AQ_TRACKER_WEB=https://tracker.example.com node dashboard.mjs
|
||||
// (makes job tracker-item tags clickable terminal hyperlinks)
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { execFileSync, spawn } from 'node:child_process';
|
||||
import { fleetConfig, fetchBoard, fetchEvents, jobAction } from './lib/fleet-dash.mjs';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
// ── args / config ───────────────────────────────────────────────────
|
||||
const argv = process.argv.slice(2);
|
||||
const getArg = (flag, def) => {
|
||||
const i = argv.indexOf(flag);
|
||||
return i !== -1 && argv[i + 1] ? argv[i + 1] : def;
|
||||
};
|
||||
const ROOT = path.resolve(getArg('--root', process.env.AGENT_QUEUE_ROOT || path.join(__dirname, 'queue')));
|
||||
const INTERVAL = Math.max(1, parseInt(getArg('--interval', '2'), 10)) * 1000;
|
||||
// A running worker is flagged stalled if its log has not changed in this many minutes.
|
||||
const STALL_MIN = Math.max(1, parseInt(process.env.AGENT_QUEUE_STALL_MIN || '10', 10));
|
||||
|
||||
const DIRS = {
|
||||
inbox: path.join(ROOT, 'inbox'),
|
||||
building: path.join(ROOT, 'building'),
|
||||
review: path.join(ROOT, 'review'),
|
||||
testing: path.join(ROOT, 'testing'),
|
||||
shipped: path.join(ROOT, 'shipped'),
|
||||
failed: path.join(ROOT, 'failed'),
|
||||
logs: path.join(ROOT, 'logs'),
|
||||
state: path.join(ROOT, '.state'),
|
||||
};
|
||||
|
||||
// ── ansi ────────────────────────────────────────────────────────────
|
||||
const C = {
|
||||
reset: '\x1b[0m', dim: '\x1b[2m', bold: '\x1b[1m',
|
||||
red: '\x1b[31m', green: '\x1b[32m', yellow: '\x1b[33m',
|
||||
blue: '\x1b[34m', cyan: '\x1b[36m', gray: '\x1b[90m',
|
||||
};
|
||||
const c = (col, s) => `${C[col]}${s}${C.reset}`;
|
||||
|
||||
// ── helpers ─────────────────────────────────────────────────────────
|
||||
const listMd = (dir) => {
|
||||
try { return fs.readdirSync(dir).filter((f) => f.endsWith('.md')); }
|
||||
catch { return []; }
|
||||
};
|
||||
const count = (dir) => listMd(dir).length;
|
||||
|
||||
const parseMeta = (file) => {
|
||||
const out = {};
|
||||
try {
|
||||
for (const line of fs.readFileSync(file, 'utf8').split('\n')) {
|
||||
const i = line.indexOf('=');
|
||||
if (i > 0) out[line.slice(0, i)] = line.slice(i + 1);
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
return out;
|
||||
};
|
||||
|
||||
// Compact per-job insights (read-only from meta; agent-queue.sh is the source of
|
||||
// truth). Surfaces tokens or cost + attempts + line deltas for finished jobs.
|
||||
const insightsTag = (m) => {
|
||||
const parts = [];
|
||||
if (m.attempts && m.attempts !== '1') parts.push(`x${m.attempts}`);
|
||||
if (m.cost_usd) parts.push(`$${m.cost_usd}${m.usage_estimated ? '~' : ''}`);
|
||||
else if (m.tokens_in || m.tokens_out) parts.push(`tok ${m.tokens_in || 0}/${m.tokens_out || 0}`);
|
||||
if (m.lines_added || m.lines_deleted) parts.push(`+${m.lines_added || 0}/-${m.lines_deleted || 0}`);
|
||||
if (m.duration_s) parts.push(`${m.duration_s}s`);
|
||||
return parts.join(' ');
|
||||
};
|
||||
|
||||
// Manifest tags (read-only): the routing inputs an operator cares about when
|
||||
// scanning the board — priority, profile, capabilities, and a tracker-item
|
||||
// reference. Rendered from a job's meta (launched jobs) or, for never-launched
|
||||
// inbox jobs, parsed from the .md frontmatter (see readManifest). The
|
||||
// tracker-item becomes a real terminal hyperlink when AQ_TRACKER_WEB is set.
|
||||
const TRACKER_WEB = (process.env.AQ_TRACKER_WEB || '').replace(/\/+$/, '');
|
||||
const osc8 = (url, label) => `\x1b]8;;${url}\x07${label}\x1b]8;;\x07`;
|
||||
const trackerTag = (id) => {
|
||||
if (!id) return '';
|
||||
const label = `⎘ ${id}`;
|
||||
return TRACKER_WEB ? osc8(`${TRACKER_WEB}/${encodeURIComponent(id)}`, label) : label;
|
||||
};
|
||||
const PRIORITY_COLOR = { critical: 'red', high: 'yellow', medium: 'gray', low: 'gray' };
|
||||
const manifestTags = (m) => {
|
||||
if (!m) return '';
|
||||
const parts = [];
|
||||
if (m.priority && m.priority !== 'medium') {
|
||||
parts.push(c(PRIORITY_COLOR[m.priority] || 'gray', `⚑${m.priority}`));
|
||||
}
|
||||
if (m.profile) parts.push(c('blue', `◆${m.profile}`));
|
||||
if (m.capabilities) {
|
||||
const caps = String(m.capabilities).replace(/^\[|\]$/g, '').trim();
|
||||
if (caps) parts.push(c('gray', `caps ${trunc(caps, 36)}`));
|
||||
}
|
||||
if (m.tracker_item) parts.push(c('cyan', trackerTag(m.tracker_item)));
|
||||
return parts.join(' ');
|
||||
};
|
||||
|
||||
const pidAlive = (pid) => {
|
||||
if (!pid) return false;
|
||||
try { process.kill(Number(pid), 0); return true; } catch { return false; }
|
||||
};
|
||||
|
||||
const lastLogLine = (job) => {
|
||||
try {
|
||||
const txt = fs.readFileSync(path.join(DIRS.logs, `${job}.log`), 'utf8');
|
||||
const lines = txt.split('\n').map((l) => l.trim()).filter(Boolean);
|
||||
return lines.length ? lines[lines.length - 1] : '';
|
||||
} catch { return ''; }
|
||||
};
|
||||
|
||||
// seconds since a job's log was last modified (no new agent output); null if no log
|
||||
const logAgeSec = (job) => {
|
||||
try {
|
||||
const mt = fs.statSync(path.join(DIRS.logs, `${job}.log`)).mtimeMs;
|
||||
return Math.max(0, Math.floor((Date.now() - mt) / 1000));
|
||||
} catch { return null; }
|
||||
};
|
||||
|
||||
const fmtElapsed = (startSec) => {
|
||||
if (!startSec) return ' -- ';
|
||||
const s = Math.max(0, Math.floor(Date.now() / 1000) - Number(startSec));
|
||||
const m = Math.floor(s / 60);
|
||||
const h = Math.floor(m / 60);
|
||||
if (h > 0) return `${h}h${String(m % 60).padStart(2, '0')}m`;
|
||||
return `${m}m${String(s % 60).padStart(2, '0')}s`;
|
||||
};
|
||||
|
||||
const trunc = (s, n) => (s.length > n ? s.slice(0, n - 1) + '…' : s);
|
||||
const shortPath = (p) => (p || '').replace(process.env.HOME || '~', '~');
|
||||
|
||||
const readMetas = () => {
|
||||
let files = [];
|
||||
try { files = fs.readdirSync(DIRS.state).filter((f) => f.endsWith('.meta')); }
|
||||
catch { /* ignore */ }
|
||||
return files.map((f) => parseMeta(path.join(DIRS.state, f)));
|
||||
};
|
||||
|
||||
// readManifest(stage, job) — manifest tags for a job that has no launched meta
|
||||
// yet (e.g. queued in inbox/). Parses the leading --- frontmatter block of the
|
||||
// job's .md and maps the few fields manifestTags renders. Never throws.
|
||||
const FM_TAG_KEYS = {
|
||||
priority: 'priority', profile: 'profile',
|
||||
capabilities: 'capabilities', 'tracker-item': 'tracker_item',
|
||||
};
|
||||
const readManifest = (stage, job) => {
|
||||
const out = {};
|
||||
try {
|
||||
const lines = fs.readFileSync(path.join(DIRS[stage], `${job}.md`), 'utf8').split('\n');
|
||||
if ((lines[0] || '').trim() !== '---') return out;
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
if (lines[i].trim() === '---') break;
|
||||
const line = lines[i].replace(/^\s+/, '');
|
||||
const ci = line.indexOf(':');
|
||||
if (ci <= 0) continue;
|
||||
const key = line.slice(0, ci).trim();
|
||||
if (!FM_TAG_KEYS[key]) continue;
|
||||
out[FM_TAG_KEYS[key]] = line.slice(ci + 1).trim().replace(/^["']|["']$/g, '');
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
return out;
|
||||
};
|
||||
|
||||
// ── agent-queue.sh control (single source of truth) ─────────────────
|
||||
const AQ = path.join(__dirname, 'agent-queue.sh');
|
||||
const stripAnsi = (s) => (s || '').replace(/\x1b\[[0-9;]*m/g, '');
|
||||
const lastLine = (s) => {
|
||||
const lines = stripAnsi(s).split('\n').map((l) => l.trim()).filter(Boolean);
|
||||
return lines.length ? lines[lines.length - 1] : '';
|
||||
};
|
||||
|
||||
// aq(args) — run an agent-queue.sh subcommand, capturing output (never throws).
|
||||
const aq = (args) => {
|
||||
try {
|
||||
const out = execFileSync('bash', [AQ, ...args], {
|
||||
encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'],
|
||||
env: { ...process.env, AGENT_QUEUE_ROOT: ROOT },
|
||||
});
|
||||
return { ok: true, out };
|
||||
} catch (e) {
|
||||
return { ok: false, out: ((e.stdout || '') + (e.stderr || '') || e.message || '').toString() };
|
||||
}
|
||||
};
|
||||
|
||||
// ── fleet mode (Phase 3: TUI re-pointed at /fleet) ──────────────────
|
||||
// Opt-in via AQ_FLEET_DASH=1. When ON, the board is sourced from the
|
||||
// platform-service /fleet REST API instead of the local queue; when OFF, every
|
||||
// fleet code path below is skipped and the dashboard is byte-for-byte the local
|
||||
// tool. All fleet I/O lives in lib/fleet-dash.mjs (injectable + unit-tested).
|
||||
const FLEET = fleetConfig();
|
||||
// Latest fleet board snapshot. On a refresh failure we KEEP the last good board
|
||||
// (no destructive flicker) and surface a staleness banner instead.
|
||||
let fleetState = { board: null, error: null, loading: FLEET.enabled, lastOk: 0 };
|
||||
let fleetRefreshing = false; // single-flight guard (no overlapping fetches)
|
||||
let fleetEvents = { jobId: null, lines: [], error: null, loading: false };
|
||||
|
||||
// refreshFleet() — single-flight board refresh. Applies a successful board;
|
||||
// on failure preserves the previous board and records the error.
|
||||
const refreshFleet = async () => {
|
||||
if (fleetRefreshing) return;
|
||||
fleetRefreshing = true;
|
||||
try {
|
||||
const r = await fetchBoard(FLEET);
|
||||
if (r.ok) fleetState = { board: r.board, error: null, loading: false, lastOk: Date.now() };
|
||||
else fleetState = { ...fleetState, error: r.error, loading: false };
|
||||
} catch (e) {
|
||||
fleetState = { ...fleetState, error: (e && e.message) || 'refresh failed', loading: false };
|
||||
} finally {
|
||||
fleetRefreshing = false;
|
||||
}
|
||||
};
|
||||
|
||||
// refreshFleetEvents(job) — load a job's event stream into the log view state.
|
||||
const refreshFleetEvents = (job) => {
|
||||
fleetEvents = { jobId: job, lines: [], error: null, loading: true };
|
||||
fetchEvents(FLEET, job).then((r) => {
|
||||
fleetEvents = { jobId: job, lines: r.lines || [], error: r.ok ? null : r.error, loading: false };
|
||||
if (mode === 'log' && logJob === job) draw();
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
// daemonPid() — pid of a live `run` loop, or null.
|
||||
const daemonPid = () => {
|
||||
try {
|
||||
const pid = fs.readFileSync(path.join(DIRS.state, 'daemon.pid'), 'utf8').trim();
|
||||
return pid && pidAlive(pid) ? pid : null;
|
||||
} catch { return null; }
|
||||
};
|
||||
|
||||
// startRun() — spawn a detached `run` loop writing to logs/run-loop.log.
|
||||
const startRun = () => {
|
||||
if (daemonPid()) { setFlash(c('yellow', 'run loop already active')); return; }
|
||||
try {
|
||||
const fd = fs.openSync(path.join(DIRS.logs, 'run-loop.log'), 'a');
|
||||
const child = spawn('bash', [AQ, 'run'], {
|
||||
detached: true, stdio: ['ignore', fd, fd],
|
||||
env: { ...process.env, AGENT_QUEUE_ROOT: ROOT },
|
||||
});
|
||||
child.unref();
|
||||
setFlash(c('green', `▶ run loop started (max ${process.env.AGENT_QUEUE_MAX || 3})`));
|
||||
} catch (e) { setFlash(c('red', `run failed: ${e.message}`)); }
|
||||
};
|
||||
|
||||
// ── interactive state ───────────────────────────────────────────────
|
||||
const INTERACTIVE = !!process.stdin.isTTY;
|
||||
const ACTION_STAGES = ['review', 'testing', 'failed', 'inbox'];
|
||||
let mode = 'board'; // 'board' | 'log' | 'help' | 'confirm'
|
||||
let items = []; // actionable jobs, rebuilt each draw
|
||||
let selIdx = 0; // selected index into items
|
||||
let selJob = null; // selected job name (stable across refreshes)
|
||||
let flash = ''; // transient status message
|
||||
let flashUntil = 0;
|
||||
let logJob = null; // job whose log is being viewed
|
||||
let confirmAction = null; // { verb, job, run }
|
||||
|
||||
const setFlash = (msg, ms = 4000) => { flash = msg; flashUntil = Date.now() + ms; };
|
||||
const flashLine = () => (flash && Date.now() < flashUntil ? flash : '');
|
||||
|
||||
const buildItems = () => {
|
||||
if (FLEET.enabled) {
|
||||
const b = fleetState.board;
|
||||
if (!b) return [];
|
||||
return b.items.map((it) => ({ stage: it.stage, job: it.id, fleet: it }));
|
||||
}
|
||||
const list = [];
|
||||
for (const st of ACTION_STAGES) {
|
||||
for (const f of listMd(DIRS[st]).sort()) list.push({ stage: st, job: f.replace(/\.md$/, '') });
|
||||
}
|
||||
return list;
|
||||
};
|
||||
|
||||
const syncSelection = () => {
|
||||
if (selJob) {
|
||||
const i = items.findIndex((it) => it.job === selJob);
|
||||
if (i >= 0) { selIdx = i; return; }
|
||||
}
|
||||
selIdx = Math.max(0, Math.min(selIdx, items.length - 1));
|
||||
selJob = items[selIdx]?.job ?? null;
|
||||
};
|
||||
|
||||
const STAGE_TAG = {
|
||||
review: () => c('cyan', '[review ]'),
|
||||
testing: () => c('cyan', '[testing]'),
|
||||
failed: () => c('red', '[failed ]'),
|
||||
inbox: () => c('blue', '[inbox ]'),
|
||||
};
|
||||
|
||||
// gate(verb, stage) — is this action valid for a job in this stage?
|
||||
const gate = (verb, stage) => ({
|
||||
promote: stage === 'review' || stage === 'testing',
|
||||
ship: stage === 'testing',
|
||||
reject: stage === 'review' || stage === 'testing',
|
||||
requeue: stage === 'failed' || stage === 'review' || stage === 'testing',
|
||||
logs: true,
|
||||
}[verb]);
|
||||
|
||||
// doAction(verb) — run the gated action on the selected job. In fleet mode it
|
||||
// calls the /fleet API (lib/fleet-dash.mjs); otherwise it shells out to
|
||||
// agent-queue.sh. promote is unavailable in fleet mode (no safe server contract).
|
||||
const doAction = (verb) => {
|
||||
const it = items[selIdx];
|
||||
if (!it) { setFlash(c('gray', 'no job selected')); return; }
|
||||
if (FLEET.enabled && verb === 'promote') {
|
||||
setFlash(c('gray', 'promote n/a in fleet mode (use ship/requeue/reject)'));
|
||||
return;
|
||||
}
|
||||
if (!gate(verb, it.stage)) { setFlash(c('gray', `${verb} not valid for a ${it.stage} job`)); return; }
|
||||
if ((verb === 'reject' || verb === 'requeue') && mode !== 'confirm') {
|
||||
confirmAction = { verb, job: it.job, run: () => doAction(verb) };
|
||||
mode = 'confirm';
|
||||
return;
|
||||
}
|
||||
if (FLEET.enabled) {
|
||||
setFlash(c('gray', `${verb}…`));
|
||||
mode = 'board'; confirmAction = null;
|
||||
jobAction(FLEET, it.fleet, verb).then((r) => {
|
||||
setFlash((r.ok ? c('green', '✓ ') : c('red', '✗ ')) + (r.message || `${verb} ${it.job}`));
|
||||
refreshFleet().then(draw);
|
||||
});
|
||||
return;
|
||||
}
|
||||
const r = aq([verb, it.job]);
|
||||
setFlash((r.ok ? c('green', '✓ ') : c('red', '✗ ')) + (lastLine(r.out) || `${verb} ${it.job}`));
|
||||
mode = 'board'; confirmAction = null;
|
||||
};
|
||||
|
||||
// ── render ──────────────────────────────────────────────────────────
|
||||
const ENGINE_COLOR = { devin: 'cyan', claude: 'yellow', codex: 'green' };
|
||||
|
||||
const FLEET_STAGE_COLOR = {
|
||||
queued: 'blue', assigned: 'yellow', building: 'yellow',
|
||||
review: 'cyan', testing: 'cyan', shipped: 'green',
|
||||
failed: 'red', dead_letter: 'red',
|
||||
};
|
||||
|
||||
// drawFleetBoard() — the board sourced from the /fleet API (AQ_FLEET_DASH=1).
|
||||
// Mirrors the local board layout; running rows reflect lease/factory status
|
||||
// (there is no local PID/liveness in fleet mode).
|
||||
function drawFleetBoard() {
|
||||
items = buildItems();
|
||||
syncSelection();
|
||||
const board = fleetState.board;
|
||||
const counts = board ? board.counts : { inbox: 0, building: 0, review: 0, testing: 0, shipped: 0, failed: 0 };
|
||||
const running = board ? board.running : [];
|
||||
const recent = board ? board.recent : [];
|
||||
|
||||
const out = [];
|
||||
out.push('');
|
||||
out.push(` ${C.bold}AGENT QUEUE${C.reset} ${c('cyan', 'fleet')} ${c('gray', FLEET.api)}`);
|
||||
const staleSec = fleetState.lastOk ? Math.floor((Date.now() - fleetState.lastOk) / 1000) : null;
|
||||
let statusBit;
|
||||
if (fleetState.loading && !board) statusBit = c('gray', '◌ loading…');
|
||||
else if (fleetState.error) statusBit = c('red', `⚠ ${trunc(fleetState.error, 40)}${board ? ` (stale ${staleSec}s)` : ''}`);
|
||||
else statusBit = c('green', `● live${staleSec !== null ? ` (${staleSec}s ago)` : ''}`);
|
||||
out.push(
|
||||
` ${c('gray', new Date().toLocaleTimeString())} refresh ${INTERVAL / 1000}s ${statusBit}` +
|
||||
` ${c('gray', `product ${FLEET.productId}`)} ${c('gray', INTERACTIVE ? 'press ? for help' : 'read-only')}`
|
||||
);
|
||||
out.push('');
|
||||
out.push(
|
||||
` ${c('blue', '▢ inbox')} ${String(counts.inbox).padEnd(3)}` +
|
||||
` ${c('yellow', '◧ building')} ${String(counts.building).padEnd(3)}` +
|
||||
` ${c('cyan', '◔ review')} ${String(counts.review).padEnd(3)}` +
|
||||
` ${c('cyan', '◕ testing')} ${String(counts.testing).padEnd(3)}` +
|
||||
` ${c('green', '▣ shipped')} ${String(counts.shipped).padEnd(3)}` +
|
||||
` ${c('red', '✕ failed')} ${String(counts.failed).padEnd(3)}` +
|
||||
` ${C.bold}running ${running.length}${C.reset}`
|
||||
);
|
||||
out.push('');
|
||||
|
||||
// factories (per-factory rows when /fleet/factories exists; else metrics aggregate)
|
||||
out.push(` ${C.bold}FACTORIES${C.reset}`);
|
||||
const factories = board ? board.factories : [];
|
||||
const metrics = board ? board.metrics : null;
|
||||
if (factories.length > 0) {
|
||||
for (const f of factories) {
|
||||
const health = String(f.health || 'ok');
|
||||
const hc = health === 'ok' ? 'green' : health === 'degraded' ? 'yellow' : 'red';
|
||||
out.push(
|
||||
` ${c('bold', trunc(f.factoryId || f.id || '?', 24).padEnd(24))} ` +
|
||||
`${c(hc, health.padEnd(9))} ` +
|
||||
`${c('gray', `load ${f.load ?? '?'}/${f.seatLimit ?? '?'}`)} ` +
|
||||
`${c('gray', trunc((Array.isArray(f.capabilities) ? f.capabilities.join(', ') : f.capabilities) || '', 36))}`
|
||||
);
|
||||
}
|
||||
} else if (metrics && metrics.factory) {
|
||||
const fm = metrics.factory;
|
||||
const bh = fm.byHealth || {};
|
||||
out.push(
|
||||
` ${c('green', `ok ${bh.ok ?? 0}`)} ${c('yellow', `degraded ${bh.degraded ?? 0}`)} ${c('red', `down ${bh.down ?? 0}`)}` +
|
||||
` ${c('gray', `live ${fm.live ?? '?'} · stale ${fm.stale ?? '?'}`)}` +
|
||||
` ${c('gray', `seats ${fm.seatsUsed ?? '?'}/${fm.seatsTotal ?? '?'}`)}` +
|
||||
` ${c('gray', `util ${metrics.utilizationPct ?? '?'}%`)}`
|
||||
);
|
||||
} else {
|
||||
out.push(` ${c('dim', 'no factory data')}`);
|
||||
}
|
||||
// alerts (from metrics)
|
||||
if (metrics && Array.isArray(metrics.alerts) && metrics.alerts.length > 0) {
|
||||
for (const a of metrics.alerts) {
|
||||
const sev = a.severity === 'critical' ? 'red' : 'yellow';
|
||||
out.push(` ${c(sev, '⚠ ')}${c(sev, a.kind || 'alert')}${a.message ? c('gray', ` — ${trunc(a.message, 50)}`) : ''}`);
|
||||
}
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// running (lease/factory status — no local pid)
|
||||
out.push(` ${C.bold}RUNNING${C.reset}`);
|
||||
if (running.length === 0) {
|
||||
out.push(` ${c('dim', 'no jobs in flight')}`);
|
||||
} else {
|
||||
for (const r of running) {
|
||||
const sc = FLEET_STAGE_COLOR[r.fleetStage] || 'gray';
|
||||
out.push(
|
||||
` ${c('bold', trunc(r.id, 30).padEnd(30))} ` +
|
||||
`${c(sc, String(r.fleetStage).padEnd(9))} ` +
|
||||
`${c('gray', r.factoryId ? `@${trunc(r.factoryId, 18)}` : 'unassigned')}`
|
||||
);
|
||||
const mtags = manifestTags(r);
|
||||
if (mtags) out.push(` ${mtags}`);
|
||||
}
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// actionable jobs (numbered + selectable) — reuses STAGE_TAG buckets
|
||||
out.push(` ${C.bold}JOBS${C.reset} ${c('gray', '(review · testing · failed · inbox)')}`);
|
||||
if (items.length === 0) {
|
||||
out.push(` ${c('dim', board ? 'no actionable jobs' : 'waiting for fleet…')}`);
|
||||
} else {
|
||||
items.forEach((it, i) => {
|
||||
const sel = i === selIdx;
|
||||
const ptr = sel ? c('cyan', '▶') : ' ';
|
||||
const num = c('gray', String(i + 1).padStart(2) + '.');
|
||||
const tag = (STAGE_TAG[it.stage] || (() => `[${it.stage}]`))();
|
||||
const name = sel ? `${C.bold}${trunc(it.job, 46)}${C.reset}` : trunc(it.job, 46);
|
||||
out.push(` ${ptr} ${num} ${tag} ${name}`);
|
||||
const jtags = manifestTags(it.fleet);
|
||||
if (jtags) out.push(` ${jtags}`);
|
||||
});
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// recent (shipped + failed)
|
||||
out.push(` ${C.bold}RECENT${C.reset}`);
|
||||
if (recent.length === 0) {
|
||||
out.push(` ${c('dim', 'nothing finished yet')}`);
|
||||
} else {
|
||||
for (const r of recent) {
|
||||
const failedRes = r.stage === 'failed';
|
||||
const mark = failedRes ? c('red', '✕') : c('green', '▣');
|
||||
const label = failedRes ? c('red', r.fleetStage) : c('green', r.fleetStage);
|
||||
const when = r.updatedAt ? new Date(r.updatedAt).toLocaleTimeString() : '';
|
||||
out.push(` ${mark} ${trunc(r.id, 34).padEnd(34)} ${label} ${c('gray', when)}`);
|
||||
}
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// flash + footer
|
||||
const fl = flashLine();
|
||||
if (fl) out.push(` ${fl}`);
|
||||
if (mode === 'confirm' && confirmAction) {
|
||||
out.push(` ${c('yellow', `${confirmAction.verb} "${confirmAction.job}" ? `)}${C.bold}y${C.reset}${c('gray', '/')}${C.bold}n${C.reset}`);
|
||||
} else if (INTERACTIVE) {
|
||||
out.push(c('gray', ' ↑/↓ select · enter events · s ship · x reject · u requeue'));
|
||||
out.push(c('gray', ' g refresh · ? help · q quit'));
|
||||
}
|
||||
process.stdout.write('\x1b[2J\x1b[H' + out.join('\n') + '\n');
|
||||
}
|
||||
|
||||
function drawBoard() {
|
||||
if (FLEET.enabled) return drawFleetBoard();
|
||||
const metas = readMetas();
|
||||
const metaByJob = Object.fromEntries(metas.filter((m) => m.job).map((m) => [m.job, m]));
|
||||
const running = metas.filter((m) => !m.ended && pidAlive(m.pid));
|
||||
const finished = metas
|
||||
.filter((m) => m.ended)
|
||||
.sort((a, b) => Number(b.ended) - Number(a.ended));
|
||||
|
||||
const counts = {
|
||||
inbox: count(DIRS.inbox), building: count(DIRS.building),
|
||||
review: count(DIRS.review), testing: count(DIRS.testing),
|
||||
shipped: count(DIRS.shipped), failed: count(DIRS.failed),
|
||||
};
|
||||
|
||||
// rebuild actionable list + keep selection stable
|
||||
items = buildItems();
|
||||
syncSelection();
|
||||
|
||||
const loop = daemonPid();
|
||||
const out = [];
|
||||
out.push('');
|
||||
out.push(` ${C.bold}AGENT QUEUE${C.reset} ${c('gray', ROOT)}`);
|
||||
out.push(
|
||||
` ${c('gray', new Date().toLocaleTimeString())} refresh ${INTERVAL / 1000}s ` +
|
||||
(loop ? c('green', `● run loop pid ${loop}`) : c('gray', '○ run loop stopped')) +
|
||||
` ${c('gray', INTERACTIVE ? 'press ? for help' : 'read-only')}`
|
||||
);
|
||||
out.push('');
|
||||
out.push(
|
||||
` ${c('blue', '▢ inbox')} ${String(counts.inbox).padEnd(3)}` +
|
||||
` ${c('yellow', '◧ building')} ${String(counts.building).padEnd(3)}` +
|
||||
` ${c('cyan', '◔ review')} ${String(counts.review).padEnd(3)}` +
|
||||
` ${c('cyan', '◕ testing')} ${String(counts.testing).padEnd(3)}` +
|
||||
` ${c('green', '▣ shipped')} ${String(counts.shipped).padEnd(3)}` +
|
||||
` ${c('red', '✕ failed')} ${String(counts.failed).padEnd(3)}` +
|
||||
` ${C.bold}running ${running.length}${C.reset}`
|
||||
);
|
||||
out.push('');
|
||||
|
||||
// running table
|
||||
out.push(` ${C.bold}RUNNING${C.reset}`);
|
||||
if (running.length === 0) {
|
||||
out.push(` ${c('dim', 'no workers running')}`);
|
||||
} else {
|
||||
for (const m of running) {
|
||||
const eng = m.engine || '?';
|
||||
const engC = ENGINE_COLOR[eng] || 'gray';
|
||||
const age = logAgeSec(m.job);
|
||||
const stalled = age !== null && age > STALL_MIN * 60;
|
||||
out.push(
|
||||
` ${c('bold', trunc(m.job || '?', 30).padEnd(30))} ` +
|
||||
`${c(engC, eng.padEnd(7))} ` +
|
||||
`${fmtElapsed(m.started).padStart(7)} ` +
|
||||
`${c('gray', 'pid ' + (m.pid || '?'))}` +
|
||||
`${stalled ? ' ' + c('red', '⚠ stalled') : ''}`
|
||||
);
|
||||
out.push(` ${c('dim', trunc(shortPath(m.cwd || ''), 70))}`);
|
||||
const mtags = manifestTags(m);
|
||||
if (mtags) out.push(` ${mtags}`);
|
||||
const last = lastLogLine(m.job);
|
||||
if (last) out.push(` ${c('cyan', '› ')}${c('dim', trunc(last, 70))}`);
|
||||
}
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// actionable job list (numbered + selectable)
|
||||
out.push(` ${C.bold}JOBS${C.reset} ${c('gray', '(review · testing · failed · inbox)')}`);
|
||||
if (items.length === 0) {
|
||||
out.push(` ${c('dim', 'no actionable jobs')}`);
|
||||
} else {
|
||||
items.forEach((it, i) => {
|
||||
const sel = i === selIdx;
|
||||
const ptr = sel ? c('cyan', '▶') : ' ';
|
||||
const num = c('gray', String(i + 1).padStart(2) + '.');
|
||||
const tag = (STAGE_TAG[it.stage] || (() => `[${it.stage}]`))();
|
||||
const name = sel ? `${C.bold}${trunc(it.job, 46)}${C.reset}` : trunc(it.job, 46);
|
||||
out.push(` ${ptr} ${num} ${tag} ${name}`);
|
||||
const jtags = manifestTags(metaByJob[it.job] || readManifest(it.stage, it.job));
|
||||
if (jtags) out.push(` ${jtags}`);
|
||||
});
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// recent finished
|
||||
out.push(` ${C.bold}RECENT${C.reset}`);
|
||||
const recent = finished.slice(0, 5);
|
||||
if (recent.length === 0) {
|
||||
out.push(` ${c('dim', 'nothing finished yet')}`);
|
||||
} else {
|
||||
for (const m of recent) {
|
||||
const res = m.result || '';
|
||||
const failedRes = res === 'failed' || res === 'timeout' || res === 'verify_failed' ||
|
||||
res === 'rejected' || res === 'retries_exhausted' || res === 'capability_mismatch' ||
|
||||
res === 'budget_exceeded' || res === 'no_engine';
|
||||
const mark = failedRes ? c('red', '✕') : c('green', '▣');
|
||||
const when = m.ended ? new Date(Number(m.ended) * 1000).toLocaleTimeString() : '';
|
||||
let label;
|
||||
if (res === 'shipped') label = c('green', 'shipped');
|
||||
else if (res === 'testing') label = c('cyan', 'testing (QA)');
|
||||
else if (res === 'review') label = c('cyan', 'review');
|
||||
else if (res === 'verify_failed') label = c('red', 'verify failed');
|
||||
else if (res === 'timeout') label = c('red', 'timeout');
|
||||
else if (res === 'budget_exceeded') label = c('red', 'budget exceeded');
|
||||
else if (res === 'rejected') label = c('red', 'rejected');
|
||||
else if (res === 'retries_exhausted') label = c('red', 'retries exhausted');
|
||||
else if (res === 'failed') label = c('red', 'failed rc=' + (m.exit || '?'));
|
||||
else label = c('gray', res || '?');
|
||||
out.push(
|
||||
` ${mark} ${trunc(m.job || '?', 34).padEnd(34)} ` +
|
||||
`${c('gray', (m.engine || '').padEnd(7))} ` +
|
||||
`${label} ${c('gray', when)} ${c('cyan', insightsTag(m))}`
|
||||
);
|
||||
}
|
||||
}
|
||||
out.push('');
|
||||
|
||||
// flash + footer
|
||||
const fl = flashLine();
|
||||
if (fl) out.push(` ${fl}`);
|
||||
if (mode === 'confirm' && confirmAction) {
|
||||
out.push(` ${c('yellow', `${confirmAction.verb} "${confirmAction.job}" ? `)}${C.bold}y${C.reset}${c('gray', '/')}${C.bold}n${C.reset}`);
|
||||
} else if (INTERACTIVE) {
|
||||
out.push(c('gray', ' ↑/↓ select · enter logs · p promote · s ship · x reject · u requeue'));
|
||||
out.push(c('gray', ' r run · S stop · g refresh · ? help · q quit'));
|
||||
}
|
||||
|
||||
process.stdout.write('\x1b[2J\x1b[H' + out.join('\n') + '\n');
|
||||
}
|
||||
|
||||
function drawLog() {
|
||||
const rows = (process.stdout.rows || 30) - 6;
|
||||
if (FLEET.enabled) {
|
||||
let body;
|
||||
if (fleetEvents.loading) body = c('gray', ' loading events…');
|
||||
else if (fleetEvents.error) body = c('red', ` ${fleetEvents.error}`);
|
||||
else if (!fleetEvents.lines.length) body = c('gray', ' no events for this job');
|
||||
else body = fleetEvents.lines.slice(-rows).join('\n');
|
||||
const head = ` ${C.bold}EVENTS${C.reset} ${c('cyan', logJob)} ${c('gray', 'q/esc back · g refresh')}`;
|
||||
process.stdout.write('\x1b[2J\x1b[H' + head + '\n' + c('gray', ' ' + '─'.repeat(60)) + '\n' + body + '\n');
|
||||
return;
|
||||
}
|
||||
let body = `no log for ${logJob}`;
|
||||
try {
|
||||
const txt = fs.readFileSync(path.join(DIRS.logs, `${logJob}.log`), 'utf8');
|
||||
body = txt.split('\n').slice(-rows).join('\n');
|
||||
} catch { /* keep default */ }
|
||||
const head = ` ${C.bold}LOG${C.reset} ${c('cyan', logJob)} ${c('gray', 'q/esc back · g refresh')}`;
|
||||
process.stdout.write('\x1b[2J\x1b[H' + head + '\n' + c('gray', ' ' + '─'.repeat(60)) + '\n' + body + '\n');
|
||||
}
|
||||
|
||||
function drawHelp() {
|
||||
const L = [
|
||||
'', ` ${C.bold}AGENT QUEUE — keys${C.reset}`,
|
||||
FLEET.enabled ? ` ${c('cyan', 'fleet mode')} ${c('gray', '— board sourced from /fleet API; run/stop/promote disabled')}` : '', '',
|
||||
` ${c('cyan', '↑/↓, j/k, 1-9')} select a job in the JOBS list`,
|
||||
` ${c('cyan', 'enter / l')} ${FLEET.enabled ? "view the selected job's events" : "view the selected job's log (live)"}`,
|
||||
` ${c('cyan', 'p')} promote (review → testing → shipped)`,
|
||||
` ${c('cyan', 's')} ship (testing/QA → shipped, the manual gate)`,
|
||||
` ${c('cyan', 'x')} reject (review/testing → failed) ${c('gray', '[confirm]')}`,
|
||||
` ${c('cyan', 'u')} requeue (failed/review/testing → inbox) ${c('gray', '[confirm]')}`,
|
||||
'',
|
||||
` ${c('cyan', 'r')} start the run loop (detached, max ${process.env.AGENT_QUEUE_MAX || 3})`,
|
||||
` ${c('cyan', 'S')} stop the run loop + running workers`,
|
||||
` ${c('cyan', 'g')} refresh now`,
|
||||
` ${c('cyan', '? / h')} toggle this help`,
|
||||
` ${c('cyan', 'q / Ctrl-C')} quit`,
|
||||
'',
|
||||
` ${c('gray', 'Lifecycle: inbox → building → review → testing → shipped (+ failed)')}`,
|
||||
` ${c('gray', 'auto: rc=0 → review; verify pass → testing; verify fail → failed')}`,
|
||||
` ${c('gray', 'manual: ship (testing → shipped)')}`,
|
||||
'', ` ${c('gray', 'press any key to return')}`, '',
|
||||
];
|
||||
process.stdout.write('\x1b[2J\x1b[H' + L.join('\n') + '\n');
|
||||
}
|
||||
|
||||
const draw = () => {
|
||||
if (mode === 'log') drawLog();
|
||||
else if (mode === 'help') drawHelp();
|
||||
else drawBoard();
|
||||
};
|
||||
|
||||
// ── main loop + key handling ────────────────────────────────────────
|
||||
// Fleet mode (AQ_FLEET_DASH=1) sources the board from the /fleet API on an async,
|
||||
// single-flight tick loop. Local mode keeps the original synchronous setInterval
|
||||
// path byte-for-byte. `timer` holds whichever timer is live so quit() can clear it.
|
||||
let timer = null;
|
||||
|
||||
if (FLEET.enabled) {
|
||||
if (!FLEET.ok) {
|
||||
process.stdout.write(
|
||||
`agent-queue: fleet dashboard enabled (AQ_FLEET_DASH=1) but missing config:\n` +
|
||||
` ${FLEET.missing.join(', ')}\n` +
|
||||
`Set AQ_FLEET_API, AQ_FLEET_TOKEN and AQ_PRODUCT_ID, or unset AQ_FLEET_DASH.\n`
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
draw(); // initial frame (loading…)
|
||||
const tick = async () => {
|
||||
await refreshFleet();
|
||||
if (mode !== 'log' && mode !== 'help') draw();
|
||||
timer = setTimeout(tick, INTERVAL); // single-flight: schedule only after the await
|
||||
};
|
||||
tick();
|
||||
} else {
|
||||
if (!fs.existsSync(ROOT)) {
|
||||
process.stdout.write(`agent-queue: queue root not found: ${ROOT}\nRun \`agent-queue.sh init\` first.\n`);
|
||||
process.exit(1);
|
||||
}
|
||||
draw();
|
||||
timer = setInterval(draw, INTERVAL);
|
||||
}
|
||||
|
||||
const quit = () => {
|
||||
if (timer) { clearTimeout(timer); clearInterval(timer); }
|
||||
try { if (process.stdin.isTTY) process.stdin.setRawMode(false); } catch { /* noop */ }
|
||||
process.stdout.write(C.reset + '\n');
|
||||
process.exit(0);
|
||||
};
|
||||
|
||||
const moveSel = (delta) => {
|
||||
if (items.length === 0) return;
|
||||
selIdx = (selIdx + delta + items.length) % items.length;
|
||||
selJob = items[selIdx]?.job ?? null;
|
||||
};
|
||||
|
||||
function onKey(key) {
|
||||
// global quit
|
||||
if (key === '\u0003') return quit(); // Ctrl-C always quits
|
||||
|
||||
if (mode === 'help') { mode = 'board'; return draw(); }
|
||||
if (mode === 'log') {
|
||||
if (key === 'q' || key === '\u001b' || key === '\r' || key === '\n') { mode = 'board'; logJob = null; }
|
||||
else if (key === 'g') { if (FLEET.enabled && logJob) refreshFleetEvents(logJob); }
|
||||
return draw();
|
||||
}
|
||||
if (mode === 'confirm') {
|
||||
if (key === 'y' || key === 'Y') confirmAction?.run();
|
||||
else { mode = 'board'; confirmAction = null; setFlash(c('gray', 'cancelled')); }
|
||||
return draw();
|
||||
}
|
||||
|
||||
// board mode
|
||||
switch (key) {
|
||||
case 'q': return quit();
|
||||
case '?': case 'h': mode = 'help'; break;
|
||||
case 'g': break; // just redraw
|
||||
case 'j': case '\u001b[B': moveSel(1); break;
|
||||
case 'k': case '\u001b[A': moveSel(-1); break;
|
||||
case '\r': case '\n': case 'l':
|
||||
if (items[selIdx]) {
|
||||
logJob = items[selIdx].job; mode = 'log';
|
||||
if (FLEET.enabled) refreshFleetEvents(logJob);
|
||||
}
|
||||
break;
|
||||
case 'p': doAction('promote'); break;
|
||||
case 's': doAction('ship'); break;
|
||||
case 'x': doAction('reject'); break;
|
||||
case 'u': doAction('requeue'); break;
|
||||
case 'r': if (FLEET.enabled) setFlash(c('gray', 'run loop n/a in fleet mode')); else startRun(); break;
|
||||
case 'S':
|
||||
if (FLEET.enabled) { setFlash(c('gray', 'stop n/a in fleet mode')); break; }
|
||||
{ const res = aq(['stop']); setFlash(c('red', '■ ') + (lastLine(res.out) || 'stopped')); break; }
|
||||
default:
|
||||
if (/^[1-9]$/.test(key)) {
|
||||
const i = parseInt(key, 10) - 1;
|
||||
if (i < items.length) { selIdx = i; selJob = items[i].job; }
|
||||
} else { return; } // ignore unknown keys (no redraw)
|
||||
}
|
||||
draw();
|
||||
}
|
||||
|
||||
if (INTERACTIVE) {
|
||||
process.stdin.setRawMode(true);
|
||||
process.stdin.resume();
|
||||
process.stdin.setEncoding('utf8');
|
||||
process.stdin.on('data', onKey);
|
||||
}
|
||||
process.on('SIGINT', quit);
|
||||
process.on('SIGTERM', quit);
|
||||
@ -1,83 +0,0 @@
|
||||
# Two-Factory Parallel Demo (Phase-2 Exit Criteria, §14)
|
||||
|
||||
This demo closes the final Phase-2 exit-criteria box: **≥2 factories executing jobs in
|
||||
parallel through one coordinator**, proving the concurrency guarantees end-to-end. It is a
|
||||
**harness over the existing runtime** — it does *not* change `agent-queue.sh` or
|
||||
`lib/fleet-client.sh`; it starts two real `agent-queue.sh run` daemons (distinct
|
||||
factoryIds, separate queues/cwds) that compete **only** through the coordinator, then
|
||||
observes and asserts.
|
||||
|
||||
## The three guarantees it proves
|
||||
|
||||
| # | Guarantee | How it's shown |
|
||||
|---|-----------|----------------|
|
||||
| **(a)** | **No double-assign** | Each of the 3 jobs is claimed/executed by exactly **one** factory. The coordinator's atomic claim (lock-guarded; only a `queued` job is claimable) means two concurrent claimers never get the same job version. |
|
||||
| **(b)** | **Fencing + reclaim** | One factory is **killed mid-job**. The reaper returns its in-flight job to `queued` with a **bumped lease epoch**; the surviving factory **reclaims and completes** it. The dead worker's late/zombie report (stale epoch) is **fenced (HTTP 409)** and never ships. |
|
||||
| **(c)** | **Parallelism** | Both factories hold an active job **simultaneously** (observed in coordinator state) — work is concurrent, not serialized. |
|
||||
|
||||
## Run it
|
||||
|
||||
### Stub mode (default, zero dependencies, CI-safe)
|
||||
|
||||
```bash
|
||||
bash demo/two-factory-demo.sh
|
||||
```
|
||||
|
||||
Drives [`coordinator-stub.sh`](coordinator-stub.sh) — a stateful, lock-guarded, file-backed
|
||||
coordinator that implements the same claim / lease / fence / reaper contract as
|
||||
platform-service, via the existing `AQ_FLEET_API_CMD` test seam. No platform-service, no
|
||||
Cosmos, no network. This is exactly what `selftest.sh` runs headlessly.
|
||||
|
||||
### Real-coordinator mode (against a live platform-service)
|
||||
|
||||
```bash
|
||||
DEMO_MODE=real \
|
||||
AQ_FLEET_API=http://localhost:4003/api \
|
||||
AQ_FLEET_TOKEN=<bearer> \
|
||||
AQ_PRODUCT_ID=<product> \
|
||||
bash demo/two-factory-demo.sh
|
||||
```
|
||||
|
||||
In real mode the demo submits via the platform-service fleet API and relies on the
|
||||
coordinator's **own lease reaper** to reclaim the killed factory's job (it waits
|
||||
`DEMO_REAP_WAIT` seconds; pair with a short `AQ_FLEET_LEASE_SECONDS` so the lease expires
|
||||
quickly). Submit endpoint is overridable via `DEMO_SUBMIT_PATH` (default `/fleet/jobs`).
|
||||
Real mode is observational/best-effort — the machine-checked assertions run in stub mode
|
||||
(and in `selftest.sh`).
|
||||
|
||||
## Env knobs
|
||||
|
||||
| Var | Default | Meaning |
|
||||
|-----|---------|---------|
|
||||
| `DEMO_MODE` | `stub` | `stub` or `real` (auto-set to `real` when `AQ_FLEET_API`+`AQ_FLEET_TOKEN` are set and `DEMO_MODE` ≠ `stub`) |
|
||||
| `DEMO_JOB_SLEEP` | `2` | per-job engine seconds — the window during which the victim is killed mid-job |
|
||||
| `DEMO_TIMEOUT` | `60` | max seconds to wait for the survivor to drain all 3 jobs |
|
||||
| `DEMO_POLL` | `0.2` | coordinator-state poll interval |
|
||||
| `DEMO_FACTORY_1` / `DEMO_FACTORY_2` | `mac-1` / `ubuntu-1` | factory ids (F1 is the victim) |
|
||||
| `DEMO_KEEP` | `0` | `1` keeps the temp dir (queues, logs, coordinator state) for inspection |
|
||||
| `DEMO_REAP_WAIT` / `DEMO_DRAIN_WAIT` | `20` / `30` | real-mode waits for the coordinator reaper / drain |
|
||||
|
||||
## What to watch
|
||||
|
||||
The demo prints a step-by-step trace and a final `RESULTS` block. The key lines:
|
||||
|
||||
- `PARALLELISM observed: mac-1 and ubuntu-1 both holding active jobs concurrently` — guarantee (c).
|
||||
- `killed factory mac-1 ... mid-job` then `reaper reclaimed mac-1's lease(s)` — the crash + reclaim.
|
||||
- `zombie report for <job> @epoch=N was FENCED (HTTP 409)` — guarantee (b) fencing.
|
||||
- `RESULTS` shows each job's winning factory; the reclaimed job's winner is the **survivor**.
|
||||
|
||||
With `DEMO_KEEP=1`, inspect under the printed temp dir:
|
||||
|
||||
- `coord/events.log` — the coordinator's audit trail: `CLAIM` / `PATCH:<stage>` / `RECLAIM` / `FENCE` events (factory + epoch on each).
|
||||
- `coord/jobs/<id>.job` — final per-job `stage` / `holder` / `epoch`.
|
||||
- `log-mac-1.txt`, `log-ubuntu-1.txt` — each factory's run-loop log (claims, the `▶ launching`, the fenced/quarantine path on the killed worker).
|
||||
|
||||
## Files
|
||||
|
||||
- `two-factory-demo.sh` — the orchestrator (start factories, kill/reclaim/fence, assert).
|
||||
- `coordinator-stub.sh` — the stateful coordinator stub (claim/patch/fence/renew/release/reap, mkdir-locked).
|
||||
- `start-fleet.example.sh` — reference launcher for a **real** multi-product local
|
||||
fleet against a live platform-service (one `agent-queue.sh run` daemon per
|
||||
product). Parameterized via env; ships the two settings you must get right —
|
||||
`AQ_FLEET_GATE=1` (M0 RU gate) and `AQ_FLEET_LEASE_RENEW_SEC=30` (heartbeat
|
||||
cadence < the 90s stale threshold). Copy + adjust for your sandbox.
|
||||
@ -1,151 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# coordinator-stub.sh — a STATEFUL, concurrency-safe fleet-coordinator stub for the
|
||||
# two-factory demo + its selftest. It is the same "AQ_FLEET_API_CMD responder" pattern
|
||||
# the existing fleet selftests use (invoked as `<METHOD> <PATH> <BODY>`, prints the
|
||||
# response body then a final HTTP-code line), EXTENDED with file-backed shared state +
|
||||
# an mkdir lock so >=2 competing factory processes coordinate through ONE coordinator —
|
||||
# exactly modeling platform-service's claim / lease / fence / reaper contract
|
||||
# (../../learning_ai_common_plat/services/platform-service/src/modules/fleet/coordinator.ts).
|
||||
#
|
||||
# It is curl-free + dependency-free (bash + POSIX awk/sed/grep) so the demo runs in CI
|
||||
# with zero external services. Real-coordinator mode bypasses this entirely (the demo
|
||||
# talks to platform-service over HTTP when AQ_FLEET_API/AQ_FLEET_TOKEN are set).
|
||||
#
|
||||
# Contract implemented (paths under the caller's AQ_FLEET_API base, which includes /api):
|
||||
# POST /fleet/factories/heartbeat -> {"ok":true} 200
|
||||
# POST /fleet/claim -> {"claimed":true,"job":{id,bodyMd,leaseEpoch},"lease":{leaseEpoch}} | {"claimed":false}
|
||||
# PATCH /fleet/jobs/:id -> 200 | 409 (stale leaseEpoch => FENCED)
|
||||
# POST /fleet/jobs/:id/lease/renew -> 200 | 409 (fenced)
|
||||
# POST /fleet/jobs/:id/lease/release -> 200
|
||||
# POST /fleet/_reap -> {"reaped":N} 200 (DEMO-only admin: models the
|
||||
# coordinator reaper reclaiming a dead factory's
|
||||
# leases — returns its in-flight jobs to `queued`
|
||||
# and BUMPS the epoch so the zombie is fenced)
|
||||
#
|
||||
# Atomicity: every state mutation runs inside an mkdir spin-lock, so under true
|
||||
# concurrency EXACTLY ONE claimer wins a given job version (no double-assign), and a
|
||||
# report carrying an epoch older than the stored epoch is rejected (409) — the same
|
||||
# guarantees the real rev/_etag compare-and-swap provides.
|
||||
#
|
||||
# State (under $COORD_STATE, set by the demo):
|
||||
# order submit-ordered job ids (one per line)
|
||||
# jobs/<id>.job key=val lines: stage, holder, epoch, body
|
||||
# events.log append-only audit: "<ts> <EVENT> job=<id> factory=<f> epoch=<n>"
|
||||
# lock/ the mkdir lock dir
|
||||
#
|
||||
# Stages: queued -> assigned -> building -> review|testing -> shipped (terminal);
|
||||
# failed/dead_letter terminal. Reclaimable (active) = assigned|building|review|testing.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
METHOD="${1:-}"; RPATH="${2:-}"; BODY="${3:-}"
|
||||
: "${COORD_STATE:?coordinator-stub.sh requires COORD_STATE}"
|
||||
JOBS_DIR="$COORD_STATE/jobs"
|
||||
EVENTS="$COORD_STATE/events.log"
|
||||
LOCK="$COORD_STATE/lock"
|
||||
|
||||
# ── JSON field extraction (no jq) ───────────────────────────────────────────
|
||||
_str_field() { printf '%s' "$BODY" | sed -n 's/.*"'"$1"'"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' | head -1; }
|
||||
_num_field() { printf '%s' "$BODY" | grep -oE "\"$1\"[[:space:]]*:[[:space:]]*-?[0-9]+" | grep -oE -- '-?[0-9]+$' | head -1; }
|
||||
# job id from /fleet/jobs/<id> or /fleet/jobs/<id>/lease/<op>
|
||||
_job_id_from_path() { printf '%s' "$RPATH" | sed -e 's#^/fleet/jobs/##' -e 's#/lease/.*$##'; }
|
||||
|
||||
# ── lock (mkdir is atomic on POSIX filesystems) ─────────────────────────────
|
||||
_lock() { local n=0; until mkdir "$LOCK" 2>/dev/null; do sleep 0.02; n=$((n+1)); [ "$n" -gt 5000 ] && break; done; }
|
||||
_unlock() { rmdir "$LOCK" 2>/dev/null || true; }
|
||||
|
||||
_jobfile() { printf '%s/%s.job\n' "$JOBS_DIR" "$1"; }
|
||||
_get() { grep -E "^$2=" "$1" 2>/dev/null | head -1 | cut -d= -f2-; }
|
||||
_set() { # <file> <key> <val> : replace or append key=val
|
||||
local f=$1 k=$2 v=$3 tmp; tmp="$f.tmp.$$"
|
||||
if grep -qE "^$k=" "$f" 2>/dev/null; then
|
||||
sed "s#^$k=.*#$k=$v#" "$f" > "$tmp" && mv "$tmp" "$f"
|
||||
else
|
||||
printf '%s=%s\n' "$k" "$v" >> "$f"
|
||||
fi
|
||||
}
|
||||
_event() { printf '%s %s\n' "$(date +%s)" "$*" >> "$EVENTS"; }
|
||||
_is_active() { case "$1" in assigned|building|review|testing) return 0;; *) return 1;; esac; }
|
||||
|
||||
_emit() { printf '%s\n%s\n' "$1" "$2"; } # <json-body> <http-code>
|
||||
|
||||
case "$METHOD $RPATH" in
|
||||
"POST /fleet/factories/heartbeat")
|
||||
_emit '{"ok":true}' 200 ;;
|
||||
|
||||
"POST /fleet/claim")
|
||||
factory=$(_str_field factoryId)
|
||||
_lock
|
||||
claimed_id=""
|
||||
if [ -f "$COORD_STATE/order" ]; then
|
||||
while IFS= read -r jid; do
|
||||
[ -n "$jid" ] || continue
|
||||
jf=$(_jobfile "$jid")
|
||||
[ -f "$jf" ] || continue
|
||||
if [ "$(_get "$jf" stage)" = "queued" ]; then claimed_id="$jid"; break; fi
|
||||
done < "$COORD_STATE/order"
|
||||
fi
|
||||
if [ -n "$claimed_id" ]; then
|
||||
jf=$(_jobfile "$claimed_id")
|
||||
epoch=$(( $(_get "$jf" epoch) + 1 ))
|
||||
_set "$jf" stage assigned; _set "$jf" holder "$factory"; _set "$jf" epoch "$epoch"
|
||||
body=$(_get "$jf" body)
|
||||
_event "CLAIM job=$claimed_id factory=$factory epoch=$epoch"
|
||||
_unlock
|
||||
_emit "{\"claimed\":true,\"job\":{\"id\":\"$claimed_id\",\"bodyMd\":\"$body\",\"leaseEpoch\":$epoch},\"lease\":{\"leaseEpoch\":$epoch}}" 200
|
||||
else
|
||||
_unlock
|
||||
_emit '{"claimed":false}' 200
|
||||
fi ;;
|
||||
|
||||
PATCH\ /fleet/jobs/*)
|
||||
jid=$(_job_id_from_path); stage=$(_str_field stage); rep_epoch=$(_num_field leaseEpoch)
|
||||
jf=$(_jobfile "$jid")
|
||||
_lock
|
||||
if [ ! -f "$jf" ]; then _unlock; _emit '{}' 404
|
||||
else
|
||||
cur_epoch=$(_get "$jf" epoch)
|
||||
if [ -n "$rep_epoch" ] && [ "$rep_epoch" -lt "$cur_epoch" ]; then
|
||||
_event "FENCE job=$jid factory=$(_get "$jf" holder) epoch=$rep_epoch<stored=$cur_epoch"
|
||||
_unlock; _emit '{}' 409 # stale leaseEpoch -> fenced (zombie rejected)
|
||||
else
|
||||
[ -n "$stage" ] && _set "$jf" stage "$stage"
|
||||
_event "PATCH:$stage job=$jid factory=$(_get "$jf" holder) epoch=$rep_epoch"
|
||||
_unlock; _emit '{}' 200
|
||||
fi
|
||||
fi ;;
|
||||
|
||||
POST\ /fleet/jobs/*/lease/renew)
|
||||
jid=$(_job_id_from_path); rep_epoch=$(_num_field leaseEpoch); jf=$(_jobfile "$jid")
|
||||
_lock
|
||||
cur_epoch=$(_get "$jf" epoch 2>/dev/null)
|
||||
if [ -n "$rep_epoch" ] && [ -n "$cur_epoch" ] && [ "$rep_epoch" -lt "$cur_epoch" ]; then
|
||||
_event "RENEW_FENCE job=$jid epoch=$rep_epoch<stored=$cur_epoch"; _unlock; _emit '{}' 409
|
||||
else
|
||||
_unlock; _emit '{}' 200
|
||||
fi ;;
|
||||
|
||||
POST\ /fleet/jobs/*/lease/release)
|
||||
jid=$(_job_id_from_path); _event "RELEASE job=$jid"; _emit '{}' 200 ;;
|
||||
|
||||
"POST /fleet/_reap")
|
||||
# DEMO admin: model the coordinator reaper reclaiming a dead factory's leases.
|
||||
factory=$(_str_field factoryId)
|
||||
_lock
|
||||
n=0
|
||||
for jf in "$JOBS_DIR"/*.job; do
|
||||
[ -f "$jf" ] || continue
|
||||
if [ "$(_get "$jf" holder)" = "$factory" ] && _is_active "$(_get "$jf" stage)"; then
|
||||
jid=$(basename "$jf" .job)
|
||||
epoch=$(( $(_get "$jf" epoch) + 1 )) # bump => the dead worker's old epoch is now stale (fenced)
|
||||
_set "$jf" stage queued; _set "$jf" holder ""; _set "$jf" epoch "$epoch"
|
||||
_event "RECLAIM job=$jid factory=$factory epoch=$epoch"
|
||||
n=$((n+1))
|
||||
fi
|
||||
done
|
||||
_unlock
|
||||
_emit "{\"reaped\":$n}" 200 ;;
|
||||
|
||||
*) _emit '{}' 200 ;;
|
||||
esac
|
||||
@ -1,88 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# start-fleet.example.sh — reference launcher for a multi-product local fleet.
|
||||
#
|
||||
# Starts one detached `agent-queue.sh run` daemon (a "factory") per product, each
|
||||
# routing work through the platform-service fleet coordinator. This is the tracked,
|
||||
# parameterized version of the operational `_start_fleet.sh` people keep in their
|
||||
# local sandbox — copy it, adjust the env vars, and run.
|
||||
#
|
||||
# Prereqs:
|
||||
# - platform-service running on $AQ_FLEET_API (see scripts/deploy-gigafactory.sh)
|
||||
# - a factory token in $FLEET_TOKEN_FILE (an admin/factory JWT for the fleet API)
|
||||
# - tmux + the `longrun` helper (sourced below) for detached, logged daemons
|
||||
#
|
||||
# Env overrides (all optional):
|
||||
# SB sandbox/state root (per-product queues live in $SB/q_<product>)
|
||||
# AQ path to agent-queue.sh
|
||||
# AQ_FLEET_API coordinator base URL (default http://localhost:4003/api)
|
||||
# FLEET_TOKEN_FILE file holding the bearer token (default $SB/.token)
|
||||
# PRODUCTS space-separated product ids (default: the ecosystem set)
|
||||
# AGENT_QUEUE_MAX per-factory concurrency (default 3)
|
||||
#
|
||||
# Docs: ../docs/GIGAFACTORY/GIGAFACTORY_SYSTEM_OVERVIEW.md (§9 API, §14 gotchas) and
|
||||
# ../docs/GIGAFACTORY/FLEET_DISPATCH_REDESIGN.md (the M0 RU gate).
|
||||
set -uo pipefail
|
||||
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SB="${SB:-$PWD/.fleet-sandbox}"
|
||||
AQ="${AQ:-$HERE/../agent-queue.sh}"
|
||||
AQ_FLEET_API="${AQ_FLEET_API:-http://localhost:4003/api}"
|
||||
FLEET_TOKEN_FILE="${FLEET_TOKEN_FILE:-$SB/.token}"
|
||||
PRODUCTS="${PRODUCTS:-lysnrai chronomind mindlyst nomgap}"
|
||||
AGENT_QUEUE_MAX="${AGENT_QUEUE_MAX:-3}"
|
||||
LONGRUN_ALIAS="${LONGRUN_ALIAS:-$HERE/../../aliases/_longrun.alias}"
|
||||
# PR mode: where the product repos are checked out, so a job's `repo` resolves to
|
||||
# a local checkout and the factory opens a PR off a git worktree. Set FLEET_PR=0
|
||||
# to run plain (no PR) jobs in $SB instead.
|
||||
FLEET_PR="${FLEET_PR:-1}"
|
||||
REPO_BASE="${REPO_BASE:-$(cd "$HERE/../../.." && pwd)}"
|
||||
|
||||
[ -f "$AQ" ] || { echo "agent-queue.sh not found at $AQ (set AQ=)"; exit 1; }
|
||||
[ -s "$FLEET_TOKEN_FILE" ] || { echo "fleet token not found at $FLEET_TOKEN_FILE (set FLEET_TOKEN_FILE=)"; exit 1; }
|
||||
TOK="$(cat "$FLEET_TOKEN_FILE")"
|
||||
|
||||
mkdir -p "$SB"
|
||||
export LONGRUN_LOG_DIR="$SB"
|
||||
# shellcheck disable=SC1090
|
||||
source "$LONGRUN_ALIAS"
|
||||
|
||||
for p in $PRODUCTS; do
|
||||
ROOT="$SB/q_$p"
|
||||
longrun "gigafactory-$p" env \
|
||||
AGENT_QUEUE_ROOT="$ROOT" \
|
||||
AGENT_QUEUE_ENGINE=devin \
|
||||
AGENT_QUEUE_MAX="$AGENT_QUEUE_MAX" \
|
||||
AQ_FLEET=1 AQ_FLEET_ROUTE=1 \
|
||||
AQ_FLEET_API="$AQ_FLEET_API" \
|
||||
AQ_FLEET_TOKEN="$TOK" \
|
||||
AQ_PRODUCT_ID="$p" \
|
||||
AQ_FACTORY_ID="mac-$p" \
|
||||
AQ_FLEET_GATE=1 \
|
||||
AQ_FLEET_LEASE_RENEW_SEC=30 \
|
||||
AQ_FLEET_PR="$FLEET_PR" \
|
||||
AQ_FLEET_REPO_BASE="$REPO_BASE" \
|
||||
"$AQ" run
|
||||
echo "----"
|
||||
done
|
||||
|
||||
# Why these two matter (both verified on a live fleet):
|
||||
# AQ_FLEET_GATE=1 §M0 RU gate — the run loop point-reads the cheap
|
||||
# per-product queue version (GET /fleet/queue-state) and
|
||||
# SKIPS the claim while nothing changed, slashing idle
|
||||
# Cosmos RU. Default OFF; safe (fails open). See
|
||||
# FLEET_DISPATCH_REDESIGN.md §8/§12.
|
||||
# AQ_FLEET_LEASE_RENEW_SEC=30 heartbeat/renew cadence. MUST stay well under the
|
||||
# coordinator's 90s stale threshold, or a healthy
|
||||
# factory flaps to "stale"/"no live factory" between
|
||||
# beats (the 300s default caused exactly that).
|
||||
# AQ_FLEET_PR=1 + AQ_FLEET_REPO_BASE WITHOUT these a job's `repo` is ignored and
|
||||
# Devin just runs the prompt in the sandbox cwd (no PR).
|
||||
# With them, the factory checks out a worktree of
|
||||
# $REPO_BASE/<repo>, commits, pushes, and opens a PR.
|
||||
#
|
||||
# Subset restart (leave a busy factory running):
|
||||
# PRODUCTS="lysnrai mindlyst" bash start-fleet.example.sh
|
||||
#
|
||||
# Stop a factory: tmux kill-session -t gigafactory-<product>
|
||||
# Tail a factory: tail -f "$SB"/longrun-gigafactory-<product>-*.log
|
||||
@ -1,248 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# two-factory-demo.sh — Phase-2 EXIT-CRITERIA demo (§14): >=2 factories executing jobs
|
||||
# in PARALLEL through ONE coordinator, proving the Phase-2 guarantees end-to-end:
|
||||
#
|
||||
# (a) NO DOUBLE-ASSIGN — each job is claimed/executed by exactly ONE factory.
|
||||
# (b) FENCING + RECLAIM — kill a factory MID-JOB; the reaper returns its job; the OTHER
|
||||
# factory reclaims + completes it; the dead worker's late/zombie
|
||||
# report is FENCED (409, never shipped).
|
||||
# (c) PARALLELISM — both factories make progress concurrently (not serialized).
|
||||
#
|
||||
# This is a DEMO HARNESS over the EXISTING runtime — it does NOT change agent-queue.sh or
|
||||
# lib/fleet-client.sh; it starts two real `agent-queue.sh run` daemons (distinct factoryIds,
|
||||
# separate queues/cwds) that compete ONLY through the coordinator, then observes + asserts.
|
||||
#
|
||||
# DUAL MODE:
|
||||
# STUB (default / CI-safe): drives demo/coordinator-stub.sh — a stateful, lock-guarded
|
||||
# file-backed coordinator. Zero external services. Used by selftest.sh.
|
||||
# REAL : set AQ_FLEET_API + AQ_FLEET_TOKEN (and DEMO_MODE=real) to run against a live
|
||||
# platform-service fleet coordinator. Submit + reaper-reclaim use its HTTP API.
|
||||
#
|
||||
# Usage:
|
||||
# bash demo/two-factory-demo.sh # stub mode (default)
|
||||
# DEMO_MODE=real AQ_FLEET_API=http://host:4003/api AQ_FLEET_TOKEN=... \
|
||||
# AQ_PRODUCT_ID=notelett bash demo/two-factory-demo.sh
|
||||
#
|
||||
# Env knobs: DEMO_JOB_SLEEP (per-job engine seconds, default 2), DEMO_TIMEOUT (drain
|
||||
# seconds, default 60), DEMO_POLL (poll seconds, default 0.2), DEMO_KEEP=1 (keep temp).
|
||||
#
|
||||
# Exit 0 = all three guarantees PASS; non-zero = FAIL. bash 3.2+ (no assoc arrays);
|
||||
# awk/sed/grep/pgrep only; mac+linux safe.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
AQ="$HERE/../agent-queue.sh"
|
||||
STUB="$HERE/coordinator-stub.sh"
|
||||
|
||||
DEMO_MODE="${DEMO_MODE:-stub}"
|
||||
if [ -n "${AQ_FLEET_API:-}" ] && [ -n "${AQ_FLEET_TOKEN:-}" ] && [ "${DEMO_MODE}" != "stub" ]; then DEMO_MODE=real; fi
|
||||
DEMO_JOB_SLEEP="${DEMO_JOB_SLEEP:-2}"
|
||||
DEMO_TIMEOUT="${DEMO_TIMEOUT:-60}"
|
||||
DEMO_POLL="${DEMO_POLL:-0.2}"
|
||||
F1="${DEMO_FACTORY_1:-mac-1}" # victim (killed mid-job)
|
||||
F2="${DEMO_FACTORY_2:-ubuntu-1}" # survivor (reclaims)
|
||||
|
||||
c_b=$'\033[1m'; c_g=$'\033[32m'; c_r=$'\033[31m'; c_c=$'\033[36m'; c_0=$'\033[0m'
|
||||
log() { printf '%s[demo]%s %s\n' "$c_c" "$c_0" "$*"; }
|
||||
ok() { printf ' %s+%s %s\n' "$c_g" "$c_0" "$*"; }
|
||||
bad() { printf ' %s- %s%s\n' "$c_r" "$*" "$c_0" >&2; }
|
||||
|
||||
TMP="$(mktemp -d "${TMPDIR:-/tmp}/aq-2factory.XXXXXX")"
|
||||
COORD_STATE="$TMP/coord"; export COORD_STATE
|
||||
DAEMON_PIDS=()
|
||||
|
||||
# kill a process AND its descendants (mac+linux; pgrep -P is portable)
|
||||
kill_tree() {
|
||||
local p=$1 c
|
||||
for c in $(pgrep -P "$p" 2>/dev/null); do kill_tree "$c"; done
|
||||
kill -9 "$p" 2>/dev/null || true
|
||||
}
|
||||
cleanup() {
|
||||
local p
|
||||
if [ "${#DAEMON_PIDS[@]}" -gt 0 ]; then
|
||||
for p in "${DAEMON_PIDS[@]}"; do [ -n "$p" ] && kill_tree "$p"; done
|
||||
fi
|
||||
[ "${DEMO_KEEP:-0}" = "1" ] || rm -rf "$TMP"
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
# In stub mode every coordinator HTTP call is routed to the stateful stub via the
|
||||
# existing AQ_FLEET_API_CMD seam; in real mode it is unset so curl talks to the service.
|
||||
if [ "$DEMO_MODE" = stub ]; then export AQ_FLEET_API_CMD="$STUB"; else unset AQ_FLEET_API_CMD 2>/dev/null || true; fi
|
||||
|
||||
# ── coordinator primitives (mode-branched) ─────────────────────────────────
|
||||
coord_init() {
|
||||
if [ "$DEMO_MODE" = stub ]; then mkdir -p "$COORD_STATE/jobs"; : > "$COORD_STATE/order"; : > "$COORD_STATE/events.log"; fi
|
||||
}
|
||||
coord_submit() { # <jobid> <bodyMd>
|
||||
if [ "$DEMO_MODE" = stub ]; then
|
||||
printf '%s\n' "stage=queued" "holder=" "epoch=0" "body=$2" > "$COORD_STATE/jobs/$1.job"
|
||||
printf '%s\n' "$1" >> "$COORD_STATE/order"
|
||||
else
|
||||
curl -sS -m 30 -X POST -H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer ${AQ_FLEET_TOKEN}" ${AQ_PRODUCT_ID:+-H "X-Product-Id: $AQ_PRODUCT_ID"} \
|
||||
--data "{\"idempotencyKey\":\"$1\",\"bodyMd\":\"$2\",\"priority\":\"medium\"}" \
|
||||
"${AQ_FLEET_API}${DEMO_SUBMIT_PATH:-/fleet/jobs}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
coord_reap() { # <factoryId> : model the reaper reclaiming a dead factory's leases
|
||||
if [ "$DEMO_MODE" = stub ]; then
|
||||
"$STUB" POST /fleet/_reap "{\"factoryId\":\"$1\"}" >/dev/null 2>&1 || true
|
||||
else
|
||||
log "real mode: waiting ${DEMO_REAP_WAIT:-20}s for the coordinator reaper to reclaim $1's lease"
|
||||
sleep "${DEMO_REAP_WAIT:-20}"
|
||||
fi
|
||||
}
|
||||
coord_zombie_report() { # <jobid> <staleEpoch> -> echoes the HTTP code (expect 409)
|
||||
if [ "$DEMO_MODE" = stub ]; then
|
||||
"$STUB" PATCH "/fleet/jobs/$1" "{\"stage\":\"building\",\"leaseEpoch\":$2}" | tail -n1
|
||||
else
|
||||
curl -sS -m 30 -o /dev/null -w '%{http_code}' -X PATCH -H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer ${AQ_FLEET_TOKEN}" ${AQ_PRODUCT_ID:+-H "X-Product-Id: $AQ_PRODUCT_ID"} \
|
||||
--data "{\"stage\":\"building\",\"leaseEpoch\":$2}" "${AQ_FLEET_API}/fleet/jobs/$1"
|
||||
fi
|
||||
}
|
||||
# stub-only state readers (assertions in stub mode read authoritative coordinator state)
|
||||
jget() { grep -E "^$2=" "$COORD_STATE/jobs/$1.job" 2>/dev/null | head -1 | cut -d= -f2-; }
|
||||
# emit (one per line) the factoryId of every factory currently holding an ACTIVE job
|
||||
active_holders() {
|
||||
local jf st ho
|
||||
for jf in "$COORD_STATE"/jobs/*.job; do
|
||||
[ -f "$jf" ] || continue
|
||||
st=$(grep -E '^stage=' "$jf" | cut -d= -f2-); ho=$(grep -E '^holder=' "$jf" | cut -d= -f2-)
|
||||
case "$st" in assigned|building|review|testing) [ -n "$ho" ] && printf '%s\n' "$ho";; esac
|
||||
done
|
||||
}
|
||||
|
||||
# ── engine + factory launch ─────────────────────────────────────────────────
|
||||
engine="$TMP/engine.sh"
|
||||
printf '#!/usr/bin/env bash\n# demo engine: sleep then succeed (gives a window to kill mid-job)\nsleep %s\nexit 0\n' "$DEMO_JOB_SLEEP" > "$engine"
|
||||
chmod +x "$engine"
|
||||
|
||||
start_factory() { # <factoryId>
|
||||
local fid=$1 root="$TMP/q-$1" work="$TMP/w-$1"
|
||||
mkdir -p "$work"
|
||||
AGENT_QUEUE_ROOT="$root" "$AQ" init >/dev/null 2>&1
|
||||
# Each factory: own queue + cwd, AQ_FLEET=1 ROUTE=1 (coordinator authoritative),
|
||||
# MAX=1 so it holds one job at a time, fast poll. Competes ONLY via the coordinator
|
||||
# (AQ_FLEET_API_CMD / AQ_FLEET_API inherited from the environment above).
|
||||
AGENT_QUEUE_ROOT="$root" AGENT_QUEUE_MAX=1 AGENT_QUEUE_POLL=1 \
|
||||
AQ_FLEET=1 AQ_FLEET_ROUTE=1 AQ_FACTORY_ID="$fid" AQ_FLEET_CWD="$work" \
|
||||
AQ_FLEET_API="${AQ_FLEET_API:-http://stub.local/api}" \
|
||||
DEVIN_BIN="$engine" "$AQ" run >"$TMP/log-$1.txt" 2>&1 &
|
||||
DAEMON_PIDS+=("$!")
|
||||
disown 2>/dev/null || true # detach from job control so SIGKILL later prints no "Killed" notice
|
||||
log "started factory $c_b$fid$c_0 (pid $!, queue q-$1)"
|
||||
}
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
log "Phase-2 two-factory parallel demo — mode=$c_b$DEMO_MODE$c_0 (job-sleep=${DEMO_JOB_SLEEP}s)"
|
||||
coord_init
|
||||
|
||||
# 1) submit 3 jobs
|
||||
for n in 1 2 3; do coord_submit "demo-job-$n" "two-factory demo job $n"; done
|
||||
log "submitted 3 jobs to the coordinator"
|
||||
|
||||
# 2) start two factories competing through the coordinator
|
||||
start_factory "$F1"
|
||||
start_factory "$F2"
|
||||
|
||||
# 3) PARALLELISM: wait until BOTH factories simultaneously hold an active job, and the
|
||||
# victim (F1) holds one we can kill mid-job.
|
||||
PARALLELISM_OK=0; VICTIM_JOB=""; VICTIM_EPOCH=""
|
||||
if [ "$DEMO_MODE" = stub ]; then
|
||||
deadline=$(( $(date +%s) + 30 ))
|
||||
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||||
holders=$(active_holders | sort -u | tr '\n' ' ')
|
||||
if printf '%s' "$holders" | grep -qw "$F1" && printf '%s' "$holders" | grep -qw "$F2"; then
|
||||
PARALLELISM_OK=1
|
||||
for jf in "$COORD_STATE"/jobs/*.job; do
|
||||
[ -f "$jf" ] || continue
|
||||
if [ "$(grep -E '^holder=' "$jf" | cut -d= -f2-)" = "$F1" ]; then
|
||||
case "$(grep -E '^stage=' "$jf" | cut -d= -f2-)" in
|
||||
assigned|building|review|testing)
|
||||
VICTIM_JOB=$(basename "$jf" .job); VICTIM_EPOCH=$(jget "$VICTIM_JOB" epoch); break;;
|
||||
esac
|
||||
fi
|
||||
done
|
||||
[ -n "$VICTIM_JOB" ] && break
|
||||
fi
|
||||
sleep "$DEMO_POLL"
|
||||
done
|
||||
else
|
||||
sleep "${DEMO_SETTLE:-5}"; PARALLELISM_OK=1; VICTIM_JOB="${DEMO_VICTIM_JOB:-demo-job-1}"; VICTIM_EPOCH="${DEMO_VICTIM_EPOCH:-1}"
|
||||
fi
|
||||
if [ "$PARALLELISM_OK" = 1 ]; then log "PARALLELISM observed: $F1 and $F2 both holding active jobs concurrently"; else log "WARN: did not observe simultaneous holders"; fi
|
||||
log "victim=$c_b$F1$c_0 holds job $c_b${VICTIM_JOB:-?}$c_0 (epoch ${VICTIM_EPOCH:-?}) — killing it mid-job"
|
||||
|
||||
# 4) KILL the victim factory mid-job (hard crash, no graceful drain)
|
||||
victim_pid="${DAEMON_PIDS[0]}"
|
||||
kill_tree "$victim_pid"
|
||||
DAEMON_PIDS[0]=""
|
||||
log "killed factory $F1 (pid $victim_pid)"
|
||||
|
||||
# 5) RECLAIM: the reaper returns the victim's in-flight job to the queue (epoch bumped)
|
||||
coord_reap "$F1"
|
||||
log "reaper reclaimed $F1's lease(s)"
|
||||
|
||||
# 6) FENCE the zombie: the dead worker's LATE report (stale epoch) must be rejected (409)
|
||||
FENCE_OK=0
|
||||
if [ -n "$VICTIM_JOB" ] && [ -n "$VICTIM_EPOCH" ]; then
|
||||
zcode=$(coord_zombie_report "$VICTIM_JOB" "$VICTIM_EPOCH")
|
||||
if [ "$zcode" = 409 ]; then FENCE_OK=1; ok "zombie report for $VICTIM_JOB @epoch=$VICTIM_EPOCH was FENCED (HTTP 409)"; else bad "zombie report not fenced (HTTP $zcode)"; fi
|
||||
fi
|
||||
|
||||
# 7) DRAIN: the survivor (F2) finishes everything, including the reclaimed job
|
||||
log "draining remaining work on the survivor ($F2)..."
|
||||
DONE=0
|
||||
if [ "$DEMO_MODE" = stub ]; then
|
||||
deadline=$(( $(date +%s) + DEMO_TIMEOUT ))
|
||||
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||||
d=0
|
||||
for jf in "$COORD_STATE"/jobs/*.job; do
|
||||
case "$(grep -E '^stage=' "$jf" | cut -d= -f2-)" in review|testing|shipped) d=$((d+1));; esac
|
||||
done
|
||||
[ "$d" -ge 3 ] && { DONE=1; break; }
|
||||
sleep "$DEMO_POLL"
|
||||
done
|
||||
else
|
||||
sleep "${DEMO_DRAIN_WAIT:-30}"; DONE=1
|
||||
fi
|
||||
|
||||
# ── ASSERT the three guarantees (stub mode reads authoritative coordinator state) ──
|
||||
echo
|
||||
log "${c_b}RESULTS${c_0}"
|
||||
PASS=1
|
||||
if [ "$DEMO_MODE" = stub ]; then
|
||||
reviewed=0
|
||||
for jf in "$COORD_STATE"/jobs/*.job; do
|
||||
jid=$(basename "$jf" .job); st=$(jget "$jid" stage); ho=$(jget "$jid" holder)
|
||||
case "$st" in
|
||||
review|testing|shipped) reviewed=$((reviewed+1)); printf ' job %-12s -> %s (stage=%s)\n' "$jid" "$ho" "$st";;
|
||||
*) printf ' job %-12s -> INCOMPLETE (stage=%s)\n' "$jid" "$st";;
|
||||
esac
|
||||
done
|
||||
claims=$(grep -c ' CLAIM ' "$COORD_STATE/events.log" 2>/dev/null || echo 0)
|
||||
distinct_claimers=$(grep ' CLAIM ' "$COORD_STATE/events.log" 2>/dev/null | sed -n 's/.*factory=\([^ ]*\).*/\1/p' | sort -u | tr '\n' ' ')
|
||||
reclaims=$(grep -c ' RECLAIM ' "$COORD_STATE/events.log" 2>/dev/null || echo 0)
|
||||
fences=$(grep -c ' FENCE ' "$COORD_STATE/events.log" 2>/dev/null || echo 0)
|
||||
victim_winner=$(jget "${VICTIM_JOB:-_none_}" holder)
|
||||
|
||||
if [ "$reviewed" -eq 3 ]; then ok "(a) no double-assign: all 3 jobs executed to terminal, one winner each"; else bad "(a) only $reviewed/3 jobs reached terminal"; PASS=0; fi
|
||||
if [ -n "$VICTIM_JOB" ] && [ "$victim_winner" = "$F2" ]; then ok " reclaimed job $VICTIM_JOB completed by survivor $F2 (not the killed $F1)"; elif [ -n "$VICTIM_JOB" ]; then bad " reclaimed job $VICTIM_JOB winner='$victim_winner' (expected $F2)"; PASS=0; fi
|
||||
if [ "$reclaims" -ge 1 ]; then ok "(b) reclaim: $reclaims RECLAIM event(s) (reaper returned the dead factory's job)"; else bad "(b) no RECLAIM event"; PASS=0; fi
|
||||
if [ "$FENCE_OK" = 1 ] && [ "$fences" -ge 1 ]; then ok "(b) fencing: zombie report rejected (409); $fences FENCE event(s)"; else bad "(b) zombie was not fenced (fence_ok=$FENCE_OK events=$fences)"; PASS=0; fi
|
||||
if [ "$PARALLELISM_OK" = 1 ] && printf '%s' "$distinct_claimers" | grep -qw "$F1" && printf '%s' "$distinct_claimers" | grep -qw "$F2"; then ok "(c) parallelism: both factories claimed concurrently (claimers: ${distinct_claimers}; $claims claims)"; else bad "(c) parallelism not observed (claimers: ${distinct_claimers})"; PASS=0; fi
|
||||
else
|
||||
if [ "$DONE" = 1 ]; then ok "real mode: drain window elapsed — inspect the coordinator + factory logs in $TMP"; fi
|
||||
ok "real mode is best-effort/observational; the asserted guarantees are validated in stub mode (and selftest)."
|
||||
fi
|
||||
|
||||
echo
|
||||
if [ "$PASS" = 1 ]; then
|
||||
printf '%s[demo] PASS%s — Phase-2 exit guarantees demonstrated (no double-assign + reclaim/fence + parallelism)\n' "$c_g" "$c_0"; exit 0
|
||||
else
|
||||
printf '%s[demo] FAIL%s\n' "$c_r" "$c_0"; exit 1
|
||||
fi
|
||||
@ -1,515 +0,0 @@
|
||||
# Fleet Dispatch Redesign — Broker-Backed, On-Demand Factories
|
||||
|
||||
> Design proposal (no code yet). Companion to `GIGAFACTORY_SYSTEM_OVERVIEW.md`
|
||||
> (what exists today) and `GIGAFACTORY_ROADMAP.md` (source-of-truth spec). This
|
||||
> doc realizes roadmap **Phase 4** ("Message bus + autoscaling") and the
|
||||
> routing-model cleanup that comes with it. Last reviewed: **2026-05-31**.
|
||||
>
|
||||
> **Review log**
|
||||
> - v1 (2026-05-31): initial proposal.
|
||||
> - v2 (2026-05-31): self-review pass — reconciled the routing model
|
||||
> (coordinator-targeted as primary), fixed the Cosmos outbox transactionality
|
||||
> claim (change feed *is* the log), constrained message size (jobId + routing
|
||||
> props only), addressed long-job vs Service Bus 5-min lock, corrected the
|
||||
> idempotency key (`MessageId = jobId`), renamed migration steps `M0–M3` to
|
||||
> avoid collision with roadmap phases, fixed the Phase-0 RU figure, and added a
|
||||
> ticked roadmap checklist + auth/observability notes.
|
||||
> - v3 (2026-05-31): added **§5.5 Error handling & cleanup** (current behavior +
|
||||
> lease-release-on-failure, branch/worktree GC, same-repo worktree clobber).
|
||||
> Review fixes: unified the field name to `targetFactoryId` (§5.1), reconciled
|
||||
> §5.3 with the complete-on-claim model (broker is not the redelivery path),
|
||||
> aligned §6 token scoping with per-factory subscriptions, and added the GC /
|
||||
> `POST /fleet/fail` checklist block to §12.
|
||||
> - v4 (2026-05-31): **coverage audit** — roadmap now maps 1:1 to the design via a
|
||||
> coverage matrix. Closed plan gaps: **M-prep** (decisions/§10 + schema +
|
||||
> containers + RBAC), correlation filter + dispatcher budget enforcement (M1),
|
||||
> small-messages/body-from-Cosmos + token re-check + alerting (M2), and new
|
||||
> **Testing** and **Rollback & flags** blocks. No design element is now without
|
||||
> an implementation step.
|
||||
> - v5 (2026-05-31): **M0 implemented + shipped** (`fleet_queue_state` + bump
|
||||
> hooks + `GET /fleet/queue-state` in common_plat; `AQ_FLEET_GATE` gate-skip in
|
||||
> agent-queue). Reconciled M0 to the as-built approach (gate the *claim*; keep
|
||||
> `POLL_SECONDS` for local responsiveness rather than raising it globally) and
|
||||
> ticked the M0 checklist. Backend vitest + gate logic verified.
|
||||
|
||||
---
|
||||
|
||||
## 1. Why this doc exists (the two smells)
|
||||
|
||||
Two structural problems surfaced while running the local fleet against
|
||||
`tracker-web` + `platform-service`:
|
||||
|
||||
### 1.1 Product-as-queue is conflated with repo-as-work-target
|
||||
|
||||
- `fleet_jobs` is partitioned by **`/productId`**, and a factory is bound to a
|
||||
single product via `AQ_PRODUCT_ID`. The job's **`repo`** is just a payload
|
||||
field (the PR target). Routing uses `productId`; the repo is orthogonal.
|
||||
- Consequence observed: a `learning_ai_notes` job submitted via the form was
|
||||
filed under **`chronomind`** (because the form's Factory dropdown maps
|
||||
`mac-2 → chronomind`), and would have opened a PR to the notes repo from a
|
||||
"chronomind" factory. Nothing ties the product to the repo, and nothing
|
||||
guarantees the chosen factory even has that repo checked out.
|
||||
- The form (`dashboards/tracker-web/.../fleet/jobs/page.tsx`) hardcodes
|
||||
`FLEET_FACTORIES = [mac-1→lysnrai, mac-2→chronomind]` and defaults
|
||||
`capabilities = "build"` — a capability **no agent-queue factory ever
|
||||
advertises** (`detect_capabilities` only emits `os:*`, `engine:*`, `node:*`,
|
||||
`has:*`). So default UI submissions are unroutable to live factories.
|
||||
|
||||
### 1.2 Pull-poll daemons burn Cosmos RU to stay "ready"
|
||||
|
||||
- The run loop iterates every **`POLL_SECONDS=3`**; with `AQ_FLEET_ROUTE=1`
|
||||
(default) each iteration calls `POST /fleet/claim`.
|
||||
- `claimNextJob` runs `repo.listJobs({ productId })` — **reads every job doc in
|
||||
the product partition, no stage filter, no limit** — on every claim, plus a
|
||||
`getLease` point-read per active job when preemption is on.
|
||||
- One process **per product** (`_start_fleet.sh` spawns 4) ⇒ ~`4 × (1/3s)` ≈
|
||||
**115k claim queries/day at idle**, each scaling with partition size, billed
|
||||
continuously whether or not work exists. The machine must also stay up running
|
||||
the loop.
|
||||
|
||||
> **Root cause:** `productId` is doing double duty as *tenant/billing scope* and
|
||||
> *work-routing queue*, and work discovery is a busy-poll against the state store.
|
||||
|
||||
---
|
||||
|
||||
## 2. Goals, non-goals, constraints
|
||||
|
||||
**Goals**
|
||||
- Eliminate idle-poll RU cost; pay (near) zero when there is no work.
|
||||
- Make a factory a **generic build worker** (host + capabilities + engines +
|
||||
checked-out repos), not a product-bound process.
|
||||
- Route work by what actually matters (**capabilities + repo**), while keeping
|
||||
per-product **billing, budgets, visibility, and token scoping**.
|
||||
- Preserve the existing **weighted scheduler** and **leaseEpoch fencing**
|
||||
(exactly-once assignment, zombie-writer protection).
|
||||
- Enable later **on-demand spawn** (scale-to-zero) without re-architecting.
|
||||
|
||||
**Non-goals (this phase)**
|
||||
- Replacing Cosmos as the system of record for job/lease/event/budget **state**.
|
||||
- Rewriting the scheduler's scoring math.
|
||||
- Multi-region / cross-cloud dispatch.
|
||||
|
||||
**Hard constraints (ecosystem rules)**
|
||||
- Every Cosmos doc keeps a `productId` (platform rule) — product stays a
|
||||
first-class **tag**, even when it is no longer the routing key.
|
||||
- Per-product budgets (`fleet_budgets /productId`), enrollment tokens (§12), and
|
||||
the `tracker-web` per-product views must keep working.
|
||||
- Changes must be flag-gated and reversible (match the existing
|
||||
`AQ_FLEET` / `AQ_FLEET_ROUTE` / `AQ_FLEET_SHADOW` cutover discipline).
|
||||
|
||||
---
|
||||
|
||||
## 3. Decision summary
|
||||
|
||||
1. **Do NOT build A3 ("single shared queue") inside Cosmos.** A single logical
|
||||
queue tempts a hot partition; scaling it forces a synthetic partition key and
|
||||
a **cross-partition "find next job" query**, which *increases* RU — the
|
||||
opposite of the goal. It also dissolves the per-product isolation the
|
||||
platform's tenancy/budget/token model depends on.
|
||||
2. **Get the shared-queue behavior from a real broker (B3), not from Cosmos.**
|
||||
Adopt **Azure Service Bus** as the dispatch substrate. Cosmos remains
|
||||
product-partitioned for **state**; the broker owns **delivery**.
|
||||
3. **Keep the scheduler.** Use a **coordinator-owns-scheduling /
|
||||
broker-owns-delivery hybrid** (B2 ⊕ B3): the coordinator decides *which
|
||||
factory* should run a job and pushes a **targeted** message; the broker
|
||||
handles transport, visibility timeout, retries, and dead-lettering.
|
||||
4. **Ship the cheap RU win first (B1) as step M0** — it is reversible, needs no
|
||||
new infra, and de-risks the broker migration by removing the bleed while the
|
||||
bigger change is built and shadowed.
|
||||
|
||||
> Net: the shared-queue *experience* (generic workers, one work stream) comes
|
||||
> from Service Bus topics/subscriptions; Cosmos stays `/productId`-partitioned
|
||||
> for state, budgets, and visibility.
|
||||
|
||||
---
|
||||
|
||||
## 4. Target architecture
|
||||
|
||||
### 4.1 Components & ownership
|
||||
|
||||
| Concern | Owner (target) | Notes |
|
||||
| --- | --- | --- |
|
||||
| Job/lease/event/budget **state** | **Cosmos** (`/productId`, `/jobId` as today) | unchanged system of record |
|
||||
| **Scheduling** (which factory) | **Coordinator** (platform-service) | existing weighted scorer + preemption |
|
||||
| **Dispatch / delivery** | **Service Bus** | competing consumers, visibility timeout, DLQ |
|
||||
| **Fencing** (zombie writers) | **Cosmos `leaseEpoch`** | broker visibility ≠ correctness boundary |
|
||||
| Per-product billing/budgets/tokens | **Cosmos + coordinator** | enforced at submit + assign, not by partition |
|
||||
| Control planes | `tracker-web`, `agent-queue` dashboard | unchanged REST surface |
|
||||
|
||||
### 4.2 Service Bus topology
|
||||
|
||||
- One **topic** `fleet-dispatch`.
|
||||
- **Primary model — coordinator-targeted (preserves the scheduler):** the
|
||||
coordinator picks the factory, then publishes a message stamped with
|
||||
`targetFactoryId`. Each factory has its **own subscription** with a
|
||||
**correlation filter** `targetFactoryId = '<me>'`. The broker does no policy —
|
||||
it just delivers the scorer's decision. **This is the model the rest of this
|
||||
doc assumes.**
|
||||
- **Fallback model — self-select (only if the scheduler is disabled):**
|
||||
capability/repo **SQL filters** on message application properties let consumers
|
||||
self-match. Multi-valued `capabilities` do **not** filter cleanly as one
|
||||
string, so encode each as a boolean property (`cap_os_mac=true`,
|
||||
`repo_learning_ai_notes=true`) rather than `LIKE '%…%'`. Subscription filters
|
||||
are why Service Bus beats Storage Queue / SQS (which can't filter → a
|
||||
queue-per-class sprawl).
|
||||
- **Messages stay small.** A message carries only
|
||||
`{ jobId, productId, repo, caps, priority, targetFactoryId }` — **not**
|
||||
`bodyMd`/manifest. The consumer reads the full job from Cosmos by `jobId`.
|
||||
(Service Bus max message is **256 KB** Standard / 1 MB Premium; job bodies can
|
||||
approach that — reinforcing "broker = transport, Cosmos = state".)
|
||||
- **DLQ** per subscription ⇒ maps onto `failed` / `retries_exhausted`.
|
||||
- **Sessions** (optional) keyed by `repo` to serialize same-repo work and avoid
|
||||
worktree/branch contention on one host.
|
||||
|
||||
### 4.3 Why this keeps the scheduler
|
||||
|
||||
A vanilla broker is FIFO competing-consumers and does **no** weighted scoring.
|
||||
To preserve the existing scorer (`capabilityFit / affinity / load / costFit /
|
||||
health / starvation`) + preemption + seat limits, the coordinator stays in the
|
||||
decision path: it **selects the target factory** and publishes a message whose
|
||||
filter routes it to *that* factory's subscription (or a per-factory
|
||||
subscription). The broker is transport, not policy.
|
||||
|
||||
---
|
||||
|
||||
## 5. Key flows
|
||||
|
||||
### 5.1 Submit → dispatch (consistency)
|
||||
|
||||
The **Cosmos change feed on `fleet_jobs` is the durable, ordered event log**, so
|
||||
no separate outbox container is needed for the primary design:
|
||||
|
||||
1. `submitJob` writes the `fleet_jobs` doc (`stage: queued`). That write *is* the
|
||||
event.
|
||||
2. A single **dispatcher** (coordinator process) tails the `fleet_jobs` change
|
||||
feed (via a lease container), runs the scheduler for each new/`queued` job,
|
||||
stamps `targetFactoryId` on the job (CAS), and **publishes** the targeted
|
||||
Service Bus message.
|
||||
3. **Crash-safe & idempotent:** the change feed redelivers from the last
|
||||
checkpoint on dispatcher restart; Service Bus **duplicate detection** keyed on
|
||||
**`MessageId = jobId`** collapses re-publishes. The consumer is idempotent
|
||||
because the authoritative claim is a Cosmos CAS on `leaseEpoch` — a second
|
||||
delivery is simply fenced (`leaseEpoch` is assigned *at claim*, so it is **not**
|
||||
a valid dedup key for the message itself).
|
||||
|
||||
> A separate **transactional outbox** is only needed if you ever publish *inline*
|
||||
> at submit instead of via the change feed. Cross-container writes are **not
|
||||
> atomic** in Cosmos, so an outbox row would have to live in the **same container
|
||||
> + same partition** as the job and be written with a **Cosmos transactional
|
||||
> batch** — or, simpler, carried as an `outboxState` field on the job doc itself.
|
||||
> The change-feed design avoids this entirely.
|
||||
|
||||
> Net effect: the per-factory busy-poll is replaced by one change-feed-driven
|
||||
> dispatcher. Idle cost is event-driven, not a per-3s full-partition scan.
|
||||
|
||||
### 5.2 Deliver → claim → fence
|
||||
|
||||
1. Factory **receives** a message (long-poll/`receiveMessages`, no RU).
|
||||
2. Factory calls `POST /fleet/claim` (or a lighter `/fleet/accept`) with
|
||||
`{ jobId, factoryId }`. Coordinator does the **CAS lease** in Cosmos exactly
|
||||
as today (`revUpdateJob` + `leaseEpoch` bump) and returns the new epoch.
|
||||
409 ⇒ fenced ⇒ factory abandons the message (it goes back / to DLQ).
|
||||
3. The **broker lock** governs redelivery (a dead consumer's message reappears);
|
||||
the **Cosmos `leaseEpoch`** governs *correctness* (a zombie writer is rejected
|
||||
on PATCH). Two distinct mechanisms — do not collapse them.
|
||||
4. **Long-running jobs vs the broker lock.** Service Bus message lock max is
|
||||
**5 minutes**; a coding job runs far longer. Two viable patterns:
|
||||
- **(recommended) complete-on-claim:** complete the message immediately after
|
||||
a successful Cosmos claim. The **Cosmos lease + reaper** then own liveness —
|
||||
on crash the reaper sets the job back to `queued`, which is a change-feed
|
||||
event that **re-dispatches** (§5.1). This decouples job runtime from the
|
||||
5-min lock entirely.
|
||||
- **renew-lock:** keep the message locked and call `renewMessageLock` on a
|
||||
timer, reusing the existing `AQ_FLEET_LEASE_RENEW_SEC` cadence to renew
|
||||
*both* the Cosmos lease and the broker lock. Simpler delivery semantics, but
|
||||
couples runtime to the broker and risks redelivery storms on long jobs.
|
||||
|
||||
### 5.3 Failure / retry / DLQ
|
||||
|
||||
Assumes the recommended **complete-on-claim** model (§5.2): the broker message is
|
||||
completed at claim, so the broker is **not** the redelivery path — re-dispatch is
|
||||
driven by Cosmos stage changes through the change feed (§5.1).
|
||||
|
||||
- **Logical failure** (engine error / verify-fail) ⇒ coordinator transitions
|
||||
`failed` and **releases the lease immediately** (new `/fleet/fail`, see §5.5);
|
||||
no redelivery (a logical failure is terminal unless a retry policy applies).
|
||||
- **Retryable failure** ⇒ coordinator sets the job back to `queued` (attempts++,
|
||||
backoff) ⇒ change-feed re-dispatch to the next best factory.
|
||||
- **Crash / lease-expiry** ⇒ the **reaper** reclaims the Cosmos lease (bumps
|
||||
`leaseEpoch`, fencing the dead holder) and returns the job to `queued` ⇒
|
||||
change-feed re-dispatch. (With the alternative *renew-lock* model, broker
|
||||
redelivery is the trigger instead — pick one, not both.)
|
||||
- **Exhausted retries** ⇒ Cosmos `retries_exhausted`; mirror to the broker DLQ
|
||||
for visibility.
|
||||
|
||||
### 5.4 Routing model (the §1.1 fix)
|
||||
|
||||
- Job carries `repo` + required `capabilities` (real tokens: `os:*`, `engine:*`,
|
||||
`has:git`, plus a new `repo:<name>` token).
|
||||
- The **scheduler** does the matching: it picks among factories that advertise
|
||||
those caps **and** have the repo locally (or can clone it), then targets the
|
||||
winner (§4.2 primary model: message stamped `targetFactoryId`, delivered via
|
||||
that factory's correlation-filtered subscription).
|
||||
- **Product is a property/tag** used for billing/visibility and budget checks —
|
||||
**not** the routing key. (In the self-select fallback, product/caps/repo become
|
||||
subscription SQL filters instead.)
|
||||
- Fix the `tracker-web` form in lockstep: derive factories/repos from live data,
|
||||
drop the bogus default `capabilities = "build"`, and stop hardcoding
|
||||
`mac-1/mac-2`.
|
||||
|
||||
### 5.5 Error handling & cleanup (worktrees, branches, leases)
|
||||
|
||||
**Today (single-host, agent-queue.sh).** The worker already handles errors well:
|
||||
the stage machine routes `timeout`/`budget_exceeded`/`crash`/`verify_failed`/
|
||||
`capability_mismatch`/`no_engine` through `_finish_failure` (→ `failed/`, with a
|
||||
retry policy that requeues to `inbox/` with backoff); a `trap` writes a WIP
|
||||
checkpoint to `aq/wip/<job>` on **every** exit path; `recover_orphans` requeues
|
||||
dead-worker `building/` jobs; and a **FENCED** report (stale `leaseEpoch`)
|
||||
triggers `fleet_quarantine` → `failed/` that **never ships or merges**
|
||||
(split-brain guard). PR/merge cleanup: `.aq_pr.md` is removed before commit; the
|
||||
PR branch `aq/job/<jid>` is deleted on auto-merge (`--delete-branch`); the repo
|
||||
worktree is force-recreated at the next job for that repo.
|
||||
|
||||
**Gaps this redesign must close.** These are real loose ends in the current code:
|
||||
|
||||
1. **No client-side lease release on failure.** `_finish_failure` is
|
||||
fleet-agnostic, so a failed fleet job's lease only frees on **expiry** via the
|
||||
reaper — slow recovery. Target: a `POST /fleet/fail` (stage=`failed`/`queued`
|
||||
+ release lease) so failure is reflected and the lease freed **immediately**.
|
||||
2. **Unbounded git artifacts.** `aq/wip/<job>` branches are never GC'd; worktrees
|
||||
are cleaned only on reuse; unmerged `aq/job/<jid>` branches accumulate on
|
||||
origin when auto-merge is off or blocked by branch protection. Target: a
|
||||
periodic **GC** sweep — delete merged `aq/job/*`, prune stale worktrees, and
|
||||
sweep `aq/wip/*` after a job reaches a terminal/shipped stage.
|
||||
3. **Same-repo concurrency can clobber a worktree.** The per-repo worktree is
|
||||
force-recreated, so two same-repo jobs on one host collide. Target: **Service
|
||||
Bus sessions keyed by `repo`** (serialize same-repo work) plus a per-`(host,
|
||||
repo)` lock as a local backstop.
|
||||
|
||||
**Target invariants.**
|
||||
- Terminal failure ⇒ Cosmos `failed` + lease released now (no expiry wait); DLQ
|
||||
mirrors `retries_exhausted` for visibility.
|
||||
- Crash / fence ⇒ reaper bumps `leaseEpoch` (fences zombie) ⇒ `queued` ⇒
|
||||
change-feed re-dispatch (§5.3).
|
||||
- Cleanup is **explicit and idempotent** — safe to re-run, never deletes a branch
|
||||
with unmerged work or a worktree with an in-flight job. (Checklist in §12.)
|
||||
|
||||
---
|
||||
|
||||
## 6. Per-product tenancy without product-partitioned queues
|
||||
|
||||
- **Budgets:** checked by the coordinator at **assign time** (it already reads
|
||||
`fleet_budgets /productId` in `claimNextJob`); unchanged, just moved to the
|
||||
dispatcher.
|
||||
- **Tokens (§12):** the factory token still scopes `productId + capabilities +
|
||||
factoryId`. In the primary (coordinator-targeted) model the dispatcher only
|
||||
ever targets a factory the scheduler deemed eligible, and the coordinator
|
||||
**re-checks the token on `/fleet/claim`** — so least-privilege holds without
|
||||
relying on the subscription topology. (In the self-select fallback, scope it
|
||||
with per-product/per-token subscription filters instead.)
|
||||
- **Visibility:** `tracker-web` keeps querying per product (state is still
|
||||
product-partitioned), so the UX is unchanged.
|
||||
|
||||
---
|
||||
|
||||
## 7. Alternatives considered
|
||||
|
||||
| Option | Verdict | Reason |
|
||||
| --- | --- | --- |
|
||||
| **A3 shared queue in Cosmos** | ✗ | hot partition; cross-partition claim = more RU; loses tenancy isolation |
|
||||
| **A1 validate ownership only** | partial | fixes "wrong factory" but not the RU/poll model or process-per-product |
|
||||
| **Storage Queue / SQS broker** | ✗ (for now) | no subscription filters ⇒ queue-per-capability sprawl; weaker DLQ/visibility ergonomics |
|
||||
| **B2 change feed, no broker** | viable | good for dispatch signal, but still needs a transport to *reach* factories; pairs naturally with B3 |
|
||||
| **Plain competing-consumers (drop scheduler)** | ✗ | throws away weighted scoring + preemption + cost/affinity routing |
|
||||
| **B3 Service Bus + coordinator hybrid** | ✓ chosen | zero idle RU, keeps scheduler + fencing, filters give capability/repo routing, paves path to B4 |
|
||||
|
||||
---
|
||||
|
||||
## 8. Phased migration
|
||||
|
||||
> Steps are labelled **M0–M3** to avoid collision with the roadmap's Phase 0–5
|
||||
> numbering; all of M0–M3 sit *inside* roadmap **Phase 4**. The ticked checklist
|
||||
> is in §12.
|
||||
|
||||
### M0 — RU quick win (no new infra, fully reversible) — *IMPLEMENTED*
|
||||
- Per-product `fleet_queue_state` doc holds a monotonic `version`, bumped on job
|
||||
create + every stage change (centralized in the repo layer, best-effort).
|
||||
- The factory run loop does a **~1-RU point-read** (`GET /fleet/queue-state`) and
|
||||
**skips the expensive claim** while the version is unchanged and it is not
|
||||
mid-drain — rather than raising `POLL_SECONDS` globally (which would slow local,
|
||||
non-fleet job pickup). A periodic safety backstop + fail-open-on-read-error
|
||||
guarantee work is never stranded.
|
||||
- Gated behind **`AQ_FLEET_GATE=1`** (default OFF ⇒ byte-for-byte prior behavior).
|
||||
- Expected: **~10–50× fewer claim queries at idle**, local responsiveness
|
||||
unchanged.
|
||||
- Code: common_plat `services/platform-service/src/modules/fleet/{types,repository,routes}.ts`
|
||||
+ `lib/cosmos-init.ts`; `agent-queue/lib/fleet-client.sh` (`fleet_gate_*`) + the
|
||||
run-loop hook in `agent-queue.sh`. Tests: fleet vitest (repo bump + endpoint) +
|
||||
selftest `39b` (gate decisions).
|
||||
|
||||
### M1 — Stand up the broker in **shadow**
|
||||
- Provision Service Bus (`fleet-dispatch` topic + subscriptions) with
|
||||
**managed-identity** auth (no connection-string keys in env/`.env`). Coordinator
|
||||
publishes messages **in parallel** with the existing claim path but factories
|
||||
still source work from Cosmos. Use the existing `AQ_FLEET_SHADOW` discipline:
|
||||
record divergence (did the broker route match the scorer's pick?) without
|
||||
acting on it.
|
||||
|
||||
### M2 — Cutover delivery to the broker
|
||||
- Flip a flag so factories source work from Service Bus + `/fleet/claim` for
|
||||
fencing; Cosmos poll path becomes the fallback only. Keep the reaper + lease
|
||||
fencing untouched. Validate exactly-once + crash recovery on multi-host.
|
||||
|
||||
### M3 — On-demand factories (B4)
|
||||
- KEDA / Container Apps scale-to-zero on subscription depth: spin a factory only
|
||||
when depth > 0; idle ⇒ **zero** running workers and zero RU. Warm-pool a single
|
||||
small instance if cold-start latency matters.
|
||||
|
||||
---
|
||||
|
||||
## 9. Risks & mitigations
|
||||
|
||||
| Risk | Mitigation |
|
||||
| --- | --- |
|
||||
| Dual source-of-truth (broker + Cosmos) drift | change-feed *is* the log (no separate outbox); SB duplicate-detection on `MessageId=jobId`; claim is a Cosmos CAS on `leaseEpoch` |
|
||||
| Broker lock vs `leaseEpoch` confusion | explicit rule: broker lock = *delivery*, `leaseEpoch` = *correctness*; never merge (§5.2) |
|
||||
| Long job > 5-min broker lock | **complete-on-claim** (reaper + change feed re-dispatch) or `renewMessageLock` on the lease cadence (§5.2) |
|
||||
| Message > 256 KB | message carries `jobId` + routing props only; consumer reads body from Cosmos (§4.2) |
|
||||
| Same-repo worktree contention across hosts | Service Bus **sessions** keyed by `repo` to serialize same-repo jobs |
|
||||
| Lost scheduler features under FIFO | coordinator keeps assignment; broker only transports targeted messages |
|
||||
| Token scope leak in shared subscriptions | per-factory subscription + correlation filter; coordinator re-checks the §12 token on claim |
|
||||
| Secrets in env (`.env` keys) | **managed identity** for Service Bus + Cosmos; no connection-string keys committed |
|
||||
| Blind operation | emit metrics: subscription depth, dispatch lag, claim-conflict (409) rate, DLQ count, change-feed lag — wire to existing monitoring |
|
||||
| Migration regressions | M1 shadow measures divergence before any cutover; all flag-gated |
|
||||
|
||||
---
|
||||
|
||||
## 10. Open questions
|
||||
|
||||
1. **Per-factory subscription scale.** The chosen coordinator-targeted model uses
|
||||
one subscription per factory (correlation filter on `targetFactoryId`). Service
|
||||
Bus allows up to **2,000 subscriptions/topic**, so this scales for realistic
|
||||
fleets. If factory churn is high, fall back to a single subscription with a
|
||||
per-consumer `targetFactoryId` SQL filter.
|
||||
2. **Where does the dispatcher run?** A new lightweight loop in platform-service
|
||||
vs a separate worker. A change-feed lease container is required either way; a
|
||||
single active dispatcher (leader-elected) avoids double-publish.
|
||||
3. **Cost envelope:** Service Bus tier (Standard vs Premium). Standard likely
|
||||
sufficient; Premium only if sessions/large messages/VNet are needed. Confirm
|
||||
against expected message volume.
|
||||
4. **Do we keep the Cosmos poll path permanently** as an offline/degraded
|
||||
fallback (like today's `AQ_FLEET_ROUTE=0`)? Recommend yes.
|
||||
5. **Repo advertisement.** How does a factory tell the coordinator which repos it
|
||||
has locally (for the `repo:<name>` capability)? Extend the heartbeat payload
|
||||
with a `repos[]` list, or derive from `AQ_FLEET_REPO_BASE`.
|
||||
|
||||
---
|
||||
|
||||
## 11. Appendix — idle RU cost sketch (today vs M0 vs target)
|
||||
|
||||
| Model | Claim/work-find ops at idle (4 factories) | Notes |
|
||||
| --- | --- | --- |
|
||||
| **Today** (poll 3s) | ~115k/day full-partition `listJobs` | scales with partition size; ~`4 × 28.8k` |
|
||||
| **M0** (poll 15–30s + gate) | ~12–23k/day **1-RU point-reads** + ~0 full scans | full scan only when the gate doc changes |
|
||||
| **Target (B3)** | ~0 | long-poll receive, no RU; full scan never on the hot path |
|
||||
|
||||
> Figures are order-of-magnitude to frame the decision, not a billing estimate.
|
||||
> A full-partition `listJobs` costs many RU and grows with partition size; a
|
||||
> point-read is ~1 RU and flat. The point: idle cost goes from "linear in
|
||||
> partition size, forever" to "≈ zero".
|
||||
|
||||
---
|
||||
|
||||
## 12. Roadmap & checklist (roadmap Phase 4)
|
||||
|
||||
Acceptance gate for the whole effort: **idle work-find RU ≈ 0**, the
|
||||
"wrong-factory / ineligible-capability" stranding is gone, exactly-once
|
||||
assignment + crash recovery still hold on multi-host, and every step is
|
||||
flag-gated + reversible.
|
||||
|
||||
### Coverage matrix (design → plan)
|
||||
|
||||
Every design element maps to a checklist block below — no design decision is left
|
||||
without an implementation step.
|
||||
|
||||
| Design element | §ref | Plan block |
|
||||
| --- | --- | --- |
|
||||
| Idle-poll RU bleed | §1.2 | M0 |
|
||||
| Product-as-queue / wrong-factory | §1.1, §5.4 | Routing-model fix |
|
||||
| Open questions / decisions | §10 | M-prep |
|
||||
| Schema, containers, RBAC | §4, §5, §6 | M-prep |
|
||||
| Service Bus topic + subscriptions + filters | §4.2 | M-prep, M1 |
|
||||
| Change-feed dispatcher + scheduler + targeting | §4.3, §5.1 | M1 |
|
||||
| Budget enforcement at assign | §6 | M1 |
|
||||
| Claim/fence + complete-on-claim | §5.2 | M2 |
|
||||
| Small messages (body from Cosmos) | §4.2 | M2 |
|
||||
| Token re-check at claim | §6 | M2 |
|
||||
| Metrics + alerting | §9 | M2 |
|
||||
| Failure→lease release, GC, same-repo clobber | §5.5 | Error handling & cleanup |
|
||||
| Scale-to-zero on-demand | §3, §5.1 | M3 |
|
||||
| Tests (dispatcher, CAS/fencing, GC, shadow) | §9 | Testing |
|
||||
| Rollback / flags per phase | §3 | Rollback & flags |
|
||||
| Doc updates | — | Docs to update |
|
||||
|
||||
### M-prep — Decisions & schema (closes §10; before M1)
|
||||
- [ ] Lock dispatcher placement (platform-service loop vs separate worker) + **leader election** so a single active dispatcher avoids double-publish (§10 Q2).
|
||||
- [ ] Lock Service Bus tier (Standard default; Premium only for sessions / large messages / VNet) (§10 Q3).
|
||||
- [ ] Lock subscription model (per-factory correlation filter default; single-subscription SQL filter if factory churn is high) (§10 Q1).
|
||||
- [ ] Confirm the Cosmos poll path stays as a **permanent** flag-gated fallback (`AQ_FLEET_ROUTE=0`) (§10 Q4).
|
||||
- [ ] Confirm repo-advertisement source: `repos[]` in the heartbeat, derived from `AQ_FLEET_REPO_BASE` (§10 Q5).
|
||||
- [ ] Schema: add `targetFactoryId` to `FleetJobDoc`, `repos[]` to `FleetFactoryDoc`; register a new `fleet_queue_state` doc (`/productId`) for the M0 gate; provision the change-feed **lease container**; update the container registry / `COSMOS_AUTO_INIT`.
|
||||
- [ ] RBAC via managed identity: dispatcher = Service Bus **Sender**, factories = **Listener** on their own subscription; no shared keys committed.
|
||||
|
||||
### M0 — RU quick win (no new infra) — ✅ DONE
|
||||
- [x] Add per-product `fleet_queue_state` doc; bump on create + every stage change (repo layer).
|
||||
- [x] Factory loop point-reads the gate each tick; run the claim only when it changed / mid-drain / safety interval.
|
||||
- [x] Keep `POLL_SECONDS` for local responsiveness; gate the *claim*, with a periodic safety backstop + fail-open (instead of raising the global poll interval).
|
||||
- [x] Flag-gate `AQ_FLEET_GATE=1` (default OFF) with a clean off-switch.
|
||||
- [x] Tests: fleet vitest (repo bump + `GET /fleet/queue-state`) + selftest `39b` (gate decisions) green; gate logic verified standalone.
|
||||
|
||||
### Routing-model fix (lands with M0/M1)
|
||||
- [ ] Add `repo:<name>` capability token; factories advertise local repos via heartbeat (`repos[]`).
|
||||
- [ ] Scheduler matches on caps **+ repo**; product becomes a tag, not the routing key.
|
||||
- [ ] Fix `tracker-web` New-Job form: drop default `capabilities="build"`, stop hardcoding `mac-1/mac-2`, derive factories/repos from live data.
|
||||
- [ ] Add product→repo ownership validation (reject/route mismatches) — the A1 safety net.
|
||||
|
||||
### M1 — Broker in shadow
|
||||
- [ ] Provision Service Bus `fleet-dispatch` topic + per-factory subscriptions, each with a **correlation filter** `targetFactoryId='<id>'` (managed identity, no keys).
|
||||
- [ ] Change-feed dispatcher (leader-elected) tails `fleet_jobs`, runs scheduler, stamps `targetFactoryId` (CAS), publishes targeted messages (`MessageId=jobId`, dup-detection on).
|
||||
- [ ] Dispatcher enforces per-product **budget** (paused / ceiling) before publishing (relocates the `claimNextJob` budget check, §6).
|
||||
- [ ] Publish in **shadow** alongside the Cosmos claim path; record route divergence (no action taken).
|
||||
- [ ] Verify: ≥ N hours shadow with broker-route == scorer-pick within tolerance.
|
||||
|
||||
### M2 — Cutover delivery
|
||||
- [ ] Factories consume from Service Bus; `/fleet/accept` does the Cosmos CAS claim + returns `leaseEpoch`.
|
||||
- [ ] Messages carry `{jobId, productId, repo, caps, priority, targetFactoryId}` only; the consumer **reads the full job body from Cosmos** by `jobId` (256 KB limit, §4.2).
|
||||
- [ ] `/fleet/accept` (and `/fleet/claim`) **re-checks the §12 factory token** (productId + caps + factoryId) before granting the lease.
|
||||
- [ ] Implement **complete-on-claim** (reaper + change-feed re-dispatch owns liveness).
|
||||
- [ ] Cosmos poll path retained as flag-gated fallback (`AQ_FLEET_ROUTE=0`).
|
||||
- [ ] Emit metrics: subscription depth, dispatch lag, 409 claim-conflict rate, DLQ count, change-feed lag — **and wire alerts** (DLQ depth > 0, dispatch lag > threshold) into existing monitoring.
|
||||
- [ ] Verify exactly-once + crash recovery on a real multi-host run; DLQ ↔ `failed`/`retries_exhausted` mapping correct.
|
||||
|
||||
### Error handling & cleanup (lands with M2) — see §5.5
|
||||
- [ ] Add `POST /fleet/fail` so a failed job sets the coordinator stage + **releases the lease immediately** (no expiry wait); wire it into `_finish_failure` / `fleet_quarantine`.
|
||||
- [ ] GC sweep (idempotent): delete merged `aq/job/*` branches, prune stale worktrees, sweep `aq/wip/*` after a job reaches a terminal/shipped stage.
|
||||
- [ ] Prevent same-repo worktree clobber: Service Bus **sessions keyed by `repo`** + a per-`(host, repo)` local lock.
|
||||
- [ ] Verify: failed jobs free their lease promptly; no orphaned worktrees/branches after N jobs; GC never deletes unmerged work or an in-flight worktree.
|
||||
|
||||
### M3 — On-demand factories (scale-to-zero)
|
||||
- [ ] KEDA / Container Apps scaler on subscription depth; idle ⇒ zero running workers.
|
||||
- [ ] Optional warm-pool (1 small instance) if cold-start latency matters.
|
||||
- [ ] Verify: zero idle workers + zero idle RU; cold-start latency within target.
|
||||
|
||||
### Testing (every phase — tests are sacred)
|
||||
- [ ] Unit: dispatcher scheduling + publish, claim CAS + `leaseEpoch` fencing, `/fleet/fail`, GC idempotency, the M0 gate read/skip logic.
|
||||
- [ ] Integration: shadow-divergence harness (M1), exactly-once + crash recovery (M2), scale-to-zero behavior (M3).
|
||||
- [ ] Extend `agent-queue/selftest.sh` + platform-service `vitest`; **CI green is the gate** to advance each phase.
|
||||
|
||||
### Rollback & flags (per phase)
|
||||
- [ ] Each phase ships behind a flag with a documented one-line rollback: M0 `AQ_FLEET_GATE`, M1 shadow (publishes but never acts), M2 `AQ_FLEET_ROUTE` / broker-source toggle, M3 scaler disable.
|
||||
- [ ] Verify each rollback returns to the prior working path with **no data loss** and no stranded leases/messages.
|
||||
|
||||
### Docs to update on completion
|
||||
- [ ] `GIGAFACTORY_ROADMAP.md` — tick Phase 4; correct the stale §0 progress table.
|
||||
- [ ] `GIGAFACTORY_SYSTEM_OVERVIEW.md` — add the broker/dispatcher to the architecture + code map.
|
||||
- [ ] common_plat `docs/GIGAFACTORY/` — mirror the backend/dispatcher changes.
|
||||
@ -1,684 +0,0 @@
|
||||
# Agent Gigafactory — Vision & Implementation Roadmap
|
||||
|
||||
> **One-liner:** Evolve today's single-host `agent-queue` bash runner into a distributed **gigafactory** — a fleet of heterogeneous machines (Mac/Ubuntu/Windows), each running different coding-agent CLIs (Devin/Codex/Claude/Copilot/…), where a scheduler **auto-picks jobs from a shared inbox and routes each `.md` to the best factory × tool × profile** — built service-side on `platform-service` + `tracker-web`, with the bash runner surviving as the offline edge agent.
|
||||
|
||||
> **How to use this doc:** It is both a PRD and an execution checklist. Every feature is a `- [ ]` checkbox with **acceptance criteria** and a **verify gate**. A phase is "100% done" only when every box is checked, its gate passes, and the phase **Definition of Done** rubric (§16) is green. Update the progress table (§0) as you go.
|
||||
|
||||
---
|
||||
|
||||
## 0. Progress tracker
|
||||
|
||||
| Phase | Theme | Status | % | Gate |
|
||||
| ----- | ----- | ------ | - | ---- |
|
||||
| **0** | Baseline (today) | ✅ shipped | 100% | `selftest.sh` green |
|
||||
| **1** | Manifest + profiles + capabilities + tracker adapter (single host) | ✅ done | ~98% | adapter e2e + selftest |
|
||||
| **2** | Coordinator as platform-service module + Cosmos + multi-factory leasing | ✅ done | ~98% | fleet e2e + module tests |
|
||||
| **3** | Fleet control plane in tracker-web + DAG deps + budgets + scoring router | ✅ done | 100% | web e2e + router tests |
|
||||
| **4** | Message bus + autoscaling + cross-OS capability marketplace | ◐ in progress | ~10% | load/chaos suite — **M0 RU gate shipped** (`fleet_queue_state` + `GET /fleet/queue-state` + `AQ_FLEET_GATE`); broker/M1+ per `FLEET_DISPATCH_REDESIGN.md` |
|
||||
| **5** | Self-optimizing / learned routing | ☐ not started | 0% | offline eval + A/B |
|
||||
|
||||
Legend: ☐ not started · ◐ in progress · ✅ done. Keep per-phase checklists below as the source of truth; this table is the summary. **Owners per phase: §23 · rollout/rollback: §21 · capacity & SLOs: §22/§19.** For the full current-state architecture, diagrams, code map, next steps and known gaps see **`GIGAFACTORY_SYSTEM_OVERVIEW.md`** (companion doc).
|
||||
|
||||
---
|
||||
|
||||
## 1. Vision & metaphor
|
||||
|
||||
A **gigafactory** turns raw intent (`.md` task files / tracker items) into shipped software with minimal human touch. The mental model is a physical factory network:
|
||||
|
||||
| Term | Meaning |
|
||||
| ---- | ------- |
|
||||
| **Fleet** | The whole network of machines under one control plane. |
|
||||
| **Factory** | One physical/virtual machine (a Mac, an Ubuntu box, a Windows host). Has an OS, installed tools, creds, capacity. |
|
||||
| **Station** | A tool/engine slot inside a factory (a Devin seat, a Codex CLI, a Claude Code session, a Copilot agent). |
|
||||
| **Worker** | A single running agent process executing one job at a station. |
|
||||
| **Job** | A unit of work: a prompt/`.md` + manifest (profile, scope, gates, budget). |
|
||||
| **Profile** | The *role* doing the work (developer, backend engineer, UX/UI designer, QA, reviewer) = persona prompt **+** capability requirements. |
|
||||
| **Capability** | A tag a factory advertises and a job requires (`os:mac`, `has:xcode`, `has:figma`, `gpu`, `engine:devin`). |
|
||||
| **Lease** | A time-boxed claim of a job by a worker; expires → job is reclaimable (crash recovery). |
|
||||
| **Gate** | A checkpoint a job must pass: auto-QA `verify`, human review, ship approval. |
|
||||
| **Artifact** | Any captured output: commits/PRs, logs, screenshots, reports, build outputs. |
|
||||
|
||||
**North star:** drop work into one inbox (or file a tracker task), and the fleet figures out *where* (factory), *with what* (tool/engine), *as whom* (profile), runs it in parallel, self-heals on crash, gates quality automatically, and surfaces everything in one live control plane — while a human only approves the final ship.
|
||||
|
||||
```
|
||||
┌──────────────────────── CONTROL PLANE (tracker-web) ────────────────────────┐
|
||||
│ plan/intake · roadmap · Fleet map · live logs · cost · approvals │
|
||||
└───────────────▲───────────────────────────────────┬─────────────────────────┘
|
||||
│ REST/SSE │
|
||||
┌────────────────────────────┴─────── COORDINATOR (platform-service module) ───────────────┐
|
||||
│ queue · scheduler/router · leases · profiles · capabilities · events · budgets (Cosmos) │
|
||||
└───▲───────────────────────▲───────────────────────▲───────────────────────▲───────────────┘
|
||||
│ claim/lease/report │ │ │
|
||||
┌───────┴───────┐ ┌────────┴───────┐ ┌────────┴───────┐ ┌───────┴────────┐
|
||||
│ FACTORY: mac │ │ FACTORY: ubuntu│ │FACTORY: windows│ │ FACTORY: mac-2 │
|
||||
│ devin, claude │ │ codex, claude │ │ copilot, codex │ │ devin (xcode) │
|
||||
│ [agent-queue] │ │ [agent-queue] │ │ [agent-queue] │ │ [agent-queue] │
|
||||
└───────────────┘ └────────────────┘ └────────────────┘ └────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Current state (Phase 0 baseline — already shipped)
|
||||
|
||||
Today's `agent-queue.sh` + `dashboard.mjs` (single host, zero-dep bash + Node):
|
||||
|
||||
- **Folder kanban lifecycle:** `inbox → building → review → testing → shipped` (+ `failed`).
|
||||
- **Auto-QA gate:** agent rc=0 → `review/`; optional `verify:` runs in `cwd` → pass `testing/`, fail `failed/`; no verify → parks in `review/`. Manual `ship` = the human gate.
|
||||
- **Per-job frontmatter:** `engine` (devin/claude/codex), `cwd`, `yolo` (→ dangerous/auto-approve), `lock` (per-repo serialization), `timeout`, `verify`.
|
||||
- **Concurrency:** `AGENT_QUEUE_MAX` (default 3), per-`lock` serialization so same-repo jobs never collide.
|
||||
- **State & logs:** `.state/<job>.meta` heartbeats + `logs/<job>.log`; git-tracked queue (audit-by-commit).
|
||||
- **Interactive dashboard:** numbered selectable job list, single-key actions (promote/ship/reject/requeue), live log viewer, run/stop, all shelling out to `agent-queue.sh`.
|
||||
|
||||
**Carries forward:** the `.md`-in-`inbox` UX, frontmatter contract, lifecycle stage names, `verify` gate, lock/affinity concept, the bash runner itself (becomes the factory agent).
|
||||
**Must change for the fleet:** single-host run loop → distributed leasing; file-only state → service + Cosmos; one engine choice → capability/profile routing; local dashboard → shared control plane.
|
||||
|
||||
- [x] Phase 0 complete — baseline shipped and self-tested. *(reference, not a work item)*
|
||||
|
||||
---
|
||||
|
||||
## 3. Goals & non-goals
|
||||
|
||||
**Goals**
|
||||
- One intake, many machines: parallel execution across heterogeneous OS/tools.
|
||||
- Automatic routing to the best `factory × tool × profile` with affinity, fairness, budget, and health awareness.
|
||||
- Self-healing (lease expiry/requeue), quality gates, and full observability.
|
||||
- Reuse the ByteLyst stack (`platform-service`, Cosmos, `@bytelyst/*`, tracker-web) — no parallel infra.
|
||||
- Preserve offline/zero-dep edge operation via the bash runner.
|
||||
|
||||
**Non-goals**
|
||||
- Not a CI/CD replacement (it *triggers* CI; CI still gates merges).
|
||||
- Not a general-purpose workflow engine (scoped to coding-agent execution).
|
||||
- Not a model/inference host (it orchestrates agent CLIs, doesn't serve models).
|
||||
- Not abandoning the simple `.md` mental model — humans still drop files / file tasks.
|
||||
|
||||
---
|
||||
|
||||
## 4. Core concepts contract (must hold across all phases)
|
||||
|
||||
- [ ] Every job has a stable **id**, an immutable **manifest**, and an append-only **event log**.
|
||||
- [ ] Every Cosmos document carries `productId` (ByteLyst rule).
|
||||
- [x] A job in flight is always covered by exactly one **lease**; no live lease → reclaimable.
|
||||
- [x] **Atomic claim:** a job is assigned to exactly one worker via optimistic concurrency (Cosmos `_etag`/`If-Match` or a conditional `fleet_leases` insert keyed by `jobId`). Concurrent claimers — exactly one wins; losers retry the next candidate.
|
||||
- [x] **Fencing token:** every lease carries a monotonic `leaseEpoch`. Every report/commit/ship carries its epoch; the coordinator **rejects writes from a stale epoch**, so a partitioned or zombie worker cannot corrupt state after its lease was reclaimed.
|
||||
- [ ] **Coordinator-authoritative time:** all lease/TTL/SLA math uses server timestamps, never factory clocks (clock-skew safety).
|
||||
- [ ] Lifecycle stages are canonical and shared: `queued → assigned → building → review → testing → shipped` (+ `blocked`, `failed`, `dead_letter`).
|
||||
- [ ] The bash runner and the service speak the **same manifest + event vocabulary** (one schema, two transports).
|
||||
|
||||
> **Implementation status (2026-05-29) — Phase 2 Foundation merged** (common-plat PR #28, `platform-service/src/modules/fleet/`): all 7 `fleet_*` containers (§13) ✓; repositories + coordinator (claim/lease/fence/heartbeat/reaper) ✓; idempotency + deps + submit-time cycle detection ✓; 50 module tests green.
|
||||
> **✓ P0 hardening landed (2026-05-29, common-plat PR #29) — atomic claim is now truly concurrency-safe.** Added `updateIfMatch` to `@bytelyst/datastore`: Cosmos conditions the replace on `_etag` via `accessCondition {type:'IfMatch'}` (412 → conflict) plus a rev compare for the pre-read window; the Memory provider does `get→compare→set` in one synchronous block (no `await` between), so concurrent callers cannot interleave. `fleet` `revUpdate*` now write conditionally. Proven by `Promise.all` 2-contender + N-claimer stress + concurrent `claimNextJob`/lease-renew tests (these **fail** on the old read-check-write, pass now). datastore 48 + fleet 53 green; full workspace build/test clean; no consumer regressed. **P2-S3 (factory integration) is now unblocked.**
|
||||
|
||||
---
|
||||
|
||||
## 5. The evolved Job manifest (feature)
|
||||
|
||||
Extend today's frontmatter into a richer, **backward-compatible** manifest. Old `.md` files keep working (new fields optional with sane defaults).
|
||||
|
||||
```yaml
|
||||
---
|
||||
# --- existing (unchanged) ---
|
||||
engine: devin # explicit engine; overrides profile/engine-class
|
||||
cwd: /abs/path/repo
|
||||
yolo: true
|
||||
lock: my-repo
|
||||
timeout: 45m
|
||||
verify: pnpm -s test
|
||||
# --- new ---
|
||||
profile: backend-engineer # role: persona + capability requirements
|
||||
engine-class: agentic-coder # abstract; scheduler picks a concrete engine if `engine` unset
|
||||
capabilities: [os:any, node>=20] # hard requirements a factory MUST satisfy
|
||||
prefers: [factory:mac-2] # soft routing hints (affinity)
|
||||
priority: high # critical|high|medium|low → SLA + preemption
|
||||
budget: { usd: 5, tokens: 2M, wall: 4h } # wall = HARD ceiling (always enforceable). usd/tokens = best-effort
|
||||
# caps: enforced only where the engine/provider exposes live metering;
|
||||
# otherwise estimated from provider usage APIs post-hoc + alerted.
|
||||
deps: [job-123, job-456] # DAG: don't start until these reach `shipped`/`testing`
|
||||
idempotency-key: nomgap-ux-2 # dedupe: a second identical submit is a no-op
|
||||
retry: { max: 2, backoff: 5m, on: [timeout, verify_failed] }
|
||||
review-policy: manual # auto|manual|reviewers:[@alice]
|
||||
artifacts: [coverage, screenshots] # what to capture beyond commits
|
||||
tracker-item: ITEM-789 # link back to the originating tracker task
|
||||
---
|
||||
```
|
||||
|
||||
- [ ] Define the manifest schema (Zod in the service; documented YAML for `.md`).
|
||||
- [x] Backward-compat: a Phase-0 `.md` (only `engine/cwd/yolo`) parses with all new fields defaulted. *(P1-S1: bash runner; Zod schema still P2. selftest backward-compat case green.)*
|
||||
- [x] **Capability grammar** defined: tokens are `key` (presence, e.g. `has:xcode`), `key:value` (e.g. `os:mac`, `engine:devin`), or `key<op>version` with `op ∈ {>=,>,=,<=,<}` (e.g. `node>=20`). `os:any` is a wildcard that matches every factory. A job matches a factory iff every required token is satisfied by the factory descriptor. *(P1-S1: `caps_match`/`detect_capabilities` in `agent-queue.sh`.)*
|
||||
- [x] **`engine-class` taxonomy** defined as an enum (`agentic-coder`, `chat-coder`, `review-only`) with a documented engine→class map (`devin,claude,codex → agentic-coder`; `copilot → chat-coder`). If `engine` is set it wins; else the scheduler picks any free engine in the class honoring `prefers-engine`. *(P1-S1: `resolve_engine`; `review-only` mapping reserved.)*
|
||||
- [x] **`idempotency-key` semantics:** `key + content-hash` identical ⇒ no-op (returns existing job). Same `key`, **different** content ⇒ **rejected with 409** unless the prior job is still `queued`/`blocked` (then it is superseded). A re-`run`/`retry` of an existing job is **not** a new submit and never trips dedupe. *(P1-S1: add-time dedupe; bash maps "409" → clear error, `queued` → still in `inbox/` ⇒ superseded.)*
|
||||
- [x] **`deps` semantics:** a dep is satisfied when it reaches `shipped` (default) or `testing` if `deps-mode: soft`. Submit-time **cycle detection** rejects cyclic graphs; unmet deps put the job in `blocked` (not `queued`). Cross-factory deps require the coordinator (P2); single-host deps work in P1. *(P1-S2: `deps_unmet` skip-with-reason in selection + `status` surfacing; `deps_would_cycle` on `add`. Cross-machine deps remain P2.)*
|
||||
- **Acceptance:** a manifest fixture suite parses/validates; invalid manifests fail with precise errors; capability-grammar + dep-cycle + idempotency-conflict cases covered.
|
||||
- **Verify gate:** schema unit tests (≥ 1 per field incl. defaults + 5 invalid cases + grammar/cycle/409 cases).
|
||||
|
||||
---
|
||||
|
||||
## 6. Profiles — persona + capability (feature)
|
||||
|
||||
A **profile** = a versioned file combining a persona (system-prompt overlay), required capabilities, default gates, preferred engine/model, and allowed repo scopes. Stored as `profiles/<name>.md` (Phase 1) → Cosmos `profiles` container (Phase 2).
|
||||
|
||||
```yaml
|
||||
# profiles/backend-engineer.md
|
||||
---
|
||||
name: backend-engineer
|
||||
persona: |
|
||||
You are a senior backend engineer. Favor minimal, well-tested changes...
|
||||
capabilities: [node>=20, has:pnpm]
|
||||
default-verify: pnpm -s typecheck && pnpm -s test
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [devin, claude]
|
||||
allowed-scope: ["backend/**", "packages/**"] # blast-radius guardrail
|
||||
review-policy: manual
|
||||
---
|
||||
```
|
||||
|
||||
- [x] Author starter catalog: `developer`, `backend-engineer`, `frontend-engineer`, `ux-designer`, `ui-designer`, `qa`, `reviewer`, `docs-writer`. *(P1-S2: `profiles/*.md` + a reserved `planner`.)*
|
||||
- [x] Persona overlay is **prepended** to the job body before the agent runs; secrets are never written to logs or the event stream (redaction at the source). *(P1-S2: `profile_persona` prepended to the stripped body file.)*
|
||||
- [x] Profile supplies default `verify`, `capabilities`, `engine-class`, `allowed-scope` when the job omits them. *(P1-S2: `fm_eff` — also `prefers-engine` + `review-policy`; job fields always override.)*
|
||||
- [ ] Profile versioning: changing a profile doesn't mutate in-flight jobs (snapshot at assign time). *(P2 — needs Cosmos snapshot at assign time.)*
|
||||
- [x] `allowed-scope` enforced as a guardrail (warn in P1, enforce/deny in P2 via pre-flight diff check). *(P1-S2: `scope_check` post-run WARN-only + `scope_warning=` in meta; `path_in_scope` unit-testable.)*
|
||||
- **Acceptance:** a job with `profile: backend-engineer` and no `verify` inherits the profile's verify + persona.
|
||||
- **Verify gate:** profile-resolution unit tests; persona-injection golden test.
|
||||
|
||||
---
|
||||
|
||||
## 7. The scheduler / router (the heart) (feature)
|
||||
|
||||
Given a `queued` job and the current fleet, choose `(factory, station/engine, profile)` and issue a lease.
|
||||
|
||||
**Inputs:** job manifest (capabilities, priority, budget, deps, prefers, lock), profile requirements, live factory descriptors (capabilities, load, health, cost class), lock/affinity table, fairness counters.
|
||||
|
||||
**Algorithm (deterministic, explainable):**
|
||||
1. **Filter** factories by **hard capability match** (job ∪ profile capabilities ⊆ factory capabilities) and free station for a compatible engine.
|
||||
2. **Block** if `deps` unmet or `lock` already held → leave `queued`/`blocked`.
|
||||
3. **Score** each candidate factory:
|
||||
`score = w1·capabilityFit + w2·affinity(prefers, repo-stickiness) + w3·(1/load) + w4·costFit(budget) + w5·health − w6·starvationPenalty`
|
||||
4. **Tie-break:** highest priority job first; then oldest; then lowest cost class.
|
||||
5. **Assign atomically** → create the lease under an optimistic-concurrency guard (`_etag`/`If-Match` or conditional insert keyed by `jobId`) **with a fresh `leaseEpoch`**; on conflict another factory won → retry the next candidate. Set job `assigned`, decrement station/seat capacity, bump fairness counter. Use **coordinator-authoritative timestamps** only.
|
||||
6. **Preemption (P3+):** a `critical` job may pause a `low` job at a needed station (checkpoint + requeue, bumping the preempted job's `leaseEpoch`).
|
||||
|
||||
> **Phasing:** Phase 2 ships the deterministic **filter + atomic-assign core** (fixed weights). Phase 3 adds **tunable weights, preemption, and the explainability UI**. Phase 5 learns the weights (§14).
|
||||
|
||||
- [ ] Implement pure, unit-testable scoring function (no I/O) with configurable weights.
|
||||
- [ ] Hard-filter correctness: never assign a job to a factory missing a required capability.
|
||||
- [ ] Affinity/stickiness: same-repo jobs prefer the factory that has the warm checkout (lock-aware).
|
||||
- [ ] Fairness: no factory or product starves under sustained load (counter + penalty).
|
||||
- [ ] Explainability: every assignment records *why* (matched caps, score breakdown) in the event log.
|
||||
- [ ] Determinism: same inputs → same decision (seeded tie-breaks) for testability.
|
||||
- [ ] Define **factory health** ∈ [0,1] = f(heartbeat freshness, recent run failure-rate, resource pressure); factories below a health floor are **filtered out**, not merely down-weighted.
|
||||
- [ ] **Station/seat capacity:** a factory's free stations = `min(host slots, per-engine seat limits)` (e.g. licensed Devin/Claude seats); the scheduler never over-subscribes a seat-limited engine.
|
||||
- [ ] **Distributed lock:** the Phase-0 local `lock` becomes a **coordinator-held lock** so same-`lock` jobs serialize across the whole fleet (prevents two factories pushing the same repo concurrently).
|
||||
- **Acceptance:** scenario fixtures (10+) produce expected assignments incl. starvation, capability-miss, seat-exhaustion, unhealthy-factory exclusion, and budget-exceed; a concurrent-claim race test proves exactly one winner.
|
||||
- **Verify gate:** router unit suite ≥ 95% branch coverage on the scoring/filter core; atomic-claim race test.
|
||||
|
||||
---
|
||||
|
||||
## 8. Factory model & registration (feature)
|
||||
|
||||
Each machine runs a **factory agent** (the evolved `agent-queue` runner) that registers, heartbeats, claims jobs, and reports events.
|
||||
|
||||
- [ ] **Capability auto-detection** at boot: OS, installed engines (devin/claude/codex/copilot), tool probes (xcode, figma-cli, docker, gpu), node/pnpm versions, available creds (presence only, never values).
|
||||
- [ ] **Enrollment / bootstrap trust**: first registration authenticates with a one-time enrollment secret (or an operator-issued platform JWT). The factory then receives a **scoped, rotatable factory token** (`jose` JWT); decommission = revoke. No standing shared secret in the queue.
|
||||
- [ ] **Registration**: `POST /fleet/factories/enroll` with descriptor → receives a factory id + one-time token (built as: registration == first heartbeat; enroll mints the scoped token).
|
||||
- [ ] **Heartbeat**: periodic `POST /fleet/factories/heartbeat` (load, free stations, health). A **coordinator lease reaper** (not Cosmos TTL) sweeps `expiresAt < now` and reclaims, **bumping `leaseEpoch`** so the dead/zombie worker is fenced; a factory missing N heartbeats is marked `offline` and all its leases reclaimed. **Cadence must be < the 90s stale threshold** (`AQ_FLEET_LEASE_RENEW_SEC`; fleet launcher uses 30s).
|
||||
- [ ] **Claim loop**: `POST /fleet/claim` advertising capabilities/free stations; atomic (exactly one winner, §4); receives a job + lease TTL + `leaseEpoch`. Use **claim backoff / long-poll** to bound Cosmos RU under many idle factories (see §22); **Phase-4 M0 adds the `AQ_FLEET_GATE` skip** (`GET /fleet/queue-state`), and broker push replaces polling in M1+.
|
||||
|
||||
> The endpoint paths above are the **as-built** API (`/fleet/factories/enroll`,
|
||||
> `/fleet/factories/heartbeat`, `/fleet/claim`) — see `GIGAFACTORY_SYSTEM_OVERVIEW.md`
|
||||
> §9 and the fleet module README for the authoritative list.
|
||||
- [ ] **Report**: stream stage/log/event back (`POST /fleet/runs/:id/events`), **echoing `leaseEpoch`** (stale epoch → 409, worker self-aborts); renew lease while alive.
|
||||
- [ ] **Environment prep**: before `verify`, the factory ensures deps are installed (cold checkout → `pnpm install`); prep time counts against `budget.wall`.
|
||||
- [ ] **Graceful drain**: factory can stop claiming, finish in-flight, deregister.
|
||||
- **Acceptance:** a factory enrolls, claims a matching job, heartbeats, completes; a killed factory's job is reclaimed by another within the lease TTL and the killed worker's late report is **rejected by fencing**.
|
||||
- **Verify gate:** factory-agent integration test against a mock coordinator; crash-recovery + fencing-rejection test.
|
||||
|
||||
---
|
||||
|
||||
## 9. Coordination architecture (decision + path)
|
||||
|
||||
Three transports were evaluated. **Decision: platform-service-native coordinator is the spine; git-queue stays for the offline edge; broker added only at scale.**
|
||||
|
||||
| Option | Pros | Cons | Verdict |
|
||||
| ------ | ---- | ---- | ------- |
|
||||
| (a) **Git-synced queue** (evolve folders) | zero infra, audit-by-commit, offline | weak/racey leasing, latency, merge churn | **Edge/offline only** |
|
||||
| (b) **Coordinator service** (platform-service module) | real leases, fairness, observability, reuses auth/Cosmos/productId | a service to run | **Chosen spine (P2)** |
|
||||
| (c) **Message broker** (NATS/Redis/SQS) | scale, backpressure, push dispatch | most moving parts/ops | **P4 when throughput demands** |
|
||||
|
||||
- [ ] Document the decision + rationale in-repo (this section is the canonical record).
|
||||
- [ ] Define the **claim/lease protocol** once; both git-queue (poll) and service (API) implement it.
|
||||
- [ ] **Split-brain / network-partition safety:** a partitioned factory may keep running and even `git push`. `idempotency-key` dedupes *submits* but cannot undo *side-effects*. Mitigation: **fencing** — the coordinator rejects `ship`/merge reports from a stale `leaseEpoch`, and the distributed `lock` (§7) prevents a reclaimed-job's twin from pushing the same repo. Residual risk (a stale push to a feature branch) is contained by the PR-merge ship gate (§10) and surfaced for human triage.
|
||||
- [ ] **Offline-degrade**: a factory cut off from the coordinator falls back to its local git-queue and reconciles on reconnect; on reconnect it presents its `leaseEpoch` — if reclaimed, its results are quarantined, not auto-merged.
|
||||
- [ ] **Poll cost**: bound claim-loop RU via long-poll/backoff (§22); migrate to broker push at P4.
|
||||
- **Acceptance:** the same job manifest runs identically through the bash/git path and the service path; a simulated partition does not double-merge (fencing test).
|
||||
- **Verify gate:** contract test asserting protocol parity (git vs service) + partition/fencing test.
|
||||
|
||||
---
|
||||
|
||||
## 10. tracker-web / platform-service integration (committed path)
|
||||
|
||||
**Layering:** tracker = *WHAT/WHY* (plan, intake, prioritize, roadmap, votes) · gigafactory = *HOW* (execute) · platform-service = shared brain · agent-queue runner = offline edge. Grounded in the real `tracker-service` model (`Item`: `type` bug/feature/**task**, `status` open/in_progress/done/closed/wont_fix, priority, labels, assignee, `source` incl. **auto_detected**, votes, comments, public roadmap) and the `tracker-web` `/api/tracker/[...path]` proxy pattern.
|
||||
|
||||
### Phase 1 — Adapter (no new infra)
|
||||
- [x] **task → job**: a tracker `Item` of `type: task` (e.g. `assignee: @agent` or label `agent:run`) is exported to a job `.md` (manifest mapped: title/description → body, priority → priority, labels → capabilities/profile hints). *(P1-S4: `aq from-tracker`; labels `engine-class:`/`profile:`/`priority:`/`cap:` → frontmatter.)*
|
||||
- [x] **job → tracker**: lifecycle events post back as **status updates + comments** — `building` → status `in_progress` + comment "started on factory X"; `shipped` → `done` + comment with commit SHAs / PR link / verify results; `failed` → comment with reason (status stays `in_progress` for human triage). *(P1-S4: `aq to-tracker` PATCHes status + posts a metrics-only comment; one-way echo §24.5; never fatal. The items API has no blocked/failed status, so failures map to `wont_fix` by default — override via `AQ_TRACKER_STATUS_FAILED`.)*
|
||||
- [x] Idempotency: re-running the adapter for the same item doesn't create duplicate jobs (idempotency-key = item id + content hash). *(P1-S4: derived `idempotency-key: tracker-<id>` reuses Slice 1 dedupe; `to-tracker` is idempotent via `tracker_echoed`.)*
|
||||
- [x] Adapter is a thin script/CLI (`aq from-tracker ITEM-789`) + optional poller. *(P1-S4: `from-tracker`/`to-tracker` + opt-in `AQ_TRACKER_AUTO` auto-echo; a standalone poller is deferred.)*
|
||||
- **Acceptance:** filing a tracker task, marking it `agent:run`, results in a queued job; on ship, the item flips to `done` with a SHA comment.
|
||||
- **Verify gate:** adapter e2e against a tracker-service test instance (or mock); round-trip assertion.
|
||||
|
||||
**Stage → tracker status mapping** (tracker's enum is coarser than the fleet's; keep fine-grained stage in a label + comment so no detail is lost):
|
||||
|
||||
| Fleet stage | Tracker `status` | Extra |
|
||||
| ----------- | ---------------- | ----- |
|
||||
| `queued` / `assigned` / `blocked` | `in_progress` | label `fleet:<stage>` |
|
||||
| `building` / `review` / `testing` | `in_progress` | label `fleet:<stage>` + progress comment |
|
||||
| `shipped` | `done` | comment with SHA(s)/PR link/verify result |
|
||||
| `failed` / `dead_letter` | `in_progress` + label `needs-triage` | never auto-`closed`/`wont_fix` (humans decide) |
|
||||
|
||||
**Ship semantics (PR flow):** `shipped` = change **merged to target branch with CI green** (default), OR `pr-opened` when `review-policy` defers merge to humans/CI — configurable per profile. This honors the non-goal that CI still gates merges (§3); the agent never bypasses branch protection.
|
||||
|
||||
### Phase 2 — Native spine
|
||||
- [ ] Stand up a `fleet` (a.k.a. `orchestrator`) module **inside platform-service**, sibling to `tracker-service`: pattern `types.ts → repository.ts → routes.ts`, ESM, Cosmos, `productId`, `req.log`.
|
||||
- [ ] Endpoints: jobs CRUD, claim/lease, events/report, factories register/heartbeat, profiles, stats.
|
||||
- [ ] Runners (bash + any) become API clients of this module; tracker adapter calls it directly.
|
||||
- **Acceptance:** a job submitted via the module is claimed by a real factory and shipped, with all state in Cosmos.
|
||||
- **Verify gate:** module test suite (repository + routes) using the shared `@bytelyst/testing` inject helpers.
|
||||
|
||||
### Phase 3 — Unified control plane
|
||||
- [ ] Add a **Fleet** surface to `tracker-web` reusing auth/Primitives/DataTable/product switcher: fleet map (factories + load/health), job table, job DAG, **live log streaming**, lease/heartbeat status, cost burndown, approve/ship buttons.
|
||||
- [ ] **Streaming caveat (correctness):** live logs **must not** use the existing buffering catch-all proxy `/api/tracker/[...path]` — it does `res.text()` and would never stream. Use a **dedicated Next.js Route Handler returning a `ReadableStream` (SSE)** or a direct SSE/WebSocket to platform-service. Full logs are shipped to blob storage (§17); the endpoint serves stored tail + live append.
|
||||
- [x] The Node TUI dashboard becomes a thin client of the same `/fleet` API (parity with web). *(devops-tools `agent-queue/dashboard.mjs` + `lib/fleet-dash.mjs`, `AQ_FLEET_DASH=1`.)*
|
||||
- **Acceptance:** an operator can watch all factories + tail any job log + ship from the browser.
|
||||
- **Verify gate:** web e2e (Playwright) covering fleet map render, live log, and a ship action.
|
||||
|
||||
---
|
||||
|
||||
## 11. Lifecycle & gates at scale (feature)
|
||||
|
||||
- [ ] Canonical stages enforced server-side: `queued → assigned → building → review → testing → shipped` (+ `blocked`, `failed`, `dead_letter`); transitions validated (illegal transition → 409).
|
||||
- [ ] Per-profile default `verify`; per-job override; verify runs at the factory, result reported as an event.
|
||||
- [ ] Human gates: `review-policy` routes to reviewers; multi-reviewer support (P3).
|
||||
- [x] **Dead-letter**: after `retry.max` exhausted, job → `dead_letter` with full diagnostics; never silently dropped. *(P1-S3 single-host stand-in: `failed/` `result=retries_exhausted`, WIP branch + full log preserved.)*
|
||||
- [ ] **Backpressure**: when no factory can take more, jobs stay `queued` (no thrash); SLA timers visible.
|
||||
- [ ] **Ship semantics** are profile-configurable (merged+green vs `pr-opened`, §10); `shipped` is terminal-success, `dead_letter` terminal-failure; `blocked` (unmet deps) is distinct from `queued`.
|
||||
- [x] **Retry vs idempotency**: a retry creates a new `fleet_runs` attempt under the same job/`idempotency-key` (never a duplicate job); backoff honored; `retry.on` filters which failure classes retry. *(P1-S3 single-host: `attempts` counter survives requeue; `backoff`→`next_eligible` gates selection; `on` filters timeout/verify_failed/crash.)*
|
||||
- **Acceptance:** a perpetually-failing job lands in `dead_letter` after configured retries; a passing one auto-advances to `testing` then waits for human `ship`; an illegal transition is rejected.
|
||||
- **Verify gate:** lifecycle state-machine unit tests (all transitions + illegal-transition rejection + retry/dead-letter path).
|
||||
|
||||
---
|
||||
|
||||
## 12. Security, safety & governance (feature — critical with `yolo`/dangerous)
|
||||
|
||||
- [ ] **Secret isolation**: creds live on each factory (env/keychain), **never** in the queue, manifest, logs, or Cosmos. Factory advertises *presence* of a cred capability, not the value.
|
||||
- [ ] **Scoped git tokens** per factory/repo; least-privilege; rotation documented.
|
||||
- [ ] **Push policy**: protected branches; agents push to feature branches + open PRs by default; direct-to-main gated by profile/flag.
|
||||
- [ ] **Blast-radius guardrail**: enforce `allowed-scope` — pre-flight + post-run diff check; out-of-scope changes block the ship gate.
|
||||
- [ ] **Budget kill-switch**: exceed `budget` (usd/tokens/wall) → pause worker, alert, require human resume.
|
||||
- [ ] **Supply-chain safety**: edits to shared `@bytelyst/*` packages require `reviewer` profile + human gate (never auto-ship).
|
||||
- [ ] **Audit trail**: append-only event log per job (who/what/when/where/cost); immutable.
|
||||
- [ ] **Corp network/proxy**: honor `NETWORK`/proxy + truststore conventions on factories that need them.
|
||||
- [ ] **Kill switch (global)**: one command/flag halts all claiming fleet-wide (incident response).
|
||||
- **Acceptance:** a job attempting an out-of-scope edit is blocked at the gate; a budget overrun pauses and alerts; no secret ever appears in any persisted artifact (scanner test).
|
||||
- **Verify gate:** security test suite incl. a secret-leak scanner over logs/meta + scope-enforcement test.
|
||||
|
||||
---
|
||||
|
||||
## 13. Data model (Cosmos containers, P2+)
|
||||
|
||||
Each container partitioned sensibly; every doc has `productId`.
|
||||
|
||||
- [x] `fleet_jobs` (pk `/productId`) — manifest snapshot **+ the full instruction body verbatim as markdown (`bodyMd`)**, current stage, idempotency-key, tracker-item link, `checkpoint` pointer (WIP branch/commit). This is the **durable source of truth for instructions** — a factory holds only a transient materialized copy, so a machine going down loses nothing (§25).
|
||||
- [x] `fleet_runs` (pk `/jobId`) — one per execution attempt: factory, engine, profile snapshot, timings, exit, verify result, **and execution insights: model, tokensIn/Out (+cached), cost (`estimated` flag), turns, tool-call counts, filesChanged, linesAdded/Deleted, attempt number** (§26).
|
||||
- [x] `fleet_leases` (pk `/jobId`) — holder factory, `expiresAt`, **`leaseEpoch` (fencing)**, renewals. **Reclaim via a coordinator reaper** that scans `expiresAt < now` — Cosmos TTL only garbage-collects stale rows, it **cannot trigger reclaim logic**. Claim guarded by `_etag`/`If-Match`.
|
||||
- [x] `fleet_factories` (pk `/productId`) — descriptor, capabilities, health, load, last heartbeat, seat limits.
|
||||
- [x] `fleet_profiles` (pk `/productId`) — versioned profile snapshots (immutable per version).
|
||||
- [x] `fleet_events` (pk `/jobId`) — append-only audit/event stream (stage changes, log pointers, cost ticks, scheduler decisions).
|
||||
- [ ] `fleet_artifacts` (pk `/jobId`) — pointers to **blob-stored** logs + artifacts (coverage, screenshots, build output). Large logs live in `@bytelyst/blob`, **never** inline in Cosmos (doc-size + RU limits).
|
||||
- [ ] Relate to existing tracker `Item` via `tracker-item` (no duplication of planning data).
|
||||
- [x] **Optimistic concurrency** (`_etag`) on every job stage transition + lease claim to prevent lost updates / double-assignment. *(PR #29: `updateIfMatch`.)*
|
||||
- [ ] **Indexing/RU**: the claim query is hot — index `stage`, `priority`, `capabilities`; avoid cross-partition fan-out; provision RU/s per §22.
|
||||
- **Acceptance:** repository CRUD + query tests per container; **atomic-claim race test (N concurrent claimers → exactly one wins)**; reaper-reclaim + fencing-rejection test; lease-expiry verified via reaper (not TTL).
|
||||
- **Verify gate:** repository unit/integration tests (memory + Cosmos provider via `DB_PROVIDER`).
|
||||
|
||||
---
|
||||
|
||||
## 14. Phased build roadmap (checklists)
|
||||
|
||||
Each phase: **Goal → checklist → Exit criteria**. Don't start a phase until the prior phase's Exit criteria are green. Tick boxes here as the canonical progress.
|
||||
|
||||
### Phase 1 — Manifest + profiles + capabilities + tracker adapter (single host)
|
||||
**Goal:** richer single-host runner that understands profiles/capabilities and bridges to tracker — no distributed infra yet.
|
||||
|
||||
> **Slice progress — P1-S1:** manifest parsing (all §5 fields, defaulted + backward-compatible), `priority` ordering, capability detection+match gate, `engine-class` resolution, and `idempotency-key` dedupe are **done** on the bash runner.
|
||||
>
|
||||
> **Slice progress — P1-S3 (resilience & insights, single host):** crash recovery (`recover_orphans` + `aq recover`), git WIP checkpoint/resume (`aq/wip/<job>`), functional `retry` policy (backoff + `retries_exhausted`), and execution insights (`parse_usage`, per-run metrics in meta, `aq insights`, `status`/`dash` insights) are **done** — see §11/§25/§26.
|
||||
>
|
||||
> **Slice progress — P1-S2 (profiles + deps/DAG, single host):** the `profiles/` catalog + resolution (`fm_eff` inheritance with job>profile>default precedence, persona injection), the warn-only `allowed-scope` guardrail (`scope_check`/`path_in_scope`), and single-host `deps` (block-with-reason in selection, `status` surfacing, submit-time cycle detection) are **done** — see §5/§6.
|
||||
>
|
||||
> **Slice progress — P1-S4 (tracker adapter, single host):** the task ↔ job round-trip is **done** (§10) — `aq from-tracker` materializes a job from a tracker Item (idempotent on `tracker-<id>`, label→manifest mapping), `aq to-tracker` echoes status + a metrics-only comment one-way (idempotent via `tracker_echoed`, never fatal), and opt-in `AQ_TRACKER_AUTO` auto-echoes on transitions. All HTTP is curl-only through one wrapper (test seam `AQ_TRACKER_API_CMD`). **This closes the Phase-1 §14 tracker-adapter item.** Remaining P1 extras: Node-`dash` surfacing of the new fields. *(`budget.wall` now enforced — see §11 retry/budget line below.)*
|
||||
|
||||
- [x] Extend `agent-queue.sh` frontmatter parsing for all new manifest fields (§5), defaulted + backward-compatible. *(P1-S1)*
|
||||
- [x] Add `profiles/` directory + profile resolution (persona injection, default verify/caps/scope) (§6). *(P1-S2)*
|
||||
- [x] Local capability detection + a job/factory capability match check before launch (§8 subset). *(P1-S1: `detect_capabilities` + `caps_match`; mismatch ⇒ `failed/` `result=capability_mismatch`, agent never launched.)*
|
||||
- [x] `priority` ordering in the inbox pick (replace pure FIFO with priority-then-age). *(P1-S1: `inbox_sorted`; per-lock serialization preserved.)*
|
||||
- [x] `deps` (DAG) blocking on a single host; `idempotency-key` dedupe on `add`. *(P1-S1 idempotency dedupe + P1-S2 `deps` blocking/cycle detection.)*
|
||||
- [x] `retry` with backoff into `failed`/requeue; `budget.wall` enforced (extends `timeout`). *(P1-S3: `retry` with backoff + `retries_exhausted` DONE. `budget.wall` DONE: parsed from `budget: { wall: <dur> }`, armed as a HARD wall-clock ceiling alongside `timeout` (whichever fires first binds), expiry → `failed` result=`budget_exceeded`, non-retryable by default.)*
|
||||
- [x] `allowed-scope` guardrail (warn-only this phase) + post-run diff report. *(P1-S2: `scope_check` WARN-only + `scope_warning=`.)*
|
||||
- [x] **Tracker adapter** `aq from-tracker <ITEM>` + `aq to-tracker` event poster (§10 P1). *(P1-S4: curl-only `tracker_api`; from-tracker materializes a job (idempotent), to-tracker echoes status+metrics one-way; opt-in `AQ_TRACKER_AUTO`. A standalone background poller is deferred to P2.)*
|
||||
- [ ] Dashboard shows profile + priority + capability tags + tracker-item link. *(P1-S1: `status` shows priority/profile/caps/tracker-item; P1-S4: status/insights also show last echoed tracker status; Node `dash` surfacing pending.)*
|
||||
- [x] Update `selftest.sh` with: manifest parse fixtures, profile resolution, priority order, dep-block, idempotency, adapter round-trip (mock). *(P1-S1 manifest/priority/idempotency + P1-S2 profile/persona/scope/dep-block/cycle + P1-S3 resilience/insights + P1-S4 tracker from/to round-trip via stub.)*
|
||||
- [x] Update README + this doc's progress table. *(P1-S1)*
|
||||
- **Exit criteria:** all boxes ✅; `selftest.sh` green; a tracker task → executed → tracker `done` with SHA comment, fully on one host; no regression to Phase-0 `.md` files.
|
||||
|
||||
### Phase 2 — Coordinator as platform-service module + Cosmos + multi-factory leasing
|
||||
**Goal:** the service spine; ≥2 real factories executing in parallel via leases.
|
||||
|
||||
> **Slice progress — P2-S3 (factory-agent integration, single host):** the bash runner
|
||||
> is now a coordinator **factory** behind `AQ_FLEET` — `lib/fleet-client.sh` (curl-only,
|
||||
> sourced) registers via heartbeat, claims jobs into inbox (interleaved with local `.md`),
|
||||
> reports **fenced** stage transitions with WIP checkpoints, renews/releases leases, and on
|
||||
> a stale `leaseEpoch` (reclaimed) **self-aborts + quarantines** the local result. Coordinator
|
||||
> 5xx/connection errors **degrade** (finish locally) rather than abandon work. When `AQ_FLEET`
|
||||
> is off the offline git-queue path is byte-for-byte unchanged. The remaining P2 items —
|
||||
> scheduler/router core, direct tracker→module calls, factory enrollment + scoped tokens,
|
||||
> `fleet.*` feature flags + shadow/dual-run, and the two-factory parallel demo — are now all
|
||||
> landed in common-plat (`scheduler.ts`, `tracker-bridge.ts`, `enrollment.ts`).
|
||||
|
||||
- [x] Scaffold `fleet`/`orchestrator` module in `platform-service` (`types/repository/routes`, Zod, ESM, `productId`). *(PR #28)*
|
||||
- [x] Cosmos containers (§13) + repository layer (memory + Cosmos providers). *(PR #28; `fleet_artifacts` blob wiring still pending.)*
|
||||
- [x] **Atomic claim** (optimistic concurrency / `_etag`) + **lease reaper** + **fencing (`leaseEpoch`)** endpoints (§4/§8/§9) — *not* Cosmos-TTL-driven reclaim. *(common-plat PR #28 + #29; truly atomic via `updateIfMatch`.)*
|
||||
- [x] Port `agent-queue` runner to a **factory agent** API client (enroll/register/heartbeat/claim/report, fencing-aware) while keeping git-queue fallback. *(P2-S3: `lib/fleet-client.sh` behind `AQ_FLEET`; registers via heartbeat, claims into inbox, reports fenced stage transitions, renews leases, quarantines on stale-epoch; offline git-queue unchanged when the flag is off.)*
|
||||
- [x] Scheduler/router core (§7) as a pure module (fixed weights) + wired into atomic assignment. *(common-plat `fleet/scheduler.ts` pure `selectJob`/`scoreCandidate`/`selectPreemptionVictim`; `coordinator.ts` `claimNextJob` ranks candidates via `selectJob` after the capability hard-filter.)*
|
||||
- [x] Tracker adapter calls the module directly (not just file export). *(common-plat `fleet/tracker-bridge.ts` + `POST /fleet/tracker/ingest` / `/fleet/tracker/echo`: idempotent ingest of a tracker item → job and one-way status echo, in-module.)*
|
||||
- [x] Auth: factory enrollment + scoped rotatable tokens; secret isolation enforced (§12 subset). *(common-plat `fleet/enrollment.ts`: `enrollFactory`/`rotateToken`/`revokeToken` issue a plaintext token once, store it hashed, scope it to `{productId, factoryId, capabilities}`; `enforceFactoryToken` gates `claim`/`heartbeat` in `routes.ts`.)*
|
||||
- [x] **Feature flags** (`fleet.enabled`, `fleet.route_via_service`) + **shadow/dual-run** vs P1 before cutover (§21). *(agent-queue runner: `AQ_FLEET` / `AQ_FLEET_ROUTE` / `AQ_FLEET_SHADOW` with documented precedence; shadow claim/compare/report is side-effect-free (isolated `-shadow` factoryId + dryRun, never materializes/ships); `fleet-shadow-report` summarizes AGREE/DIVERGE/COORD_EMPTY/LOCAL_EMPTY + agreement; 60→68 selftest checks.)*
|
||||
- [x] Module test suite (repository + routes via `@bytelyst/testing`); **atomic-claim race**, crash-recovery, fencing-rejection, reaper-reclaim tests. *(PR #28 + #29: 53 fleet + 48 datastore tests, incl. true-concurrency claim.)*
|
||||
- [x] Two-factory demo (e.g. mac + ubuntu) running 3 parallel jobs end-to-end. *(`agent-queue/demo/two-factory-demo.sh` + `coordinator-stub.sh`: two real `run` daemons (mac-1 + ubuntu-1, separate queues/cwds) compete through one coordinator; asserts (a) no double-assign, (b) kill-mid-job → reaper reclaim → survivor completes → zombie report fenced (409), (c) concurrent parallelism. Dual-mode: CI-safe stateful stub by default, live platform-service when `AQ_FLEET_API`/`AQ_FLEET_TOKEN` set. Headless checks in `selftest.sh` → 68→71 green.)*
|
||||
- **Exit criteria:** all boxes ✅; `pnpm --filter @lysnrai/platform-service test` green; killing a factory mid-job → another reclaims and completes **and the dead worker's late report is fenced**; concurrent claimers never double-assign; all state in Cosmos with `productId`; **flag-off rollback verified** (§21). — _Runtime exit guarantees **demonstrated** by the two-factory demo (no double-assign + reclaim/fence + parallelism) and flag-off rollback verified (§21). Scheduler/router core, tracker-module direct calls, and factory enrollment + scoped tokens are now all wired in (see boxes above) — Phase 2 is effectively complete. **Remaining for a hard 100%:** validate the Cosmos `_etag` CAS path under true production contention + live blob-backed `fleet_artifacts`._
|
||||
|
||||
### Phase 3 — Fleet control plane in tracker-web + DAG + budgets + scoring router
|
||||
**Goal:** one browser control plane; smart routing + budgets live.
|
||||
|
||||
- [x] `fleet` API client in `tracker-web` (reuse `/api/tracker`-style proxy → `/fleet`). *(common-plat `dashboards/tracker-web/src/lib/fleet-client.ts`: typed client over `/api/fleet`.)*
|
||||
- [x] Fleet map page (factories, load, health, capabilities) on `@bytelyst/*` components. *(common-plat `app/dashboard/fleet/page.tsx`: health badges, load, capabilities, fleet metrics + alerts.)*
|
||||
- [x] Job table + job detail + **DAG view**; live log via **SSE**; approve/ship/reject/requeue actions. *(common-plat `app/dashboard/fleet/jobs/page.tsx` + `jobs/[id]/page.tsx`: stage-filtered table, DAG via `getJobDag`, SSE event stream, ship/requeue/reject/requestReview.)*
|
||||
- [x] Cost burndown + budget kill-switch UI; multi-reviewer routing. *(common-plat `app/dashboard/fleet/budget/page.tsx` burndown + pause/resume; `ReviewGateCard` multi-reviewer quorum gate via `requestReview`/`submitReview`.)*
|
||||
- [x] Scoring router with configurable weights + explainability surfaced in UI. *(common-plat `fleet/scheduler.ts` tunable weights + `GET /fleet/jobs/:id/explain`; `ExplainPanel` breakdown in job detail.)*
|
||||
- [x] Preemption of low-priority by critical jobs (checkpoint + requeue). *(common-plat `fleet/scheduler.ts` `selectPreemptionVictim` + coordinator eviction under `FLEET_PREEMPTION`; victim requeued with checkpoint + bumped epoch, `preempted` event.)*
|
||||
- [x] TUI dashboard re-pointed at `/fleet` API (parity). *(devops-tools `agent-queue/lib/fleet-dash.mjs` adapter + `dashboard.mjs` fleet mode under `AQ_FLEET_DASH=1`: board/factories/metrics/alerts, job actions ship/requeue/reject via `/fleet`, per-job events log; opt-in so local mode is byte-for-byte unchanged. Verified by `lib/fleet-dash.test.mjs` (22 assertions) wired into `selftest.sh` + live non-TTY render smoke.)*
|
||||
- [x] Web e2e (Playwright): fleet map, live log, ship, budget-pause. *(common-plat `dashboards/tracker-web/e2e/fleet.spec.ts`: fleet overview, metrics, job detail, ship, budget-pause, review-gate specs green.)*
|
||||
- **Exit criteria:** all boxes ✅; web `verify` (typecheck+lint+test+e2e) green; an operator runs the whole 3-repo parallel workload from the browser, including a budget pause + resume.
|
||||
|
||||
### Phase 4 — Message bus + autoscaling + cross-OS capability marketplace
|
||||
**Goal:** scale-out and elasticity.
|
||||
|
||||
- [ ] Introduce broker (NATS/Redis) for push dispatch + backpressure; coordinator publishes, factories subscribe by capability.
|
||||
- [ ] Autoscaling hooks (spin ephemeral factories: cloud VM / container) keyed to queue depth + SLA.
|
||||
- [ ] Capability "marketplace": jobs requiring rare caps (xcode/figma/gpu) routed to the few factories that have them; queueing fairness across products.
|
||||
- [ ] Load + chaos test suite (factory churn, broker outage, thundering herd).
|
||||
- **Exit criteria:** all boxes ✅; sustained N×throughput vs Phase 3 under load test; graceful degradation on broker outage (fallback to poll).
|
||||
|
||||
### Phase 5 — Self-optimizing / learned routing
|
||||
**Goal:** the scheduler learns from history to cut time/cost and raise first-pass success.
|
||||
|
||||
- [ ] Capture outcome features per run (engine, profile, repo, duration, cost, verify pass, human-edit rate).
|
||||
- [ ] Offline eval harness comparing learned vs heuristic routing on historical data.
|
||||
- [ ] Shadow/A-B rollout with guardrails; auto-tune scoring weights.
|
||||
- [ ] Recommendations surfaced ("route NomGap UX jobs to claude on mac-2: 23% faster, 11% cheaper").
|
||||
- **Exit criteria:** all boxes ✅; learned router beats heuristic on the eval set without regressing safety gates; A/B shows measurable improvement on a target metric.
|
||||
|
||||
---
|
||||
|
||||
## 15. Cross-cutting feature catalog (quick index)
|
||||
|
||||
| Feature | First phase | Section |
|
||||
| ------- | ----------- | ------- |
|
||||
| Evolved job manifest | P1 | §5 |
|
||||
| Profiles (persona + capability) | P1 | §6 |
|
||||
| Capability matching | P1→P2 | §6/§8 |
|
||||
| Priority + SLA | P1 | §5/§7 |
|
||||
| DAG dependencies | P1→P3 | §5/§11 |
|
||||
| Idempotency / dedupe | P1 | §5 |
|
||||
| Retry + dead-letter | P1→P2 | §11 |
|
||||
| Budgets + kill-switch | P1(wall)→P3 | §5/§12 |
|
||||
| Scheduler/router scoring | P2→P3 | §7 |
|
||||
| Factory registration/heartbeat/lease | P2 | §8 |
|
||||
| Coordinator (platform-service module) | P2 | §9/§10 |
|
||||
| Cosmos data model | P2 | §13 |
|
||||
| Tracker bi-directional sync | P1→P2 | §10 |
|
||||
| Web control plane + SSE logs | P3 | §10/§17 |
|
||||
| Security/scope/secret isolation | P1→P2 | §12 |
|
||||
| Broker + autoscaling | P4 | §14 |
|
||||
| Learned routing | P5 | §14 |
|
||||
| Atomic claim + fencing + distributed lock | P2 | §4/§7/§9 |
|
||||
| Rollout / rollback / feature flags | P2→ | §21 |
|
||||
| Capacity planning & RU/cost | P2→ | §22 |
|
||||
| Ownership & RACI / on-call | all | §23 |
|
||||
| Work hierarchy & composite delegation (roadmap/epic) | P3 (manual) → P5 (planner) | §24 |
|
||||
| Durability, crash recovery & work preservation | P1 (orphan/retry/WIP) → P2 (lease/resume) | §25 |
|
||||
| Execution insights & token accounting | P1 (capture) → P3 (rollup UI) | §26 |
|
||||
|
||||
---
|
||||
|
||||
## 16. Definition of Done — the "100% accuracy" rubric
|
||||
|
||||
A feature/phase is **not done** until **every** item below is true (this is the bar for "100% end-to-end"):
|
||||
|
||||
- [ ] **Functionality**: acceptance criteria met; happy path + documented edge cases handled.
|
||||
- [ ] **Tests**: unit + integration written *first or alongside*, all green; no weakened/deleted tests; coverage targets met (router ≥95% core).
|
||||
- [ ] **Verify gate**: the phase's named gate command passes locally (and in CI where applicable).
|
||||
- [ ] **Idempotency & recovery**: re-runs are safe; crash mid-step recovers (lease/idempotency).
|
||||
- [ ] **Security review**: secret-leak scan clean; scope guardrail honored; least-privilege tokens.
|
||||
- [ ] **Observability**: events/logs/metrics emitted; failures are diagnosable from the control plane.
|
||||
- [ ] **Docs**: this roadmap's checkboxes ticked; README/AGENTS updated; manifest/profile docs current.
|
||||
- [ ] **Backward-compat**: existing `.md`/Phase-0 behavior unbroken (regression check).
|
||||
- [ ] **Drift checks**: shared-infra templates (`.npmrc`, `docker-prep`) untouched/synced; conventional commits.
|
||||
- [ ] **No `console.log`/`print`** in service code; `req.log`/`os.Logger` used; ESM `.js` imports.
|
||||
|
||||
---
|
||||
|
||||
## 17. Observability & control plane details
|
||||
|
||||
- [ ] **Log transport/storage**: factory ships logs to blob (`@bytelyst/blob`); `fleet_events` carries pointers + a recent-tail buffer. The control plane serves stored tail + live append (via the streaming route, **not** the buffering proxy — §10).
|
||||
- [ ] **Live logs** via SSE (single stream contract) from the streaming endpoint to web/TUI.
|
||||
- [ ] **Metrics**: queue depth, `blocked` count, assign latency, claim-loop RU/s, run duration, verify pass-rate, cost, factory utilization, fairness, reclaim/fencing-rejection counts.
|
||||
- [ ] **Alerting**: stall (no log N min), failure spikes, budget breach, factory offline, dead-letter, **claim-race anomalies**, RU throttling (Cosmos 429s).
|
||||
- [ ] **Tracing**: a job's full timeline (queued→…→shipped) reconstructable from `fleet_events` (immutable, ordered).
|
||||
- [ ] **Cost burndown** per job/product/day with budget overlays.
|
||||
- [ ] **SLOs defined + dashboarded** (see §19 targets); error budget tracked per SLO.
|
||||
|
||||
---
|
||||
|
||||
## 18. Risks & gaps explicitly tracked (expert call-outs)
|
||||
|
||||
- [ ] **Duplicate execution** across transports (git fallback + service) — `idempotency-key` (submit) + atomic lease (assign) + **fencing token** (side-effect) + distributed `lock` (push).
|
||||
- [ ] **Crash recovery** — coordinator **lease reaper + fencing** (not Cosmos TTL); checkpoint long jobs where engines allow.
|
||||
- [ ] **Split-brain / partition** — fencing rejects stale `leaseEpoch` writes; reclaimed-job results quarantined, not auto-merged (§9).
|
||||
- [ ] **Shared-package conflicts** — two jobs editing `@bytelyst/*` simultaneously → fleet-wide `lock` + reviewer gate.
|
||||
- [ ] **Starvation/fairness** — per-product + per-factory counters with penalty.
|
||||
- [ ] **Cost runaway** — `budget.wall` hard ceiling everywhere; `usd`/`tokens` best-effort (provider metering) + global kill switch.
|
||||
- [ ] **Cosmos RU throttling (429)** — hot claim path; bound via long-poll/backoff + indexing (§13/§22); broker offload at P4.
|
||||
- [ ] **Clock skew** — coordinator-authoritative timestamps for all lease/SLA math (§4).
|
||||
- [ ] **Tool-version drift / reproducibility** — record engine + tool versions per run; pin where possible.
|
||||
- [ ] **Windows quirks** — path/shell differences in the factory agent; capability-gate Windows-only work.
|
||||
- [ ] **Human-review bottleneck** — auto-verify as much as possible; batch review UI; reviewer routing.
|
||||
- [ ] **Result capture beyond commits** — artifacts (coverage, screenshots, build logs) attached to runs.
|
||||
- [ ] **Secret sprawl** — never in queue/manifest/logs/Cosmos; presence-only capabilities.
|
||||
- [ ] **Data retention** — event/log retention + archival policy (extend today's `clean`).
|
||||
- [ ] **Engine API churn** — engines mapped in one place (`build_agent_cmd`); capability matrix versioned.
|
||||
|
||||
---
|
||||
|
||||
## 19. Success metrics
|
||||
|
||||
Each metric has a **provisional SLO target** (tune with real data; tracked with an error budget):
|
||||
|
||||
| Dimension | Metric | Provisional SLO target |
|
||||
| --------- | ------ | ---------------------- |
|
||||
| Throughput | jobs shipped/day; parallel utilization | utilization ≥ 60% under backlog |
|
||||
| Quality | % auto-verified; first-pass success; escaped-defect; post-agent human-edit rate | first-pass ≥ 70%; escaped-defect < 2% |
|
||||
| Speed | assign latency; time queued→shipped (excl. human gate) | assign p95 < 5s; queue-wait p95 < 2m at target load |
|
||||
| Cost | $/shipped job; budget-breach rate | budget-breach < 1% of jobs |
|
||||
| Reliability | lease-reclaim success; dead-letter rate; factory uptime; double-execution incidents | reclaim success ≥ 99.9%; **double-merge = 0**; dead-letter < 5% |
|
||||
| Fairness | max/min product wait-time ratio | ratio < 3× |
|
||||
| Correctness | atomic-claim violations; fencing rejections functioning | claim violations = 0 |
|
||||
|
||||
> Targets are starting points; the §0 owners ratify per-phase SLOs before that phase's exit.
|
||||
|
||||
---
|
||||
|
||||
## 20. Open questions
|
||||
|
||||
- [ ] Copilot headless feasibility as an engine/station (CLI/automation surface?).
|
||||
- [ ] Who owns merge/push authority — agents open PRs only, or auto-merge on green for low-risk profiles?
|
||||
- [ ] Multi-user/tenant: per-user queues + RBAC in the control plane?
|
||||
- [ ] On-call/ownership for the fleet (alerts routing, runbooks)?
|
||||
- [ ] Cloud factory provisioning (Phase 4) — which provider/runtime, cost guardrails?
|
||||
- [ ] Profile authorship/governance — who can create/edit profiles, and review of persona prompts?
|
||||
|
||||
---
|
||||
|
||||
## 21. Rollout, rollback & data migration
|
||||
|
||||
Each phase ships behind controls so it can be turned off without losing work.
|
||||
|
||||
- [ ] **Feature-flagged rollout**: gate each phase's new path behind a platform feature flag (`fleet.enabled`, `fleet.route_via_service`, `fleet.tracker_sync`); default off; enable per-product first.
|
||||
- [x] **Dual-run / shadow**: P2 coordinator runs in shadow (assign decisions logged, not executed) alongside the P0/P1 path before cutover; compare decisions. *(agent-queue `AQ_FLEET_SHADOW=1`: offline path stays authoritative, coordinator queried in parallel, decisions classified AGREE/DIVERGE/COORD_EMPTY/LOCAL_EMPTY into `.state/fleet-shadow.log`; strictly side-effect-free — never ships/quarantines/mutates real job state.)*
|
||||
- [x] **Cutover is reversible**: a factory can fall back from service-claim to git-queue via flag; no schema-destructive step on the rollback path. *(rollback = `AQ_FLEET_ROUTE=0` and/or `AQ_FLEET=0` at any time → instant return to the local/offline path; no data migration.)*
|
||||
- [ ] **Data migration**: introducing Cosmos containers (P2) is **additive** — no migration of existing tracker data; backfill is read-only (link `tracker-item`, don't mutate). Container creation is idempotent (registered in `cosmos-init`).
|
||||
- [ ] **Backward-compat gate**: every phase re-runs Phase-0 `selftest.sh` + a corpus of legacy `.md` files (regression).
|
||||
- [ ] **Rollback drill**: each phase's exit includes a tested rollback (flag off → prior behavior, in-flight jobs drain or requeue cleanly).
|
||||
- **Acceptance:** flipping `fleet.*` flags off returns the system to the prior phase's behavior with zero data loss; in-flight jobs either complete or requeue.
|
||||
- **Verify gate:** rollout/rollback drill documented + a flag-off regression run is green.
|
||||
|
||||
---
|
||||
|
||||
## 22. Capacity planning & cost
|
||||
|
||||
- [ ] **Concurrency model**: fleet throughput = Σ factory free-stations, bounded by per-engine **seat limits** (e.g. N Devin seats) — document seat inventory per engine before P2.
|
||||
- [ ] **Cosmos RU budgeting**: the claim/heartbeat paths are the hot loops. Estimate RU/s = (factories × claim-poll rate × query RU) + (factories × heartbeat rate × upsert RU); pick **long-poll interval** to keep steady-state RU within a provisioned budget; enable autoscale RU with a ceiling + 429 alerting.
|
||||
- [ ] **Polling vs push**: at F factories the poll RU grows linearly — define the F threshold that triggers the P4 broker migration.
|
||||
- [ ] **Blob storage**: logs/artifacts sizing + lifecycle (hot → cool → delete) per retention policy (§18).
|
||||
- [ ] **Factory sizing**: per-OS resource baseline (CPU/RAM/disk for N concurrent agent sessions + warm checkouts); disk pressure as a health input.
|
||||
- [ ] **Cost guardrails**: per-product spend caps + alerts; ties to `budget` and the global kill-switch.
|
||||
- **Acceptance:** a documented capacity sheet (seats, RU/s, blob GB, factory specs) sized for the target steady-state + 2× burst.
|
||||
- **Verify gate:** load test sustains target throughput within the RU/cost budget (no 429 storms).
|
||||
|
||||
---
|
||||
|
||||
## 23. Ownership & RACI
|
||||
|
||||
Owners are roles, not names — assign before each phase starts (this removes the "undefined owner" gap).
|
||||
|
||||
| Area | Responsible (R) | Accountable (A) | Consulted (C) | Informed (I) |
|
||||
| ---- | --------------- | --------------- | ------------- | ------------ |
|
||||
| Runner / factory agent (bash) | DevOps eng | Platform lead | — | All |
|
||||
| Coordinator module (platform-service) | Backend eng | Platform lead | Security | All |
|
||||
| Scheduler/router | Distributed-systems eng | Platform lead | Backend | All |
|
||||
| Control plane (tracker-web Fleet) | Frontend eng | Platform lead | UX | All |
|
||||
| Security/governance | Security eng | Security lead | Platform | All |
|
||||
| Capacity/cost & SLOs | SRE | Platform lead | Finance | All |
|
||||
| Profiles & persona governance | Eng leads | Platform lead | — | All |
|
||||
|
||||
- [ ] Each phase names its R/A before kickoff; SLOs (§19) ratified by A.
|
||||
- [ ] On-call + runbooks established before the fleet runs unattended `yolo` workloads (Phase 2+).
|
||||
|
||||
---
|
||||
|
||||
## 24. Work hierarchy & composite delegation (roadmap / epic)
|
||||
|
||||
**Goal:** delegate work at *any* granularity — a single bug/feature/task, **or an entire roadmap** — and let the fleet decompose + orchestrate rather than hand a multi-day roadmap to one agent session (which is long-horizon, low first-pass-success, and high blast-radius under `yolo`).
|
||||
|
||||
### 24.1 Two delegation modes
|
||||
- **Atomic** (today's model): one leaf item (`bug`/`feature`/`task`) → one job → one agent at one station.
|
||||
- **Composite** (new): a `roadmap`/`epic` → a **planner** profile expands it into child jobs → the scheduler runs them as a **DAG across factories/agents/profiles**, honoring `deps` + phase gates. "Delegate the whole roadmap" = hand it to the **orchestrator**, which fans out — never one agent grinding for hours.
|
||||
|
||||
### 24.2 Job `kind` — the one genuinely new concept
|
||||
A new axis, **orthogonal to tracker `type`**:
|
||||
- **`kind: leaf`** — runs an engine at a station (everything Phase 1–2 already does).
|
||||
- **`kind: composite`** — runs the **planner/orchestrator** that emits child `leaf` jobs and a dependency graph; it never itself edits a repo.
|
||||
|
||||
The scheduler (§7) routes by `kind`: `leaf` → station/engine; `composite` → planner. This keeps execution and planning cleanly separated.
|
||||
|
||||
### 24.3 Hierarchy & relationships
|
||||
- [ ] `parentId` links a child job/item to its roadmap/epic; `deps` (§5) expresses ordering within it (DAG, submit-time cycle detection).
|
||||
- [ ] A roadmap is, mechanically, a **named DAG of jobs + a rollup** — it reuses `deps`, profiles (§6), the scheduler (§7), and the lifecycle (§11); the only additions are `kind`, `parentId`, and rollup logic.
|
||||
- [ ] Add a **`planner`/`architect`/`tech-lead` profile** (§6 catalog) for decomposition + orchestration; leaf work still uses `backend-engineer`, `ux-designer`, etc.
|
||||
|
||||
### 24.4 Rollup semantics (composite-level)
|
||||
- [ ] **Status rollup:** roadmap `status` is derived from children — `in_progress` once any child starts; `shipped`/`done` only when **all** children reach `shipped`; surfaces `blocked`/`failed` children for triage.
|
||||
- [ ] **Budget rollup:** roadmap `budget` = Σ child budgets with an explicit **ceiling**; breaching the ceiling pauses fan-out (ties to §12 kill-switch).
|
||||
- [ ] **Verify rollup:** each leaf runs its own `verify`; the roadmap's acceptance gate runs **after** all leaves pass (e.g. an integration/e2e gate).
|
||||
- [ ] **Phase gates:** the roadmap's own phase Exit-criteria become **runtime gates** — fan-out of phase N+1 is blocked until phase N's children ship; human approval between phases is the default for `yolo` safety.
|
||||
- [ ] **Idempotent re-run:** re-running a roadmap **skips already-`shipped` children** (content-hash dedupe, §5); only unfinished/changed children re-queue.
|
||||
|
||||
### 24.5 Source-of-truth & sync (no drift)
|
||||
Composite work obeys the same SoT discipline as the core contract (§4 immutable manifest) and the tracker echo (§10): a roadmap/epic is **one record referenced by many**, never duplicated.
|
||||
- [ ] The **roadmap/epic** is the SoT for *what/why + rollup status*; each **leaf job/run** is the SoT for *its* execution.
|
||||
- [ ] Children reference the parent by `parentId`; the planner writes the child set **once** at decomposition (immutable manifest snapshot). Re-planning creates a new revision, it does not mutate in-flight children.
|
||||
- [ ] Status flows **one way, child → parent → tracker** (the §10 echo); humans never hand-edit rollup state.
|
||||
|
||||
### 24.6 Decision — **Hybrid** (recorded)
|
||||
> Model composite delegation in the **fleet layer now**; defer the shared-platform enum change until proven.
|
||||
|
||||
- **Now (fleet-owned):** add `kind` (`leaf`/`composite`), `parentId`, and rollup to the `fleet_jobs` schema (§13). The fleet owns this schema outright — no cross-product risk.
|
||||
- **Tracker stays `bug`/`feature`/`task`** (the shared `ITEM_TYPES` used by all 9 products is unchanged). A roadmap is represented by a **parent item + label `kind:roadmap`** + `parentId` on children — zero platform migration, no sign-off needed.
|
||||
- **Later (optional, gated on proven value):** promote `kind:roadmap` → a first-class `epic` tracker `type` via an **additive migration** (backfill items where `labels` contains `kind:roadmap` into `type: epic`, keep the label as an alias during transition). Low-risk because the behavior already works fleet-side.
|
||||
- **Rationale:** avoids a speculative 9-product platform change (UI/filters/stats/tests) before the orchestration model is validated; if the model is wrong, only fleet code is refactored, not a platform enum every product depends on.
|
||||
|
||||
### 24.7 Phasing & gates
|
||||
- **P1–P2:** leaf-only (no composite); `kind` defaults to `leaf`.
|
||||
- **P3:** composite scheduling + rollup + DAG view in the control plane, with **manual decomposition** (a human/author defines the child set).
|
||||
- **P3→P5:** the **auto-decomposition planner agent** (itself a `composite` job run by the `planner` profile) — start manual, automate once trustworthy.
|
||||
- **Acceptance:** a roadmap with N child jobs fans out across ≥2 factories, respects `deps` + phase gates, rolls up status/budget correctly, and a re-run skips shipped children; tracker shows the parent moving `in_progress → done` via the one-way echo.
|
||||
- **Verify gate:** composite-orchestration tests — DAG expansion, rollup status/budget, phase-gate blocking, idempotent re-run; control-plane e2e for the roadmap DAG view.
|
||||
|
||||
---
|
||||
|
||||
## 25. Durability, crash recovery & work preservation
|
||||
|
||||
**Goal:** a machine power-off, daemon/agent crash, or network partition **never loses the job, its instructions, or in-progress work**, and never corrupts state. Recovery is automatic and idempotent.
|
||||
|
||||
### 25.1 Instructions are durable (markdown in Cosmos)
|
||||
- [ ] The **full job instruction body is persisted verbatim as markdown** in `fleet_jobs.bodyMd` (§13), alongside the structured manifest. The originating tracker `Item.description` also retains the human instruction text; the two are linked by `tracker-item`, never duplicated as competing truth (§24.5).
|
||||
- [ ] A factory only ever holds a **transient materialized copy** (temp prompt file) fetched from the API — losing the factory loses nothing. On the offline edge, the `.md` file on disk is the durable copy and reconciles on reconnect (§9).
|
||||
|
||||
### 25.2 Work-in-progress is preserved (checkpointing)
|
||||
- [x] For a git-repo `cwd`, the worker commits **WIP to a dedicated branch `aq/wip/<jobId>`** at start and on every exit path (success, failure, timeout, signal) — partial work is never lost to a crash. Never commits to `main`/protected branches (§12 push policy). *(P1-S3: `_wip_start`/`_wip_checkpoint` + EXIT/INT/TERM trap; non-git cwd skipped.)*
|
||||
- [ ] `fleet_jobs.checkpoint` records the WIP branch + last commit so any worker can find it. *(P2 Cosmos; single-host records `wip_branch`/`wip_base`/`wip_commit` in `<job>.meta`.)*
|
||||
- [x] Long agents checkpoint periodically where the engine supports it; otherwise the start/exit commits bound the loss window. *(P1-S3: start + every-exit-path commits bound the loss window.)*
|
||||
|
||||
### 25.3 Recovery is automatic, resumable, and fenced
|
||||
- [x] **Orphan detection:** on coordinator/runner startup (and continuously), a job in `building/assigned` whose worker is dead (no live lease / dead pid) is an **orphan**; it is recovered, not stranded. *(P1-S3: `recover_orphans` on `run` startup + each loop, and `agent-queue.sh recover`; dead-pid + `pidstart` reuse guard.)*
|
||||
- [x] **Resume vs restart:** recovery starts a **new `fleet_runs` attempt**; if `aq/wip/<jobId>` exists, the new worker **resumes from the checkpoint** instead of restarting from zero. *(P1-S3: relaunch checks out `aq/wip/<job>`; `attempts` incremented.)*
|
||||
- [ ] **Fencing (§4):** the reclaimed run gets a higher `leaseEpoch`; the dead/zombie worker's late commits/ship reports are rejected — no double-execution of *visible* outcomes. *(P2 — distributed leasing; out of single-host scope.)*
|
||||
- [x] **Retry policy** (`retry.max/backoff/on`): agent `rc≠0` / `timeout` / `verify_failed` requeue with backoff up to `max`; on exhaustion → `dead_letter` (P2) / `failed` (P1 stand-in) with full diagnostics — never silently dropped. *(P1-S3 single-host.)*
|
||||
- [x] **State integrity:** all run state is **append-only / optimistic-concurrency guarded** (§13); recovery is idempotent (running it twice yields one recovery). *(P1-S3 single-host: meta is append-only + re-derivable from folder location; `_etag` guard is P2.)*
|
||||
|
||||
### 25.4 Crash taxonomy (all handled)
|
||||
| Failure | Detection | Recovery |
|
||||
| ------- | --------- | -------- |
|
||||
| Agent process crash (`rc≠0`) | exit code | retry policy → requeue or `failed`/`dead_letter` |
|
||||
| Daemon/runner crash | lease not renewed | reaper reclaims → resume from checkpoint |
|
||||
| Machine power-off / partition | missed heartbeats + lease expiry | reaper + fencing + WIP resume elsewhere |
|
||||
| Coordinator restart | state in Cosmos | leases survive; in-flight reconciled on boot |
|
||||
|
||||
- **Acceptance:** SIGKILL an agent and power-off a factory mid-run → another worker **resumes from the last checkpoint (not from zero)** and ships; instructions intact (read back from Cosmos `bodyMd`); **zero duplicate commits/merges**; a retry-exhausted job lands in `dead_letter`/`failed` with diagnostics.
|
||||
- **Verify gate:** chaos tests — kill agent, kill runner, simulate partition; assert resume-from-checkpoint, fencing rejection of the stale worker, instruction integrity, and no double-merge.
|
||||
|
||||
---
|
||||
|
||||
## 26. Execution insights & token accounting
|
||||
|
||||
**Goal:** per-job/run visibility into **token usage, cost, model, latency, and tool activity** — to drive budgets (§5/§12), cost burndown (§17), and learned routing (§14 P5).
|
||||
|
||||
- [x] **Per-run telemetry record** (in `fleet_runs`, streamed as `fleet_events`): engine, model, **tokensIn/Out (+cached)**, **cost USD** (`estimated:true` when not provider-reported), wall + CPU time, **turn count, tool-call counts**, verify pass/fail, **filesChanged, linesAdded/Deleted**, attempt number, retries. *(P1-S3 single-host: recorded in `<job>.meta` — `duration_s`, `files_changed`/`lines_added`/`lines_deleted`, tokens/cost/turns/tool_calls, `attempts`; CPU time not captured.)*
|
||||
- [x] **Token source (honest feasibility):** capture real usage where the engine/provider exposes it (Claude/Codex/OpenAI usage in responses; Devin session metrics); otherwise **estimate** from log heuristics and mark `estimated` — same caveat as `budget.usd/tokens` (§5). A single `parse_usage(engine, log)` adapter centralizes per-engine extraction. *(P1-S3: `parse_usage` adapter; generic `AQ_USAGE` line + Claude/Codex heuristics; Devin/Copilot TODO; `usage_estimated` flag, never fabricated.)*
|
||||
- [ ] **Aggregation/rollups:** per job, roadmap (§24), product, factory, engine, profile, and day. Powers cost burndown (§17) and the learned-routing eval (§14). *(P1-S3 partial: `aq insights` does per-job + per-engine rollup; product/factory/profile/day are P2/P3.)*
|
||||
- [ ] **Surfacing:** control-plane panels (tokens, cost, success/first-pass/human-edit rates) + a CLI insights summary at the edge; reuse the platform-service telemetry module where present. *(P1-S3 partial: edge CLI `aq insights` + `status`/`dash` insights line done; web control-plane panels are P3.)*
|
||||
- [x] **Privacy:** telemetry carries metrics + pointers only — **never prompt content or secrets** (redaction §12). *(P1-S3: insights/meta record only metrics; no prompt body or secrets added.)*
|
||||
- **Acceptance:** after a run, its `fleet_runs` carries token/cost/duration/tool/diff metrics (real where metered, flagged `estimated` otherwise); dashboards show per-engine and per-profile cost + token totals; a budget breach is detectable from telemetry alone.
|
||||
- **Verify gate:** telemetry unit tests (capture + rollup); a metered-engine run records real tokens; an unmetered run records estimated + flagged; aggregation totals verified.
|
||||
|
||||
---
|
||||
|
||||
*This document is the single source of truth for the gigafactory build. Keep the §0 table and per-phase checkboxes updated; a phase ships only when its Exit criteria and the §16 Definition-of-Done rubric are fully green.*
|
||||
|
||||
@ -1,451 +0,0 @@
|
||||
# Agent Gigafactory — System Overview (current picture)
|
||||
|
||||
> Companion to `GIGAFACTORY_ROADMAP.md` (the source-of-truth spec & checklists).
|
||||
> This document describes **what is actually built today**, how the pieces fit
|
||||
> together, the architecture diagrams, the code map across both repos, the next
|
||||
> steps, and the known bugs/gaps. Last reviewed: **2026-05-31**.
|
||||
>
|
||||
> The **Phase-4 plan + the as-built M0 RU gate** live in
|
||||
> [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md) — read it for the
|
||||
> broker-backed dispatch design and the migration checklist.
|
||||
|
||||
---
|
||||
|
||||
## 1. What it is (in one paragraph)
|
||||
|
||||
The **Agent Gigafactory** turns a single-host "folder queue" agent runner into a
|
||||
**distributed fleet** of agent "factories" (machines: mac/ubuntu/windows) that
|
||||
claim and execute coding jobs in parallel, coordinated by a durable,
|
||||
product-agnostic service. A job is a markdown manifest (persona + capabilities +
|
||||
budget + deps); the **coordinator** assigns each job to the best-fit factory via a
|
||||
deterministic scoring router, guarantees **exactly-once assignment** through
|
||||
optimistic-concurrency claims + **leases with epoch fencing**, recovers crashed
|
||||
work automatically (reaper + WIP checkpoints), enforces **per-product budgets**,
|
||||
supports **DAG decomposition** (composite → child jobs), and exposes the whole
|
||||
fleet through **two control planes**: a browser UI (`tracker-web`) and a terminal
|
||||
TUI (`agent-queue` dashboard). Both control planes talk to the same `/fleet` REST
|
||||
API.
|
||||
|
||||
---
|
||||
|
||||
## 2. Completion snapshot (reality, not the stale table)
|
||||
|
||||
| Phase | Theme | Real status | Notes |
|
||||
| ----- | ----- | ----------- | ----- |
|
||||
| **0** | Single-host baseline | ✅ 100% | `agent-queue.sh` folder queue, selftest green |
|
||||
| **1** | Manifest + profiles + capabilities + tracker adapter | ✅ ~98% | Only leftover: Node `dash` field surfacing — **now also done** via fleet-dash tags. Effectively complete |
|
||||
| **2** | Coordinator module + Cosmos + multi-factory leasing | ✅ ~98% | Scheduler wiring, enrollment+tokens, tracker-bridge are **done in code** but boxes 384/386 unticked in roadmap (see §11 Gaps) |
|
||||
| **3** | Fleet control plane (web + TUI) + DAG + budgets + scoring | ✅ 100% (all boxes ticked) | Pending: Playwright e2e wired into CI; live multi-host operator run |
|
||||
| **4** | Message bus + autoscaling + capability marketplace | 🟡 in progress | **M0 (RU gate) shipped** — see below. Broker (M1+) not started. Plan: [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md) |
|
||||
| **5** | Self-optimizing / learned routing | ☐ 0% | Not started |
|
||||
|
||||
> **Phase-4 M0 (RU gate) is live (2026-05-31):** a per-product `fleet_queue_state`
|
||||
> doc holds a monotonic `version` (bumped on job create + every stage change);
|
||||
> factories with `AQ_FLEET_GATE=1` point-read `GET /fleet/queue-state` (~1 RU) and
|
||||
> skip the expensive claim while nothing changed — cutting idle Cosmos RU without
|
||||
> raising the local poll interval. Default OFF; the live fleet runs it on.
|
||||
|
||||
---
|
||||
|
||||
## 3. System architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph CP["Control planes (operators)"]
|
||||
WEB["tracker-web Fleet UI<br/>(Next.js, /dashboard/fleet/*)"]
|
||||
TUI["agent-queue TUI<br/>(dashboard.mjs, AQ_FLEET_DASH=1)"]
|
||||
end
|
||||
|
||||
subgraph SVC["platform-service — fleet module (the spine)"]
|
||||
ROUTES["routes.ts<br/>/fleet REST + SSE"]
|
||||
COORD["coordinator.ts<br/>claim · lease · fence · reaper<br/>preemption · budgets · DAG · review"]
|
||||
SCHED["scheduler.ts<br/>pure scoring router (§7)"]
|
||||
ENROLL["enrollment.ts<br/>factory tokens (scoped, rotatable)"]
|
||||
BRIDGE["tracker-bridge.ts<br/>job ↔ tracker item"]
|
||||
ARTIF["artifacts.ts / artifacts-blob.ts<br/>pointer + blob bytes"]
|
||||
REPO["repository.ts<br/>CAS (rev/_etag) CRUD"]
|
||||
end
|
||||
|
||||
subgraph DATA["@bytelyst/datastore (Cosmos / memory)"]
|
||||
JOBS[("fleet_jobs")]
|
||||
RUNS[("fleet_runs")]
|
||||
LEASES[("fleet_leases")]
|
||||
FAC[("fleet_factories")]
|
||||
PROF[("fleet_profiles")]
|
||||
EVENTS[("fleet_events")]
|
||||
ARTDOCS[("fleet_artifacts")]
|
||||
end
|
||||
|
||||
subgraph FLEET["Factory agents (workers, N hosts)"]
|
||||
F1["agent-queue.sh + lib/fleet-client.sh<br/>(AQ_FLEET=1) — mac-1"]
|
||||
F2["agent-queue.sh + lib/fleet-client.sh<br/>ubuntu-1"]
|
||||
ENGINES["engines: claude · codex · devin"]
|
||||
end
|
||||
|
||||
WEB -->|/api/fleet proxy| ROUTES
|
||||
TUI -->|lib/fleet-dash.mjs| ROUTES
|
||||
ROUTES --> COORD
|
||||
COORD --> SCHED
|
||||
ROUTES --> ENROLL
|
||||
ROUTES --> BRIDGE
|
||||
ROUTES --> ARTIF
|
||||
COORD --> REPO
|
||||
ENROLL --> REPO
|
||||
BRIDGE --> REPO
|
||||
ARTIF --> ARTDOCS
|
||||
REPO --> JOBS & RUNS & LEASES & FAC & PROF & EVENTS
|
||||
|
||||
F1 -->|heartbeat · claim · patch fenced · renew| ROUTES
|
||||
F2 -->|heartbeat · claim · patch fenced · renew| ROUTES
|
||||
F1 --> ENGINES
|
||||
F2 --> ENGINES
|
||||
```
|
||||
|
||||
**Layering principle:** `scheduler.ts` is **pure** (no I/O — all inputs passed
|
||||
in), `coordinator.ts` is the orchestration core, `repository.ts` is the only thing
|
||||
that touches the datastore, and `routes.ts` is the only thing that touches HTTP.
|
||||
Factories never touch the DB directly — they only call REST.
|
||||
|
||||
---
|
||||
|
||||
## 4. Job lifecycle (stages)
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> queued: submitJob
|
||||
queued --> blocked: unmet deps
|
||||
blocked --> queued: deps satisfied (reaper/unblock)
|
||||
queued --> assigned: claimNextJob (CAS win + lease)
|
||||
assigned --> building: factory starts (patch fenced)
|
||||
building --> review: rc=0 → review gate
|
||||
building --> testing: verify-pass (auto)
|
||||
review --> testing: approve / requestReview quorum
|
||||
testing --> shipped: ship (manual gate)
|
||||
building --> failed: verify-fail / budget_exceeded / timeout
|
||||
review --> failed: reject
|
||||
assigned --> queued: lease expired (reaper, +epoch, keep checkpoint)
|
||||
building --> queued: preempted (critical job, checkpoint + epoch bump)
|
||||
failed --> queued: requeue (operator)
|
||||
failed --> dead_letter: retries exhausted
|
||||
shipped --> [*]
|
||||
dead_letter --> [*]
|
||||
```
|
||||
|
||||
Stages (`types.ts`): `queued · blocked · assigned · building · review · testing ·
|
||||
shipped · failed · dead_letter`. The TUI/local board collapse these onto kanban
|
||||
buckets (`inbox/building/review/testing/shipped/failed`) for parity.
|
||||
|
||||
---
|
||||
|
||||
## 5. The core guarantee — atomic claim + lease fencing
|
||||
|
||||
This is the heart of "no double-assignment, ever" and "a dead worker can never
|
||||
corrupt a reassigned job."
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant FA as Factory A
|
||||
participant FB as Factory B
|
||||
participant CO as coordinator
|
||||
participant DB as fleet_jobs / fleet_leases
|
||||
|
||||
FA->>CO: POST /fleet/claim (caps)
|
||||
FB->>CO: POST /fleet/claim (caps)
|
||||
CO->>DB: selectJob() → job J (rev=5)
|
||||
CO->>DB: revUpdate J: queued→assigned IF rev==5 (CAS)
|
||||
DB-->>CO: A wins (rev→6, leaseEpoch=1)
|
||||
CO->>DB: revUpdate J IF rev==5 (B's CAS)
|
||||
DB-->>CO: conflict (B re-selects)
|
||||
CO-->>FA: assigned J (leaseEpoch=1)
|
||||
CO-->>FB: conflict → next job
|
||||
|
||||
Note over FA: A crashes mid-build
|
||||
CO->>DB: reapExpiredLeases(): lease expired → J back to queued,<br/>leaseEpoch=2, checkpoint preserved
|
||||
FB->>CO: claim → J (leaseEpoch=2)
|
||||
FA-->>CO: (zombie) PATCH J stage=shipped leaseEpoch=1
|
||||
CO-->>FA: 409 fenced (1 < 2) — rejected
|
||||
```
|
||||
|
||||
- **CAS:** `repository.revUpdateJob/revUpdateLease` write only if stored `rev`
|
||||
matches (Cosmos `_etag`/`If-Match`; memory provider re-reads `rev`).
|
||||
- **Fencing:** every worker mutation carries `leaseEpoch`; epoch `< job.leaseEpoch`
|
||||
⇒ `fenced` (409).
|
||||
- **Reaper:** `reapExpiredLeases(now)` requeues expired-lease jobs, **bumps the
|
||||
epoch**, and **keeps the `checkpoint`** (WIP git branch pointer) so work resumes
|
||||
rather than restarts. Cosmos TTL cannot do this — the reaper owns recovery.
|
||||
|
||||
---
|
||||
|
||||
## 6. Data model (Cosmos containers)
|
||||
|
||||
| Container | PK | Purpose |
|
||||
| --------- | -- | ------- |
|
||||
| `fleet_jobs` | `/productId` | durable job: `manifestSnapshot`, verbatim `bodyMd`, `stage`, `idempotencyKey`, `deps`, `depsMode`, `checkpoint`, `priority`, `rev`, `leaseEpoch`, `kind`, `parentId` |
|
||||
| `fleet_runs` | `/jobId` | one execution attempt: engine, timings, `result`, `insights` (tokens/cost/diff) |
|
||||
| `fleet_leases` | `/jobId` | single-holder lease: `holderFactoryId`, `expiresAt`, `leaseEpoch`, `status` |
|
||||
| `fleet_factories` | `/productId` | worker host: `capabilities[]`, `health`, `load`, `seatLimit`, `lastHeartbeatAt` |
|
||||
| `fleet_profiles` | `/productId` | immutable, versioned persona/capability profile snapshot |
|
||||
| `fleet_events` | `/jobId` | append-only audit stream (monotonic `seq`) — powers SSE |
|
||||
| `fleet_artifacts` | `/jobId` | **pointers** to blob-stored artifacts (no inline logs) |
|
||||
| `fleet_queue_state` | `/productId` | **Phase-4 M0 RU gate**: monotonic `version` bumped on job create + every stage change; read via `GET /fleet/queue-state` so a factory can cheaply detect "work changed" |
|
||||
|
||||
Every document carries `productId`. Containers registered in `lib/cosmos-init.ts`.
|
||||
|
||||
---
|
||||
|
||||
## 7. The scheduler / scoring router (`scheduler.ts`)
|
||||
|
||||
Pure, deterministic, fixed-weight (tunable per-product in Phase 3, learned in
|
||||
Phase 5). Filter → score → rank:
|
||||
|
||||
```
|
||||
score = w1·capabilityFit + w2·affinity + w3·(1/(1+load))
|
||||
+ w4·costFit(budget) + w5·health − w6·starvationPenalty(age)
|
||||
```
|
||||
|
||||
Default weights (`DEFAULT_WEIGHTS`): `capabilityFit 1.0 · affinity 0.5 · load 1.0
|
||||
· costFit 0.75 · health 1.0 · starvation 1.5`. Capability is a **hard filter**
|
||||
(subset check); `down` factories are filtered out, not scored; aging fully
|
||||
de-penalises after ~30 min (anti-starvation). `scoreCandidate` returns a per-term
|
||||
breakdown that powers the **explainability** panel (`GET /fleet/jobs/:id/explain`
|
||||
→ `ExplainPanel`). `selectPreemptionVictim` picks the lowest-priority running job a
|
||||
critical job may evict (under `FLEET_PREEMPTION`).
|
||||
|
||||
---
|
||||
|
||||
## 8. Subsystems at a glance
|
||||
|
||||
| Subsystem | File(s) | What it does | Flag |
|
||||
| --------- | ------- | ------------ | ---- |
|
||||
| Claim / lease / fence / reaper | `coordinator.ts` | exactly-once assignment, recovery | — |
|
||||
| Scoring router + preemption | `scheduler.ts`, `coordinator.ts` | best-fit assignment, evict low-pri for critical | `FLEET_PREEMPTION` |
|
||||
| Per-product budgets | `coordinator.ts` (`accrueSpend`, `pause/resume`) | ceiling + auto-pause kill-switch; burndown | `FLEET_BUDGETS` |
|
||||
| DAG decomposition | `coordinator.ts` (`submitChildren`, `getDagSubtree`, `maybeUnblockParent`) | composite job fans out to children; deps gate parent | — |
|
||||
| Review gate | `coordinator.ts` (`requestReview`, `submitReview`) | multi-reviewer quorum before ship | — |
|
||||
| Factory enrollment | `enrollment.ts` | scoped, rotatable, hashed tokens; auth on claim/heartbeat | — |
|
||||
| Tracker bridge | `tracker-bridge.ts` | idempotent ingest of tracker item → job; one-way status echo | — |
|
||||
| Artifacts | `artifacts.ts`, `artifacts-blob.ts` | pointer docs in Cosmos, bytes in blob (SAS) | — |
|
||||
| Live events | `routes.ts` SSE + `fleet_events` | `GET /fleet/jobs/:id/events/stream` | — |
|
||||
| Metrics / alerts | `coordinator.ts` (`fleetMetrics`) | utilization, health rollup, starvation alerts | — |
|
||||
|
||||
---
|
||||
|
||||
## 9. REST API surface (`/fleet`, under `/api`, auth + `x-product-id`)
|
||||
|
||||
```
|
||||
Jobs POST /fleet/jobs · GET /fleet/jobs · GET /fleet/jobs/:id
|
||||
PATCH /fleet/jobs/:id (fenced) · POST /fleet/jobs/:id/actions/:action
|
||||
Claim POST /fleet/claim
|
||||
Lease POST /fleet/jobs/:id/lease/renew · /lease/release
|
||||
Factories POST /fleet/factories/heartbeat · /enroll
|
||||
POST /fleet/factories/:id/token/rotate · /token/revoke
|
||||
Runs/Events GET /fleet/jobs/:id/runs · /events · /events/stream (SSE) · /explain
|
||||
Review POST /fleet/jobs/:id/review/request · /review
|
||||
Budgets GET /fleet/budgets/:productId · /burndown
|
||||
PUT /fleet/budgets/:productId · POST /pause · /resume
|
||||
DAG POST /fleet/jobs/:id/children · GET /fleet/jobs/:id/dag
|
||||
Artifacts POST /fleet/jobs/:id/artifacts · GET (list) · GET/DELETE /fleet/artifacts/:id
|
||||
Tracker POST /fleet/tracker/ingest · /fleet/tracker/echo
|
||||
Metrics GET /fleet/metrics · GET /fleet/queue-state (Phase-4 M0 RU gate)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 10. The two control planes & feature flags
|
||||
|
||||
**Browser (`tracker-web`)** — `dashboards/tracker-web/src/`:
|
||||
- `app/dashboard/fleet/page.tsx` — fleet map (factory cards, health/load/caps, metrics + alerts)
|
||||
- `app/dashboard/fleet/jobs/page.tsx` — stage-filtered job table
|
||||
- `app/dashboard/fleet/jobs/[id]/page.tsx` — job detail: SSE event timeline, runs, artifacts, **DAG view**, **ExplainPanel**, **ReviewGateCard**, ship/requeue/reject
|
||||
- `app/dashboard/fleet/budget/page.tsx` — burndown chart + pause/resume kill-switch
|
||||
- `lib/fleet-client.ts` — typed client; `subscribeJobEvents` (fetch-based SSE w/ auth + `Last-Event-ID` resume + poll fallback); graceful 404 → null
|
||||
- `app/api/fleet/[...path]/route.ts` — proxy to platform-service
|
||||
|
||||
**Terminal (`agent-queue`)** — `learning_ai_devops_tools/agent-queue/`:
|
||||
- `dashboard.mjs` (`AQ_FLEET_DASH=1`) → `lib/fleet-dash.mjs` adapter: board counts, factories (per-factory rows or metrics aggregate), alerts, running, actionable JOBS w/ tags, recent, per-job events log; ship/requeue/reject via `/fleet`. Local folder-queue mode byte-for-byte unchanged when the flag is off.
|
||||
|
||||
**Feature flags**
|
||||
|
||||
| Flag | Where | Effect |
|
||||
| ---- | ----- | ------ |
|
||||
| `FLEET_PREEMPTION` | platform-service | enable critical-job preemption + seat limits |
|
||||
| `FLEET_BUDGETS` | platform-service | enable budget enforcement + auto-pause |
|
||||
| `AQ_FLEET` | factory runner | runner becomes a coordinator factory (claim/report) |
|
||||
| `AQ_FLEET_ROUTE` / `AQ_FLEET_SHADOW` | factory runner | route via service / side-effect-free shadow compare |
|
||||
| `AQ_FLEET_DASH` | TUI | dashboard sources board from `/fleet` API |
|
||||
| `AQ_FLEET_API` / `AQ_FLEET_TOKEN` / `AQ_PRODUCT_ID` | both | base URL / bearer / `x-product-id` |
|
||||
|
||||
All flags default **off** → the system is byte-for-byte the prior single-host tool.
|
||||
|
||||
---
|
||||
|
||||
## 11. Code map (where everything lives)
|
||||
|
||||
**`learning_ai_common_plat` (the durable spine):**
|
||||
```
|
||||
services/platform-service/src/modules/fleet/
|
||||
types.ts Zod schemas + canonical model (stages, lease, budget, DAG, events)
|
||||
repository.ts per-container CRUD + revUpdate CAS, appendEvent, listChildrenByParent
|
||||
coordinator.ts submit/claim/lease/fence/reaper, preemption, budgets, DAG, review, metrics
|
||||
scheduler.ts pure scoring router + selectPreemptionVictim + scoreCandidate (explain)
|
||||
enrollment.ts factory enroll / rotate / revoke / enforceFactoryToken
|
||||
tracker-bridge.ts ingest tracker item → job; one-way status echo
|
||||
artifacts.ts artifact pointer mgmt
|
||||
artifacts-blob.ts blob upload/download/delete (SAS)
|
||||
routes.ts all /fleet REST + SSE
|
||||
*.test.ts coordinator/scheduler/repository/routes/enrollment/tracker/artifacts/types
|
||||
dashboards/tracker-web/src/
|
||||
app/dashboard/fleet/** the browser control plane (pages above)
|
||||
lib/fleet-client.ts typed client + SSE
|
||||
app/api/fleet/[...path]/route.ts proxy
|
||||
e2e/fleet.spec.ts Playwright specs
|
||||
lib/cosmos-init.ts container registration
|
||||
docs/GIGAFACTORY/gigafactory-phase3-progress.md / docs/GIGAFACTORY/FLEET_CONTROL_PLANE.md
|
||||
```
|
||||
|
||||
**`learning_ai_devops_tools` (the factory agent + TUI + spec):**
|
||||
```
|
||||
agent-queue/
|
||||
agent-queue.sh single-host runner + factory agent (AQ_FLEET); budget.wall, retry, recover
|
||||
lib/fleet-client.sh curl-only coordinator client (register/claim/report/renew, fencing-aware)
|
||||
lib/fleet-dash.mjs TUI fleet-mode adapter over /fleet (+ fleet-dash.test.mjs, 22 assertions)
|
||||
dashboard.mjs the TUI (local + fleet modes)
|
||||
profiles/*.md persona+capability catalog
|
||||
demo/two-factory-demo.sh + coordinator-stub.sh parallel-fleet demo
|
||||
selftest.sh ~75 dependency-light checks
|
||||
docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md source-of-truth spec & checklists
|
||||
docs/GIGAFACTORY/GIGAFACTORY_SYSTEM_OVERVIEW.md (this file)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 12. Test coverage (what's verified)
|
||||
|
||||
- **platform-service fleet** (~134+ tests): atomic-claim race (true concurrency, no
|
||||
double-assign), fencing rejection, reaper reclaim + checkpoint, scheduler scoring
|
||||
/ tie-breaks / starvation / preemption-victim, DAG fan-out/unblock/subtree,
|
||||
budgets + burndown + auto-pause, review-gate quorum, enrollment/token lifecycle +
|
||||
auth enforcement, tracker ingest/echo idempotency, routes (incl. SSE + explain),
|
||||
schema validation.
|
||||
- **tracker-web** (~198 tests): fleet-client unit tests + page render; SSE
|
||||
parse/resume/fallback; graceful 404 degradation.
|
||||
- **tracker-web e2e** (`e2e/fleet.spec.ts`): fleet map, live log, ship, budget-pause,
|
||||
review-gate (Playwright — needs CI wiring).
|
||||
- **agent-queue** (`selftest.sh`, ~75 checks): manifest/profiles/caps/priority/deps/
|
||||
idempotency, retry/recover/insights, tracker round-trip, `AQ_FLEET` register/claim/
|
||||
fenced-patch/reaper-reclaim/quarantine, shadow AGREE/DIVERGE, two-factory demo,
|
||||
**budget.wall enforcement**, **fleet-dash adapter (22 assertions)**.
|
||||
|
||||
---
|
||||
|
||||
## 13. Next steps
|
||||
|
||||
**Immediate (close Phase 1–3 to a clean 100%):**
|
||||
1. **Validate the Cosmos `_etag`/`If-Match` CAS path under true contention** and
|
||||
**live blob-backed `fleet_artifacts`** — the two items the roadmap marks as
|
||||
"remaining for a hard 100%" on Phase 2/3 (tests today use the memory provider +
|
||||
pointer-only artifacts).
|
||||
2. **Wire `e2e/fleet.spec.ts` into CI** (Playwright install + a `verify` job) so the
|
||||
Phase-3 exit criterion ("web verify incl. e2e green") is enforced, not just
|
||||
present.
|
||||
3. **Live multi-host operator run** end-to-end (the Phase-3 acceptance: drive the
|
||||
3-repo parallel workload from the browser, including a budget pause + resume
|
||||
against a real platform-service, not the stub).
|
||||
|
||||
**Phase 4 (scale-out) — in progress; see [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md):**
|
||||
- ✅ **M0 (done)** — RU gate: `fleet_queue_state` + `GET /fleet/queue-state` +
|
||||
`AQ_FLEET_GATE`; factories skip the claim while the queue version is unchanged.
|
||||
4. **M1+: broker** (the redesign picks **Azure Service Bus**, not NATS/Redis, for
|
||||
subscription filters + DLQ) for push dispatch + backpressure in a
|
||||
coordinator-owns-scheduling / broker-owns-delivery hybrid (keeps the scorer).
|
||||
5. **M3: autoscaling** — scale-to-zero ephemeral factories (KEDA/Container Apps)
|
||||
keyed to subscription depth.
|
||||
6. **Capability marketplace** — route rare-capability jobs (xcode/figma/gpu) to the
|
||||
few factories that have them; cross-product queueing fairness.
|
||||
7. **Load + chaos suite** — factory churn, broker outage, thundering herd.
|
||||
|
||||
**Phase 5 (learned routing):**
|
||||
8. Capture per-run outcome features → offline eval harness (learned vs heuristic) →
|
||||
shadow/A-B with guardrails → surface recommendations ("route NomGap UX jobs to
|
||||
claude on mac-2: 23% faster").
|
||||
|
||||
---
|
||||
|
||||
## 14. Bugs, gaps & risks (be honest)
|
||||
|
||||
**Documentation status (reconciled 2026-05-31):**
|
||||
- `GIGAFACTORY_ROADMAP.md` §0 now reads Phase 0 ✅100% · 1 ✅~98% · 2 ✅~98% ·
|
||||
3 ✅100% · **4 ◐ in progress (~10%, M0 shipped)** · 5 ☐. Phase-2 boxes for the
|
||||
scheduler core and factory enrollment/scoped tokens are ticked (`scheduler.ts`
|
||||
`selectJob`/`selectPreemptionVictim` wired into `claimNextJob`; `enrollment.ts`
|
||||
`enforceFactoryToken` gating claim/heartbeat). The earlier "stale §0 table"
|
||||
warning no longer applies.
|
||||
|
||||
**Runtime / correctness gaps:**
|
||||
- **SSE is poll-fallback based, not a push-only contract.** `subscribeJobEvents`
|
||||
falls back to `getJobEvents()` polling on stream error — fine for resilience, but
|
||||
"live" can silently degrade to polling without a visible operator signal.
|
||||
- **UI pages degrade silently on some errors** (empty states / `null`), which can
|
||||
mask a real backend outage as "nothing happening."
|
||||
- **Budget page assumes `ceilingUsd` exists** when rendering the spend bar — a
|
||||
budget doc without a ceiling could render a broken/NaN bar. Guard it.
|
||||
- **Dashboard `patchJob` only sends `{stage, leaseEpoch}`** — other fenced-transition
|
||||
fields (e.g. `checkpoint`) aren't exposed in the web UI, so operator-driven
|
||||
transitions can't carry a checkpoint.
|
||||
- **`rev` CAS on the memory provider** is exact only for the sequential calls the
|
||||
coordinator/tests make (re-read `rev` before write). Real concurrency safety
|
||||
depends on Cosmos `_etag`/`If-Match` in production — verify the Cosmos path under
|
||||
true contention before relying on it at scale.
|
||||
|
||||
**TUI-specific (this repo):**
|
||||
- Fleet **utilization %** only renders in the metrics-aggregate fallback branch, not
|
||||
when per-factory rows are present — a minor inconsistency in the TUI board.
|
||||
- The **budget.wall live selftest is timing-sensitive** (races a 2s wall ceiling) and
|
||||
can flake under heavy disk/CPU load; the code is correct but the test could be made
|
||||
more robust (e.g. inject the clock).
|
||||
- TUI fleet mode has **no write path for budgets/preemption** — it's read + job
|
||||
actions only; budget pause/resume is web-only.
|
||||
|
||||
**Operational gotchas (verified on the live fleet — get these right):**
|
||||
- **Heartbeat cadence MUST be < the 90s stale threshold.** `fleet_metrics` marks a
|
||||
factory stale after `DEFAULT_STALE_FACTORY_MS = 90_000`, but the factory only
|
||||
heartbeats every `AQ_FLEET_LEASE_RENEW_SEC` (**default 300s**). Left at the
|
||||
default, a healthy factory flaps to "stale"/"no live factory" between beats. The
|
||||
fleet launcher sets `AQ_FLEET_LEASE_RENEW_SEC=30` to stay well inside the window.
|
||||
- **The tracker-web New-Job form is misconfigured:** it hardcodes factories
|
||||
`mac-1`/`mac-2` and defaults `capabilities=["build"]` — a token **no agent-queue
|
||||
factory advertises** (`detect_capabilities` emits `os:*`/`engine:*`/`node:*`/`has:*`).
|
||||
So a default UI submission is unroutable (queues forever → `queue_starvation`).
|
||||
Fix tracked in the redesign doc's routing-model section.
|
||||
- **No factory deregister API.** Only heartbeat/enroll/rotate/revoke exist, so a
|
||||
dead factory's doc lingers and shows as `stale` until pruned out-of-band
|
||||
(currently a manual Cosmos delete). A prune/deregister path is a Phase-4 item.
|
||||
|
||||
**Not-yet-built (expected, Phase 4+):**
|
||||
- **No message bus yet** — dispatch is still poll-based, but the **M0 RU gate now
|
||||
skips the claim while idle** (so idle Cosmos RU is near-flat). Broker push/
|
||||
backpressure is M1+.
|
||||
- **No autoscaling** — factory fleet is static/manually run (M3 target).
|
||||
- **No capability marketplace / cross-product fairness** under contention.
|
||||
- **No load/chaos test suite** — resilience is unit-proven, not load-proven.
|
||||
- **Artifacts blob wiring** (`fleet_artifacts` → real blob storage) should be
|
||||
validated against a live storage account (tests use memory/pointer only).
|
||||
|
||||
**Recently fixed (2026-05-31):**
|
||||
- **`run --once` could return before a backgrounded worker finished the PR/report.**
|
||||
`_meta_end` (which writes `ended=`) was called right after the `testing/` move,
|
||||
*before* PR open/merge + coordinator reports, so the slot freed early and `--once`
|
||||
could exit (and a caller could observe completion) mid-PR. Now `ended=` is written
|
||||
last; the selftest PR-mode case is deterministic again.
|
||||
|
||||
---
|
||||
|
||||
## 15. TL;DR
|
||||
|
||||
Phases 0–3 are functionally **complete and well-tested**: a durable coordinator with
|
||||
exactly-once leasing + fencing + crash recovery, a deterministic scoring router with
|
||||
preemption + explainability, per-product budgets, DAG decomposition, a multi-reviewer
|
||||
gate, factory enrollment with scoped tokens, and **two** control planes (browser +
|
||||
TUI) over one `/fleet` API. The remaining work is (a) trivial doc corrections, (b)
|
||||
CI-enforcing the existing e2e, and (c) the genuinely new Phase-4 scale-out frontier
|
||||
(broker, autoscaling, marketplace, chaos) and Phase-5 learned routing.
|
||||
@ -1,20 +0,0 @@
|
||||
# Gigafactory — Agent-Queue Docs
|
||||
|
||||
Source-of-truth specs and the system overview for **Agent Gigafactory**, the
|
||||
fleet-coordination layer that turns the single-host `agent-queue` runner into a
|
||||
multi-host factory of autonomous coding agents.
|
||||
|
||||
## Contents
|
||||
|
||||
| Doc | What it is |
|
||||
| --- | --- |
|
||||
| [`GIGAFACTORY_ROADMAP.md`](GIGAFACTORY_ROADMAP.md) | The canonical source-of-truth spec: architecture, the evolved job manifest, scoring formula, lifecycle/retry, enrollment, and the phased checklists (§1–§17). Job specs in `../jobs/` point here. |
|
||||
| [`GIGAFACTORY_SYSTEM_OVERVIEW.md`](GIGAFACTORY_SYSTEM_OVERVIEW.md) | A narrative overview of how the pieces fit together end-to-end, with a code-map of the relevant files across both repos. |
|
||||
| [`FLEET_DISPATCH_REDESIGN.md`](FLEET_DISPATCH_REDESIGN.md) | Phase-4 design proposal (no code): broker-backed (Azure Service Bus) dispatch + on-demand factories that fixes the product-as-queue routing smell and the idle-poll Cosmos RU cost. Phased migration starting with a zero-infra RU quick win. |
|
||||
|
||||
## Related docs in the other repo
|
||||
|
||||
The platform-service backend and the tracker-web UI live in
|
||||
`learning_ai_common_plat`. Its Gigafactory docs (roadmap-completion audit,
|
||||
remaining-task checklist, Phase-3 progress, and the fleet control-plane guide)
|
||||
are under `docs/GIGAFACTORY/` there.
|
||||
@ -1,79 +0,0 @@
|
||||
# Agent-Queue Run Policy
|
||||
|
||||
How the agent-queue daemon and the agents it launches must operate. Written
|
||||
after a live review found jobs running in `--yolo` (dangerous) mode directly
|
||||
against **live working trees**, which dirtied repos, produced duplicate/competing
|
||||
commits, and risked leaking secrets.
|
||||
|
||||
## Observed behavior (the problem)
|
||||
|
||||
`agent-queue.sh` launches the chosen CLI with `cwd` taken from the job
|
||||
front-matter (default `$PWD`) and, when `yolo: true` (the default), with
|
||||
full-autonomy flags:
|
||||
|
||||
| Engine | yolo flag |
|
||||
| --- | --- |
|
||||
| devin | `--permission-mode dangerous` |
|
||||
| claude | `--dangerously-skip-permissions` |
|
||||
| codex | `--dangerously-bypass-approvals-and-sandbox` |
|
||||
| (other) | `--allow-all-tools` |
|
||||
|
||||
With `cwd` pointing at a canonical checkout (e.g. `…/learning_ai_fastgap`), a
|
||||
dangerous-mode agent edits, commits, and pushes in the repo you also work in.
|
||||
|
||||
## Policy
|
||||
|
||||
1. **Isolation — never run in the canonical checkout.**
|
||||
Each job MUST run in a dedicated **git worktree** (or fresh clone) created off
|
||||
`origin/main`, not the live working directory. Set the job's `cwd` to that
|
||||
worktree. The canonical checkout must be left untouched.
|
||||
|
||||
2. **One job = one branch.**
|
||||
Create/checkout a dedicated branch (e.g. `aq/<job-id>`) off the latest
|
||||
`origin/main`. Agents push that branch and open a PR. **Never push straight to
|
||||
the shared `main`** of platform/shared repos.
|
||||
|
||||
3. **Least privilege by default.**
|
||||
Default `yolo: false`. Reserve the dangerous/`--allow-all-tools` flags for
|
||||
**disposable sandboxes only** (throwaway worktree/clone or container). Never
|
||||
run dangerous mode against a directory whose changes you care about.
|
||||
|
||||
4. **Clean-tree contract.**
|
||||
A job starts only from a clean tree and verifies the canonical checkout is
|
||||
unchanged when it finishes. If a worktree is dirty at pickup, fail fast.
|
||||
|
||||
5. **Test before ship.**
|
||||
Run typecheck + lint + the repo's test suite before committing. Commit small,
|
||||
conventional messages. Open a PR for review instead of force-merging.
|
||||
|
||||
6. **Never track runtime/queue state.**
|
||||
The `queue/{.state,inbox,building,testing,review,failed,shipped,logs}` lifecycle
|
||||
dirs are runtime state and are git-ignored (see repo `.gitignore`). Jobs must
|
||||
not commit them.
|
||||
|
||||
7. **One writer per repo.**
|
||||
At most one job per target repo at a time (use the existing per-repo lock) so
|
||||
two agents never compete on the same working tree.
|
||||
|
||||
8. **Secrets stay out of git.**
|
||||
Jobs must not write real secrets into tracked files. Use `.env` (gitignored);
|
||||
the pre-push secret scan is a backstop, not a license.
|
||||
|
||||
## Applying this with the current runner
|
||||
|
||||
- Add a **worktree-prep step** before launch: `git -C <repo> worktree add
|
||||
<tmp>/<job-id> -b aq/<job-id> origin/main`, then set the job `cwd: <tmp>/<job-id>`.
|
||||
- Set `yolo: false` in job front-matter unless the `cwd` is a disposable
|
||||
sandbox.
|
||||
- On completion, push `aq/<job-id>` and open a PR; remove the worktree
|
||||
(`git worktree remove`) once merged.
|
||||
|
||||
## Pre-flight checklist (per job)
|
||||
|
||||
- [ ] `cwd` is a dedicated worktree/clone, not a canonical checkout
|
||||
- [ ] dedicated branch off latest `origin/main`
|
||||
- [ ] `yolo: false` unless sandboxed/disposable
|
||||
- [ ] starts from a clean tree
|
||||
- [ ] tests/lint/typecheck run before commit
|
||||
- [ ] pushes a branch + PR (no direct shared-`main` pushes)
|
||||
- [ ] no runtime/queue state or secrets committed
|
||||
@ -1,86 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat-dependabot
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior platform engineer. TRIAGE the open Dependabot dependency-update PRs in
|
||||
`learning_ai_common_plat`, verify each one builds + tests green against CURRENT main,
|
||||
and MERGE only the safe ones. This is a maintenance sweep — be conservative: a green
|
||||
verify gate is the bar for merging; anything that fails, conflicts, or is a risky major
|
||||
bump gets left open with a clear note. NEVER weaken or skip a test to make a PR pass.
|
||||
|
||||
PARALLEL-SAFETY: Other Devins may be running in this repo and in learning_ai_devops_tools
|
||||
on gigafactory `fleet` work. You touch ONLY dependency manifests + lockfile as Dependabot
|
||||
already changed them — do NOT edit application source. If a Dependabot branch conflicts
|
||||
with main on anything other than package.json / pnpm-lock.yaml, SKIP it (leave open, note
|
||||
why) rather than hand-resolving source conflicts.
|
||||
|
||||
THE BRANCHES (each is one open PR, ahead of main by ~1 commit):
|
||||
- dependabot/npm_and_yarn/azure/cosmos-4.9.2
|
||||
- dependabot/npm_and_yarn/fastify/cors-11.2.0
|
||||
- dependabot/npm_and_yarn/happy-dom-20.8.4
|
||||
- dependabot/npm_and_yarn/jose-6.2.2
|
||||
- dependabot/npm_and_yarn/lint-staged-16.4.0
|
||||
- dependabot/npm_and_yarn/multi-6d7db9f379 (a grouped multi-package bump)
|
||||
- dependabot/npm_and_yarn/react-dom-19.2.4
|
||||
- dependabot/npm_and_yarn/stripe-20.4.1
|
||||
- dependabot/npm_and_yarn/types/node-25.5.0
|
||||
- dependabot/npm_and_yarn/typescript-eslint/parser-8.57.1
|
||||
- dependabot/github_actions/actions/checkout-6
|
||||
- dependabot/github_actions/actions/setup-node-6
|
||||
- dependabot/github_actions/actions/setup-python-6
|
||||
(Re-list with `git branch -r | grep dependabot` in case the set changed.)
|
||||
|
||||
PER-PR PROCEDURE (do each in an ISOLATED worktree off CURRENT origin/main so the main
|
||||
checkout + other Devins are never disturbed):
|
||||
1. `git fetch origin --prune`; create a temp worktree at origin/main; merge the dependabot
|
||||
branch into it (`--no-commit --no-ff`).
|
||||
- If the merge touches ANY file other than package.json / pnpm-lock.yaml /
|
||||
.github/workflows/* -> ABORT, classify SKIP (unexpected scope), note it.
|
||||
- If it conflicts -> ABORT, classify SKIP (conflicts main), note it.
|
||||
2. Identify the bump TYPE from the version delta (semver): patch / minor / major.
|
||||
3. Run the VERIFY GATE in the merged worktree:
|
||||
- `pnpm install --frozen-lockfile` (must succeed with the bumped lockfile)
|
||||
- `pnpm build`
|
||||
- `pnpm test`
|
||||
- For react-dom: also run the dashboards' web tests if they have their own suite.
|
||||
- GitHub-actions bumps (checkout/setup-node/setup-python): no pnpm gate; just confirm
|
||||
the workflow YAML still parses and the action major is supported by our runners.
|
||||
4. CLASSIFY:
|
||||
- MERGE if: scope is only manifests/lockfile/workflow, no conflicts, verify gate fully
|
||||
green. (Patch/minor with green gate = merge. A MAJOR bump may merge ONLY if the gate
|
||||
is green AND nothing in our code uses a removed/changed API — if unsure, HOLD.)
|
||||
- HOLD (leave open) if: gate fails, major bump with any ambiguity, or behavioral risk
|
||||
(e.g. stripe / jose / react-dom majors that need a human eye).
|
||||
- SKIP if: conflicts main or touches unexpected files.
|
||||
5. To MERGE: merge the branch into main with `--no-ff` (first parent = main), message
|
||||
`chore(deps): <package> <old> -> <new> (#<pr>)`, push origin HEAD:main, then delete the
|
||||
dependabot branch. Re-fetch main before the NEXT PR so each builds on the latest (avoids
|
||||
lockfile churn between merges). Do the LOW-RISK ones first (types/node, lint-staged,
|
||||
happy-dom, the actions bumps), majors last.
|
||||
|
||||
CONSTRAINTS: no app-source edits; never modify/skip tests; ESM repo conventions; conventional
|
||||
commits (chore(deps): ...); do not touch the gigafactory `fleet` modules; do not delete
|
||||
backup/* branches; leave the gigafactory + hermes branches alone. Stay entirely in isolated
|
||||
worktrees; clean every worktree up afterward (`git worktree remove --force` + `prune`).
|
||||
|
||||
VERIFY GATE (per merged PR, must be green to merge):
|
||||
- pnpm install --frozen-lockfile && pnpm build && pnpm test (no regression)
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Dependency Triage Report — common-plat Dependabot
|
||||
### Summary table
|
||||
| PR / package | old -> new | bump | verify gate | decision |
|
||||
(one row per branch: MERGE / HOLD / SKIP)
|
||||
### Merged (pushed to main)
|
||||
- <package> <old->new> (#pr) — commit <sha>
|
||||
### Held open (with reason)
|
||||
- <package> — <why: failing gate / major risk / needs human>
|
||||
### Skipped (with reason)
|
||||
- <package> — <conflicts main / unexpected scope>
|
||||
### Verify gate results (build/test summary per merged PR)
|
||||
### Branches deleted
|
||||
### Anything that needs a human decision
|
||||
@ -1,101 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: devops-tools
|
||||
timeout: 3h
|
||||
---
|
||||
|
||||
ROLE: Senior engineer. Implement Phase 1 — Slice 1 of the Agent Gigafactory roadmap.
|
||||
|
||||
SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §4, §5, §6, §7, §14 Phase 1
|
||||
first). This slice implements ONLY the items listed below.
|
||||
|
||||
STRICT SCOPE:
|
||||
- Edit ONLY files under agent-queue/ (primarily agent-queue.sh, selftest.sh, README.md,
|
||||
docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). DO NOT touch any other repo.
|
||||
- DO NOT modify, move, or delete anything under agent-queue/queue/ — there are LIVE jobs
|
||||
running there. DO NOT run `agent-queue.sh run`. selftest.sh uses its own temp queue
|
||||
(AGENT_QUEUE_ROOT) — that is the only execution allowed.
|
||||
- This is bash (single host). No service/Cosmos/Zod work in this slice (that is Phase 2).
|
||||
|
||||
DELIVERABLES (in agent-queue.sh, backward-compatible — legacy .md files with only
|
||||
engine/cwd/yolo MUST behave exactly as today):
|
||||
|
||||
1. MANIFEST PARSING: recognize these new frontmatter keys with safe defaults via the existing
|
||||
fm_get pattern: profile, engine-class, capabilities, prefers, priority, budget, deps,
|
||||
deps-mode, idempotency-key, retry, review-policy, artifacts, tracker-item. In THIS slice
|
||||
only items 2–5 are functional; the rest must be parsed + stored in the job .meta and shown
|
||||
in `status`, but otherwise inert (document as "reserved, no-op until later phase").
|
||||
|
||||
2. PRIORITY ORDERING: replace pure-FIFO inbox selection with priority-then-age.
|
||||
priority in {critical,high,medium,low} (default medium). Higher priority picked first; ties
|
||||
broken by oldest timestamp. Must not break per-lock serialization.
|
||||
|
||||
3. CAPABILITY GRAMMAR + MATCH (single-host):
|
||||
- detect_capabilities(): emit tokens for this host — os:<mac|linux>,
|
||||
engine:<devin|claude|codex|copilot present>, node>=<major>, has:<tool> for a small probe
|
||||
set (git, pnpm, docker if present).
|
||||
- caps_match(required[], available[]) honoring §5 grammar: bare `key` = presence;
|
||||
`key:value` exact; `key<op>version` with op in {>=,>,=,<=,<} (numeric/semver-major
|
||||
compare); `os:any` = wildcard match-all. A job matches iff EVERY required token is satisfied.
|
||||
- At run time, if a job declares `capabilities` the host does not satisfy, move the job to
|
||||
failed/ with result=capability_mismatch and a clear log line (do NOT launch the agent).
|
||||
|
||||
4. ENGINE-CLASS RESOLUTION: if `engine` is unset but `engine-class` is set, pick a concrete
|
||||
engine from a documented class map honoring `prefers-engine` then availability:
|
||||
agentic-coder -> [devin, claude, codex]; chat-coder -> [copilot]. Explicit `engine` always
|
||||
wins. If neither yields an available binary, fail the job with result=no_engine.
|
||||
|
||||
5. IDEMPOTENCY-KEY DEDUPE (on `add`): compute a content hash of the stripped body. If an
|
||||
existing job in ANY stage (inbox/building/review/testing/shipped) has the same
|
||||
idempotency-key AND same hash -> no-op (log "duplicate, skipped"). Same key + DIFFERENT hash
|
||||
-> reject with a clear error UNLESS the prior job is still in inbox/ (then replace it).
|
||||
|
||||
TESTS (selftest.sh — tests are sacred; only ADD, never weaken existing ones). Add cases:
|
||||
- backward-compat: a legacy engine/cwd/yolo-only .md still completes and lands in review/.
|
||||
- priority: with max=1, a `critical` job queued after a `low` job runs first.
|
||||
- capability mismatch: a job requiring `has:definitely-not-installed` -> failed/
|
||||
result=capability_mismatch (agent never launched; use the existing no-op engine stub).
|
||||
- engine-class: a job with `engine-class: agentic-coder` and no `engine`, DEVIN_BIN stubbed,
|
||||
runs and lands in review/.
|
||||
- idempotency: adding the same key+body twice yields exactly one inbox file; same key +
|
||||
different body is rejected.
|
||||
|
||||
DOCS:
|
||||
- README.md frontmatter table: add the new fields, clearly marking ACTIVE (Phase 1) vs RESERVED.
|
||||
- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick ONLY the Phase 1 checklist boxes you fully completed and
|
||||
update the §0 progress % for Phase 1 (do not tick incomplete items).
|
||||
|
||||
CONSTRAINTS:
|
||||
- bash style consistent with the existing script; no new runtime dependencies; macOS + Linux
|
||||
safe (no GNU-only flags without fallback). No emojis in code. No leftover debug echo noise.
|
||||
- Work on a NEW branch: feat/gigafactory-p1-slice1. Commit in logical steps with conventional
|
||||
commit messages. Push the branch and open a PR. DO NOT merge to main (human gate).
|
||||
|
||||
VERIFY GATE (must pass before finishing):
|
||||
- bash agent-queue/selftest.sh -> MUST be fully green (existing + new cases).
|
||||
- bash -n agent-queue/agent-queue.sh and node --check agent-queue/dashboard.mjs.
|
||||
|
||||
FINAL OUTPUT — print an implementation report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 1 Slice 1
|
||||
### Branch & commits
|
||||
- branch: <name>
|
||||
- commits: <sha> <message> (one per line)
|
||||
- PR: <url or "opened, not merged">
|
||||
### Files changed
|
||||
- <path>: <one-line summary>
|
||||
### What was implemented (per deliverable 1-5)
|
||||
- <deliverable>: <how, key functions added/changed>
|
||||
### Tests added
|
||||
- <test name>: <what it asserts> (plus selftest.sh PASS/FAIL summary)
|
||||
### Verify gate results
|
||||
- selftest.sh: <PASS/FAIL + counts>
|
||||
- bash -n / node --check: <result>
|
||||
### Deviations / assumptions
|
||||
- <anything changed from spec and why>
|
||||
### Reserved (parsed-but-inert) fields
|
||||
- <list fields parsed but no-op this slice>
|
||||
### Suggested next slice
|
||||
- <what should come next>
|
||||
@ -1,109 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: devops-tools
|
||||
timeout: 3h
|
||||
---
|
||||
|
||||
ROLE: Senior engineer. Implement Phase 1 — Slice 2 (Profiles + deps/DAG, single host).
|
||||
|
||||
SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §5 deps, §6 profiles,
|
||||
§14 Phase 1). This slice implements ONLY the items below.
|
||||
|
||||
PREREQUISITE / BRANCHING:
|
||||
- Slice 1 (manifest/priority/capabilities/engine-class/idempotency) AND Slice 3
|
||||
(resilience: orphan recovery, WIP checkpoint/resume, retry, insights) are BOTH
|
||||
already merged into `main`. Branch off the CURRENT `main`.
|
||||
- Do NOT duplicate, revert, or break any Slice 1 or Slice 3 code or tests — the
|
||||
existing selftest cases (34 checks) MUST stay green (regression).
|
||||
- New branch: feat/gigafactory-p1-slice2. Push + open a PR. DO NOT merge.
|
||||
|
||||
STRICT SCOPE:
|
||||
- Edit ONLY under agent-queue/ (agent-queue.sh, selftest.sh, README.md, new
|
||||
profiles/ dir, docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). No other repo.
|
||||
- DO NOT modify/delete anything under agent-queue/queue/ (live jobs). DO NOT run
|
||||
`agent-queue.sh run`. selftest.sh uses its own temp AGENT_QUEUE_ROOT only.
|
||||
- bash, single host. No service/Cosmos work (that is Phase 2).
|
||||
|
||||
A. PROFILES (§6)
|
||||
1. Create agent-queue/profiles/ with a starter catalog as profiles/<name>.md:
|
||||
developer, backend-engineer, frontend-engineer, ux-designer, ui-designer, qa,
|
||||
reviewer, docs-writer (and a reserved `planner`). Each has frontmatter:
|
||||
name, persona (multi-line block), capabilities, default-verify, engine-class,
|
||||
prefers-engine, allowed-scope, review-policy.
|
||||
2. Profile resolution: when a job sets `profile: X`, inherit any of
|
||||
{verify, capabilities, engine-class, prefers-engine, allowed-scope,
|
||||
review-policy} that the job OMITS. Job-level fields ALWAYS override the profile.
|
||||
Resolution runs BEFORE the capability gate + engine resolution so inherited
|
||||
caps/engine-class take effect.
|
||||
3. Persona injection: prepend the profile's persona to the job body in the
|
||||
stripped body file fed to the engine. Never write secrets to logs.
|
||||
4. allowed-scope guardrail — WARN-ONLY this phase: after the run, if cwd is a git
|
||||
repo, compute changed paths and log a WARNING for any path outside the
|
||||
allowed-scope globs. Non-blocking (do NOT fail the job). Expose the scope-check
|
||||
as a unit-testable function.
|
||||
5. Document the resolution precedence (job > profile > built-in default).
|
||||
|
||||
B. DEPS / DAG — single host (§5)
|
||||
6. deps reference other jobs by `idempotency-key` (stable, author-controlled). A
|
||||
dep is satisfied when a job with that key is in shipped/ (default), or in
|
||||
shipped/ OR testing/ when the dependent job sets `deps-mode: soft`.
|
||||
7. A job with unmet deps is BLOCKED: not selected to run, surfaced in `status` as
|
||||
"blocked (waiting on <keys>)". Implement as a skip-with-reason in inbox
|
||||
selection (like the busy-lock skip) — do NOT launch, do NOT move to failed.
|
||||
Re-evaluated every run loop; becomes runnable once deps are satisfied.
|
||||
8. Submit-time cycle detection on `add`: build the dep graph from idempotency-keys
|
||||
across inbox + active stages; reject (die, nonzero) if the new job would create
|
||||
a cycle.
|
||||
9. No cross-machine deps (that is P2).
|
||||
|
||||
TESTS (selftest.sh — tests are sacred; only ADD):
|
||||
- profile inherit verify: a profile whose default-verify is `false` → a job using
|
||||
it (no own verify) routes to failed/; a profile with default-verify `true` →
|
||||
testing/.
|
||||
- persona injection (golden): the body fed to the engine begins with the profile
|
||||
persona (capture via a stub that copies its --prompt-file to a sentinel).
|
||||
- profile caps inheritance: job omitting capabilities inherits the profile's →
|
||||
unmet → failed/ result=capability_mismatch.
|
||||
- allowed-scope warn: an out-of-scope change logs a WARNING and the job still
|
||||
succeeds (or assert the scope-check function directly).
|
||||
- deps block→run: job B deps:[keyA] stays blocked while A is unshipped; once A is
|
||||
in shipped/, B becomes runnable and completes.
|
||||
- deps-mode soft: dep satisfied when the dependency is in testing/.
|
||||
- cycle detection: adding A deps:[keyB] while B deps:[keyA] is rejected.
|
||||
|
||||
DOCS:
|
||||
- README: profiles section (catalog + resolution precedence) + deps/blocked
|
||||
semantics.
|
||||
- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick the §6 boxes you fully completed and the §5
|
||||
`deps` box; bump §0 Phase 1 %.
|
||||
|
||||
CONSTRAINTS: bash style consistent with the existing script; no new runtime deps;
|
||||
macOS + Linux safe; no emojis in code; no leftover debug noise; conventional
|
||||
commits.
|
||||
|
||||
VERIFY GATE (must pass):
|
||||
- bash agent-queue/selftest.sh → fully green (existing + new).
|
||||
- bash -n agent-queue/agent-queue.sh ; node --check agent-queue/dashboard.mjs.
|
||||
|
||||
FINAL OUTPUT — print the implementation report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 1 Slice 2
|
||||
### Branch & commits
|
||||
- branch / based-on: <name> (based on main | feat/gigafactory-p1-slice1)
|
||||
- commits: <sha> <message> (one per line)
|
||||
- PR: <url or "opened, not merged">
|
||||
### Files changed
|
||||
- <path>: <one-line summary>
|
||||
### What was implemented (A1-5, B6-9)
|
||||
- <item>: <how, key functions added/changed>
|
||||
### Tests added
|
||||
- <test name>: <what it asserts> (plus selftest.sh PASS/FAIL summary)
|
||||
### Verify gate results
|
||||
- selftest.sh: <PASS/FAIL + counts>
|
||||
- bash -n / node --check: <result>
|
||||
### Deviations / assumptions
|
||||
- <anything changed from spec and why>
|
||||
### Suggested next slice
|
||||
- <what should come next (likely: tracker adapter aq from-tracker/to-tracker)>
|
||||
@ -1,168 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: devops-tools
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior engineer. Implement Phase 1 — Slice 3: RESILIENCE & INSIGHTS (single host).
|
||||
This is a LARGE, fully self-contained slice (git + log parsing only — NO network,
|
||||
NO external service, NO credentials) so it runs end-to-end without blockers.
|
||||
|
||||
SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §11 lifecycle/retry,
|
||||
§25 durability/crash-recovery, §26 execution insights, §17 observability, §14 Phase 1).
|
||||
Implement the SINGLE-HOST bash equivalents of §25 and §26.
|
||||
|
||||
PREREQUISITE / BRANCHING:
|
||||
- Builds on Slice 1 (PR #1, branch feat/gigafactory-p1-slice1).
|
||||
- Base on `main` IF PR #1 (and PR #2 if present) are merged; otherwise branch off
|
||||
feat/gigafactory-p1-slice1. Do NOT revert or duplicate earlier slice code.
|
||||
- This slice is INDEPENDENT of Slice 2 (profiles/deps) — do not depend on it.
|
||||
- New branch: feat/gigafactory-p1-slice3. Commit in logical steps, push, open a PR.
|
||||
DO NOT merge (human gate).
|
||||
|
||||
STRICT SCOPE:
|
||||
- Edit ONLY under agent-queue/ (agent-queue.sh, selftest.sh, README.md,
|
||||
docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). No other repo.
|
||||
- DO NOT modify/delete anything under agent-queue/queue/ (live jobs). DO NOT run
|
||||
`agent-queue.sh run` against the real queue. selftest.sh uses its own temp
|
||||
AGENT_QUEUE_ROOT and temp git repos only.
|
||||
- bash, single host, macOS + Linux safe, zero new runtime deps.
|
||||
|
||||
==================================================================
|
||||
A. CRASH RECOVERY & WORK PRESERVATION (single-host §25)
|
||||
==================================================================
|
||||
A1. ORPHAN RECOVERY: On `run` startup (and at the top of each run loop), detect
|
||||
jobs stuck in building/ whose worker is no longer alive — i.e. the meta has a
|
||||
`pid=` whose process is dead (and `pidstart` mismatch guards PID reuse), or no
|
||||
live pid at all. Such a job is an ORPHAN from a previous crash/power-off.
|
||||
Recover it deterministically (never lose or strand it):
|
||||
- increment an `attempts=` counter in the meta,
|
||||
- log a clear recovery line,
|
||||
- move it back to inbox/ for re-selection (subject to retry policy A3),
|
||||
- recovery MUST be idempotent (running it twice recovers once).
|
||||
|
||||
A2. WIP CHECKPOINTING (work preservation): when a job's `cwd` is inside a git repo,
|
||||
the worker preserves partial work on a dedicated branch so a crash never loses it:
|
||||
- at START: ensure/create branch `aq/wip/<job>` (from current HEAD), record
|
||||
`wip_branch=` + `wip_base=` in meta. NEVER touch main/protected branches.
|
||||
- on EVERY exit path (success, failure, timeout, signal/trap): commit any
|
||||
changes in cwd to `aq/wip/<job>` with a message like
|
||||
"aq wip: <job> (<stage/exit>)" and record `wip_commit=` in meta.
|
||||
- use a trap so even SIGTERM/SIGINT/timeout still checkpoints.
|
||||
- if cwd is NOT a git repo: skip cleanly (log "wip: cwd not a git repo").
|
||||
RESUME: when an orphan/retry of a job whose `aq/wip/<job>` branch exists is
|
||||
relaunched, check out / fast-forward that branch first so the agent continues
|
||||
from the checkpoint instead of from zero. Document the resume behavior.
|
||||
|
||||
A3. RETRY POLICY (make the reserved `retry` field FUNCTIONAL):
|
||||
parse `retry: { max: N, backoff: 5m, on: [timeout, verify_failed, crash] }`.
|
||||
On a failure whose class is in `on` (agent rc!=0 => crash/agent_error,
|
||||
timeout => timeout, verify fail => verify_failed), requeue to inbox/ with the
|
||||
backoff delay honored (record `next_eligible=` epoch; selection skips until
|
||||
then) up to `max` attempts. On exhaustion → failed/ with
|
||||
result=retries_exhausted (single-host stand-in for dead_letter), preserving the
|
||||
wip branch + full diagnostics in the log. Default when `retry` absent = no
|
||||
retry (current behavior).
|
||||
|
||||
A4. STATE INTEGRITY: keep all meta writes append-only (as today); never truncate a
|
||||
live meta. Recovery/retry/backoff bookkeeping must be crash-safe (re-derivable
|
||||
from meta + folder location).
|
||||
|
||||
==================================================================
|
||||
B. EXECUTION INSIGHTS & TOKEN ACCOUNTING (single-host §26)
|
||||
==================================================================
|
||||
B1. PER-RUN METRICS: on completion, record into the job meta:
|
||||
duration_s, exit, result, attempts, and repo deltas for the run —
|
||||
files_changed, lines_added, lines_deleted (from `git -C <cwd> diff --numstat`
|
||||
against wip_base, or against HEAD~ if applicable).
|
||||
B2. TOKEN/COST CAPTURE (best-effort, honest): add a single extensible adapter
|
||||
`parse_usage <engine> <logfile>` that extracts, when present in the engine's
|
||||
output: model, tokens_in, tokens_out, tokens_cached, cost_usd, turns,
|
||||
tool_calls. Where the engine does not expose usage, omit the field or set an
|
||||
`estimated=true` marker — DO NOT fabricate precise numbers. Centralize all
|
||||
per-engine patterns in this one function (devin/claude/codex/copilot stubs;
|
||||
real patterns where known, TODO-commented otherwise).
|
||||
B3. SURFACE in `status`: add an insights sub-line per finished/running job
|
||||
(duration, attempts, tokens/cost if known, +/- lines).
|
||||
B4. NEW COMMAND `aq insights [job]`:
|
||||
- with a job id: print that job's full metrics.
|
||||
- without: print a table of recent finished jobs + an AGGREGATE rollup by
|
||||
engine (total tokens, total cost (mark if any estimated), job count,
|
||||
success rate, avg duration).
|
||||
B5. dashboard.mjs: surface a compact insights column/panel (tokens or cost +
|
||||
attempts) for finished jobs. Keep it read-only from meta (agent-queue.sh
|
||||
stays the single source of truth).
|
||||
B6. PRIVACY: never write prompt content or secrets into meta/insights/logs beyond
|
||||
what already exists.
|
||||
|
||||
==================================================================
|
||||
TESTS (selftest.sh — tests are sacred; only ADD; use temp git repos + stubs)
|
||||
==================================================================
|
||||
- orphan recovery: craft a building/ job whose meta pid is a dead PID → a `run`
|
||||
startup recovers it to inbox/ with attempts incremented; running recovery twice
|
||||
recovers exactly once.
|
||||
- wip checkpoint (git): job with a git-repo cwd that creates a file → after the
|
||||
run, branch aq/wip/<job> exists and contains a commit with the change; main
|
||||
branch untouched. Non-git cwd → skipped cleanly (no error).
|
||||
- wip resume: a recovered job whose aq/wip/<job> has a prior commit → the relaunch
|
||||
checks out that branch (assert HEAD is on aq/wip/<job> when the agent runs).
|
||||
- retry policy: verify-fail job with retry.max=1 on=[verify_failed] → requeued once
|
||||
(attempts=2) then → failed/ result=retries_exhausted; backoff next_eligible
|
||||
respected (job not picked before its delay — use a tiny backoff like 1s).
|
||||
- retry on crash: agent rc!=0 with on=[crash] retries; without `crash` in `on`,
|
||||
it goes straight to failed/ (no retry).
|
||||
- insights parse: feed a stub engine log containing a known usage line →
|
||||
parse_usage extracts tokens/cost into meta; `aq insights <job>` prints them;
|
||||
a no-usage log → fields omitted/estimated, no crash.
|
||||
- insights aggregate: two finished jobs → `aq insights` prints a per-engine rollup
|
||||
with correct totals + success rate.
|
||||
- numstat deltas: a run that adds N lines → lines_added recorded.
|
||||
- REGRESSION: all existing selftest cases (Slice 0 + Slice 1) still green.
|
||||
|
||||
==================================================================
|
||||
DOCS
|
||||
==================================================================
|
||||
- README: new "Resilience" section (orphan recovery, WIP checkpoint/resume, retry)
|
||||
and "Insights" section (metrics, `aq insights`, token caveat) + document the
|
||||
`retry` frontmatter (now active) and the new result= values
|
||||
(retries_exhausted). Update the manifest table: move `retry` from RESERVED to ACTIVE.
|
||||
- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick the single-host items you fully completed in
|
||||
§11 (retry/dead-letter stand-in), §25 (orphan/WIP/retry — note "single-host
|
||||
subset"), §26 (capture/insights — single-host subset); bump §0 Phase 1 %.
|
||||
|
||||
==================================================================
|
||||
CONSTRAINTS
|
||||
==================================================================
|
||||
- bash style consistent with the existing script; no new runtime deps; mac+linux
|
||||
safe (no GNU-only flags without a fallback — note macOS has BSD date/stat);
|
||||
no emojis in code; no leftover debug noise; conventional commits.
|
||||
- Be careful with `set -euo pipefail` + traps so the WIP-on-exit checkpoint always
|
||||
runs even on failure/timeout.
|
||||
|
||||
VERIFY GATE (must pass before finishing):
|
||||
- bash agent-queue/selftest.sh → fully green (existing + all new cases).
|
||||
- bash -n agent-queue/agent-queue.sh ; node --check agent-queue/dashboard.mjs.
|
||||
- shellcheck --severity=error agent-queue/agent-queue.sh (if available) → clean.
|
||||
|
||||
FINAL OUTPUT — print the implementation report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 1 Slice 3
|
||||
### Branch & commits
|
||||
- branch / based-on: <name> (based on main | feat/gigafactory-p1-slice1)
|
||||
- commits: <sha> <message> (one per line)
|
||||
- PR: <url or "opened, not merged">
|
||||
### Files changed
|
||||
- <path>: <one-line summary>
|
||||
### What was implemented (A1-A4, B1-B6)
|
||||
- <item>: <how, key functions added/changed>
|
||||
### Tests added
|
||||
- <test name>: <what it asserts> (plus selftest.sh PASS/FAIL summary)
|
||||
### Verify gate results
|
||||
- selftest.sh: <PASS/FAIL + counts>
|
||||
- bash -n / node --check / shellcheck: <result>
|
||||
### Deviations / assumptions
|
||||
- <anything changed from spec and why; which engines have real token parsing vs TODO>
|
||||
### Suggested next slice
|
||||
- <what should come next (likely: tracker adapter aq from-tracker/to-tracker, P2)>
|
||||
@ -1,125 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: devops-tools
|
||||
timeout: 3h
|
||||
---
|
||||
|
||||
ROLE: Senior engineer. Implement Phase 1 — Slice 4: TRACKER ADAPTER (single host).
|
||||
This CLOSES Phase 1: a task in the tracker can become a job, and job outcomes echo
|
||||
back to the tracker — the task<->job round-trip (§10, the last Phase-1 §14 item).
|
||||
|
||||
SOURCE OF TRUTH: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md (read §10 tracker
|
||||
integration, §5 manifest incl. tracker-item + idempotency-key, §24.5 one-way echo).
|
||||
|
||||
PREREQUISITE / BRANCHING:
|
||||
- Slice 1, Slice 3, AND Slice 2 (profiles/deps) are merged into `main`. Branch off
|
||||
the CURRENT `main`. This slice MUST run AFTER Slice 2 is merged (it shares
|
||||
agent-queue.sh) — do not start it until then.
|
||||
- New branch: feat/gigafactory-p1-slice4. Push + open a PR. DO NOT merge.
|
||||
- Keep ALL existing selftest checks green (regression).
|
||||
|
||||
STRICT SCOPE:
|
||||
- Edit ONLY under agent-queue/ (agent-queue.sh, selftest.sh, README.md,
|
||||
docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md). No other repo is modified.
|
||||
- You MAY READ (not edit) ../learning_ai_common_plat/services/platform-service/
|
||||
src/modules/items/{types,routes}.ts to match the real Item API contract
|
||||
(paths, fields, auth header). Do not change that repo.
|
||||
- bash, single host, mac+linux safe, zero new runtime deps (curl only).
|
||||
|
||||
CONFIG (all via env; document in README; never hardcode URLs/tokens/secrets):
|
||||
- AQ_TRACKER_API : base URL of the items API (default http://localhost:4003).
|
||||
- AQ_TRACKER_TOKEN : bearer token for auth (required for real calls).
|
||||
- AQ_PRODUCT_ID : productId to stamp/filter (every tracker Item has productId).
|
||||
- A single `tracker_api <method> <path> [json]` wrapper does ALL HTTP via curl
|
||||
(bearer header, content-type, base URL). It MUST be overridable for tests via
|
||||
AQ_TRACKER_API_CMD (a stub script path) so selftest needs NO live service.
|
||||
|
||||
DELIVERABLES
|
||||
|
||||
1. `aq from-tracker <ITEM_ID>` — pull a tracker Item and materialize a job in inbox/:
|
||||
- GET the item via tracker_api; map fields → job frontmatter:
|
||||
title/description -> job body (the instruction markdown, verbatim)
|
||||
item type/labels -> engine-class/profile/capabilities/priority where
|
||||
labels carry them (e.g. label `engine-class:agentic-coder`,
|
||||
`profile:backend-engineer`, `priority:high`,
|
||||
`cap:os:mac`); otherwise sane defaults.
|
||||
item id -> `tracker-item: <ITEM_ID>` and
|
||||
`idempotency-key: tracker-<ITEM_ID>` (stable).
|
||||
- IDEMPOTENT: if a job for this tracker-item already exists in any stage
|
||||
(reuse Slice 1 idempotency on the derived key) → no duplicate enqueue.
|
||||
- On success print the created inbox filename; on missing item → clear error, nonzero.
|
||||
|
||||
2. `aq to-tracker <job>` — push a job's CURRENT outcome to its tracker Item
|
||||
(one-way echo, child -> tracker; §24.5). Only if the job meta has tracker-item.
|
||||
- Map stage/result -> item status PATCH:
|
||||
building/review/testing -> in_progress
|
||||
shipped -> done
|
||||
failed -> blocked (or the API's failure status) + note
|
||||
- Post a comment/note with result, attempts, and insights summary
|
||||
(duration, tokens/cost if present) — reuse Slice 3 metrics. Metrics only,
|
||||
NEVER prompt content or secrets.
|
||||
- IDEMPOTENT: re-running to-tracker for an unchanged outcome is a no-op
|
||||
(track last-echoed state in meta, e.g. `tracker_echoed=<status>`).
|
||||
|
||||
3. Auto-echo hook (opt-in, default OFF): an env flag (e.g. AQ_TRACKER_AUTO=1)
|
||||
makes the worker call `to-tracker` automatically on each stage transition it
|
||||
already performs (enqueue→building→review/testing/failed/shipped). When OFF,
|
||||
echo is manual via the command. Never block/fail a job because an echo failed —
|
||||
log the echo error and continue (the tracker is downstream, not authoritative
|
||||
for execution).
|
||||
|
||||
4. `status` / `aq insights`: show the tracker-item id and last echoed status where
|
||||
present (you already surface tracker-item in status from Slice 1 — extend it).
|
||||
|
||||
TESTS (selftest.sh — only ADD; NO live service — use AQ_TRACKER_API_CMD stub that
|
||||
returns canned JSON and records the calls it received):
|
||||
- from-tracker creates an inbox job: stub returns an item JSON →
|
||||
`aq from-tracker T-1` creates one inbox/*.md whose frontmatter has
|
||||
tracker-item: T-1 and idempotency-key: tracker-T-1, body = item description.
|
||||
- from-tracker label mapping: item with labels [engine-class:agentic-coder,
|
||||
priority:high] → frontmatter reflects them.
|
||||
- from-tracker idempotent: calling it twice for T-1 → exactly one job (dedupe).
|
||||
- to-tracker status echo: a shipped job → stub receives a PATCH to status=done and
|
||||
a comment with the insights summary; assert no prompt body is sent.
|
||||
- to-tracker idempotent: second call with unchanged outcome → no duplicate
|
||||
PATCH/comment (tracker_echoed honored).
|
||||
- echo failure is non-fatal: stub returns HTTP 500 → `to-tracker` logs the error,
|
||||
exits without corrupting job state; the job's stage is unchanged.
|
||||
- REGRESSION: all existing checks (Slice 0/1/2/3) still green.
|
||||
|
||||
DOCS:
|
||||
- README: "Tracker integration" section — from-tracker/to-tracker, the env config,
|
||||
label→manifest mapping table, the one-way-echo rule, AQ_TRACKER_AUTO, and a note
|
||||
that real use needs platform-service running + a token.
|
||||
- docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md: tick the §10 single-host items + the §14 Phase-1
|
||||
"tracker adapter" item; set §0 Phase 1 → complete (or note the exact remaining %).
|
||||
|
||||
CONSTRAINTS: bash style consistent with the script; curl-only HTTP through the one
|
||||
wrapper; mac+linux safe; no emojis; conventional commits; tests sacred.
|
||||
|
||||
VERIFY GATE: bash agent-queue/selftest.sh fully green; bash -n agent-queue.sh;
|
||||
node --check dashboard.mjs; shellcheck --severity=error clean.
|
||||
|
||||
FINAL OUTPUT — print the report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 1 Slice 4
|
||||
### Branch & commits
|
||||
- branch / based-on: <name>
|
||||
- commits: <sha> <message>
|
||||
- PR: <url or "opened, not merged">
|
||||
### Files changed
|
||||
- <path>: <one-line summary>
|
||||
### What was implemented (1-4)
|
||||
- <item>: <how, key functions; the Item API contract you matched>
|
||||
### Tests added
|
||||
- <test name>: <what it asserts> (+ selftest PASS/FAIL summary)
|
||||
### Verify gate results
|
||||
- selftest / bash -n / node --check / shellcheck: <results>
|
||||
### Deviations / assumptions
|
||||
- <API path/field/status mapping choices; anything stubbed>
|
||||
### Phase 1 status
|
||||
- <which §14 items now complete; what (if anything) remains>
|
||||
### Suggested next slice
|
||||
- Phase 2 Slice 1 (fleet data model + repositories in platform-service)
|
||||
@ -1,86 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat-artifacts
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior backend engineer. Implement FLEET ARTIFACTS + BLOB WIRING (§13 leftover):
|
||||
large run outputs (logs, coverage, screenshots, build output) are stored in blob
|
||||
storage and only POINTERS (with size/content-type/SAS) live in the `fleet_artifacts`
|
||||
Cosmos container — NEVER inline in Cosmos (doc-size + RU limits).
|
||||
|
||||
PARALLEL-SAFETY (two other Devins are running — DO NOT collide):
|
||||
- You OWN the fleet_artifacts surface: types.ts (artifact schema only), repository.ts
|
||||
(artifact repo only), routes.ts (artifact endpoints only), cosmos-init.ts (only if the
|
||||
fleet_artifacts container needs registration), and a NEW artifacts.test.ts.
|
||||
- You MUST NOT touch: coordinator.ts, coordinator.test.ts, scheduler.ts (another Devin owns
|
||||
the scheduler + claim ranking). Keep your edits to types/repository/routes additive and
|
||||
localized to the artifact pieces — do not refactor the job/lease/claim code.
|
||||
- A third Devin is in a different repo (agent-queue) — no overlap.
|
||||
|
||||
READ FIRST:
|
||||
- services/platform-service/src/modules/fleet/types.ts — find FleetArtifactDoc (the
|
||||
foundation may already declare it, pk /jobId). repository.ts — see if an artifacts repo
|
||||
already exists; extend, don't duplicate. cosmos-init.ts — see if fleet_artifacts is
|
||||
already registered.
|
||||
- packages/blob (@bytelyst/blob) — the Azure Blob client + SAS token helpers. Learn the
|
||||
exact API (upload, container/key conventions, SAS generation, the memory/dev fallback).
|
||||
Use it the same way other consumers do (grep for existing @bytelyst/blob usage).
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §13 (fleet_artifacts
|
||||
bullet) + §26 (insights/artifacts).
|
||||
|
||||
PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-artifacts.
|
||||
Push + open PR. DO NOT merge.
|
||||
|
||||
DELIVERABLES
|
||||
1. FleetArtifactDoc (in types.ts — confirm/extend): { id, productId, jobId, runId?, kind
|
||||
('log'|'coverage'|'screenshot'|'build'|'other'), blobKey, contentType, sizeBytes,
|
||||
sha256?, createdAt }. Zod schema → inferred type. productId on the doc.
|
||||
2. repository.ts — artifacts repo: createArtifact, listArtifactsByJob(jobId),
|
||||
getArtifact(id, productId), deleteArtifact. Single-partition (pk /jobId). Do not touch
|
||||
the job/lease/run repos beyond importing shared helpers.
|
||||
3. Blob integration (a small artifacts service fn, e.g. in a NEW
|
||||
modules/fleet/artifacts-blob.ts): uploadArtifact(jobId, kind, bytes/stream, contentType)
|
||||
→ stores in @bytelyst/blob under a deterministic key
|
||||
(`fleet/<productId>/<jobId>/<id>-<kind>`), returns the persisted FleetArtifactDoc with a
|
||||
short-lived SAS read URL. getArtifactDownload(id) → re-issues a SAS URL. Large content
|
||||
NEVER goes into Cosmos.
|
||||
4. routes.ts — guarded endpoints (auth + productId, Zod-validated), additive only:
|
||||
POST /fleet/jobs/:id/artifacts (multipart or base64 body → upload + pointer)
|
||||
GET /fleet/jobs/:id/artifacts (list pointers)
|
||||
GET /fleet/artifacts/:artifactId (pointer + fresh SAS download URL)
|
||||
DELETE /fleet/artifacts/:artifactId
|
||||
Register exactly like the existing fleet routes (do not reorder/rewrite the others).
|
||||
|
||||
TESTS (artifacts.test.ts — memory blob + memory datastore; tests are sacred):
|
||||
- upload → a fleet_artifacts pointer doc is created with productId, blobKey, sizeBytes,
|
||||
contentType; the bytes live in blob, NOT in the Cosmos doc (assert the doc has no inline
|
||||
payload field).
|
||||
- list by job returns only that job's artifacts (partition isolation).
|
||||
- get returns a (fresh) SAS download URL; a large payload (> a Cosmos-safe threshold) still
|
||||
succeeds (proves blob offload).
|
||||
- delete removes the pointer (and blob if your helper does so).
|
||||
- routes via fastify inject: upload/list/get/delete; auth + productId enforced; invalid body
|
||||
→ 400; unknown id → 404.
|
||||
- existing fleet tests (jobs/leases/claim/events) remain green and untouched.
|
||||
|
||||
VERIFY GATE:
|
||||
- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet (all green)
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
- pnpm build && pnpm test (no consumer regressed)
|
||||
|
||||
CONSTRAINTS: ESM .js imports; no any; no console.log; productId on every doc; large logs in
|
||||
blob never Cosmos; conventional commits (feat(platform-service): ...); do not touch the files
|
||||
reserved for the other Devins; do not edit the agent-queue repo.
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Implementation Report — Fleet Artifacts + Blob Wiring (§13)
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
### What was implemented (artifact schema, blob key scheme, SAS, routes)
|
||||
### Tests added (+ pnpm test summary; esp. the "bytes in blob not Cosmos" assertion)
|
||||
### Verify gate results
|
||||
### Deviations / assumptions (blob API used, dev/memory fallback, SAS TTL)
|
||||
### Suggested next slice
|
||||
@ -1,108 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat
|
||||
timeout: 3h
|
||||
---
|
||||
|
||||
ROLE: Senior distributed-systems engineer. P0 HARDENING: make the fleet
|
||||
coordinator's job claim TRULY atomic. The Phase 2 Foundation (merged) implements
|
||||
the claim as an in-module "rev compare-and-swap" layered over an UNCONDITIONAL
|
||||
datastore read-then-replace. Because there are `await` points between the read,
|
||||
the rev check, and the write, two CONCURRENT claims can both read the same rev,
|
||||
both pass the check, and both write — a DOUBLE-ASSIGNMENT. The existing race test
|
||||
only drives the claims SEQUENTIALLY, so it does not catch this. Fix the root cause.
|
||||
|
||||
CONTEXT TO READ FIRST:
|
||||
- services/platform-service/src/modules/fleet/repository.ts — revUpdateJob /
|
||||
revUpdateLease (the non-atomic read-check-write).
|
||||
- services/platform-service/src/modules/fleet/coordinator.ts — tryClaimJob.
|
||||
- services/platform-service/src/modules/fleet/coordinator.test.ts — the current
|
||||
(sequential) "atomic claim race" test.
|
||||
- packages/datastore — the shared datastore abstraction + its Memory and Cosmos
|
||||
providers. Find the update/replace method and how (if at all) it exposes
|
||||
optimistic concurrency.
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §4 (atomic
|
||||
claim is THE core contract) + §13 (maps to Cosmos _etag / If-Match).
|
||||
|
||||
PREREQUISITE / BRANCHING:
|
||||
- Branch off CURRENT `main` (the foundation is already merged). New branch:
|
||||
feat/gigafactory-p2-atomic-claim. Push + open a PR. DO NOT merge.
|
||||
|
||||
GOAL: a single-winner claim that holds under TRUE concurrency, backed by a
|
||||
server-side conditional write (Cosmos If-Match/_etag) and a process-atomic memory
|
||||
implementation — not a best-effort in-module check.
|
||||
|
||||
DELIVERABLES
|
||||
|
||||
1. ADD an optimistic-concurrency update to the shared datastore (@bytelyst/datastore).
|
||||
This is a legitimate ADDITIVE shared-package feature (NOT a template-managed infra
|
||||
file) — it MUST be backward-compatible and fully tested so existing consumers are
|
||||
unaffected. Suggested API (match the package's existing style/naming):
|
||||
updateIfMatch(id, partitionKey, expected: { etag?: string; rev?: number }, patch)
|
||||
-> { ok: true, doc } | { ok: false, reason: 'conflict' | 'not_found' }
|
||||
- COSMOS provider: perform a conditional replace using the document `_etag` with
|
||||
`accessCondition { type: 'IfMatch', condition: etag }`; translate Cosmos 412
|
||||
(precondition failed) → { ok:false, reason:'conflict' }. Surface `_etag` on reads
|
||||
so callers can pass it back.
|
||||
- MEMORY provider: implement the get → compare → set with NO `await`/yield between
|
||||
the compare and the set (do it in one synchronous block inside the method) so two
|
||||
concurrent callers CANNOT interleave within the single-threaded event loop. This
|
||||
gives true in-process atomicity. Keep a monotonic rev (or reuse the existing one)
|
||||
as the compare token for parity with Cosmos `_etag`.
|
||||
- Do NOT change existing method signatures; only ADD. Update the provider interface
|
||||
+ both providers + the package's index exports.
|
||||
|
||||
2. REWIRE the fleet repository to use it: revUpdateJob / revUpdateLease must perform
|
||||
the compare-and-write through the new conditional update (no read-check-write with
|
||||
an intervening await). The coordinator's tryClaimJob keeps the same external
|
||||
behavior (returns ok / conflict) but is now genuinely atomic.
|
||||
|
||||
3. UPGRADE the tests to actually prove atomicity (these are the point of the slice):
|
||||
- In datastore: unit tests for updateIfMatch on BOTH providers — match → writes +
|
||||
bumps token; stale token → conflict, NO write; missing → not_found.
|
||||
- In fleet coordinator: replace/extend the race test to drive TRUE concurrency:
|
||||
(a) `await Promise.all([tryClaimJob(jobA), tryClaimJob(jobB)])` on the same
|
||||
freshly-read job → exactly one ok, one conflict; job assigned once; exactly
|
||||
one run; one lease; leaseEpoch == 1.
|
||||
(b) an N-claimer stress test: fire N (>=10) concurrent claims for one job via
|
||||
Promise.all → exactly one ok, N-1 conflicts; no double-assignment.
|
||||
(c) the same for lease renew under contention (optional but preferred).
|
||||
- These concurrent tests MUST fail against the OLD read-check-write and pass after
|
||||
the fix (sanity-check that you are testing the right thing; mention it in the report).
|
||||
|
||||
4. Keep ALL existing platform-service tests green (the 50 fleet + the rest). Do not
|
||||
weaken any test.
|
||||
|
||||
VERIFY GATE (must pass):
|
||||
- pnpm --filter @bytelyst/datastore test (new conditional-update tests)
|
||||
- pnpm --filter @bytelyst/datastore build
|
||||
- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
- pnpm build && pnpm test (full workspace — confirm no consumer of @bytelyst/datastore regressed)
|
||||
|
||||
CONSTRAINTS: ESM .js imports; no any; no console.log; additive + backward-compatible
|
||||
datastore change with tests; conventional commits (feat(datastore): ... /
|
||||
fix(platform-service): ...); never edit template-managed infra (.npmrc, docker-prep,
|
||||
tsconfig.base, pnpm-workspace). Tests are sacred.
|
||||
|
||||
FINAL OUTPUT — print the report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 2 Atomic-Claim Hardening
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
- <path>: <summary>
|
||||
### The fix
|
||||
- datastore conditional update: <API, Cosmos If-Match mapping, memory atomicity approach>
|
||||
- fleet rewire: <how revUpdate* now writes conditionally>
|
||||
### Tests added (the proof)
|
||||
- concurrent claim (Promise.all) + N-claimer stress: <results>
|
||||
- did the new concurrent test FAIL on the old code? <yes/no + brief note>
|
||||
- datastore conditional-update unit tests: <results>
|
||||
### Verify gate results
|
||||
- datastore test/build · fleet test · platform build · full pnpm build && test: <results>
|
||||
### Deviations / assumptions
|
||||
### Suggested next slice
|
||||
- Phase 2 Slice 3: factory-agent integration (agent-queue.sh ↔ coordinator) now that
|
||||
the claim is genuinely atomic.
|
||||
@ -1,97 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat-enrollment
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior backend + security engineer. Implement PHASE 2 — FACTORY ENROLLMENT +
|
||||
SCOPED ROTATABLE TOKENS (§12) for the fleet coordinator in platform-service, plus two
|
||||
small artifact-route hardening fixes found in review.
|
||||
|
||||
PARALLEL-SAFETY (another Devin is running in a DIFFERENT repo — agent-queue/devops-tools —
|
||||
on feature flags; no file overlap with you. Stay within platform-service):
|
||||
- You OWN: a NEW modules/fleet/enrollment.ts, modules/fleet/tokens.ts (or one
|
||||
enrollment.ts), enrollment.test.ts, and ADDITIVE edits to types.ts, repository.ts,
|
||||
routes.ts, cosmos-init.ts (factory token fields + enrollment endpoints + token-auth
|
||||
middleware). You MAY edit artifacts-blob.ts/routes.ts ONLY for the two review fixes below.
|
||||
- You MUST NOT change the scheduler.ts scoring, coordinator.ts claim/lease/fence CAS, or
|
||||
the heartbeat/claim PAYLOAD shape (only ADD an optional auth check around them, behind a
|
||||
flag — see below). Do not break any of the existing 79 fleet tests / 1591 platform tests.
|
||||
|
||||
READ FIRST:
|
||||
- modules/fleet/types.ts — FleetFactoryDoc (id, productId, capabilities, health, load,
|
||||
lastHeartbeatAt...). repository.ts — factory upsert (heartbeat). routes.ts — POST
|
||||
/fleet/factories/heartbeat, POST /fleet/claim (these will optionally require a token).
|
||||
- modules/auth/** in platform-service AND ../../packages/auth — reuse the EXISTING token/
|
||||
hashing primitives (bcrypt/sha-256 recovery-code pattern). Do NOT invent new crypto.
|
||||
Tokens are stored HASHED at rest; the plaintext is returned exactly once at enroll/rotate.
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §12 (enrollment,
|
||||
scoped tokens, rotation, revocation) + §18 (trust boundary).
|
||||
|
||||
PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-enrollment.
|
||||
Push + open PR. DO NOT merge.
|
||||
|
||||
DELIVERABLES
|
||||
1. Factory enrollment + token lifecycle (enrollment.ts):
|
||||
- enrollFactory({productId, capabilities, label?}) → creates/links a FleetFactoryDoc and
|
||||
issues a SCOPED token: scope = {productId, factoryId, capabilities[]}. Persist only the
|
||||
HASH (+ tokenId, createdAt, lastUsedAt, status). Return plaintext token ONCE.
|
||||
- rotateToken(factoryId, productId) → issue a new token, invalidate the previous (grace:
|
||||
mark old `rotating` with a short overlap TTL so an in-flight worker isn't cut off).
|
||||
- revokeToken(tokenId|factoryId, productId) → status=revoked; immediately rejected.
|
||||
- verifyToken(plaintext) → resolves {factoryId, productId, capabilities, status} or null;
|
||||
constant-time hash compare; updates lastUsedAt. Revoked/expired ⇒ null.
|
||||
2. Token-auth on the fleet endpoints — GATED so existing tests keep passing:
|
||||
- Add a `requireFactoryToken` check to POST /fleet/factories/heartbeat and POST
|
||||
/fleet/claim that is ENFORCED only when enforcement is on (env/flag
|
||||
FLEET_REQUIRE_FACTORY_TOKEN, default OFF so the 79 existing tests are unaffected). When
|
||||
on: missing/invalid/revoked token ⇒ 401; token scope must cover the requested productId
|
||||
+ the claim's capabilities ⇒ else 403. When off: behaves exactly as today.
|
||||
- The claim's effective capabilities/productId must be taken from the VERIFIED token scope
|
||||
when enforcement is on (a factory cannot claim outside its scope).
|
||||
3. Routes (additive): POST /fleet/factories/enroll, POST /fleet/factories/:id/token/rotate,
|
||||
POST /fleet/factories/:id/token/revoke — all auth + productId + Zod validated, registered
|
||||
like the existing fleet routes (do not reorder others).
|
||||
4. REVIEW FIXES (small, same module):
|
||||
- listArtifactsByJob must be productId-scoped: thread `productId` through
|
||||
repo.listArtifactsByJob + the GET /fleet/jobs/:id/artifacts handler (use the request
|
||||
productId), so a caller can only list artifacts for their own product.
|
||||
- Upload must prefer the request/auth productId over body.productId (drop the
|
||||
`body.productId ||` precedence; use getRequestProductId(req), body value only as a
|
||||
non-overriding hint or removed).
|
||||
|
||||
TESTS (enrollment.test.ts + targeted additions; tests are sacred, all prior green):
|
||||
- enroll returns a plaintext token once; the stored doc holds only a hash (assert no
|
||||
plaintext persisted) + scope (productId, capabilities).
|
||||
- verifyToken: valid → scope; tampered/unknown → null; revoked → null.
|
||||
- rotate: old token still works during the overlap TTL, then is rejected; new token works.
|
||||
- revoke: immediate rejection.
|
||||
- enforcement OFF (default): heartbeat/claim behave exactly as the existing tests expect
|
||||
(re-assert claim works with NO token).
|
||||
- enforcement ON: no token → 401; out-of-scope productId or capability → 403; in-scope → ok,
|
||||
and claim is constrained to the token's scope.
|
||||
- artifact fixes: list is productId-scoped (a different product cannot see the pointers);
|
||||
upload ignores a spoofed body.productId.
|
||||
|
||||
VERIFY GATE:
|
||||
- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet (all green;
|
||||
count grows from 79)
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
- pnpm build && pnpm test (no regression across consumers)
|
||||
|
||||
CONSTRAINTS: ESM .js imports; no any; no console.log; productId on every doc; tokens HASHED
|
||||
at rest, plaintext shown once; reuse existing auth/crypto primitives (no new schemes);
|
||||
enforcement default OFF; conventional commits (feat(platform-service): ...); do not touch
|
||||
scheduler scoring or the claim CAS; do not edit the agent-queue repo.
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Implementation Report — Phase 2 Factory Enrollment + Scoped Tokens (§12)
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
### What was implemented (enroll/rotate/revoke/verify, scope model, gated auth, artifact fixes)
|
||||
### Tests added (+ pnpm test summary; esp. hashed-at-rest, scope 401/403, enforcement-off no-op)
|
||||
### Verify gate results
|
||||
### Deviations / assumptions (which crypto primitive, rotation overlap TTL, flag name)
|
||||
### Suggested next slice
|
||||
@ -1,105 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: agent-queue
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior bash + distributed-systems engineer. Implement PHASE 2 — FLEET FEATURE FLAGS
|
||||
+ SHADOW / DUAL-RUN for the agent-queue runner: a safe, reversible path to validate the
|
||||
fleet coordinator against the proven single-host (P1) behavior BEFORE any real cutover.
|
||||
|
||||
PARALLEL-SAFETY (another Devin is running in a DIFFERENT repo — learning_ai_common_plat —
|
||||
on enrollment/tokens; no file overlap with you. Stay within the agent-queue repo):
|
||||
- You OWN: agent-queue/lib/fleet-client.sh, agent-queue/agent-queue.sh (the fleet hook
|
||||
points only), agent-queue/selftest.sh, agent-queue/README.md,
|
||||
agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md.
|
||||
- Keep the offline git-queue path unchanged when fleet is off. All 60 existing selftest
|
||||
checks MUST stay green.
|
||||
|
||||
READ FIRST:
|
||||
- agent-queue/lib/fleet-client.sh — the P2-S3 client: fleet_enabled, fleet_api,
|
||||
fleet_claim, fleet_report, lease renew/release, fleet_quarantine. You EXTEND this.
|
||||
- agent-queue/agent-queue.sh — the run loop + the existing fleet hook points + the offline
|
||||
path (cmd_add/run_worker/ship). Study how AQ_FLEET gates everything today.
|
||||
- agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §9 (split-brain / offline degrade), §16/§17
|
||||
(feature flags fleet.enabled / fleet.route_via_service), §27 (cutover & rollback).
|
||||
|
||||
PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-flags-shadow.
|
||||
Push + open PR. DO NOT merge.
|
||||
|
||||
FLAG MODEL (three explicit, independently-toggleable levels; document precedence):
|
||||
- AQ_FLEET=0|1 master switch (exists). 0 ⇒ pure offline, zero coordinator calls.
|
||||
- AQ_FLEET_ROUTE=0|1 route_via_service: when 1 (and AQ_FLEET=1) the coordinator is
|
||||
AUTHORITATIVE for claim/assignment (today's P2-S3 behavior).
|
||||
When 0, the LOCAL inbox is authoritative (coordinator not used to
|
||||
source work) — this is the pre-cutover state.
|
||||
- AQ_FLEET_SHADOW=0|1 shadow/dual-run: when 1 (requires AQ_FLEET=1, AQ_FLEET_ROUTE=0)
|
||||
the runner does its normal OFFLINE/local processing as the
|
||||
authoritative path, and IN PARALLEL queries the coordinator
|
||||
(shadow claim + shadow report) WITHOUT acting on its responses —
|
||||
purely to compare decisions and record divergence. Shadow NEVER
|
||||
ships, quarantines, or mutates real job state.
|
||||
|
||||
DELIVERABLES
|
||||
1. fleet-client.sh additions (all guarded; no-ops unless their flag is on):
|
||||
- fleet_route_enabled / fleet_shadow_enabled helpers (precedence: SHADOW only meaningful
|
||||
when ROUTE=0; if both ROUTE=1 and SHADOW=1, ROUTE wins and a warning is logged).
|
||||
- fleet_shadow_claim — asks the coordinator what it WOULD assign for this factory's caps,
|
||||
without claiming a lease for real (read-only / dry-run; if the API has no dry-run, claim
|
||||
then immediately lease/release, or use a shadow factoryId — pick the least-invasive and
|
||||
document it). Returns the would-be job id (or none).
|
||||
- fleet_shadow_compare — given the LOCAL decision (the job the offline path actually ran)
|
||||
and the coordinator's would-be decision, classify AGREE / DIVERGE / COORD_EMPTY /
|
||||
LOCAL_EMPTY and append a structured line to a shadow log
|
||||
(agent-queue/queue/.state/fleet-shadow.log: ts, localJob, coordJob, verdict).
|
||||
- fleet_shadow_report — mirrors stage transitions to the coordinator as shadow events
|
||||
(clearly flagged shadow=1) so reporting is exercised, but divergence in the coordinator
|
||||
response is logged, never acted on.
|
||||
2. agent-queue.sh wiring (minimal, flag-gated):
|
||||
- run loop: if SHADOW on, after the local authoritative decision each iteration, call
|
||||
fleet_shadow_claim + fleet_shadow_compare (best-effort, error-swallowed — shadow must
|
||||
NEVER fail a real job).
|
||||
- ROUTE flag: thread it so claim sourcing honors it (ROUTE=1 ⇒ coordinator-sourced as
|
||||
today; ROUTE=0 ⇒ local inbox authoritative even when AQ_FLEET=1).
|
||||
- new subcommand `aq fleet-shadow-report` — summarize the shadow log (counts of
|
||||
AGREE/DIVERGE/…, last N divergences). Add to dispatch + help.
|
||||
- surface the three flags' resolved state in `aq status` / `aq fleet-status`.
|
||||
3. Cutover safety: document the recommended rollout ladder in README — (1) AQ_FLEET=1,
|
||||
ROUTE=0, SHADOW=1 (observe, zero risk) → (2) inspect agreement rate → (3) flip ROUTE=1
|
||||
once agreement is high → rollback = set ROUTE=0 (and/or AQ_FLEET=0) at any time.
|
||||
|
||||
TESTS — extend selftest.sh (stub the coordinator like the P2-S3 fleet stub; all 60 prior
|
||||
checks stay green):
|
||||
- flags off: AQ_FLEET=0 ⇒ zero coordinator calls (incl. shadow); offline flow identical.
|
||||
- shadow agree: stub returns the same job the local path runs ⇒ shadow log records AGREE;
|
||||
the real job still ships via the offline/local path; coordinator state NOT mutated for real.
|
||||
- shadow diverge: stub returns a different/empty job ⇒ DIVERGE/COORD_EMPTY logged; real job
|
||||
still completes; nothing quarantined.
|
||||
- shadow is non-fatal: coordinator 5xx/timeout during shadow ⇒ real job still completes,
|
||||
exit 0, a shadow-error noted.
|
||||
- ROUTE precedence: ROUTE=1 + SHADOW=1 ⇒ ROUTE path taken, warning logged, no shadow compare.
|
||||
- ROUTE=0 + AQ_FLEET=1 ⇒ local inbox is authoritative (coordinator not used to source work).
|
||||
- fleet-shadow-report summarizes the log counts correctly.
|
||||
|
||||
VERIFY GATE:
|
||||
- bash agent-queue/selftest.sh (60 prior + new shadow/flag cases; none weakened)
|
||||
- bash -n agent-queue/agent-queue.sh && bash -n agent-queue/lib/fleet-client.sh
|
||||
- shellcheck --severity=error agent-queue/agent-queue.sh agent-queue/lib/fleet-client.sh
|
||||
- node --check agent-queue/dashboard.mjs (if unchanged)
|
||||
|
||||
CONSTRAINTS: bash + curl + POSIX awk only (no jq/new deps); reuse P2-S3 helpers; shadow must
|
||||
be strictly side-effect-free on real job state; offline path unchanged when AQ_FLEET=0;
|
||||
never hardcode tokens; conventional commits (feat(agent-queue): ...); never weaken a test;
|
||||
do not edit the common-plat repo.
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Implementation Report — Phase 2 Feature Flags + Shadow/Dual-run
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
### What was implemented (flag model + precedence, shadow claim/compare/report, cutover ladder)
|
||||
### Tests added (+ selftest summary = 60 prior + N new; esp. flags-off no-op, shadow non-fatal, ROUTE precedence)
|
||||
### Verify gate results
|
||||
### Deviations / assumptions (how shadow claim avoids real lease mutation)
|
||||
### Suggested next slice
|
||||
@ -1,179 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat
|
||||
timeout: 5h
|
||||
---
|
||||
|
||||
ROLE: Senior backend / distributed-systems engineer. Implement the PHASE 2
|
||||
FOUNDATION of the agent gigafactory: a new `fleet` module in platform-service
|
||||
covering (S1) the durable data model + repositories AND (S2) the CONCURRENCY CORE
|
||||
— atomic claim, leases, fencing, heartbeat, and a reaper. This is one long,
|
||||
self-contained backend slice. It supersedes the single-host stand-ins built in the
|
||||
agent-queue (devops-tools) repo.
|
||||
|
||||
WHY THIS IS A SAFE LONG (UNATTENDED) RUN: everything is in ONE repo
|
||||
(learning_ai_common_plat), all logic is TypeScript, and ALL tests run on the
|
||||
in-memory datastore provider (DB_PROVIDER=memory) — NO live platform-service, NO
|
||||
Cosmos, NO network calls, NO tokens required. There are no external blockers.
|
||||
|
||||
READ FIRST (this is NOT the platform-service you may assume — verify conventions):
|
||||
- services/platform-service/src/modules/items/{types,repository,routes}.ts — copy
|
||||
this module pattern EXACTLY: types.ts -> repository.ts -> routes.ts, Zod schemas,
|
||||
the cloud-agnostic datastore, productId on every doc, req.log/app.log, ESM with
|
||||
.js import suffixes, no `any`, no console.log.
|
||||
- packages/datastore (or the existing datastore abstraction) — how repositories are
|
||||
built, how optimistic concurrency (_etag / If-Match) is exposed, and how the
|
||||
memory vs cosmos provider is selected (DB_PROVIDER).
|
||||
- packages/cosmos container registry — how containers are registered.
|
||||
- The fleet spec lives in the sibling devops-tools repo (read-only):
|
||||
../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md
|
||||
§4 (core contract: idempotency/atomic-claim/fencing/lease), §7 (scheduler/claim),
|
||||
§8 (factory/lease/heartbeat), §13 (containers + fields), §18 (failure model),
|
||||
§25 (durability/recovery), §26 (insights). Match these field names + semantics.
|
||||
|
||||
PREREQUISITE / SETUP / BRANCHING:
|
||||
- Branch off CURRENT `main` of learning_ai_common_plat.
|
||||
- New branch: feat/gigafactory-p2-foundation. Commit in logical steps (data model,
|
||||
repos, coordinator, routes, docs). Push + open a PR. DO NOT merge (human gate).
|
||||
- If node_modules is missing, run `pnpm install` once at the repo root. All tests
|
||||
must pass with DB_PROVIDER=memory (set it in the test setup if not already).
|
||||
|
||||
STRICT SCOPE:
|
||||
- Add ONE new module: services/platform-service/src/modules/fleet/ (+ tests there).
|
||||
- Register the new fleet_* Cosmos containers via the existing registration path.
|
||||
- Do NOT modify unrelated modules. Do NOT hand-edit template-managed infra
|
||||
(.npmrc, docker-prep.sh*, tsconfig.base.json, pnpm-workspace.yaml) — they drift.
|
||||
- Every Cosmos doc MUST include productId. ESM everywhere. No `any` (Zod inference
|
||||
or explicit types). No console.log (use req.log / app.log). Tests are sacred —
|
||||
never weaken or delete a test to go green; fix the code.
|
||||
|
||||
=================================================================
|
||||
PART S1 — DATA MODEL + REPOSITORIES
|
||||
=================================================================
|
||||
1. types.ts — Zod schemas + inferred types, each doc carrying productId:
|
||||
- FleetJobDoc (pk /productId): manifestSnapshot, bodyMd (verbatim instructions),
|
||||
stage (enum matching the agent-queue lifecycle:
|
||||
queued|blocked|assigned|building|review|testing|shipped|failed|dead_letter),
|
||||
idempotencyKey, trackerItemId?, parentId?, kind ('leaf'|'composite' default
|
||||
'leaf'), checkpoint? {wipBranch,wipBase,wipCommit}, priority
|
||||
(critical|high|medium|low), capabilities[], engineClass?, profile?, deps[],
|
||||
depsMode?, budget? {usd?,tokens?,wall?}, retry? {max,backoff,on[]}, timestamps.
|
||||
- FleetRunDoc (pk /jobId): jobId, attempt, factoryId?, engine, profileSnapshot?,
|
||||
startedAt, endedAt?, exit?, verifyResult?, result?, insights {model?,tokensIn?,
|
||||
tokensOut?,tokensCached?,costUsd?,estimated?,turns?,toolCalls?,filesChanged?,
|
||||
linesAdded?,linesDeleted?}.
|
||||
- FleetLeaseDoc (pk /jobId): jobId, holderFactoryId?, expiresAt?, leaseEpoch
|
||||
(number, default 0), renewals (number), status (held|expired|released).
|
||||
- FleetFactoryDoc (pk /productId): factoryId, descriptor, capabilities[],
|
||||
health (ok|degraded|down), load, seatLimit, lastHeartbeatAt.
|
||||
- FleetProfileDoc (pk /productId): name, version, immutable snapshot.
|
||||
- FleetEventDoc (pk /jobId): append-only {type, at, actor?, data}.
|
||||
- FleetArtifactDoc (pk /jobId): pointers to blob-stored artifacts (no inline logs).
|
||||
2. repository.ts — one repo per container on the datastore abstraction (memory +
|
||||
cosmos): create, getById, list (by productId; jobs also by stage + by
|
||||
idempotencyKey), update (returning/honoring _etag), delete where sensible,
|
||||
appendEvent(jobId,event). Partition-aware; no cross-partition fan-out in hot paths.
|
||||
3. Register all fleet_* containers with correct partition keys.
|
||||
|
||||
=================================================================
|
||||
PART S2 — CONCURRENCY CORE (claim / lease / fencing / heartbeat / reaper)
|
||||
=================================================================
|
||||
4. ATOMIC CLAIM (the heart): `claimNextJob(factory)` selects the highest-priority,
|
||||
oldest eligible job whose stage is `queued` AND whose deps are satisfied AND
|
||||
whose capabilities are a subset of the factory's, then atomically transitions it
|
||||
to `assigned` and creates/acquires its lease — guarded by _etag / If-Match so
|
||||
that under contention EXACTLY ONE factory wins; losers get a conflict and retry
|
||||
the selection. No double-assignment, ever.
|
||||
5. LEASES + FENCING: acquiring a lease increments `leaseEpoch`. `renewLease`,
|
||||
`releaseLease`. Every state-mutating call from a worker carries its leaseEpoch;
|
||||
a call whose epoch is < the current epoch is REJECTED (fencing) — a stale/zombie
|
||||
worker can never overwrite a reassigned job's state.
|
||||
6. HEARTBEAT: `heartbeat(factoryId)` updates lastHeartbeatAt + load/health.
|
||||
7. REAPER: `reapExpiredLeases(now)` scans leases with expiresAt < now, marks them
|
||||
expired, bumps leaseEpoch, and returns the job to `queued` (or `blocked` if deps
|
||||
now unmet) for re-claim — resume-from-checkpoint friendly (checkpoint pointer
|
||||
preserved on the job). Reaper is idempotent. (Cosmos TTL does NOT do this — the
|
||||
reaper must; document why.)
|
||||
8. IDEMPOTENCY: submit with an existing idempotencyKey + identical content => returns
|
||||
the existing job (no dup); same key + different content while still queued =>
|
||||
supersede; otherwise 409. (Mirror the agent-queue Slice 1 semantics.)
|
||||
9. DEPS: a job is `blocked` until each dep reaches shipped (or testing when
|
||||
depsMode:soft); submit-time cycle detection rejects cyclic graphs.
|
||||
|
||||
10. routes.ts — guarded REST under the existing auth + productId middleware:
|
||||
POST /fleet/jobs (submit, idempotent), GET /fleet/jobs (list by stage),
|
||||
GET /fleet/jobs/:id, PATCH /fleet/jobs/:id (fenced state transition),
|
||||
POST /fleet/claim (atomic claim for a factory),
|
||||
POST /fleet/jobs/:id/lease/renew, POST /fleet/jobs/:id/lease/release,
|
||||
POST /fleet/factories/heartbeat, GET /fleet/jobs/:id/runs,
|
||||
GET /fleet/jobs/:id/events. Validate every body with the Zod schemas. Register
|
||||
the module in the app exactly as items is registered.
|
||||
|
||||
=================================================================
|
||||
TESTS (Vitest — write alongside; memory provider; tests are sacred)
|
||||
=================================================================
|
||||
- schema validation: valid docs pass; missing productId / bad enum fail precisely
|
||||
(>=1 invalid case per container).
|
||||
- repo CRUD round-trip per container; list filters by productId, by stage, by
|
||||
idempotencyKey; appendEvent yields an ordered append-only stream.
|
||||
- ATOMIC CLAIM RACE: two claims contending for the SAME job version (same _etag) =>
|
||||
exactly one succeeds, the other gets a conflict; assert no double-assignment.
|
||||
(Deterministic: drive via the conditional/If-Match update, not real threads.)
|
||||
- priority+age selection: among eligible queued jobs, claim returns the
|
||||
highest-priority then oldest.
|
||||
- deps gating: a job with unmet deps is `blocked` and NOT claimable; becomes
|
||||
claimable once deps reach shipped; depsMode:soft satisfied at testing; cycle
|
||||
rejected at submit.
|
||||
- FENCING: a state-mutating call with a stale leaseEpoch is rejected; the current
|
||||
epoch succeeds.
|
||||
- REAPER: an expired lease => job back to queued, leaseEpoch bumped, checkpoint
|
||||
preserved; running the reaper twice is idempotent.
|
||||
- HEARTBEAT updates lastHeartbeatAt/health; a stale factory is detectable.
|
||||
- IDEMPOTENT submit: same key+content => 1 job; key+changed content while queued =>
|
||||
superseded; otherwise 409.
|
||||
- routes: submit+claim+renew+release+heartbeat+patch via fastify inject (shared
|
||||
testing helpers); auth + productId enforced; invalid body rejected.
|
||||
|
||||
VERIFY GATE (must all pass before finishing):
|
||||
- pnpm --filter @lysnrai/platform-service typecheck
|
||||
- pnpm --filter @lysnrai/platform-service test (all new tests green; none weakened)
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
Run the full repo gate too if quick: `pnpm build && pnpm test && pnpm typecheck`.
|
||||
|
||||
DOCS:
|
||||
- A module README (or docblock) describing each container, the claim/lease/fence
|
||||
protocol, and the reaper. In your REPORT, list which roadmap §4/§7/§8/§13/§18
|
||||
items are now satisfied (I will tick them in the devops-tools repo — you must NOT
|
||||
edit that repo).
|
||||
|
||||
CONSTRAINTS: follow items-module conventions precisely; ESM .js imports; no any; no
|
||||
console.log; productId on every doc; conventional commits
|
||||
(feat(platform-service): ...); do not touch template-managed infra.
|
||||
|
||||
FINAL OUTPUT — print the report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 2 Foundation (fleet module + coordinator)
|
||||
### Branch & commits
|
||||
- branch / based-on / PR
|
||||
- commits: <sha> <message> (one per line)
|
||||
### Files changed
|
||||
- <path>: <one-line summary>
|
||||
### What was implemented
|
||||
- S1 data model: <containers, partition keys, etag handling>
|
||||
- S2 concurrency: <claim algorithm, lease/fencing via leaseEpoch, reaper, heartbeat>
|
||||
- idempotency + deps + cycle detection: <how>
|
||||
### Tests added
|
||||
- <test name>: <assertion> (esp. the atomic-claim race, fencing, reaper tests)
|
||||
- pnpm test summary: <N passed>
|
||||
### Verify gate results
|
||||
- typecheck / test / build (+ full-repo gate if run): <results>
|
||||
### Roadmap items now satisfied
|
||||
- §4: <...> §7: <...> §8: <...> §13: <...> §18: <...>
|
||||
### Deviations / assumptions
|
||||
- <datastore concurrency model, how the race test is made deterministic, anything stubbed>
|
||||
### Suggested next slice
|
||||
- Phase 2 Slice 3: factory-agent integration — agent-queue.sh registers/heartbeats/
|
||||
claims/reports against this coordinator behind a flag, preserving offline mode;
|
||||
plus the tracker echo wired through fleet_events.
|
||||
@ -1,84 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat-scheduler
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior backend engineer. Implement the PHASE 2 SCHEDULER / ROUTER CORE (§7)
|
||||
for the fleet coordinator: a deterministic, fixed-weight scoring engine that picks
|
||||
WHICH job a claiming factory gets, and wire it into the atomic claim.
|
||||
|
||||
PARALLEL-SAFETY (two other Devins are running — DO NOT collide):
|
||||
- You OWN: services/platform-service/src/modules/fleet/scheduler.ts (NEW),
|
||||
scheduler.test.ts (NEW), and the candidate-ranking section of coordinator.ts +
|
||||
coordinator.test.ts.
|
||||
- You MUST NOT touch: types.ts, repository.ts, routes.ts, cosmos-init.ts, server.ts
|
||||
(another Devin is editing those for fleet_artifacts). If you need a new type, define
|
||||
it inside scheduler.ts. If wiring truly requires a types.ts change, instead re-export
|
||||
from scheduler.ts. Import existing FleetJobDoc/FleetFactoryDoc from types.ts (read-only).
|
||||
- A third Devin is in a different repo (agent-queue) — no overlap.
|
||||
|
||||
READ FIRST:
|
||||
- services/platform-service/src/modules/fleet/coordinator.ts — claimNextJob /
|
||||
tryClaimJob: today it selects "highest-priority, oldest, deps-satisfied, capability-
|
||||
subset". You will replace the SELECTION step with the scoring engine (keep the atomic
|
||||
tryClaimJob CAS exactly as-is).
|
||||
- types.ts (read-only) — FleetJobDoc (priority, capabilities, budget, createdAt, deps,
|
||||
stage), FleetFactoryDoc (capabilities, health, load, seatLimit).
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §7 (the formula
|
||||
+ tie-breaks + phasing note: Phase 2 = fixed weights; Phase 3 = tunable + preemption).
|
||||
|
||||
PREREQUISITE / BRANCHING: branch off CURRENT main → feat/gigafactory-p2-scheduler.
|
||||
Push + open PR. DO NOT merge.
|
||||
|
||||
DELIVERABLES
|
||||
1. scheduler.ts (pure, no I/O, fully unit-testable):
|
||||
- Weight config (fixed defaults, overridable via a passed-in object — NOT env here):
|
||||
score = w1·capabilityFit + w2·affinity(prefersEngine/repo-stickiness)
|
||||
+ w3·(1/(1+load)) + w4·costFit(budget) + w5·health − w6·starvationPenalty(age)
|
||||
- `scoreCandidate(job, factory, ctx, weights?) → { score, breakdown }` — return the
|
||||
per-term breakdown for explainability (§7/Phase-3 readiness).
|
||||
- `selectJob(candidates: FleetJobDoc[], factory, ctx, weights?) → FleetJobDoc | null` —
|
||||
filter to deps-satisfied + capability-subset (reuse the coordinator's existing
|
||||
predicates; if they're inline, extract pure helpers INTO scheduler.ts), then rank by
|
||||
score; deterministic tie-break: higher priority → older createdAt → lower cost class.
|
||||
- Pure, synchronous, no datastore calls. Health/load come from the factory doc; age
|
||||
from job.createdAt vs ctx.now (coordinator-authoritative time, passed in).
|
||||
2. Wire into coordinator.claimNextJob: replace the ad-hoc selection with
|
||||
`selectJob(...)`, passing the existing candidate set + the claiming factory + ctx.now.
|
||||
Keep tryClaimJob's rev/updateIfMatch CAS and lease/fence logic byte-for-byte unchanged.
|
||||
If the claim has no factory capabilities/health context today, thread the minimal fields
|
||||
through ClaimContext (additive, in coordinator.ts only).
|
||||
|
||||
TESTS (scheduler.test.ts + additions to coordinator.test.ts — tests are sacred):
|
||||
- capabilityFit: a factory missing a required cap → candidate filtered out (never selected).
|
||||
- priority dominates when all else equal; age breaks ties deterministically.
|
||||
- load: higher-load factory lowers score (1/(1+load)); health: degraded < ok.
|
||||
- starvation: an old low-priority job eventually outranks a fresh low-priority one.
|
||||
- costFit: a job exceeding the factory/budget cost class is penalized/last.
|
||||
- breakdown: scoreCandidate returns each weighted term (sums to score).
|
||||
- selectJob determinism: same inputs → same pick across runs; empty/no-eligible → null.
|
||||
- coordinator integration: claimNextJob still returns exactly one winner under the existing
|
||||
concurrency tests (all prior fleet tests stay green); selection now follows the score.
|
||||
|
||||
VERIFY GATE:
|
||||
- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet (all green)
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
- pnpm build && pnpm test (no regression)
|
||||
|
||||
CONSTRAINTS: ESM .js imports; no any; no console.log; fixed weights this phase (tunable +
|
||||
preemption are Phase 3 — do NOT build them); pure scheduler (no I/O); conventional commits
|
||||
(feat(platform-service): ...); do not touch the files reserved above; do not edit the
|
||||
agent-queue repo.
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Implementation Report — Phase 2 Scheduler/Router Core (§7)
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
### What was implemented (scoring terms, tie-breaks, coordinator wiring)
|
||||
### Tests added (+ pnpm test summary)
|
||||
### Verify gate results
|
||||
### Deviations / assumptions (what ctx fields were threaded, weight defaults chosen)
|
||||
### Suggested next slice (Phase 3 tunable weights + preemption + explainability UI)
|
||||
@ -1,125 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior backend/distributed-systems engineer. Implement Phase 2 — Slice 1:
|
||||
the FLEET DATA MODEL + REPOSITORIES as a new platform-service module. This is the
|
||||
durable backbone (§13) that supersedes the single-host stand-ins. NO atomic
|
||||
claim/lease/fencing logic yet — that is Phase 2 Slice 2. This slice is schemas,
|
||||
repositories, container registration, basic guarded CRUD, and tests.
|
||||
|
||||
NOTE: This runs in a DIFFERENT repo (learning_ai_common_plat), so it does NOT
|
||||
conflict with the agent-queue (devops-tools) slices and can run independently.
|
||||
|
||||
READ FIRST (this is NOT the platform-service you may assume — verify conventions):
|
||||
- services/platform-service/src/modules/items/{types,repository,routes}.ts — copy
|
||||
this module pattern EXACTLY (types.ts -> repository.ts -> routes.ts, Zod schemas,
|
||||
the cloud-agnostic datastore, productId on every doc, req.log/app.log).
|
||||
- packages/cosmos (container registry) + how existing modules register containers.
|
||||
- The fleet container spec in the roadmap: agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md
|
||||
§13 lives in the devops-tools repo at ../learning_ai_devops_tools — read it for
|
||||
the field lists (fleet_jobs incl. bodyMd + checkpoint; fleet_runs incl. token/
|
||||
cost/tool/diff insights; fleet_leases incl. leaseEpoch; fleet_factories;
|
||||
fleet_profiles; fleet_events; fleet_artifacts) and §25/§26.
|
||||
|
||||
PREREQUISITE / BRANCHING:
|
||||
- Branch off CURRENT `main` of learning_ai_common_plat.
|
||||
- New branch: feat/gigafactory-p2-slice1. Push + open a PR. DO NOT merge.
|
||||
|
||||
STRICT SCOPE:
|
||||
- Add a NEW module: services/platform-service/src/modules/fleet/ (+ its tests).
|
||||
- Register the new Cosmos containers via the existing registration path.
|
||||
- Do NOT modify unrelated modules. Do NOT hand-edit shared infra (.npmrc,
|
||||
docker-prep.sh, tsconfig.base, pnpm-workspace) — those are template-managed.
|
||||
- ESM everywhere ("type": "module", .js import suffixes). No `any` (Zod inference
|
||||
or explicit types). No console.log (use req.log/app.log). Every Cosmos doc has
|
||||
productId. Tests are sacred.
|
||||
|
||||
DELIVERABLES
|
||||
|
||||
1. types.ts — Zod schemas + inferred types for each container, each with productId:
|
||||
- FleetJobDoc (pk /productId): manifestSnapshot, bodyMd (verbatim instructions),
|
||||
stage, idempotencyKey, trackerItemId?, parentId?, kind ('leaf'|'composite',
|
||||
default 'leaf'), checkpoint? { wipBranch, wipBase, wipCommit }, priority,
|
||||
capabilities[], engineClass?, profile?, deps[], depsMode?, timestamps.
|
||||
- FleetRunDoc (pk /jobId): jobId, attempt, factoryId?, engine, profileSnapshot?,
|
||||
startedAt, endedAt?, exit?, verifyResult?, result?, and insights: model?,
|
||||
tokensIn?, tokensOut?, tokensCached?, costUsd?, estimated?, turns?, toolCalls?,
|
||||
filesChanged?, linesAdded?, linesDeleted?.
|
||||
- FleetLeaseDoc (pk /jobId): jobId, holderFactoryId?, expiresAt?, leaseEpoch
|
||||
(number, default 0), renewals, status. (Fields only — reclaim/claim logic is S2.)
|
||||
- FleetFactoryDoc (pk /productId): factoryId, descriptor, capabilities[], health,
|
||||
load, lastHeartbeatAt, seatLimit.
|
||||
- FleetProfileDoc (pk /productId): name, version, immutable snapshot (persona,
|
||||
defaults). FleetEventDoc (pk /jobId): append-only event { type, at, data }.
|
||||
FleetArtifactDoc (pk /jobId): pointers to blob-stored artifacts (no inline logs).
|
||||
- Define enums for stage and result that MATCH the agent-queue lifecycle.
|
||||
|
||||
2. repository.ts — one repository per container using the existing datastore
|
||||
abstraction (so DB_PROVIDER=memory works in tests, cosmos in prod):
|
||||
- CRUD: create, getById, list (by productId; jobs also by stage), update
|
||||
(optimistic via _etag where the datastore supports it — expose the etag,
|
||||
even though the ATOMIC claim flow is S2), delete where sensible.
|
||||
- appendEvent(jobId, event) for the append-only fleet_events stream.
|
||||
- All queries partition-aware; no cross-partition fan-out in hot paths.
|
||||
|
||||
3. container registration — register all fleet_* containers with correct partition
|
||||
keys via the existing cosmos container registry; memory provider auto-handles.
|
||||
|
||||
4. routes.ts — minimal guarded REST under the existing auth + productId middleware:
|
||||
- POST /fleet/jobs (create), GET /fleet/jobs (list by stage/productId),
|
||||
GET /fleet/jobs/:id, PATCH /fleet/jobs/:id (stage/fields), and read endpoints
|
||||
for runs (GET /fleet/jobs/:id/runs) + events. Keep it thin — claim/lease
|
||||
endpoints are S2. Validate all bodies with the Zod schemas.
|
||||
- Register the route module in the platform-service app the same way items does.
|
||||
|
||||
TESTS (Vitest — write alongside; memory provider; tests sacred):
|
||||
- schema validation: valid docs pass; missing productId / bad enum fail with
|
||||
precise errors; at least one invalid case per container.
|
||||
- repository CRUD round-trip per container (create→get→list→update→delete) on the
|
||||
memory provider; list filters by productId and by stage (jobs).
|
||||
- appendEvent produces an ordered, append-only stream for a jobId.
|
||||
- routes: create+get+list+patch a job via fastify inject (use the shared testing
|
||||
helpers); auth/productId enforced; invalid body rejected.
|
||||
- _etag surfaced on update (lost-update guard groundwork) — assert the etag flows.
|
||||
|
||||
VERIFY GATE (must pass):
|
||||
- pnpm --filter @lysnrai/platform-service typecheck
|
||||
- pnpm --filter @lysnrai/platform-service test (new tests green; none weakened)
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
|
||||
DOCS:
|
||||
- Short module README or header docblock describing the containers + that
|
||||
claim/lease/fencing is Phase 2 Slice 2.
|
||||
- In ../learning_ai_devops_tools roadmap you may NOT edit (different repo) — instead
|
||||
note in your report which §13 items are now satisfied so I can tick them.
|
||||
|
||||
CONSTRAINTS: follow the items-module conventions precisely; ESM .js imports; no any;
|
||||
no console.log; productId everywhere; conventional commits (feat(platform-service):
|
||||
...); do not touch template-managed infra files.
|
||||
|
||||
FINAL OUTPUT — print the report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 2 Slice 1
|
||||
### Branch & commits
|
||||
- branch / based-on / PR
|
||||
- commits: <sha> <message>
|
||||
### Files changed
|
||||
- <path>: <one-line summary>
|
||||
### What was implemented (1-4)
|
||||
- containers + schemas + repos + routes; partition keys; etag handling
|
||||
### Tests added
|
||||
- <test name>: <assertion> (+ pnpm test summary: N passed)
|
||||
### Verify gate results
|
||||
- typecheck / test / build: <results>
|
||||
### §13 items now satisfied
|
||||
- <list which roadmap §13 boxes are done so the human can tick them>
|
||||
### Deviations / assumptions
|
||||
- <datastore/etag/provider choices>
|
||||
### Suggested next slice
|
||||
- Phase 2 Slice 2: atomic claim (_etag/If-Match) + lease renew/release + heartbeat
|
||||
+ reaper + fencing (leaseEpoch) — the concurrency core.
|
||||
@ -1,156 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: agent-queue
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior bash + distributed-systems engineer. Implement PHASE 2 SLICE 3 —
|
||||
FACTORY-AGENT INTEGRATION: make the single-host `agent-queue.sh` runner act as a
|
||||
"factory" that registers / heartbeats / claims / reports against the already-merged
|
||||
`fleet` coordinator in platform-service, **behind a feature flag**, while keeping
|
||||
the existing offline git-queue path 100% intact when the flag is off.
|
||||
|
||||
NON-NEGOTIABLE DESIGN RULE (prevents merge churn + regressions):
|
||||
- Put ALL coordinator-client logic in a NEW separate file `agent-queue/lib/fleet-client.sh`
|
||||
that `agent-queue.sh` sources. Touch `agent-queue.sh` only at a few well-defined hook
|
||||
points (claim source, stage-transition reporting, dispatch/help). The offline git-queue
|
||||
code path MUST be byte-for-byte behaviorally unchanged when `AQ_FLEET` is unset/0.
|
||||
- Gate every coordinator interaction on `AQ_FLEET=1`. Default (unset) = today's offline
|
||||
behavior. All 53 existing selftest checks MUST still pass unchanged.
|
||||
|
||||
READ FIRST (verify the real contract — do not guess):
|
||||
- agent-queue/agent-queue.sh — the runner. Study: the manifest/lifecycle stages
|
||||
(queued→assigned→building→review→testing→shipped + blocked/failed/dead_letter),
|
||||
`run_worker`/`cmd_run`/`ship`/`promote`, the Slice-4 `tracker_api` curl wrapper +
|
||||
`_api_call` + awk JSON helpers (REUSE these patterns — POSIX awk, curl-only, no jq),
|
||||
and the Slice-4 auto-echo hooks. Mirror that style exactly.
|
||||
- agent-queue/selftest.sh — how stub-driven HTTP tests work (the tracker stub overrides
|
||||
the curl wrapper). Build the fleet stub the same way.
|
||||
- THE COORDINATOR CONTRACT (read-only, in the sibling repo
|
||||
../learning_ai_common_plat/services/platform-service/src/modules/fleet/routes.ts):
|
||||
all routes are registered under the `/api` prefix. Exact endpoints:
|
||||
POST /api/fleet/factories/heartbeat {factoryId, capabilities[], health, load}
|
||||
POST /api/fleet/claim {factoryId, capabilities[]} -> job + leaseEpoch + lease expiry (or empty)
|
||||
GET /api/fleet/jobs/:id
|
||||
PATCH /api/fleet/jobs/:id fenced stage transition: {stage, checkpoint?, leaseEpoch}
|
||||
POST /api/fleet/jobs/:id/lease/renew {leaseEpoch}
|
||||
POST /api/fleet/jobs/:id/lease/release {leaseEpoch}
|
||||
GET /api/fleet/jobs/:id/runs
|
||||
GET /api/fleet/jobs/:id/events
|
||||
Note: there is NO client-side "register factory" or "append event" endpoint — registration
|
||||
is the heartbeat upsert, and `fleet_events` are written SERVER-SIDE by the coordinator on
|
||||
each PATCH/claim. The coordinator owns `leaseEpoch` fencing: a PATCH/renew carrying a stale
|
||||
epoch is rejected (409/conflict).
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §7 (claim loop),
|
||||
§8 (factory/heartbeat/claim/report/drain), §9 (split-brain/offline-degrade), §18 (fencing).
|
||||
|
||||
PREREQUISITE / BRANCHING:
|
||||
- Branch off CURRENT `main` (Phase 1 complete; foundation + hardening merged).
|
||||
New branch: feat/gigafactory-p2-slice3. Commit in logical steps. Push + open a PR.
|
||||
DO NOT merge.
|
||||
|
||||
CONFIG BLOCK (env, in fleet-client.sh; document in README):
|
||||
- AQ_FLEET (0/1, default 0 — master switch; 0 = pure offline git-queue)
|
||||
- AQ_FLEET_API (default http://localhost:4003/api)
|
||||
- AQ_FLEET_TOKEN (bearer; never hardcode)
|
||||
- AQ_PRODUCT_ID (reuse the Slice-4 var; X-Product-Id header)
|
||||
- AQ_FACTORY_ID (default: hostname + short rand; stable per process)
|
||||
- AQ_FLEET_LEASE_RENEW_SEC (default 300), AQ_FLEET_CAPS (auto-detected caps override)
|
||||
|
||||
DELIVERABLES
|
||||
|
||||
1. `agent-queue/lib/fleet-client.sh` (new) — a sourced library, curl-only + POSIX awk
|
||||
(reuse Slice-4 helpers; do not add deps):
|
||||
- `fleet_enabled` — returns true iff AQ_FLEET=1 (guard for every other fn).
|
||||
- `fleet_api METHOD PATH [json]` — curl wrapper adding bearer + X-Product-Id; returns
|
||||
body; captures HTTP code; non-2xx is logged and surfaced (never crashes the runner).
|
||||
- `fleet_detect_caps` — reuse the runner's existing capability auto-detection (os, engines,
|
||||
tools) to build the capabilities array.
|
||||
- `fleet_heartbeat` — POST factories/heartbeat (registration == first heartbeat); call at
|
||||
loop start + every AQ_FLEET_LEASE_RENEW_SEC during long runs.
|
||||
- `fleet_claim` — POST /fleet/claim with caps; parse job id + bodyMd + leaseEpoch + lease
|
||||
expiry; materialize a transient local job file (reuse the Slice-4 from-tracker
|
||||
materialization) so the existing runner executes it unchanged. Store leaseEpoch in the
|
||||
job meta.
|
||||
- `fleet_report STAGE [checkpoint]` — PATCH /fleet/jobs/:id with {stage, checkpoint?,
|
||||
leaseEpoch}. **Fencing-aware:** if the coordinator returns conflict/409 (stale epoch),
|
||||
the worker MUST self-abort the job (stop work, do NOT ship/merge) and log a fenced-abort
|
||||
event — a reclaimed/zombie worker can never corrupt coordinator state.
|
||||
- `fleet_lease_renew` / `fleet_lease_release` — fenced; renew on a timer while building;
|
||||
release on terminal stages.
|
||||
- `fleet_checkpoint` — capture {wipBranch, wipCommit} and send via fleet_report so a
|
||||
reclaim can resume (durability, §25).
|
||||
|
||||
2. Wire `agent-queue.sh` at MINIMAL hook points (all guarded by `fleet_enabled`):
|
||||
- source `lib/fleet-client.sh` near the top.
|
||||
- claim: when AQ_FLEET=1 and the local inbox is empty, try `fleet_claim` before idling
|
||||
(coordinator jobs interleave with local `.md` files; local files still work).
|
||||
- stage transitions (building/review/testing/shipped/failed): call `fleet_report` +
|
||||
checkpoint — REPLACE the meaning of the Slice-4 direct tracker echo when AQ_FLEET=1
|
||||
(the coordinator records `fleet_events`, becoming the audit source of truth → "tracker
|
||||
echo routed through fleet_events"); keep the direct tracker echo as the offline path.
|
||||
- heartbeat timer in the run loop; lease renew while a fleet job is building; release on done.
|
||||
- new subcommands: `aq fleet-status` (heartbeat + show claimable count) and surface
|
||||
factoryId/leaseEpoch in `status`; add to dispatch + help.
|
||||
|
||||
3. OFFLINE-DEGRADE + SPLIT-BRAIN (§9/§18): if the coordinator is unreachable mid-job, the
|
||||
runner finishes the in-flight job locally and reconciles on the next reachable call; on
|
||||
reconnect it presents its leaseEpoch — if the coordinator reports it stale (reclaimed),
|
||||
the local result is quarantined (marked, NOT auto-shipped) and surfaced for human triage.
|
||||
|
||||
TESTS — extend `agent-queue/selftest.sh` (stub the fleet API exactly like the tracker stub;
|
||||
tests are sacred, all 53 prior checks stay green):
|
||||
- flag off (default): AQ_FLEET unset → ZERO fleet API calls; existing offline flow identical
|
||||
(re-assert a couple of the offline cases under flag-off).
|
||||
- heartbeat/register: AQ_FLEET=1 loop start → stub receives POST factories/heartbeat with caps.
|
||||
- claim: stub returns a job → runner materializes a local job (bodyMd + leaseEpoch in meta)
|
||||
and executes it to review/.
|
||||
- report + checkpoint: building/review/testing → stub receives PATCH /fleet/jobs/:id with the
|
||||
correct stage + leaseEpoch (+ checkpoint on building).
|
||||
- FENCING: stub returns conflict on PATCH (stale epoch) → worker self-aborts, job NOT shipped,
|
||||
a fenced-abort is logged/surfaced.
|
||||
- lease renew: long-running stub → at least one renew call with current leaseEpoch.
|
||||
- offline-degrade: stub returns connection error mid-job → job still completes locally; on
|
||||
next call presenting a now-stale epoch → result quarantined (not auto-shipped).
|
||||
- no-leak: assert the prompt/bodyMd + token are never sent in a report/comment payload they
|
||||
shouldn't be (reuse the Slice-4 sentinel check).
|
||||
|
||||
VERIFY GATE (must all pass):
|
||||
- bash agent-queue/selftest.sh (all prior 53 + new fleet cases green; none weakened)
|
||||
- bash -n agent-queue/agent-queue.sh && bash -n agent-queue/lib/fleet-client.sh
|
||||
- node --check agent-queue/dashboard.mjs (if present/unchanged)
|
||||
- shellcheck --severity=error agent-queue/agent-queue.sh agent-queue/lib/fleet-client.sh
|
||||
|
||||
DOCS:
|
||||
- README: a "Fleet integration (Phase 2)" section — the AQ_FLEET flag, env table, the
|
||||
claim/heartbeat/report/fence/renew protocol, offline-degrade + quarantine behavior, and a
|
||||
one-paragraph "offline vs fleet mode" explainer.
|
||||
- Tick the relevant §8/§9/§14 Phase-2 boxes in GIGAFACTORY_ROADMAP.md with a P2-S3 slice note.
|
||||
|
||||
CONSTRAINTS: bash + curl + POSIX awk only (no jq, no new deps); reuse Slice-4 helpers; never
|
||||
hardcode tokens/secrets; offline path unchanged when AQ_FLEET unset; conventional commits
|
||||
(feat(agent-queue): ...); never weaken a test; do not edit the sibling common-plat repo.
|
||||
|
||||
FINAL OUTPUT — print the report in EXACTLY this format:
|
||||
|
||||
## Implementation Report — Phase 2 Slice 3 (factory-agent integration)
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
- <path>: <summary>
|
||||
### What was implemented
|
||||
- fleet-client.sh: <functions + flag gating>
|
||||
- agent-queue.sh hook points: <the few places touched + why minimal>
|
||||
- fencing + offline-degrade + quarantine: <how>
|
||||
- tracker echo via fleet_events: <how>
|
||||
### Tests added
|
||||
- <name>: <assertion> (esp. flag-off no-op, claim, fenced self-abort, offline quarantine)
|
||||
- selftest summary: <N checks = 53 prior + M new>
|
||||
### Verify gate results
|
||||
- selftest / bash -n / node --check / shellcheck: <results>
|
||||
### Deviations / assumptions
|
||||
- <claim/lease contract details, anything stubbed, how registration maps to heartbeat>
|
||||
### Suggested next slice
|
||||
- Phase 2 remaining: scheduler/router wiring, factory enrollment + scoped tokens, feature-flag
|
||||
shadow/dual-run, and the two-factory parallel demo (Phase 2 exit criteria).
|
||||
@ -1,120 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat-tracker
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior backend engineer. Implement the PHASE 2 DIRECT TRACKER -> MODULE WIRING
|
||||
(§10) for the fleet coordinator: a service-side bridge that turns a tracker Item into
|
||||
a fleet job (submitted through the coordinator, so it is routed by the §7 scheduler),
|
||||
and echoes the job's lifecycle back onto the Item — the full task<->job ROUND-TRIP,
|
||||
in-process, with no shell hop. This closes the §10 "direct tracker->module calls" box.
|
||||
|
||||
PARALLEL-SAFETY: One other Devin is running in a DIFFERENT repo (learning_ai_devops_tools,
|
||||
the two-factory demo). There is NO other Devin in this repo, so you may edit any fleet
|
||||
file you need. Do NOT edit the agent-queue repo.
|
||||
|
||||
READ FIRST (understand the contracts before writing):
|
||||
- services/platform-service/src/modules/fleet/coordinator.ts
|
||||
- submitJob(productId, SubmitJobInput) -> { job, outcome } : idempotent submit; the
|
||||
job already has a `trackerItemId` field (types.ts) — reuse it, do NOT add a new one.
|
||||
- claimNextJob(ctx) already routes candidates through the §7 scheduler (selectJob).
|
||||
You do NOT change claim/scheduler — tracker jobs flow through the SAME path.
|
||||
- patchJobFenced / stage transitions — the lifecycle you will mirror to the tracker.
|
||||
- services/platform-service/src/modules/fleet/types.ts — FleetJobDoc.stage values,
|
||||
SubmitJobSchema (trackerItemId, idempotencyKey, priority, capabilities, budget, kind).
|
||||
- services/platform-service/src/modules/fleet/routes.ts — existing fleet route patterns
|
||||
(auth, getRequestProductId(req), Zod parse, productId enforcement). Add new routes here
|
||||
in the SAME style.
|
||||
- services/platform-service/src/modules/items/{types,routes,repository}.ts — the Item
|
||||
API contract you mirror to: Item fields (id, productId, title/description, status,
|
||||
labels[]), the status vocabulary, and the comment/note mechanism. Call the items
|
||||
repository DIRECTLY in-process (no HTTP/curl) — this is the whole point of "direct wiring".
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §10 (tracker
|
||||
integration), §24.5 (echo rule), §14 Phase-2 checklist (the §10 box you will tick).
|
||||
|
||||
PREREQUISITE / BRANCHING: branch off CURRENT main -> feat/gigafactory-p2-tracker-wiring.
|
||||
Push + open a PR. DO NOT merge.
|
||||
|
||||
DELIVERABLES
|
||||
|
||||
1. tracker-bridge.ts (NEW) — pure-ish service module (it may call the items + fleet
|
||||
repositories, but no HTTP, no Fastify types inside it):
|
||||
- `ingestItemAsJob(productId, itemId, opts?) -> { job, outcome }`:
|
||||
* read the Item via the items repository (404 -> NotFoundError).
|
||||
* map Item -> SubmitJobInput: title/description -> bodyMd (verbatim instruction);
|
||||
labels carry manifest hints where present (engine-class:*, profile:*,
|
||||
priority:*, cap:* -> capabilities[]); otherwise sane defaults.
|
||||
* set trackerItemId = itemId and a STABLE idempotency-key (e.g. `tracker-<itemId>`),
|
||||
then call coordinator.submitJob — so re-ingest of the same Item dedupes (no
|
||||
duplicate job) and the job is scheduled by the §7 router like any other.
|
||||
- `echoJobToItem(productId, jobId) -> { echoed: status | null }`:
|
||||
* load the job; if it has no trackerItemId -> no-op (return null).
|
||||
* map stage -> Item status (FULL round-trip, both directions of the lifecycle):
|
||||
queued/assigned/building/review/testing -> in_progress
|
||||
shipped -> done
|
||||
failed -> blocked (+ note)
|
||||
* append a comment/note with metrics ONLY (attempts, duration, cost/tokens if
|
||||
present) — NEVER the prompt body / secrets.
|
||||
* IDEMPOTENT: persist the last-echoed status (on the job doc or a small bridge
|
||||
record) and make a re-echo of an unchanged outcome a no-op.
|
||||
- Echo is BEST-EFFORT and downstream: an items-write failure NEVER fails the job —
|
||||
surface it as a logged error / a `{ echoed: null, error }` shape, never throw into
|
||||
the job lifecycle.
|
||||
|
||||
2. Wire echo into stage transitions (server-side, opt-in, additive):
|
||||
- When the coordinator/route performs a stage transition for a job that has a
|
||||
trackerItemId, call echoJobToItem (guarded by a config flag, default OFF, e.g.
|
||||
FLEET_TRACKER_ECHO; OFF => behavior byte-for-byte unchanged). Do not block or fail
|
||||
the transition on echo error.
|
||||
|
||||
3. Routes (routes.ts, additive — match existing auth/productId style):
|
||||
- POST /fleet/tracker/ingest { itemId } -> ingestItemAsJob
|
||||
- POST /fleet/tracker/echo { jobId } -> echoJobToItem (manual echo)
|
||||
- All productId-scoped via getRequestProductId(req); a foreign productId cannot ingest
|
||||
or echo another product's Item/job.
|
||||
|
||||
TESTS (tracker-bridge.test.ts + route additions — tests are sacred; use @bytelyst/testing
|
||||
+ the in-memory providers; NO live HTTP):
|
||||
- ingest creates exactly one job: Item -> job with trackerItemId set, bodyMd = description,
|
||||
idempotency-key = tracker-<id>; the job is claimable via the normal claimNextJob path.
|
||||
- ingest label mapping: labels [engine-class:agentic-coder, priority:high, cap:os:mac]
|
||||
-> job priority/capabilities reflect them.
|
||||
- ingest idempotent: ingesting the same Item twice -> one job (dedupe), outcome reflects it.
|
||||
- echo round-trip: a job advancing queued->building->shipped drives the Item
|
||||
in_progress -> done, and a metrics-only comment is written (assert NO bodyMd/secret leaks).
|
||||
- echo failed -> Item blocked (+ note).
|
||||
- echo idempotent: re-echo of an unchanged stage -> no duplicate Item write.
|
||||
- echo non-fatal: items-write throws -> echoJobToItem returns { echoed:null,error }, the
|
||||
job state is untouched, the transition still succeeds.
|
||||
- echo OFF (default flag): a stage transition performs ZERO items writes.
|
||||
- productId isolation: ingest/echo for a foreign productId -> not found / rejected.
|
||||
- REGRESSION: every existing fleet + items test stays green.
|
||||
|
||||
VERIFY GATE:
|
||||
- pnpm --filter @lysnrai/platform-service exec vitest run src/modules/fleet src/modules/items
|
||||
- pnpm --filter @lysnrai/platform-service build
|
||||
- pnpm build && pnpm test (no consumer regression)
|
||||
|
||||
CONSTRAINTS: ESM `.js` import specifiers; no `any` (Zod inference / explicit types); no
|
||||
console.log (use app.log / req.log); every Cosmos doc keeps `productId`; reuse the
|
||||
existing `trackerItemId` field and items contract — do NOT fork a parallel schema;
|
||||
do NOT change claimNextJob or the scheduler; conventional commits
|
||||
(feat(platform-service): ...); do not edit the agent-queue repo.
|
||||
|
||||
DOCS: tick §10 "direct tracker->module calls" in
|
||||
../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §14 Phase-2 (note the
|
||||
flag name + that it is the in-process round-trip; the agent-queue shell adapter remains the
|
||||
single-host path).
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Implementation Report — Phase 2 Direct Tracker -> Module Wiring (§10)
|
||||
### Branch & commits / PR
|
||||
### Files changed
|
||||
### What was implemented (ingest mapping, round-trip status map, echo idempotency + flag)
|
||||
### Tests added (+ pnpm test summary)
|
||||
### Verify gate results
|
||||
### Deviations / assumptions (Item status vocabulary matched, flag name, where last-echoed is stored)
|
||||
### Suggested next slice (Phase 3 tracker-web fleet control plane)
|
||||
@ -1,91 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_devops_tools
|
||||
yolo: true
|
||||
lock: devops-tools-demo
|
||||
timeout: 4h
|
||||
---
|
||||
|
||||
ROLE: Senior engineer. Build the PHASE 2 TWO-FACTORY PARALLEL DEMO — the final
|
||||
Phase-2 EXIT-CRITERIA box (§14): >=2 factories executing jobs in parallel via the
|
||||
coordinator, proving conflict-free atomic claims, lease fencing, and reaper-reclaim
|
||||
end-to-end. This is a DEMO HARNESS + DOCS, not new runtime behavior — agent-queue.sh
|
||||
and lib/fleet-client.sh already implement everything; you orchestrate + observe them.
|
||||
|
||||
PARALLEL-SAFETY: One other Devin is running in a DIFFERENT repo (learning_ai_common_plat,
|
||||
the tracker-wiring slice) — no overlap. In THIS repo you OWN a NEW demo directory and the
|
||||
additive selftest/docs only:
|
||||
- You OWN (create/edit): agent-queue/demo/two-factory-demo.sh (NEW),
|
||||
agent-queue/demo/README.md (NEW), additive checks in agent-queue/selftest.sh,
|
||||
and the §14 Phase-2 demo/exit-criteria ticks in agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md.
|
||||
- You MUST NOT change the behavior of agent-queue.sh or lib/fleet-client.sh. You may READ
|
||||
them and CALL them; if a tiny additive hook is unavoidable, keep it flag-gated and prove
|
||||
all 68 existing selftest checks still pass byte-for-byte.
|
||||
- Leave the runtime agent-queue/queue/* working-tree artifacts ALONE (live-daemon state,
|
||||
not yours) — never stage or commit them.
|
||||
|
||||
READ FIRST:
|
||||
- agent-queue/agent-queue.sh — the run loop, AQ_FLEET / AQ_FLEET_ROUTE flags, claim path,
|
||||
fencing/quarantine, offline-degrade.
|
||||
- agent-queue/lib/fleet-client.sh — fleet_register/heartbeat, claim, lease renew, fenced
|
||||
PATCH, the coordinator HTTP wrappers and their env (AQ_FLEET_API, AQ_FLEET_TOKEN, factory id).
|
||||
- agent-queue/selftest.sh — how the EXISTING fleet tests STUB the coordinator (the canned
|
||||
responder pattern). Reuse that exact stub style so the demo's selftest needs NO live service.
|
||||
- ../learning_ai_common_plat/services/platform-service/src/modules/fleet/coordinator.ts —
|
||||
the claim/lease/fence/reaper contract you are demonstrating (read-only; do not edit).
|
||||
- agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §14 Phase-2 "Two-factory demo" + "Exit criteria".
|
||||
|
||||
DELIVERABLES
|
||||
|
||||
1. agent-queue/demo/two-factory-demo.sh — an orchestration script that:
|
||||
- Starts >=2 factories (distinct factoryIds, e.g. mac-1 + ubuntu-1) against ONE
|
||||
coordinator with AQ_FLEET=1 AQ_FLEET_ROUTE=1, each in its own working dir/queue so
|
||||
they do not share local inbox state — they compete ONLY through the coordinator.
|
||||
- Submits 3 jobs and lets the two factories drain them in parallel.
|
||||
- DEMONSTRATES + ASSERTS the Phase-2 exit guarantees:
|
||||
(a) no double-assign: each job is claimed/executed by exactly ONE factory.
|
||||
(b) fencing: kill a factory MID-JOB -> the reaper returns the job -> the OTHER
|
||||
factory reclaims and completes it AND the dead worker's late/zombie report is
|
||||
FENCED (rejected, never shipped).
|
||||
(c) parallelism: both factories make progress concurrently (not serialized).
|
||||
- Prints a clear PASS/FAIL summary (per-job winner, reclaim event, fence event).
|
||||
- DUAL MODE: works against a real coordinator when AQ_FLEET_API/AQ_FLEET_TOKEN are set;
|
||||
otherwise drives the SAME selftest coordinator STUB so the demo is runnable + CI-safe
|
||||
with zero external deps. Document both invocations.
|
||||
- bash, mac+linux safe, curl-only, no new runtime deps; style consistent with the repo.
|
||||
|
||||
2. agent-queue/demo/README.md — how to run the demo (stub mode + real-coordinator mode),
|
||||
the env vars, what each asserted guarantee proves, and a short "what to watch" guide
|
||||
(the kanban/log lines that show the reclaim + fence).
|
||||
|
||||
3. selftest.sh — ADD a small number of checks (do NOT modify the existing 68) that run the
|
||||
demo in STUB mode headlessly and assert: 3 jobs all reach a terminal state across the 2
|
||||
factories with no double-assignment; the kill -> reclaim -> fenced-zombie path fires;
|
||||
exit 0. Keep them fast + deterministic (seeded, no real sleeps where avoidable).
|
||||
|
||||
TESTS / VERIFY GATE:
|
||||
- bash agent-queue/selftest.sh -> all prior 68 + the new demo checks green, exit 0.
|
||||
- bash -n agent-queue/demo/two-factory-demo.sh && bash -n agent-queue/agent-queue.sh
|
||||
&& bash -n agent-queue/lib/fleet-client.sh -> OK.
|
||||
- shellcheck --severity=error on the new script + the two core scripts -> clean.
|
||||
- node --check agent-queue/dashboard.mjs -> OK (must remain unchanged).
|
||||
|
||||
CONSTRAINTS: do NOT alter agent-queue.sh / fleet-client.sh runtime behavior; reuse the
|
||||
existing coordinator stub pattern; never commit queue/* runtime artifacts; mac+linux safe;
|
||||
no emojis; conventional commits (feat(agent-queue): ...); tests sacred (the 68 stay green).
|
||||
|
||||
DOCS: tick the §14 Phase-2 "Two-factory demo" box and, once the demo asserts all three
|
||||
guarantees, the Phase-2 "Exit criteria" line in GIGAFACTORY_ROADMAP.md — set §0 Phase 2 ->
|
||||
complete (or note the exact remaining %). This is the box that closes Phase 2.
|
||||
|
||||
FINAL OUTPUT — report in EXACTLY this format:
|
||||
## Implementation Report — Phase 2 Two-Factory Parallel Demo (Exit Criteria)
|
||||
### Branch & commits / PR
|
||||
- branch / based-on: feat/gigafactory-p2-two-factory-demo off current main
|
||||
### Files changed
|
||||
### What was implemented (orchestration, the 3 asserted guarantees, stub vs real mode)
|
||||
### Tests added (+ selftest PASS/FAIL summary: prior 68 + new)
|
||||
### Verify gate results (selftest / bash -n / shellcheck / node --check)
|
||||
### Deviations / assumptions (how factories are isolated, how kill/reclaim is simulated in stub)
|
||||
### Phase 2 status (which §14 boxes now complete; exit criteria met Y/N; what (if anything) remains)
|
||||
### Suggested next slice (Phase 3 — tracker-web fleet control plane + DAG + budgets)
|
||||
@ -1,162 +0,0 @@
|
||||
---
|
||||
engine: devin
|
||||
cwd: /Users/sd9235/code/mygh/learning_ai_common_plat
|
||||
yolo: true
|
||||
lock: common-plat-phase3
|
||||
timeout: 10h
|
||||
---
|
||||
|
||||
ROLE: Senior full-stack engineer. Implement PHASE 3 of the Agent Gigafactory END-TO-END
|
||||
in `learning_ai_common_plat`, SEQUENTIALLY, over a long unattended run: smart routing
|
||||
(tunable weights + preemption), DAG job decomposition, per-product budgets, and the
|
||||
tracker-web fleet control plane. Work SLICE BY SLICE; each slice is self-contained,
|
||||
fully tested, and pushed before the next begins. This is an overnight run — favor
|
||||
correctness, small verifiable steps, and never leaving main/PR in a broken state.
|
||||
|
||||
================================================================================
|
||||
PREREQUISITE (the operator guarantees this before starting): Phase 2 is COMPLETE and
|
||||
merged to origin/main — fleet foundation, atomic claim, scheduler/router core, artifacts,
|
||||
enrollment+tokens, feature-flags/shadow, the in-process tracker->module wiring, and the
|
||||
two-factory demo are ALL on main. You branch off CURRENT origin/main.
|
||||
================================================================================
|
||||
|
||||
GLOBAL GUARDRAILS (unattended danger mode — obey strictly):
|
||||
- Branch: feat/gigafactory-phase3 off CURRENT origin/main. ONE long-lived branch; ONE
|
||||
commit per slice (conventional commits). Push after EVERY slice. Open ONE PR after
|
||||
Slice 1 and keep pushing to it. DO NOT MERGE anything. DO NOT touch origin/main.
|
||||
- Tests are SACRED: never delete, weaken, skip, or `.skip`/`.only` a test to go green.
|
||||
If you cannot make a slice pass honestly, see the FAILURE PROTOCOL below.
|
||||
- A slice is "done" only when its VERIFY GATE is fully green. Never start slice N+1 with
|
||||
slice N red.
|
||||
- Reserved / DO NOT TOUCH: the agent-queue repo (different repo), unrelated services
|
||||
(cowork-service, extraction-service), packages/* internals (consume, don't edit),
|
||||
and any backup/* or dependabot/* branches. Stay in services/platform-service +
|
||||
dashboards/tracker-web.
|
||||
- Conventions: ESM `.js` import specifiers; no `any`; no console.log (use app.log/req.log,
|
||||
and the tracker-web logger/telemetry pattern); every Cosmos doc carries `productId`;
|
||||
reuse @bytelyst/* packages and existing module patterns (types.ts -> repository.ts ->
|
||||
routes.ts). Do NOT hardcode colors/URLs/secrets.
|
||||
- CHECKPOINTING: maintain docs/GIGAFACTORY/gigafactory-phase3-progress.md on the branch. After each
|
||||
slice, record: slice name, status (DONE/WIP/FAILED), commit sha, verify-gate result,
|
||||
and any follow-ups. Commit it WITH the slice. If you resume after an interruption, read
|
||||
it first and continue from the first not-DONE slice.
|
||||
|
||||
FAILURE PROTOCOL (per slice): attempt the verify gate up to 3 times, fixing the ROOT
|
||||
cause each time (not the test). If still red after 3 honest attempts: commit the WIP with
|
||||
message `wip(<scope>): <slice> — BLOCKED: <one-line reason>`, mark it FAILED in
|
||||
progress.md with the exact failing output, and MOVE ON to the next slice that does NOT
|
||||
depend on it (dependencies noted per slice). Never thrash; never fake green.
|
||||
|
||||
READ FIRST:
|
||||
- ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md — §7 (scoring;
|
||||
Phase-3 = tunable weights + preemption), §5/§6 (DAG/deps), §11/§13 (budgets), §14
|
||||
Phase-3 checklist + Exit criteria, §16 Definition-of-Done.
|
||||
- services/platform-service/src/modules/fleet/{scheduler,coordinator,repository,routes,
|
||||
types}.ts — the engine you extend (read the existing claim/lease/fence/selectJob).
|
||||
- dashboards/tracker-web/ — match its App-Router structure (src/app, src/app/api),
|
||||
data-fetching/auth pattern, @bytelyst/ui + design-tokens usage, vitest + Playwright
|
||||
(e2e/, playwright.config.ts) setup. The existing fleet HTTP API you consume:
|
||||
POST/GET /fleet/jobs, GET /fleet/jobs/:id, PATCH /fleet/jobs/:id, POST /fleet/claim,
|
||||
lease renew/release, POST /fleet/factories/heartbeat|enroll, token rotate/revoke,
|
||||
GET /fleet/jobs/:id/runs, GET /fleet/jobs/:id/events, artifacts upload/list/get/delete.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
SLICE 1 — Tunable scoring weights + preemption (backend; depends on: nothing)
|
||||
--------------------------------------------------------------------------------
|
||||
Extend the PURE scheduler (scheduler.ts) without breaking §7 Phase-2 behavior:
|
||||
- Weights become configurable per-product/per-request (passed in; fixed defaults preserved
|
||||
so existing tests stay green). Add a small typed FleetWeightConfig resolver (defaults ->
|
||||
optional product override). NO env reads inside the pure module.
|
||||
- Preemption: a `selectWithPreemption(candidates, runningJobs, factory, ctx, weights?)`
|
||||
that, when a CRITICAL job cannot be placed and only lower-priority jobs are running,
|
||||
returns a preemption decision { evict: jobId, reason, breakdown } — PURE, no I/O.
|
||||
- Wire preemption into the coordinator behind a flag (FLEET_PREEMPTION, default OFF; OFF =
|
||||
byte-for-byte current behavior). Eviction must checkpoint + requeue the victim via the
|
||||
EXISTING fenced-requeue path (bump leaseEpoch; the zombie's late report is fenced).
|
||||
TESTS: weight override changes ranking; defaults reproduce all prior picks; preemption
|
||||
evicts only a strictly-lower-priority running job, never an equal/higher; victim is
|
||||
requeued with checkpoint + bumped epoch and its stale report is fenced; flag OFF = no
|
||||
preemption. VERIFY GATE: pnpm --filter @lysnrai/platform-service exec vitest run
|
||||
src/modules/fleet && build && (pnpm build && pnpm test).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
SLICE 2 — DAG job decomposition (backend; depends on: nothing; independent of S1)
|
||||
--------------------------------------------------------------------------------
|
||||
Parent/child jobs with dependency-gated execution (§5/§6):
|
||||
- types: a job may declare children (subtasks) and dependsOn[] (sibling/child ids). Reuse
|
||||
existing kind ('leaf'|...) + parentId; add child submission + a DAG edge model. Cycle
|
||||
detection at submit (extend the existing submit-time cycle check).
|
||||
- coordinator: a parent is not claimable until its children reach a terminal state (or its
|
||||
declared deps are satisfied); completing the last child unblocks the parent. claimNextJob
|
||||
only returns deps-satisfied jobs (extend the existing predicate). Fan-out: submitting a
|
||||
parent with children atomically creates the children.
|
||||
- routes (additive): POST /fleet/jobs/:id/children (submit children), GET /fleet/jobs/:id/dag
|
||||
(return the subtree + per-node stage). productId-scoped.
|
||||
TESTS: parent blocked until children done; last child completion unblocks parent; cycle at
|
||||
submit -> rejected; capability/priority still respected per node; DAG endpoint returns the
|
||||
correct subtree; all prior fleet tests green. VERIFY GATE as in Slice 1 (+ items unaffected).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
SLICE 3 — Per-product budgets + pause/resume (backend; depends on: nothing)
|
||||
--------------------------------------------------------------------------------
|
||||
Cost ceilings that pause routing (§11/§13):
|
||||
- A FleetBudgetDoc per productId (ceilingUsd, window, spentUsd, status active|paused).
|
||||
Spend accrues from job run cost (reuse run/insights cost if present; else estimate from
|
||||
budget.usd at completion). Container partitioned by /productId.
|
||||
- Enforcement in claimNextJob: if the product's budget is paused or the next job would
|
||||
exceed the ceiling, that product's jobs are NOT claimed (other products unaffected).
|
||||
Behind FLEET_BUDGETS (default OFF = unchanged).
|
||||
- routes (additive): GET/PUT /fleet/budgets/:productId, POST /fleet/budgets/:productId/pause,
|
||||
POST /fleet/budgets/:productId/resume.
|
||||
TESTS: under ceiling -> claims proceed; crossing ceiling -> that product pauses, others
|
||||
still claim; manual pause blocks claims; resume restores; flag OFF = no enforcement;
|
||||
spend accounting is monotonic + idempotent per run. VERIFY GATE as above.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
SLICE 4 — tracker-web Fleet Control Plane UI (frontend; depends on: S1-S3 endpoints,
|
||||
but build defensively — feature-detect/degrade if an endpoint is absent)
|
||||
--------------------------------------------------------------------------------
|
||||
A new `/fleet` section in dashboards/tracker-web (App Router), matching existing patterns:
|
||||
- Typed fleet API client (src/lib or src/app/api proxy as the repo does it) wrapping the
|
||||
fleet endpoints with auth token injection (reuse the existing auth/client pattern).
|
||||
- Pages/components (use @bytelyst/ui + --*-tokens; every interactive element has an
|
||||
aria-label or visible label):
|
||||
* Fleet map: factories (id, caps, health, load, lease state) as live cards.
|
||||
* Job table: filter by product/stage/priority; submit-job modal; row -> job detail.
|
||||
* Job detail: stage timeline from /events, runs from /runs, artifacts list, a SHIP
|
||||
action (PATCH stage), and the DAG subtree (from /dag) when present.
|
||||
* Budget panel: per-product ceiling + spent + pause/resume controls.
|
||||
- Live updates via polling (simple, robust) unless an SSE/stream endpoint exists.
|
||||
TESTS: vitest component/unit tests for the client + key components (render, actions call
|
||||
the right endpoint, error/empty/degraded states); Playwright e2e for the core flow
|
||||
(see fleet map -> open a job -> ship; pause a budget -> resume). VERIFY GATE:
|
||||
the tracker-web `verify` script (typecheck + lint + test + e2e) green — run exactly what
|
||||
its package.json defines (e.g. pnpm --filter <tracker-web> run verify, or the documented
|
||||
equivalent). Do not weaken its lint/e2e config.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
SLICE 5 — Docs + roadmap + Phase-3 exit criteria (depends on: S1-S4 outcomes)
|
||||
--------------------------------------------------------------------------------
|
||||
- Update ../learning_ai_devops_tools/agent-queue/docs/GIGAFACTORY/GIGAFACTORY_ROADMAP.md §14 Phase-3
|
||||
checkboxes for every box you actually completed, with a one-line note + the flag names
|
||||
(FLEET_PREEMPTION/FLEET_BUDGETS) and which are default-OFF. Tick the Phase-3 Exit-criteria
|
||||
line ONLY if its conditions are genuinely met; otherwise note the exact remaining %.
|
||||
(This is a docs edit in the OTHER repo — make it as a separate small commit/PR in
|
||||
learning_ai_devops_tools, OR include the roadmap delta as a patch file under
|
||||
docs/ in THIS branch and note it for the operator — do NOT entangle the two repos'
|
||||
git history. Prefer the patch-file note if a clean cross-repo PR isn't trivial.)
|
||||
- Update dashboards/tracker-web/README + a short docs/GIGAFACTORY/FLEET_CONTROL_PLANE.md (how to use
|
||||
the new UI, the flags, the endpoints consumed).
|
||||
- Finalize docs/GIGAFACTORY/gigafactory-phase3-progress.md with the end-state of every slice.
|
||||
|
||||
FINAL OUTPUT — print ONE consolidated report in EXACTLY this format:
|
||||
## Implementation Report — Phase 3 (overnight)
|
||||
### Branch & PR
|
||||
### Per-slice results
|
||||
| slice | status (DONE/WIP/FAILED) | commit | verify gate | notes |
|
||||
### What was implemented (per slice: key files, flags, endpoints, UI surfaces)
|
||||
### Tests added (counts per area + the final verify-gate output per slice)
|
||||
### Deviations / assumptions (weight defaults, budget accounting source, polling vs SSE, any degraded UI paths)
|
||||
### Phase 3 status (which §14 boxes now complete; exit criteria met Y/N; remaining %)
|
||||
### Anything that needs a human decision (esp. risky majors, cross-repo roadmap tick)
|
||||
### Suggested next phase (Phase 4 — message bus + autoscaling + capability marketplace)
|
||||
@ -1,70 +0,0 @@
|
||||
# Boot-persistence: agent-queue as a macOS LaunchAgent
|
||||
|
||||
Auto-start the `agent-queue` run loop on login and keep it alive across
|
||||
**reboot / crash / logout** — the one failure mode that `tmux` + `caffeinate`
|
||||
alone can't cover.
|
||||
|
||||
| Layer | Survives terminal close | Survives sleep | Survives reboot |
|
||||
| ----- | :---------------------: | :------------: | :-------------: |
|
||||
| plain shell | no | no | no |
|
||||
| `tmux` | yes | no | no |
|
||||
| `caffeinate` | n/a | yes | no |
|
||||
| **LaunchAgent (this)** | yes | yes (via caffeinate) | **yes** |
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
bash launchd/install.sh # render plist, load, start now (RunAtLoad + KeepAlive)
|
||||
tail -f ~/Library/Logs/agent-queue/agent-queue.out.log
|
||||
```
|
||||
|
||||
It renders `~/Library/LaunchAgents/com.bytelyst.agent-queue.plist` from the
|
||||
resolved repo path (works on any clone) and bootstraps it into your GUI session.
|
||||
|
||||
## Use
|
||||
|
||||
The LaunchAgent runs `agent-queue-boot.sh`, which wraps `agent-queue run` in
|
||||
`caffeinate`. Just drop prompt `.md` files into `queue/inbox/` — they get picked
|
||||
up automatically, now or after the next reboot.
|
||||
|
||||
```bash
|
||||
aq add ~/jobs/phase3-overnight.md --engine codex # or drop the file in queue/inbox/
|
||||
aqs # status
|
||||
```
|
||||
|
||||
## Configure (no need to edit the plist)
|
||||
|
||||
Put overrides in `~/.agent-queue.env` (untracked — also the place for tokens):
|
||||
|
||||
```bash
|
||||
AGENT_QUEUE_ENGINE=codex # codex (recommended: local repo) | devin | claude
|
||||
AGENT_QUEUE_MAX=1 # concurrent jobs on this host (default 3)
|
||||
# AGENT_QUEUE_NO_CAFFEINATE=1 # allow the Mac to idle-sleep (NOT for overnight runs)
|
||||
# DEVIN_BIN=/custom/path/devin # if a CLI isn't on the default PATH
|
||||
```
|
||||
|
||||
## Stop / uninstall
|
||||
|
||||
```bash
|
||||
bash launchd/install.sh --uninstall # bootout + remove plist (queued jobs stay put)
|
||||
```
|
||||
|
||||
## Notes & gotchas
|
||||
|
||||
- **codex vs devin:** for a local monorepo overnight runner, **codex** is the
|
||||
default — it runs in-repo so `@bytelyst/*` workspace links resolve locally and
|
||||
logs/token-usage parsing already work. Use **devin** when you want a cloud
|
||||
sandbox doing the heavy lifting (and ACUs/network aren't a concern).
|
||||
- **Power:** caffeinate wraps the long-lived loop, so the Mac stays awake the
|
||||
whole time the LaunchAgent runs. That's intended for a dedicated runner. Set
|
||||
`AGENT_QUEUE_NO_CAFFEINATE=1` if you'd rather let it idle-sleep when no job is
|
||||
active. Keep it plugged in with the lid open for true overnight runs.
|
||||
- **PATH:** launchd starts processes with a minimal `PATH`. Both the plist
|
||||
(`EnvironmentVariables`) and the wrapper repair it, but if a CLI lives
|
||||
somewhere unusual, point at it explicitly via `~/.agent-queue.env`.
|
||||
- **Dangerous mode:** jobs run `--yolo` (auto-approve) by default. The safety net
|
||||
is the agent-queue lifecycle itself — jobs land in `review/` → `testing/` and
|
||||
**shipping is always a manual human gate**. Never let an unattended run touch
|
||||
`main`; push to a branch and open one PR.
|
||||
- **Auth:** cache `gh auth login` / git credentials and the agent CLI's auth
|
||||
before relying on it overnight, or the first `push` will block forever.
|
||||
@ -1,105 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# install.sh — install (or remove) the macOS LaunchAgent that auto-starts the
|
||||
# agent-queue run loop on login and keeps it alive across reboot/crash.
|
||||
#
|
||||
# bash launchd/install.sh # render plist, load, and start now
|
||||
# bash launchd/install.sh --uninstall # stop, unload, and remove the plist
|
||||
#
|
||||
# The plist is generated from the resolved repo path so it works on any clone.
|
||||
# Logs land in ~/Library/Logs/agent-queue/.
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd -P)"
|
||||
AQ_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd -P)"
|
||||
WRAPPER="$AQ_DIR/agent-queue-boot.sh"
|
||||
|
||||
LABEL="com.bytelyst.agent-queue"
|
||||
PLIST="$HOME/Library/LaunchAgents/$LABEL.plist"
|
||||
LOG_DIR="$HOME/Library/Logs/agent-queue"
|
||||
UID_NUM="$(id -u)"
|
||||
DOMAIN="gui/$UID_NUM"
|
||||
|
||||
if [ "$(uname -s)" != "Darwin" ]; then
|
||||
echo "install.sh: macOS only (LaunchAgents). On Linux use a systemd --user unit." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
uninstall() {
|
||||
echo "[launchd] booting out $LABEL ..."
|
||||
launchctl bootout "$DOMAIN/$LABEL" 2>/dev/null || true
|
||||
rm -f "$PLIST"
|
||||
echo "[launchd] removed $PLIST"
|
||||
echo "[launchd] (the run loop is stopped; queued jobs stay in queue/inbox/)"
|
||||
}
|
||||
|
||||
if [ "${1:-}" = "--uninstall" ] || [ "${1:-}" = "-u" ]; then
|
||||
uninstall
|
||||
exit 0
|
||||
fi
|
||||
|
||||
[ -f "$WRAPPER" ] || { echo "install.sh: missing $WRAPPER" >&2; exit 1; }
|
||||
chmod +x "$WRAPPER" "$AQ_DIR/agent-queue.sh" 2>/dev/null || true
|
||||
mkdir -p "$HOME/Library/LaunchAgents" "$LOG_DIR"
|
||||
|
||||
echo "[launchd] writing $PLIST"
|
||||
cat > "$PLIST" <<EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>$LABEL</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>$WRAPPER</string>
|
||||
</array>
|
||||
|
||||
<!-- Start on login and restart if it ever exits non-zero (crash/reboot). -->
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
<key>KeepAlive</key>
|
||||
<dict>
|
||||
<key>SuccessfulExit</key>
|
||||
<false/>
|
||||
</dict>
|
||||
<!-- Guard against tight crash loops. -->
|
||||
<key>ThrottleInterval</key>
|
||||
<integer>30</integer>
|
||||
|
||||
<key>WorkingDirectory</key>
|
||||
<string>$AQ_DIR</string>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>$LOG_DIR/agent-queue.out.log</string>
|
||||
<key>StandardErrorPath</key>
|
||||
<string>$LOG_DIR/agent-queue.err.log</string>
|
||||
|
||||
<!-- launchd's PATH is minimal; the wrapper also repairs PATH defensively. -->
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>$HOME/.local/bin:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
|
||||
<key>AGENT_QUEUE_ENGINE</key>
|
||||
<string>codex</string>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
|
||||
# Reload cleanly (bootout first so a re-run picks up plist changes).
|
||||
launchctl bootout "$DOMAIN/$LABEL" 2>/dev/null || true
|
||||
launchctl bootstrap "$DOMAIN" "$PLIST"
|
||||
launchctl enable "$DOMAIN/$LABEL"
|
||||
launchctl kickstart -k "$DOMAIN/$LABEL"
|
||||
|
||||
echo "[launchd] installed + started: $LABEL"
|
||||
echo "[launchd] status : launchctl print $DOMAIN/$LABEL | sed -n '1,20p'"
|
||||
echo "[launchd] logs : tail -f $LOG_DIR/agent-queue.out.log"
|
||||
echo "[launchd] stop : bash $SCRIPT_DIR/install.sh --uninstall"
|
||||
echo
|
||||
echo "Drop prompt .md files into: $AQ_DIR/queue/inbox/"
|
||||
echo "Override engine/concurrency/secrets in ~/.agent-queue.env (e.g. AGENT_QUEUE_MAX=1)."
|
||||
@ -1,578 +0,0 @@
|
||||
# shellcheck shell=bash
|
||||
# ── Fleet coordinator client (Phase 2, §7/§8/§9/§18) ────────────────
|
||||
#
|
||||
# Sourced by agent-queue.sh. Lets the single-host runner act as a "factory" that
|
||||
# registers / heartbeats / claims / reports against the platform-service `fleet`
|
||||
# coordinator — BEHIND the AQ_FLEET flag. When AQ_FLEET is unset/0, every function
|
||||
# here is an immediate no-op and the offline git-queue path is byte-for-byte
|
||||
# unchanged. curl-only + POSIX awk (reuses agent-queue.sh helpers: log/err,
|
||||
# _meta_val, _json_str, _json_escape, detect_capabilities, active_workers, CURL_BIN).
|
||||
#
|
||||
# Contract (routes under AQ_FLEET_API, which already includes /api):
|
||||
# POST /fleet/factories/heartbeat {factoryId, capabilities[], health, load}
|
||||
# POST /fleet/claim {factoryId, capabilities[], leaseSeconds}
|
||||
# -> {claimed, job{id,bodyMd,leaseEpoch}, lease{...}}
|
||||
# PATCH /fleet/jobs/:id {stage, leaseEpoch, checkpoint?} (409 = fenced)
|
||||
# POST /fleet/jobs/:id/lease/renew {leaseEpoch, leaseSeconds} (409 = fenced)
|
||||
# POST /fleet/jobs/:id/lease/release {leaseEpoch, stage?}
|
||||
# The coordinator owns leaseEpoch fencing + writes fleet_events server-side; there
|
||||
# is no client-side "register" or "append event" call (register == first heartbeat).
|
||||
|
||||
# ── Config (env-overridable) ────────────────────────────────────────
|
||||
AQ_FLEET="${AQ_FLEET:-0}" # master switch (0 = offline)
|
||||
AQ_FLEET_API="${AQ_FLEET_API:-http://localhost:4003/api}" # base URL incl. /api
|
||||
# Normalize: platform-service mounts the fleet routes under /api. Strip a trailing
|
||||
# slash and append /api unless already present, so AQ_FLEET_API=http://host:4003
|
||||
# (the natural form) works too instead of silently 404ing every fleet call.
|
||||
AQ_FLEET_API="${AQ_FLEET_API%/}"
|
||||
[[ "$AQ_FLEET_API" == */api ]] || AQ_FLEET_API="${AQ_FLEET_API}/api"
|
||||
AQ_FLEET_TOKEN="${AQ_FLEET_TOKEN:-}" # bearer; never hardcode
|
||||
# AQ_PRODUCT_ID is shared with the Slice-4 tracker config (X-Product-Id header).
|
||||
AQ_FACTORY_ID="${AQ_FACTORY_ID:-$( (hostname -s 2>/dev/null || hostname 2>/dev/null || echo factory) | tr -cd 'A-Za-z0-9._-')-$$}"
|
||||
AQ_FLEET_LEASE_RENEW_SEC="${AQ_FLEET_LEASE_RENEW_SEC:-300}" # heartbeat/renew cadence
|
||||
AQ_FLEET_LEASE_SECONDS="${AQ_FLEET_LEASE_SECONDS:-900}" # requested lease duration
|
||||
AQ_FLEET_CAPS="${AQ_FLEET_CAPS:-}" # override caps (comma/space list)
|
||||
AQ_FLEET_CWD="${AQ_FLEET_CWD:-$PWD}" # cwd for claimed fleet jobs
|
||||
AQ_FLEET_API_CMD="${AQ_FLEET_API_CMD:-}" # test seam (stub script)
|
||||
AQ_FLEET_HB_TS=0 # last heartbeat epoch (mutable)
|
||||
|
||||
# ── Slice 4: feature-flag levels (three explicit, independently-toggleable) ──
|
||||
# Precedence (documented in README §Cutover):
|
||||
# AQ_FLEET=0 ⇒ pure offline, ZERO coordinator calls (master switch).
|
||||
# AQ_FLEET_ROUTE=1 ⇒ route_via_service: coordinator is AUTHORITATIVE for claim
|
||||
# (default; preserves the P2-S3 behavior).
|
||||
# AQ_FLEET_ROUTE=0 ⇒ LOCAL inbox is authoritative (coordinator not used to
|
||||
# source work) — the pre-cutover state.
|
||||
# AQ_FLEET_SHADOW=1 ⇒ shadow/dual-run (requires AQ_FLEET=1 AND AQ_FLEET_ROUTE=0):
|
||||
# run the normal offline path as authoritative AND query the coordinator in
|
||||
# parallel WITHOUT acting on its responses, purely to record divergence.
|
||||
# If AQ_FLEET_ROUTE=1 AND AQ_FLEET_SHADOW=1, ROUTE WINS and shadow is disabled
|
||||
# (a one-shot warning is logged) — you never shadow and route at the same time.
|
||||
AQ_FLEET_ROUTE="${AQ_FLEET_ROUTE:-1}"
|
||||
# AQ_FLEET_AUTOSHIP=1 ⇒ when the factory's local verify gate passes, advance the
|
||||
# coordinator job testing -> shipped (the factory's verify IS the test phase).
|
||||
# Default 0 keeps the human review gate authoritative (job rests at testing).
|
||||
AQ_FLEET_AUTOSHIP="${AQ_FLEET_AUTOSHIP:-0}"
|
||||
# AQ_FLEET_PR=1 ⇒ for jobs that carry a `repo`, run the agent in an isolated
|
||||
# checkout on branch aq/job/<id>, then commit/push and open a PR; the PR URL is
|
||||
# reported back and recorded on the run. Checkouts are cached under AQ_FLEET_REPOS_DIR.
|
||||
AQ_FLEET_PR="${AQ_FLEET_PR:-0}"
|
||||
AQ_FLEET_REPOS_DIR="${AQ_FLEET_REPOS_DIR:-}" # default resolved to $STATE/repos at call time
|
||||
AQ_FLEET_SHADOW="${AQ_FLEET_SHADOW:-0}"
|
||||
# Isolated factory id for the read-only shadow claim (never the real factory id).
|
||||
AQ_FLEET_SHADOW_FACTORY_ID="${AQ_FLEET_SHADOW_FACTORY_ID:-${AQ_FACTORY_ID}-shadow}"
|
||||
# Shadow divergence log (default resolved to $STATE/fleet-shadow.log at call time).
|
||||
AQ_FLEET_SHADOW_LOG="${AQ_FLEET_SHADOW_LOG:-}"
|
||||
_AQ_FLEET_SHADOW_WARNED=0 # one-shot ROUTE>SHADOW precedence warning (per process)
|
||||
SHADOW_COORD_JOB="" # set by fleet_shadow_claim: would-be coordinator job id
|
||||
|
||||
# ── §M0 RU gate (docs/GIGAFACTORY/FLEET_DISPATCH_REDESIGN.md §8/§12) ──
|
||||
# When ON, the run loop point-reads a cheap per-product queue version
|
||||
# (GET /fleet/queue-state, ~1 RU) and SKIPS the expensive claim while nothing has
|
||||
# changed and we are not mid-drain — slashing idle Cosmos RU. Default OFF
|
||||
# (opt-in): behavior is byte-for-byte unchanged unless AQ_FLEET_GATE=1, and the
|
||||
# gate always FAILS OPEN (claims) on any read error so work is never stranded.
|
||||
AQ_FLEET_GATE="${AQ_FLEET_GATE:-0}"
|
||||
# Force a full claim at least this often even when the gate is unchanged (backstops
|
||||
# a missed/raced version bump). 0 disables the periodic backstop.
|
||||
AQ_FLEET_GATE_SAFETY_SEC="${AQ_FLEET_GATE_SAFETY_SEC:-300}"
|
||||
AQ_FLEET_GATE_SEEN="" # last-seen queue version (mutable, per process)
|
||||
AQ_FLEET_GATE_TS=0 # epoch of the last full (drained) claim attempt
|
||||
AQ_FLEET_GATE_DRAINING=1 # 1 = keep claiming (last claim got a job / startup)
|
||||
|
||||
# fleet_enabled — true iff the coordinator integration is switched on.
|
||||
fleet_enabled() { [[ "${AQ_FLEET:-0}" == 1 ]]; }
|
||||
|
||||
# fleet_route_enabled — coordinator is authoritative for claim/assignment (ROUTE=1).
|
||||
fleet_route_enabled() { fleet_enabled && [[ "${AQ_FLEET_ROUTE:-1}" == 1 ]]; }
|
||||
|
||||
# fleet_shadow_enabled — shadow/dual-run is active. Pure (no logging): requires
|
||||
# AQ_FLEET=1 AND AQ_FLEET_ROUTE=0 AND AQ_FLEET_SHADOW=1. When ROUTE=1 this returns
|
||||
# false (ROUTE wins) — the precedence warning is emitted once by fleet_flags_warn_once.
|
||||
fleet_shadow_enabled() {
|
||||
fleet_enabled || return 1
|
||||
[[ "${AQ_FLEET_ROUTE:-1}" == 0 ]] || return 1
|
||||
[[ "${AQ_FLEET_SHADOW:-0}" == 1 ]]
|
||||
}
|
||||
|
||||
# fleet_flags_warn_once — emit the ROUTE>SHADOW precedence warning at most once.
|
||||
# Called from the run-loop init so an operator who sets ROUTE=1 + SHADOW=1 is told
|
||||
# that shadow is suppressed. No-op unless that exact (conflicting) combo is set.
|
||||
fleet_flags_warn_once() {
|
||||
fleet_enabled || return 0
|
||||
if [[ "${AQ_FLEET_ROUTE:-1}" == 1 && "${AQ_FLEET_SHADOW:-0}" == 1 && "${_AQ_FLEET_SHADOW_WARNED:-0}" != 1 ]]; then
|
||||
err "fleet: AQ_FLEET_ROUTE=1 and AQ_FLEET_SHADOW=1 — ROUTE wins; shadow/dual-run is DISABLED. Set AQ_FLEET_ROUTE=0 to shadow."
|
||||
_AQ_FLEET_SHADOW_WARNED=1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_flags_state — one-line resolved flag summary (for `status` / `fleet-status`).
|
||||
fleet_flags_state() {
|
||||
local route shadow
|
||||
if [[ "${AQ_FLEET_ROUTE:-1}" == 1 ]]; then route="route_via_service"; else route="local-authoritative"; fi
|
||||
if fleet_shadow_enabled; then shadow="shadow=ON"; else shadow="shadow=off"; fi
|
||||
printf 'AQ_FLEET=1 route=%s(AQ_FLEET_ROUTE=%s) %s(AQ_FLEET_SHADOW=%s)' \
|
||||
"$route" "${AQ_FLEET_ROUTE:-1}" "$shadow" "${AQ_FLEET_SHADOW:-0}"
|
||||
}
|
||||
|
||||
# ── HTTP (curl only; same output contract as the Slice-4 tracker_api) ──
|
||||
# fleet_api <METHOD> <PATH> [JSON] -> response body, then a final HTTP-code line.
|
||||
fleet_api() {
|
||||
local method=$1 path=$2 body=${3:-}
|
||||
if [[ -n "$AQ_FLEET_API_CMD" ]]; then
|
||||
"$AQ_FLEET_API_CMD" "$method" "$path" "$body"
|
||||
return $?
|
||||
fi
|
||||
local url="${AQ_FLEET_API}${path}"
|
||||
local -a args=(-sS -m "${AQ_FLEET_TIMEOUT:-30}" -X "$method"
|
||||
-H "Content-Type: application/json" -w '\n%{http_code}')
|
||||
[[ -n "$AQ_FLEET_TOKEN" ]] && args+=(-H "Authorization: Bearer $AQ_FLEET_TOKEN")
|
||||
[[ -n "$AQ_PRODUCT_ID" ]] && args+=(-H "X-Product-Id: $AQ_PRODUCT_ID")
|
||||
[[ -n "$body" ]] && args+=(--data "$body")
|
||||
local out rc
|
||||
out=$("$CURL_BIN" "${args[@]}" "$url" 2>/dev/null); rc=$?
|
||||
if [[ $rc -ne 0 ]]; then printf '%s\n000\n' "$out"; else printf '%s\n' "$out"; fi
|
||||
}
|
||||
|
||||
# _fleet_call <METHOD> <PATH> [JSON] -> sets globals FLEET_BODY + FLEET_CODE.
|
||||
_fleet_call() {
|
||||
local out; out=$(fleet_api "$@")
|
||||
FLEET_CODE=$(printf '%s' "$out" | tail -n1)
|
||||
FLEET_BODY=$(printf '%s' "$out" | sed '$d')
|
||||
}
|
||||
|
||||
# _fleet_json_num <key> (reads JSON on stdin) -> first numeric value for key.
|
||||
_fleet_json_num() {
|
||||
grep -oE "\"$1\"[[:space:]]*:[[:space:]]*-?[0-9]+" | head -1 | grep -oE -- '-?[0-9]+$'
|
||||
}
|
||||
|
||||
# _fleet_is_job <job> -> 0 if this job was claimed from the coordinator.
|
||||
_fleet_is_job() { [[ -n "$(_meta_val "$STATE/$1.meta" fleet_job_id)" ]]; }
|
||||
|
||||
# fleet_detect_caps -> JSON array of capability tokens (override or auto-detected).
|
||||
fleet_detect_caps() {
|
||||
local toks
|
||||
if [[ -n "$AQ_FLEET_CAPS" ]]; then
|
||||
toks=$(printf '%s' "$AQ_FLEET_CAPS" | tr ', ' '\n\n')
|
||||
else
|
||||
toks=$(detect_capabilities)
|
||||
fi
|
||||
local out="[" first=1 t
|
||||
while IFS= read -r t; do
|
||||
[[ -n "$t" ]] || continue
|
||||
[[ $first -eq 1 ]] && first=0 || out+=","
|
||||
out+="\"$(_json_escape "$t")\""
|
||||
done <<< "$toks"
|
||||
printf '%s]' "$out"
|
||||
}
|
||||
|
||||
# ── Heartbeat (registration == first heartbeat) ─────────────────────
|
||||
fleet_heartbeat() {
|
||||
fleet_enabled || return 0
|
||||
local caps load body
|
||||
caps=$(fleet_detect_caps)
|
||||
load=$(active_workers 2>/dev/null || echo 0)
|
||||
body="{\"factoryId\":\"$(_json_escape "$AQ_FACTORY_ID")\",\"capabilities\":$caps,\"health\":\"ok\",\"load\":${load:-0}}"
|
||||
_fleet_call POST "/fleet/factories/heartbeat" "$body"
|
||||
case "$FLEET_CODE" in
|
||||
2*) AQ_FLEET_HB_TS=$(date +%s); return 0;;
|
||||
*) err "fleet: heartbeat failed (HTTP ${FLEET_CODE:-error}) — running degraded"; return 1;;
|
||||
esac
|
||||
}
|
||||
|
||||
# fleet_heartbeat_maybe — heartbeat only when the cadence interval has elapsed.
|
||||
fleet_heartbeat_maybe() {
|
||||
fleet_enabled || return 0
|
||||
local now; now=$(date +%s)
|
||||
[[ $(( now - ${AQ_FLEET_HB_TS:-0} )) -ge "${AQ_FLEET_LEASE_RENEW_SEC:-300}" ]] && fleet_heartbeat
|
||||
return 0
|
||||
}
|
||||
|
||||
# ── §M0 RU gate helpers ─────────────────────────────────────────────
|
||||
# fleet_gate_enabled — true iff the cheap-poll gate is switched on.
|
||||
fleet_gate_enabled() { fleet_enabled && [[ "${AQ_FLEET_GATE:-0}" == 1 ]]; }
|
||||
|
||||
# fleet_queue_version — print the product's queue version (GET /fleet/queue-state);
|
||||
# return non-zero on any read failure so callers can fail open.
|
||||
fleet_queue_version() {
|
||||
_fleet_call GET "/fleet/queue-state"
|
||||
case "$FLEET_CODE" in 2*) :;; *) return 1;; esac
|
||||
printf '%s' "$FLEET_BODY" | _fleet_json_num version
|
||||
}
|
||||
|
||||
# fleet_gate_should_claim — 0 = run the (expensive) claim this tick, 1 = skip it.
|
||||
# Read-only. Fails OPEN (claim) on any uncertainty so work is never stranded.
|
||||
# Always 0 when the gate is OFF, preserving the pre-gate behavior exactly.
|
||||
fleet_gate_should_claim() {
|
||||
fleet_gate_enabled || return 0 # gate off -> always claim
|
||||
[[ "${AQ_FLEET_GATE_DRAINING:-1}" == 1 ]] && return 0 # mid-drain -> keep claiming
|
||||
local now; now=$(date +%s)
|
||||
if [[ "${AQ_FLEET_GATE_SAFETY_SEC:-0}" -gt 0 \
|
||||
&& $(( now - ${AQ_FLEET_GATE_TS:-0} )) -ge "${AQ_FLEET_GATE_SAFETY_SEC}" ]]; then
|
||||
return 0 # periodic safety backstop
|
||||
fi
|
||||
local v; v=$(fleet_queue_version) || return 0 # read failed -> fail open
|
||||
[[ -n "$v" ]] || return 0
|
||||
[[ "$v" != "${AQ_FLEET_GATE_SEEN:-}" ]] && return 0 # changed -> claim
|
||||
return 1 # unchanged + within backstop -> skip
|
||||
}
|
||||
|
||||
# fleet_gate_note_claim <claim_rc> — update gate state after a claim attempt.
|
||||
# rc 0 (claimed a job) -> stay draining (there may be more, keep claiming).
|
||||
# rc 2 (nothing claimable) / 1 (API error) -> arm the gate: record the current
|
||||
# version + timestamp and stop draining, so we skip until the version changes.
|
||||
fleet_gate_note_claim() {
|
||||
fleet_gate_enabled || return 0
|
||||
if [[ "${1:-1}" == 0 ]]; then AQ_FLEET_GATE_DRAINING=1; return 0; fi
|
||||
AQ_FLEET_GATE_DRAINING=0
|
||||
AQ_FLEET_GATE_TS=$(date +%s)
|
||||
local v; v=$(fleet_queue_version) && [[ -n "$v" ]] && AQ_FLEET_GATE_SEEN="$v"
|
||||
return 0
|
||||
}
|
||||
|
||||
# ── Claim — pull one job and materialize it as a local inbox .md ────
|
||||
# Returns 0 = claimed + materialized, 2 = nothing claimable, 1 = API error.
|
||||
fleet_claim() {
|
||||
fleet_enabled || return 2
|
||||
local caps body; caps=$(fleet_detect_caps)
|
||||
body="{\"factoryId\":\"$(_json_escape "$AQ_FACTORY_ID")\",\"capabilities\":$caps,\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900}}"
|
||||
_fleet_call POST "/fleet/claim" "$body"
|
||||
case "$FLEET_CODE" in 2*) :;; *) err "fleet: claim failed (HTTP ${FLEET_CODE:-error})"; return 1;; esac
|
||||
printf '%s' "$FLEET_BODY" | grep -q '"claimed"[[:space:]]*:[[:space:]]*true' || return 2
|
||||
|
||||
local jid body_md epoch repo base_branch verify automerge="" engine_pick
|
||||
jid=$(printf '%s' "$FLEET_BODY" | _json_str id)
|
||||
body_md=$(printf '%s' "$FLEET_BODY" | _json_str bodyMd)
|
||||
epoch=$(printf '%s' "$FLEET_BODY" | _fleet_json_num leaseEpoch)
|
||||
repo=$(printf '%s' "$FLEET_BODY" | _json_str repo)
|
||||
base_branch=$(printf '%s' "$FLEET_BODY" | _json_str baseBranch)
|
||||
verify=$(printf '%s' "$FLEET_BODY" | _json_str verify)
|
||||
printf '%s' "$FLEET_BODY" | grep -q '"autoMerge"[[:space:]]*:[[:space:]]*true' && automerge=true
|
||||
# Concrete engine the submitter picked (job.engine wins over engineClass via
|
||||
# resolve_engine). Only honor a KNOWN engine — never the run's 'unknown'/class
|
||||
# placeholder — so an engineless job still falls back to the factory default.
|
||||
engine_pick=$(printf '%s' "$FLEET_BODY" | _json_str engine)
|
||||
case "$engine_pick" in devin | claude | codex | copilot) ;; *) engine_pick="" ;; esac
|
||||
[[ -n "$jid" ]] || { err "fleet: claim returned no job id"; return 1; }
|
||||
|
||||
# Materialize a transient local job .md (same approach as from-tracker) so the
|
||||
# existing runner executes a coordinator job unchanged. fleet-job-id +
|
||||
# fleet-lease-epoch travel in frontmatter -> the job meta (see cmd_run).
|
||||
local safe tmpdir tmp
|
||||
safe=$(printf '%s' "$jid" | tr -c 'A-Za-z0-9._-' '_')
|
||||
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/aq-fleet.XXXXXX")
|
||||
tmp="$tmpdir/fleet-$safe.md"
|
||||
{
|
||||
echo "---"
|
||||
echo "cwd: $AQ_FLEET_CWD"
|
||||
echo "yolo: true"
|
||||
echo "fleet-job-id: $jid"
|
||||
echo "fleet-lease-epoch: ${epoch:-0}"
|
||||
[[ -n "$engine_pick" ]] && echo "engine: $engine_pick"
|
||||
[[ -n "$repo" ]] && echo "fleet-repo: $repo"
|
||||
[[ -n "$base_branch" ]] && echo "fleet-base-branch: $base_branch"
|
||||
# Per-repo verify command (drives the existing verify gate) + auto-merge flag.
|
||||
[[ -n "$verify" ]] && echo "verify: $verify"
|
||||
[[ -n "$automerge" ]] && echo "fleet-automerge: true"
|
||||
echo "idempotency-key: fleet-$jid"
|
||||
echo "---"
|
||||
echo
|
||||
printf '%s\n' "$body_md"
|
||||
} > "$tmp"
|
||||
cmd_add "$tmp" >/dev/null 2>&1
|
||||
rm -rf "$tmpdir"
|
||||
log "fleet: claimed job $C_BOLD$jid$C_RESET (leaseEpoch=${epoch:-0})"
|
||||
return 0
|
||||
}
|
||||
|
||||
# ── Report a fenced stage transition ────────────────────────────────
|
||||
# fleet_report <job> <stage> [with-checkpoint] -> 0 ok, 2 FENCED (stale epoch:
|
||||
# caller must self-abort), 1 degraded (coordinator unreachable: continue locally).
|
||||
fleet_report() {
|
||||
fleet_enabled || return 0
|
||||
local job=$1 stage=$2 with_ckpt=${3:-} metaf jid epoch
|
||||
metaf="$STATE/$job.meta"
|
||||
jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
|
||||
[[ -n "$jid" ]] || return 0
|
||||
local ckpt=""
|
||||
if [[ -n "$with_ckpt" ]]; then
|
||||
local wb wc; wb=$(_meta_val "$metaf" wip_branch); wc=$(_meta_val "$metaf" wip_commit)
|
||||
if [[ -n "$wb" ]]; then
|
||||
ckpt=",\"checkpoint\":{\"wipBranch\":\"$(_json_escape "$wb")\""
|
||||
[[ -n "$wc" ]] && ckpt+=",\"wipCommit\":\"$(_json_escape "$wc")\""
|
||||
ckpt+="}"
|
||||
fi
|
||||
fi
|
||||
# payload carries ONLY {stage, leaseEpoch, checkpoint} — never bodyMd/prompt/token.
|
||||
_fleet_call PATCH "/fleet/jobs/$jid" "{\"stage\":\"$stage\",\"leaseEpoch\":${epoch:-0}$ckpt}"
|
||||
case "$FLEET_CODE" in
|
||||
2*) echo "fleet_reported=$stage" >> "$metaf"; return 0;;
|
||||
409|412) err "fleet: FENCED reporting stage=$stage (stale leaseEpoch=$epoch) — self-aborting $job"
|
||||
echo "fleet_fenced=1" >> "$metaf"; return 2;;
|
||||
*) err "fleet: report stage=$stage failed (HTTP ${FLEET_CODE:-error}) — offline-degrade, continuing locally"
|
||||
echo "fleet_degraded=1" >> "$metaf"; return 1;;
|
||||
esac
|
||||
}
|
||||
|
||||
# fleet_lease_renew <job> -> extend the lease; 0 ok, 2 fenced, 1 degraded.
|
||||
# A renewal lost to a transient blip (timeout / 5xx / proxy) would let the lease
|
||||
# expire and the coordinator reclaim a job that is still running, wasting the work.
|
||||
# So retry a TRANSIENT failure a few times with a short backoff (well within the
|
||||
# lease window); a 409/412 FENCE is terminal and never retried. Tunables:
|
||||
# AQ_FLEET_RENEW_RETRIES (default 2 extra attempts), AQ_FLEET_RENEW_BACKOFF_SEC (2).
|
||||
fleet_lease_renew() {
|
||||
fleet_enabled || return 0
|
||||
local job=$1 metaf jid epoch
|
||||
metaf="$STATE/$job.meta"
|
||||
jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
|
||||
[[ -n "$jid" ]] || return 0
|
||||
local retries="${AQ_FLEET_RENEW_RETRIES:-2}" backoff="${AQ_FLEET_RENEW_BACKOFF_SEC:-2}" i=0
|
||||
[[ "$retries" =~ ^[0-9]+$ ]] || retries=2
|
||||
while :; do
|
||||
_fleet_call POST "/fleet/jobs/$jid/lease/renew" "{\"leaseEpoch\":${epoch:-0},\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900}}"
|
||||
case "$FLEET_CODE" in
|
||||
2*) return 0;;
|
||||
409|412) echo "fleet_fenced=1" >> "$metaf"; return 2;;
|
||||
esac
|
||||
# transient: retry up to $retries extra times before giving up degraded
|
||||
[[ "$i" -ge "$retries" ]] && return 1
|
||||
i=$((i + 1))
|
||||
sleep "$backoff"
|
||||
done
|
||||
}
|
||||
|
||||
# fleet_lease_release <job> [stage] -> best-effort release on a terminal stage.
|
||||
fleet_lease_release() {
|
||||
fleet_enabled || return 0
|
||||
local job=$1 stage=${2:-} metaf jid epoch body
|
||||
metaf="$STATE/$job.meta"
|
||||
jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
|
||||
[[ -n "$jid" ]] || return 0
|
||||
body="{\"leaseEpoch\":${epoch:-0}"
|
||||
[[ -n "$stage" ]] && body+=",\"stage\":\"$stage\""
|
||||
body+="}"
|
||||
_fleet_call POST "/fleet/jobs/$jid/lease/release" "$body"
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_report_insights <job> [result] — report the run's cost/token/effort metrics
|
||||
# (parsed by parse_usage into the job meta) to the coordinator, recorded on the
|
||||
# current run. Also releases the held lease (the agent has finished its work unit).
|
||||
# Best-effort: never blocks the loop. Engines that don't expose usage locally
|
||||
# (e.g. devin) simply omit token/cost fields; `result` + endedAt still land.
|
||||
fleet_report_insights() {
|
||||
fleet_enabled || return 0
|
||||
local job=$1 result=${2:-} metaf jid epoch
|
||||
metaf="$STATE/$job.meta"
|
||||
jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
|
||||
[[ -n "$jid" ]] || return 0
|
||||
local model engine session ti to tc cost turns tools est ins=""
|
||||
model=$(_meta_val "$metaf" model)
|
||||
engine=$(_meta_val "$metaf" engine); session=$(_meta_val "$metaf" session_id)
|
||||
ti=$(_meta_val "$metaf" tokens_in); to=$(_meta_val "$metaf" tokens_out)
|
||||
tc=$(_meta_val "$metaf" tokens_cached); cost=$(_meta_val "$metaf" cost_usd)
|
||||
turns=$(_meta_val "$metaf" turns); tools=$(_meta_val "$metaf" tool_calls)
|
||||
est=$(_meta_val "$metaf" usage_estimated)
|
||||
[[ -n "$model" ]] && ins+=",\"model\":\"$(_json_escape "$model")\""
|
||||
[[ -n "$engine" ]] && ins+=",\"engine\":\"$(_json_escape "$engine")\""
|
||||
[[ -n "$session" ]] && ins+=",\"sessionId\":\"$(_json_escape "$session")\""
|
||||
[[ "$ti" =~ ^[0-9]+$ ]] && ins+=",\"tokensIn\":$ti"
|
||||
[[ "$to" =~ ^[0-9]+$ ]] && ins+=",\"tokensOut\":$to"
|
||||
[[ "$tc" =~ ^[0-9]+$ ]] && ins+=",\"tokensCached\":$tc"
|
||||
[[ "$cost" =~ ^[0-9]+(\.[0-9]+)?$ ]] && ins+=",\"costUsd\":$cost"
|
||||
[[ "$turns" =~ ^[0-9]+$ ]] && ins+=",\"turns\":$turns"
|
||||
[[ "$tools" =~ ^[0-9]+$ ]] && ins+=",\"toolCalls\":$tools"
|
||||
[[ "$est" == "true" || "$est" == "1" ]] && ins+=",\"estimated\":true"
|
||||
local pr_url pr_branch pr_state
|
||||
pr_url=$(_meta_val "$metaf" pr_url); pr_branch=$(_meta_val "$metaf" pr_branch)
|
||||
pr_state=$(_meta_val "$metaf" pr_state)
|
||||
local body="{\"leaseEpoch\":${epoch:-0}"
|
||||
[[ -n "$ins" ]] && body+=",\"insights\":{${ins#,}}"
|
||||
[[ -n "$result" ]] && body+=",\"result\":\"$(_json_escape "$result")\""
|
||||
[[ -n "$pr_url" ]] && body+=",\"prUrl\":\"$(_json_escape "$pr_url")\""
|
||||
[[ -n "$pr_branch" ]] && body+=",\"branch\":\"$(_json_escape "$pr_branch")\""
|
||||
[[ -n "$pr_state" ]] && body+=",\"prState\":\"$(_json_escape "$pr_state")\""
|
||||
body+="}"
|
||||
_fleet_call POST "/fleet/jobs/$jid/lease/release" "$body"
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_renew_active — renew leases for all in-flight (building/) fleet jobs.
|
||||
fleet_renew_active() {
|
||||
fleet_enabled || return 0
|
||||
local f job
|
||||
for f in "$BUILDING"/*.md; do
|
||||
[[ -e "$f" ]] || continue
|
||||
job=$(basename "$f"); job=${job%.md}
|
||||
_fleet_is_job "$job" && { fleet_lease_renew "$job" >/dev/null 2>&1 || true; }
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_release_all_active — best-effort release leases for all in-flight (building/)
|
||||
# fleet jobs, e.g. on daemon shutdown, so the coordinator can reclaim them
|
||||
# immediately instead of waiting out the lease TTL (~900s). Never blocks shutdown;
|
||||
# no-op when fleet is disabled. Mirrors fleet_renew_active (release vs renew).
|
||||
fleet_release_all_active() {
|
||||
fleet_enabled || return 0
|
||||
local f job
|
||||
for f in "$BUILDING"/*.md; do
|
||||
[[ -e "$f" ]] || continue
|
||||
job=$(basename "$f"); job=${job%.md}
|
||||
_fleet_is_job "$job" && { fleet_lease_release "$job" >/dev/null 2>&1 || true; }
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_quarantine <job> <file> <metaf> <logf> — a fenced (reclaimed) worker must
|
||||
# NOT ship: park the local result in failed/ for human triage (§9 split-brain).
|
||||
fleet_quarantine() {
|
||||
local job=$1 file=$2 metaf=$3 logf=$4
|
||||
{
|
||||
echo "FLEET FENCED — the coordinator reclaimed this job (stale leaseEpoch)."
|
||||
echo "Quarantining the local result — NOT shipping/merging. Needs human triage. ($(date))"
|
||||
} >> "$logf"
|
||||
[[ -e "$file" ]] && mv "$file" "$FAILED/" 2>/dev/null
|
||||
{ echo "result=fenced_quarantine"; echo "fleet_quarantined=1"; echo "ended=$(date +%s)"; } >> "$metaf"
|
||||
err "fleet: quarantined $job (fenced/reclaimed) — surfaced for human triage"
|
||||
}
|
||||
|
||||
# _fleet_stage_for <result> -> the coordinator stage for a job result/stage.
|
||||
_fleet_stage_for() {
|
||||
case "$1" in
|
||||
shipped) echo shipped;;
|
||||
testing) echo testing;;
|
||||
review) echo review;;
|
||||
failed|timeout|verify_failed|retries_exhausted|capability_mismatch|no_engine|rejected) echo failed;;
|
||||
*) echo building;;
|
||||
esac
|
||||
}
|
||||
|
||||
# ── Slice 4: shadow / dual-run (strictly side-effect-free on real job state) ──
|
||||
|
||||
# _fleet_shadow_log -> path to the structured shadow-divergence log.
|
||||
_fleet_shadow_log() { printf '%s\n' "${AQ_FLEET_SHADOW_LOG:-$STATE/fleet-shadow.log}"; }
|
||||
|
||||
# fleet_shadow_claim — ask the coordinator what it WOULD assign for this factory's
|
||||
# capabilities, read-only. Side-effect-free on real job state, by construction:
|
||||
# * uses an ISOLATED shadow factoryId (never the real one), so it can't take a
|
||||
# job away from the real factory's identity;
|
||||
# * sends "dryRun":true,"shadow":true — a coordinator that honors it never
|
||||
# assigns (purely returns the would-be job);
|
||||
# * if the coordinator DID assign anyway (no dry-run support), the lease is
|
||||
# released immediately so no real assignment persists;
|
||||
# * the would-be job is NEVER materialized / run / reported / shipped locally.
|
||||
# Sets SHADOW_COORD_JOB to the would-be job id ("" = none). Best-effort: any error
|
||||
# is recorded as SHADOW_ERROR and swallowed — shadow must NEVER fail a real job.
|
||||
fleet_shadow_claim() {
|
||||
SHADOW_COORD_JOB=""
|
||||
fleet_shadow_enabled || return 0
|
||||
local caps body; caps=$(fleet_detect_caps)
|
||||
body="{\"factoryId\":\"$(_json_escape "$AQ_FLEET_SHADOW_FACTORY_ID")\",\"capabilities\":$caps,\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900},\"dryRun\":true,\"shadow\":true}"
|
||||
_fleet_call POST "/fleet/claim" "$body"
|
||||
case "$FLEET_CODE" in
|
||||
2*) : ;;
|
||||
*) printf '%s\t%s\t%s\t%s\n' "$(date +%s)" "<none>" "<none>" "SHADOW_ERROR(claim:HTTP_${FLEET_CODE:-error})" \
|
||||
>> "$(_fleet_shadow_log)" 2>/dev/null || true
|
||||
return 0 ;;
|
||||
esac
|
||||
printf '%s' "$FLEET_BODY" | grep -q '"claimed"[[:space:]]*:[[:space:]]*true' || return 0
|
||||
local jid epoch
|
||||
jid=$(printf '%s' "$FLEET_BODY" | _json_str id)
|
||||
epoch=$(printf '%s' "$FLEET_BODY" | _fleet_json_num leaseEpoch)
|
||||
SHADOW_COORD_JOB="$jid"
|
||||
# Undo any REAL lease the coordinator may have created (no dry-run support) so
|
||||
# the shadow probe leaves zero residue. Best-effort, response ignored.
|
||||
if [[ -n "$jid" ]]; then
|
||||
_fleet_call POST "/fleet/jobs/$jid/lease/release" "{\"leaseEpoch\":${epoch:-0},\"shadow\":true}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_shadow_compare <localJobId> <coordJobId> — classify the local (authoritative)
|
||||
# decision against the coordinator's would-be decision and append a structured line
|
||||
# (ts<TAB>localJob<TAB>coordJob<TAB>verdict) to the shadow log. Verdicts:
|
||||
# AGREE | DIVERGE | COORD_EMPTY | LOCAL_EMPTY. Both-empty is a no-op (nothing to compare).
|
||||
fleet_shadow_compare() {
|
||||
fleet_shadow_enabled || return 0
|
||||
local lj=${1:-} cj=${2:-} verdict
|
||||
if [[ -z "$lj" && -z "$cj" ]]; then return 0; fi
|
||||
if [[ -n "$lj" && -z "$cj" ]]; then verdict=COORD_EMPTY
|
||||
elif [[ -z "$lj" && -n "$cj" ]]; then verdict=LOCAL_EMPTY
|
||||
elif [[ "$lj" == "$cj" ]]; then verdict=AGREE
|
||||
else verdict=DIVERGE; fi
|
||||
printf '%s\t%s\t%s\t%s\n' "$(date +%s)" "${lj:-<none>}" "${cj:-<none>}" "$verdict" \
|
||||
>> "$(_fleet_shadow_log)" 2>/dev/null || true
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet_shadow_report <localJobId> <coordJobId> [stage] — mirror a stage transition
|
||||
# to the coordinator as a SHADOW event ("shadow":true,"dryRun":true) so the report
|
||||
# path is EXERCISED, but the coordinator response is NEVER acted on (no fence /
|
||||
# quarantine / state change) — divergence (e.g. 409) is only logged. Targets the
|
||||
# would-be coordinator job id; a no-op when there is none. Best-effort + swallowed.
|
||||
fleet_shadow_report() {
|
||||
fleet_shadow_enabled || return 0
|
||||
local lj=${1:-} cj=${2:-} stage=${3:-building}
|
||||
[[ -n "$cj" ]] || return 0
|
||||
_fleet_call PATCH "/fleet/jobs/$cj" "{\"stage\":\"$(_json_escape "$stage")\",\"shadow\":true,\"dryRun\":true}"
|
||||
case "${FLEET_CODE:-}" in
|
||||
2*) : ;;
|
||||
*) printf '%s\t%s\t%s\t%s\n' "$(date +%s)" "${lj:-<none>}" "${cj:-<none>}" "SHADOW_REPORT_DIVERGE(HTTP_${FLEET_CODE:-error})" \
|
||||
>> "$(_fleet_shadow_log)" 2>/dev/null || true ;;
|
||||
esac
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet-shadow-report — summarize the shadow log: per-verdict counts, agreement
|
||||
# rate, and the last N divergences. Read-only; safe regardless of the flags.
|
||||
cmd_fleet_shadow_report() {
|
||||
ensure_dirs
|
||||
local n=10 logf; logf=$(_fleet_shadow_log)
|
||||
[[ "${1:-}" =~ ^[0-9]+$ ]] && n=$1
|
||||
if [[ ! -s "$logf" ]]; then
|
||||
log "fleet shadow: no shadow log yet ($logf)."
|
||||
log "fleet shadow: run with AQ_FLEET=1 AQ_FLEET_ROUTE=0 AQ_FLEET_SHADOW=1 to record divergence."
|
||||
return 0
|
||||
fi
|
||||
log "fleet shadow report ($logf):"
|
||||
awk -F'\t' '
|
||||
{ v=$4; sub(/\(.*/, "", v); c[v]++; tot++
|
||||
if (v=="AGREE") ag++
|
||||
if (v=="AGREE"||v=="DIVERGE"||v=="COORD_EMPTY"||v=="LOCAL_EMPTY") dec++ }
|
||||
END {
|
||||
split("AGREE DIVERGE COORD_EMPTY LOCAL_EMPTY SHADOW_ERROR SHADOW_REPORT_DIVERGE", ord, " ")
|
||||
for (i=1; i<=6; i++) printf " %-22s %d\n", ord[i], c[ord[i]]+0
|
||||
printf " %-22s %d\n", "TOTAL", tot+0
|
||||
if (dec>0) printf " %-22s %d%%\n", "AGREEMENT", int(100*ag/dec)
|
||||
}' "$logf"
|
||||
log "last $n divergence/error events:"
|
||||
grep -E "$(printf '\t')(DIVERGE|COORD_EMPTY|LOCAL_EMPTY|SHADOW_ERROR|SHADOW_REPORT_DIVERGE)" "$logf" 2>/dev/null \
|
||||
| tail -n "$n" \
|
||||
| awk -F'\t' '{ printf " ts=%s local=%s coord=%s verdict=%s\n", $1, $2, $3, $4 }' || true
|
||||
return 0
|
||||
}
|
||||
|
||||
# fleet-status — heartbeat (register) + print this factory's identity/caps + flags.
|
||||
cmd_fleet_status() {
|
||||
ensure_dirs
|
||||
if ! fleet_enabled; then
|
||||
log "fleet: AQ_FLEET is off — running in offline git-queue mode (no coordinator)."
|
||||
return 0
|
||||
fi
|
||||
log "fleet: factory=$C_BOLD$AQ_FACTORY_ID$C_RESET api=$AQ_FLEET_API"
|
||||
log "fleet: flags=$(fleet_flags_state)"
|
||||
fleet_flags_warn_once
|
||||
log "fleet: capabilities=$(fleet_detect_caps)"
|
||||
if fleet_shadow_enabled; then
|
||||
log "fleet: SHADOW/dual-run mode — local inbox is authoritative; coordinator queried for comparison only (never acted on)."
|
||||
elif ! fleet_route_enabled; then
|
||||
log "fleet: ROUTE off — local inbox is authoritative; coordinator not used to source work."
|
||||
fi
|
||||
if fleet_heartbeat; then
|
||||
log "fleet: heartbeat OK (registered)."
|
||||
else
|
||||
err "fleet: coordinator unreachable — would run in offline-degrade mode."
|
||||
fi
|
||||
}
|
||||
@ -1,218 +0,0 @@
|
||||
// fleet-dash.mjs — read/act adapter that re-points the agent-queue TUI dashboard
|
||||
// at the platform-service `/fleet` REST API (roadmap Phase 3: "TUI dashboard
|
||||
// re-pointed at /fleet API (parity)").
|
||||
//
|
||||
// This module is intentionally pure-ish and dependency-injectable (the HTTP
|
||||
// `fetchImpl` is a parameter) so it is unit-testable WITHOUT a live service.
|
||||
// dashboard.mjs uses it ONLY when AQ_FLEET_DASH=1; otherwise the dashboard's
|
||||
// local-queue behavior is byte-for-byte unchanged.
|
||||
//
|
||||
// Auth + scoping mirror agent-queue/lib/fleet-client.sh:
|
||||
// base URL AQ_FLEET_API (already includes /api)
|
||||
// bearer AQ_FLEET_TOKEN
|
||||
// product X-Product-Id: AQ_PRODUCT_ID (sent on every request)
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 8000;
|
||||
|
||||
// fleetConfig(env) — resolve the dashboard's fleet mode from the environment.
|
||||
// enabled iff AQ_FLEET_DASH=1 (explicit opt-in). `ok` additionally requires the
|
||||
// api/token/product config to be complete; `missing` lists what's absent so the
|
||||
// dashboard can fail visibly instead of silently doing nothing.
|
||||
export function fleetConfig(env = process.env) {
|
||||
const enabled = String(env.AQ_FLEET_DASH || '') === '1';
|
||||
const api = String(env.AQ_FLEET_API || '').replace(/\/+$/, '');
|
||||
const token = String(env.AQ_FLEET_TOKEN || '');
|
||||
const productId = String(env.AQ_PRODUCT_ID || '');
|
||||
const missing = [];
|
||||
if (enabled) {
|
||||
if (!api) missing.push('AQ_FLEET_API');
|
||||
if (!token) missing.push('AQ_FLEET_TOKEN');
|
||||
if (!productId) missing.push('AQ_PRODUCT_ID');
|
||||
}
|
||||
return { enabled, api, token, productId, ok: enabled && missing.length === 0, missing };
|
||||
}
|
||||
|
||||
// ── stage mapping ───────────────────────────────────────────────────────────
|
||||
// Fleet stages collapse onto the local board's kanban buckets so the dashboard's
|
||||
// existing layout/gating/STAGE_TAG logic can be reused unchanged.
|
||||
const STAGE_BUCKET = {
|
||||
queued: 'inbox',
|
||||
assigned: 'building',
|
||||
building: 'building',
|
||||
review: 'review',
|
||||
testing: 'testing',
|
||||
shipped: 'shipped',
|
||||
failed: 'failed',
|
||||
dead_letter: 'failed',
|
||||
};
|
||||
export const mapStage = (s) => STAGE_BUCKET[s] || 'inbox';
|
||||
|
||||
// Stages an operator can act on from the dashboard (parity with the local
|
||||
// ACTION_STAGES = review · testing · failed · inbox).
|
||||
const ACTIONABLE = new Set(['review', 'testing', 'failed', 'inbox']);
|
||||
const RUNNING_STAGES = new Set(['assigned', 'building']);
|
||||
const ITEM_ORDER = { review: 0, testing: 1, failed: 2, inbox: 3 };
|
||||
|
||||
const trackerOf = (j) => j.trackerItemId || j.trackerItem || (j.data && j.data.trackerItem) || '';
|
||||
const capsOf = (j) => (Array.isArray(j.capabilities) ? j.capabilities.join(', ') : String(j.capabilities || ''));
|
||||
|
||||
// toBoard({jobs, factories, metrics}) — normalize the raw API payloads into the
|
||||
// shape the dashboard renders. Pure (no I/O), so it is fully unit-testable.
|
||||
export function toBoard({ jobs = [], factories = [], metrics = null } = {}) {
|
||||
const counts = { inbox: 0, building: 0, review: 0, testing: 0, shipped: 0, failed: 0 };
|
||||
const items = [];
|
||||
const running = [];
|
||||
const recent = [];
|
||||
for (const j of jobs) {
|
||||
const bucket = mapStage(j.stage);
|
||||
if (counts[bucket] !== undefined) counts[bucket] += 1;
|
||||
const norm = {
|
||||
// `stage` is the bucket so the dashboard's gate()/STAGE_TAG work unchanged;
|
||||
// `fleetStage` keeps the true server stage for display.
|
||||
stage: bucket,
|
||||
fleetStage: j.stage,
|
||||
id: j.id,
|
||||
priority: j.priority || 'medium',
|
||||
profile: j.profile || '',
|
||||
capabilities: capsOf(j),
|
||||
tracker_item: trackerOf(j),
|
||||
leaseEpoch: j.leaseEpoch,
|
||||
factoryId: j.factoryId || j.leaseFactoryId || '',
|
||||
attempts: j.attempts,
|
||||
updatedAt: j.updatedAt || j.createdAt || '',
|
||||
raw: j,
|
||||
};
|
||||
if (RUNNING_STAGES.has(j.stage)) running.push(norm);
|
||||
if (ACTIONABLE.has(bucket)) items.push(norm);
|
||||
if (bucket === 'shipped' || bucket === 'failed') recent.push(norm);
|
||||
}
|
||||
items.sort((a, b) => (ITEM_ORDER[a.stage] - ITEM_ORDER[b.stage]) || cmp(a.id, b.id));
|
||||
recent.sort((a, b) => cmp(String(b.updatedAt), String(a.updatedAt)));
|
||||
return { counts, items, running, recent: recent.slice(0, 5), factories, metrics };
|
||||
}
|
||||
|
||||
const cmp = (a, b) => (a < b ? -1 : a > b ? 1 : 0);
|
||||
|
||||
// ── HTTP ──────────────────────────────────────────────────────────────────--
|
||||
// fleetFetch — a single request against the coordinator. NEVER throws: network
|
||||
// errors / timeouts / non-JSON bodies are returned as a structured result so the
|
||||
// TUI stays responsive. A timeout is enforced via AbortController.
|
||||
export async function fleetFetch(cfg, pathname, opts = {}, fetchImpl = globalThis.fetch) {
|
||||
const url = `${cfg.api}${pathname}`;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${cfg.token}`,
|
||||
'X-Product-Id': cfg.productId,
|
||||
Accept: 'application/json',
|
||||
};
|
||||
const hasBody = opts.body !== undefined;
|
||||
if (hasBody) headers['Content-Type'] = 'application/json';
|
||||
const ac = new AbortController();
|
||||
const timer = setTimeout(() => ac.abort(), opts.timeoutMs || DEFAULT_TIMEOUT_MS);
|
||||
try {
|
||||
const res = await fetchImpl(url, {
|
||||
method: opts.method || 'GET',
|
||||
headers,
|
||||
body: hasBody ? JSON.stringify(opts.body) : undefined,
|
||||
signal: ac.signal,
|
||||
});
|
||||
let json = null;
|
||||
let text = '';
|
||||
try { text = await res.text(); } catch { /* ignore body read errors */ }
|
||||
if (text) { try { json = JSON.parse(text); } catch { json = null; } }
|
||||
return { ok: !!res.ok, status: res.status, json };
|
||||
} catch (e) {
|
||||
const timedOut = e && (e.name === 'AbortError' || e.code === 'ABORT_ERR');
|
||||
return { ok: false, status: 0, json: null, error: timedOut ? 'timeout' : (e && e.message) || 'network error' };
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
// fetchBoard — assemble the board model. Jobs are REQUIRED (failure ⇒ board
|
||||
// fails). Metrics + factories are best-effort: a missing/404/501 factories
|
||||
// endpoint degrades to [] (it is optional server-side), and absent metrics
|
||||
// simply omits the aggregate panel.
|
||||
export async function fetchBoard(cfg, fetchImpl = globalThis.fetch) {
|
||||
const jobsRes = await fleetFetch(cfg, '/fleet/jobs', {}, fetchImpl);
|
||||
if (!jobsRes.ok || !jobsRes.json) {
|
||||
return { ok: false, error: jobsRes.error || `jobs HTTP ${jobsRes.status}` };
|
||||
}
|
||||
const jobs = Array.isArray(jobsRes.json.jobs) ? jobsRes.json.jobs : [];
|
||||
|
||||
const metricsRes = await fleetFetch(cfg, '/fleet/metrics', {}, fetchImpl);
|
||||
const metrics = metricsRes.ok && metricsRes.json ? metricsRes.json : null;
|
||||
|
||||
const facRes = await fleetFetch(cfg, '/fleet/factories', {}, fetchImpl);
|
||||
const factories = facRes.ok && facRes.json && Array.isArray(facRes.json.factories)
|
||||
? facRes.json.factories
|
||||
: [];
|
||||
|
||||
return { ok: true, board: toBoard({ jobs, factories, metrics }) };
|
||||
}
|
||||
|
||||
// formatEvent — one fleet event → a single log line for the TUI log view.
|
||||
export function formatEvent(e) {
|
||||
const at = e.at ? safeTime(e.at) : '';
|
||||
const actor = e.actor ? ` ${e.actor}` : '';
|
||||
const data = e.data && typeof e.data === 'object' && Object.keys(e.data).length
|
||||
? ` ${JSON.stringify(e.data)}`
|
||||
: '';
|
||||
return `${at} ${e.type || '?'}${actor}${data}`.trim();
|
||||
}
|
||||
|
||||
const safeTime = (iso) => {
|
||||
const d = new Date(iso);
|
||||
return Number.isNaN(d.getTime()) ? String(iso) : d.toLocaleTimeString();
|
||||
};
|
||||
|
||||
// fetchEvents — the job's event stream rendered as log lines.
|
||||
export async function fetchEvents(cfg, jobId, fetchImpl = globalThis.fetch) {
|
||||
const res = await fleetFetch(cfg, `/fleet/jobs/${encodeURIComponent(jobId)}/events`, {}, fetchImpl);
|
||||
if (!res.ok || !res.json) {
|
||||
return { ok: false, error: res.error || `events HTTP ${res.status}`, lines: [] };
|
||||
}
|
||||
const events = Array.isArray(res.json.events) ? res.json.events : [];
|
||||
return { ok: true, lines: events.map(formatEvent) };
|
||||
}
|
||||
|
||||
// Operator verbs the dashboard supports in fleet mode. `promote` has no safe
|
||||
// server contract (client-inferred stage transitions could violate workflow
|
||||
// invariants), so it is explicitly unavailable here.
|
||||
const FLEET_VERBS = new Set(['ship', 'requeue', 'reject']);
|
||||
|
||||
// jobAction — execute an operator verb against the coordinator.
|
||||
// ship → re-GET the job for a FRESH leaseEpoch, then PATCH stage=shipped
|
||||
// (a stale snapshot epoch would be fenced with 409).
|
||||
// requeue → POST /actions/requeue (lease-free operator action)
|
||||
// reject → POST /actions/reject
|
||||
// Returns {ok, message}; never throws.
|
||||
export async function jobAction(cfg, item, verb, fetchImpl = globalThis.fetch) {
|
||||
if (verb === 'promote') return { ok: false, message: 'promote is not available in fleet mode' };
|
||||
if (!FLEET_VERBS.has(verb)) return { ok: false, message: `${verb} not supported in fleet mode` };
|
||||
const id = item && item.id;
|
||||
if (!id) return { ok: false, message: 'no job selected' };
|
||||
|
||||
if (verb === 'ship') {
|
||||
const cur = await fleetFetch(cfg, `/fleet/jobs/${encodeURIComponent(id)}`, {}, fetchImpl);
|
||||
if (!cur.ok || !cur.json) return { ok: false, message: cur.error || `job HTTP ${cur.status}` };
|
||||
const res = await fleetFetch(
|
||||
cfg,
|
||||
`/fleet/jobs/${encodeURIComponent(id)}`,
|
||||
{ method: 'PATCH', body: { stage: 'shipped', leaseEpoch: cur.json.leaseEpoch } },
|
||||
fetchImpl,
|
||||
);
|
||||
if (res.status === 409) return { ok: false, message: 'job changed (fenced) — refresh and retry' };
|
||||
if (!res.ok) return { ok: false, message: res.error || `ship HTTP ${res.status}` };
|
||||
return { ok: true, message: `shipped ${id}` };
|
||||
}
|
||||
|
||||
const res = await fleetFetch(
|
||||
cfg,
|
||||
`/fleet/jobs/${encodeURIComponent(id)}/actions/${verb}`,
|
||||
{ method: 'POST', body: {} },
|
||||
fetchImpl,
|
||||
);
|
||||
if (res.status === 409) return { ok: false, message: 'job conflict/terminal — refresh and retry' };
|
||||
if (!res.ok) return { ok: false, message: res.error || `${verb} HTTP ${res.status}` };
|
||||
return { ok: true, message: `${verb} ${id}` };
|
||||
}
|
||||
@ -1,287 +0,0 @@
|
||||
// fleet-dash.test.mjs — dependency-light unit tests for the fleet-mode dashboard
|
||||
// adapter. Uses node:assert only (no test framework), matching the repo style.
|
||||
// Run: `node fleet-dash.test.mjs` (also wired into selftest.sh).
|
||||
//
|
||||
// These tests prove the dashboard's CONTRACT ASSUMPTIONS against the /fleet API
|
||||
// (request shaping, response mapping, graceful degradation, action semantics)
|
||||
// via an injected fetch stub. They do NOT prove live server compatibility.
|
||||
|
||||
import assert from 'node:assert/strict';
|
||||
import {
|
||||
fleetConfig,
|
||||
mapStage,
|
||||
toBoard,
|
||||
fleetFetch,
|
||||
fetchBoard,
|
||||
fetchEvents,
|
||||
formatEvent,
|
||||
jobAction,
|
||||
} from './fleet-dash.mjs';
|
||||
|
||||
let passed = 0;
|
||||
const t = (name, fn) => {
|
||||
try {
|
||||
const r = fn();
|
||||
if (r && typeof r.then === 'function') {
|
||||
return r.then(
|
||||
() => { passed += 1; },
|
||||
(e) => { console.error(` ✗ ${name}\n ${e && e.message}`); process.exitCode = 1; },
|
||||
);
|
||||
}
|
||||
passed += 1;
|
||||
} catch (e) {
|
||||
console.error(` ✗ ${name}\n ${e && e.message}`);
|
||||
process.exitCode = 1;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
// A recording fetch stub. `routes` maps a matcher → {status, body} (or a fn).
|
||||
function makeFetch(routes) {
|
||||
const calls = [];
|
||||
const fetchImpl = async (url, opts = {}) => {
|
||||
calls.push({ url, opts, headers: opts.headers || {}, method: opts.method || 'GET' });
|
||||
let entry = routes[url];
|
||||
if (!entry) {
|
||||
// try suffix match (path only)
|
||||
const key = Object.keys(routes).find((k) => url.endsWith(k));
|
||||
entry = key ? routes[key] : undefined;
|
||||
}
|
||||
if (typeof entry === 'function') entry = entry({ url, opts });
|
||||
if (entry === undefined) return mkRes(404, '{}');
|
||||
if (entry.throw) throw Object.assign(new Error(entry.throw), { name: entry.name || 'Error' });
|
||||
return mkRes(entry.status ?? 200, entry.body ?? '');
|
||||
};
|
||||
fetchImpl.calls = calls;
|
||||
return fetchImpl;
|
||||
}
|
||||
const mkRes = (status, body) => ({
|
||||
ok: status >= 200 && status < 300,
|
||||
status,
|
||||
text: async () => (typeof body === 'string' ? body : JSON.stringify(body)),
|
||||
});
|
||||
|
||||
const CFG = { enabled: true, ok: true, api: 'http://svc/api', token: 'tok', productId: 'prodX', missing: [] };
|
||||
|
||||
await (async () => {
|
||||
// ── fleetConfig ──
|
||||
t('fleetConfig: AQ_FLEET_DASH unset ⇒ disabled', () => {
|
||||
const c = fleetConfig({});
|
||||
assert.equal(c.enabled, false);
|
||||
assert.equal(c.ok, false);
|
||||
});
|
||||
t('fleetConfig: enabled but missing config ⇒ not ok, lists missing', () => {
|
||||
const c = fleetConfig({ AQ_FLEET_DASH: '1' });
|
||||
assert.equal(c.enabled, true);
|
||||
assert.equal(c.ok, false);
|
||||
assert.deepEqual(c.missing.sort(), ['AQ_FLEET_API', 'AQ_FLEET_TOKEN', 'AQ_PRODUCT_ID'].sort());
|
||||
});
|
||||
t('fleetConfig: enabled + complete ⇒ ok, trims trailing slash', () => {
|
||||
const c = fleetConfig({ AQ_FLEET_DASH: '1', AQ_FLEET_API: 'http://svc/api/', AQ_FLEET_TOKEN: 'k', AQ_PRODUCT_ID: 'p' });
|
||||
assert.equal(c.ok, true);
|
||||
assert.equal(c.api, 'http://svc/api');
|
||||
});
|
||||
|
||||
// ── mapStage ──
|
||||
t('mapStage: fleet stages collapse to board buckets', () => {
|
||||
assert.equal(mapStage('queued'), 'inbox');
|
||||
assert.equal(mapStage('assigned'), 'building');
|
||||
assert.equal(mapStage('building'), 'building');
|
||||
assert.equal(mapStage('review'), 'review');
|
||||
assert.equal(mapStage('testing'), 'testing');
|
||||
assert.equal(mapStage('shipped'), 'shipped');
|
||||
assert.equal(mapStage('failed'), 'failed');
|
||||
assert.equal(mapStage('dead_letter'), 'failed');
|
||||
assert.equal(mapStage('weird'), 'inbox');
|
||||
});
|
||||
|
||||
// ── toBoard ──
|
||||
t('toBoard: counts, actionable items, running, recent', () => {
|
||||
const jobs = [
|
||||
{ id: 'a', stage: 'queued', priority: 'high', capabilities: ['os:any'] },
|
||||
{ id: 'b', stage: 'building', priority: 'critical', factoryId: 'mac-1', leaseEpoch: 3 },
|
||||
{ id: 'c', stage: 'review', updatedAt: '2026-01-01T00:00:02Z' },
|
||||
{ id: 'd', stage: 'testing' },
|
||||
{ id: 'e', stage: 'shipped', updatedAt: '2026-01-01T00:00:09Z' },
|
||||
{ id: 'f', stage: 'failed', updatedAt: '2026-01-01T00:00:05Z' },
|
||||
{ id: 'g', stage: 'dead_letter', updatedAt: '2026-01-01T00:00:01Z' },
|
||||
];
|
||||
const b = toBoard({ jobs });
|
||||
assert.equal(b.counts.inbox, 1);
|
||||
assert.equal(b.counts.building, 1);
|
||||
assert.equal(b.counts.review, 1);
|
||||
assert.equal(b.counts.testing, 1);
|
||||
assert.equal(b.counts.shipped, 1);
|
||||
assert.equal(b.counts.failed, 2); // failed + dead_letter
|
||||
// running = assigned/building only
|
||||
assert.deepEqual(b.running.map((x) => x.id), ['b']);
|
||||
assert.equal(b.running[0].fleetStage, 'building');
|
||||
assert.equal(b.running[0].factoryId, 'mac-1');
|
||||
// actionable items exclude building/shipped, ordered review<testing<failed<inbox
|
||||
assert.deepEqual(b.items.map((x) => x.id), ['c', 'd', 'f', 'g', 'a']);
|
||||
// item.stage is the bucket (so dashboard gate()/STAGE_TAG reuse works)
|
||||
assert.equal(b.items[0].stage, 'review');
|
||||
assert.equal(b.items[4].stage, 'inbox');
|
||||
// recent = shipped+failed, newest first, capped at 5
|
||||
assert.deepEqual(b.recent.map((x) => x.id), ['e', 'f', 'g']);
|
||||
});
|
||||
|
||||
// ── fleetFetch: headers + product scoping on every request ──
|
||||
await t('fleetFetch: sends bearer + X-Product-Id; parses JSON', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs': { status: 200, body: { jobs: [] } } });
|
||||
const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.deepEqual(r.json, { jobs: [] });
|
||||
const h = f.calls[0].headers;
|
||||
assert.equal(h.Authorization, 'Bearer tok');
|
||||
assert.equal(h['X-Product-Id'], 'prodX');
|
||||
assert.equal(f.calls[0].url, 'http://svc/api/fleet/jobs');
|
||||
});
|
||||
await t('fleetFetch: network error ⇒ ok:false with message (no throw)', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs': { throw: 'boom' } });
|
||||
const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.equal(r.status, 0);
|
||||
assert.match(r.error, /boom/);
|
||||
});
|
||||
await t('fleetFetch: abort ⇒ timeout error', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs': { throw: 'aborted', name: 'AbortError' } });
|
||||
const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.equal(r.error, 'timeout');
|
||||
});
|
||||
await t('fleetFetch: non-JSON 500 body ⇒ ok:false, json null', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs': { status: 500, body: '<html>err</html>' } });
|
||||
const r = await fleetFetch(CFG, '/fleet/jobs', {}, f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.equal(r.status, 500);
|
||||
assert.equal(r.json, null);
|
||||
});
|
||||
|
||||
// ── fetchBoard: assembly + degradation ──
|
||||
await t('fetchBoard: jobs + metrics + factories assembled', async () => {
|
||||
const f = makeFetch({
|
||||
'/fleet/jobs': { body: { jobs: [{ id: 'a', stage: 'review' }] } },
|
||||
'/fleet/metrics': { body: { utilizationPct: 50, alerts: [] } },
|
||||
'/fleet/factories': { body: { factories: [{ factoryId: 'mac-1', health: 'ok' }] } },
|
||||
});
|
||||
const r = await fetchBoard(CFG, f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.equal(r.board.items.length, 1);
|
||||
assert.equal(r.board.metrics.utilizationPct, 50);
|
||||
assert.equal(r.board.factories.length, 1);
|
||||
});
|
||||
await t('fetchBoard: factories 404 ⇒ degrades to []', async () => {
|
||||
const f = makeFetch({
|
||||
'/fleet/jobs': { body: { jobs: [] } },
|
||||
'/fleet/metrics': { body: {} },
|
||||
'/fleet/factories': { status: 404, body: {} },
|
||||
});
|
||||
const r = await fetchBoard(CFG, f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.deepEqual(r.board.factories, []);
|
||||
});
|
||||
await t('fetchBoard: factories 501 ⇒ degrades to []', async () => {
|
||||
const f = makeFetch({
|
||||
'/fleet/jobs': { body: { jobs: [] } },
|
||||
'/fleet/metrics': { body: {} },
|
||||
'/fleet/factories': { status: 501, body: {} },
|
||||
});
|
||||
const r = await fetchBoard(CFG, f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.deepEqual(r.board.factories, []);
|
||||
});
|
||||
await t('fetchBoard: metrics failure ⇒ board still ok, metrics null', async () => {
|
||||
const f = makeFetch({
|
||||
'/fleet/jobs': { body: { jobs: [] } },
|
||||
'/fleet/metrics': { status: 500, body: 'oops' },
|
||||
'/fleet/factories': { body: { factories: [] } },
|
||||
});
|
||||
const r = await fetchBoard(CFG, f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.equal(r.board.metrics, null);
|
||||
});
|
||||
await t('fetchBoard: jobs failure ⇒ board fails with error', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs': { status: 503, body: '{}' } });
|
||||
const r = await fetchBoard(CFG, f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.match(r.error, /503/);
|
||||
});
|
||||
|
||||
// ── events ──
|
||||
t('formatEvent: renders type + actor + data', () => {
|
||||
const line = formatEvent({ type: 'claimed', actor: 'mac-1', at: '2026-01-01T00:00:00Z', data: { leaseEpoch: 2 } });
|
||||
assert.match(line, /claimed/);
|
||||
assert.match(line, /mac-1/);
|
||||
assert.match(line, /leaseEpoch/);
|
||||
});
|
||||
await t('fetchEvents: maps events to lines', async () => {
|
||||
const f = makeFetch({
|
||||
'/events': { body: { events: [{ type: 'queued', at: '2026-01-01T00:00:00Z', data: {} }, { type: 'claimed', data: {} }] } },
|
||||
});
|
||||
const r = await fetchEvents(CFG, 'job-1', f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.equal(r.lines.length, 2);
|
||||
assert.match(r.lines[1], /claimed/);
|
||||
assert.match(f.calls[0].url, /\/fleet\/jobs\/job-1\/events$/);
|
||||
});
|
||||
await t('fetchEvents: failure ⇒ ok:false, empty lines', async () => {
|
||||
const f = makeFetch({ '/events': { status: 500, body: 'x' } });
|
||||
const r = await fetchEvents(CFG, 'job-1', f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.deepEqual(r.lines, []);
|
||||
});
|
||||
|
||||
// ── jobAction ──
|
||||
await t('jobAction: ship re-GETs fresh leaseEpoch then PATCHes shipped', async () => {
|
||||
let patchBody = null;
|
||||
const f = makeFetch({
|
||||
'http://svc/api/fleet/jobs/j1': ({ opts }) => {
|
||||
if ((opts.method || 'GET') === 'PATCH') { patchBody = JSON.parse(opts.body); return { status: 200, body: { id: 'j1', stage: 'shipped' } }; }
|
||||
return { status: 200, body: { id: 'j1', stage: 'testing', leaseEpoch: 7 } }; // fresh epoch
|
||||
},
|
||||
});
|
||||
const r = await jobAction(CFG, { id: 'j1', leaseEpoch: 2 /* stale */ }, 'ship', f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.equal(patchBody.stage, 'shipped');
|
||||
assert.equal(patchBody.leaseEpoch, 7); // used the freshly-fetched epoch, not the stale 2
|
||||
});
|
||||
await t('jobAction: ship 409 ⇒ actionable fenced message', async () => {
|
||||
const f = makeFetch({
|
||||
'http://svc/api/fleet/jobs/j1': ({ opts }) => (opts.method === 'PATCH'
|
||||
? { status: 409, body: '{}' }
|
||||
: { status: 200, body: { id: 'j1', leaseEpoch: 7 } }),
|
||||
});
|
||||
const r = await jobAction(CFG, { id: 'j1' }, 'ship', f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.match(r.message, /fenced|refresh/i);
|
||||
});
|
||||
await t('jobAction: requeue ⇒ POST /actions/requeue', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs/j1/actions/requeue': { status: 200, body: { id: 'j1', stage: 'queued' } } });
|
||||
const r = await jobAction(CFG, { id: 'j1' }, 'requeue', f);
|
||||
assert.equal(r.ok, true);
|
||||
assert.equal(f.calls[0].method, 'POST');
|
||||
assert.match(f.calls[0].url, /\/actions\/requeue$/);
|
||||
});
|
||||
await t('jobAction: reject 409 ⇒ conflict message', async () => {
|
||||
const f = makeFetch({ '/fleet/jobs/j1/actions/reject': { status: 409, body: '{}' } });
|
||||
const r = await jobAction(CFG, { id: 'j1' }, 'reject', f);
|
||||
assert.equal(r.ok, false);
|
||||
assert.match(r.message, /conflict|terminal|refresh/i);
|
||||
});
|
||||
t('jobAction: promote ⇒ explicitly unavailable in fleet mode', async () => {
|
||||
return jobAction(CFG, { id: 'j1' }, 'promote', makeFetch({})).then((r) => {
|
||||
assert.equal(r.ok, false);
|
||||
assert.match(r.message, /promote/i);
|
||||
});
|
||||
});
|
||||
})();
|
||||
|
||||
// Summary line (selftest greps for PASS).
|
||||
process.on('exit', () => {
|
||||
if (process.exitCode && process.exitCode !== 0) {
|
||||
console.error('fleet-dash.test FAIL');
|
||||
} else {
|
||||
console.log(`fleet-dash.test PASS (${passed} assertions)`);
|
||||
}
|
||||
});
|
||||
@ -1,19 +0,0 @@
|
||||
---
|
||||
name: backend-engineer
|
||||
persona: |
|
||||
You are a senior backend engineer. Favor minimal, well-tested changes. Respect
|
||||
service boundaries, validate inputs, handle errors explicitly, and never log
|
||||
secrets. Prefer existing libraries and patterns over new dependencies. Keep
|
||||
migrations and API changes backward-compatible unless the task says otherwise.
|
||||
capabilities: [os:any, node>=20, has:pnpm]
|
||||
default-verify: pnpm -s typecheck && pnpm -s test
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [devin, claude]
|
||||
allowed-scope: ["backend/**", "services/**", "packages/**"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# backend-engineer
|
||||
|
||||
Server-side work. Inherits a typecheck+test verify gate and a scope limited to
|
||||
backend/service/package code.
|
||||
@ -1,20 +0,0 @@
|
||||
---
|
||||
name: developer
|
||||
persona: |
|
||||
You are a pragmatic senior software engineer. Make the smallest correct change
|
||||
that satisfies the task. Match the surrounding code style and existing patterns,
|
||||
keep diffs focused, and never commit secrets. Add or update tests when you change
|
||||
behavior, and explain non-obvious decisions briefly in the commit message.
|
||||
capabilities: [os:any, has:git]
|
||||
default-verify:
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [devin, claude, codex]
|
||||
allowed-scope: ["**"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# developer
|
||||
|
||||
General-purpose engineering profile. No default verify (parks in review for a
|
||||
human gate) and an unrestricted scope — pick a more specific profile when you
|
||||
want a tighter blast radius or an automatic QA gate.
|
||||
@ -1,18 +0,0 @@
|
||||
---
|
||||
name: docs-writer
|
||||
persona: |
|
||||
You are a technical writer. Produce clear, accurate documentation that matches
|
||||
the repository's existing voice and structure. Update READMEs, guides, and
|
||||
references; keep examples runnable and links valid. Do not change source code
|
||||
beyond doc comments. Never include secrets in examples.
|
||||
capabilities: [os:any]
|
||||
default-verify:
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [claude, devin]
|
||||
allowed-scope: ["docs/**", "**/*.md", "**/*.mdx"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# docs-writer
|
||||
|
||||
Documentation profile. Scoped to docs + markdown; parks in review for a human read.
|
||||
@ -1,18 +0,0 @@
|
||||
---
|
||||
name: frontend-engineer
|
||||
persona: |
|
||||
You are a senior frontend engineer. Build accessible, responsive UI that matches
|
||||
the existing component library and design tokens. Keep state management simple,
|
||||
avoid unnecessary dependencies, and ensure type-safety. Verify the build and
|
||||
tests pass before finishing; never hardcode secrets or API keys.
|
||||
capabilities: [os:any, node>=20, has:pnpm]
|
||||
default-verify: pnpm -s typecheck && pnpm -s build
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [claude, devin]
|
||||
allowed-scope: ["dashboards/**", "apps/**", "packages/ui/**", "src/**"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# frontend-engineer
|
||||
|
||||
Client/UI work. Inherits a typecheck+build gate and a UI-oriented scope.
|
||||
@ -1,19 +0,0 @@
|
||||
---
|
||||
name: planner
|
||||
persona: |
|
||||
You are a planning agent. Break an objective into a dependency-ordered set of
|
||||
small, well-scoped tasks, each mappable to a job .md (with a profile, scope, and
|
||||
verify). Output the plan as markdown; do not implement the tasks yourself.
|
||||
capabilities: [os:any]
|
||||
default-verify:
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [claude]
|
||||
allowed-scope: ["docs/**", "**/*.md"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# planner (reserved)
|
||||
|
||||
Reserved for a future planning/decomposition flow that emits child jobs with
|
||||
`deps:` wiring. Usable today as a docs-scoped persona; automatic job emission is
|
||||
a later slice.
|
||||
@ -1,18 +0,0 @@
|
||||
---
|
||||
name: qa
|
||||
persona: |
|
||||
You are a QA engineer. Write and strengthen tests; reproduce bugs with a failing
|
||||
test first, then confirm the fix. Cover edge cases, error paths, and regressions.
|
||||
Do not weaken or delete existing tests to make a suite pass — fix the cause.
|
||||
Keep tests deterministic and fast.
|
||||
capabilities: [os:any, node>=20, has:pnpm]
|
||||
default-verify: pnpm -s test
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [codex, claude]
|
||||
allowed-scope: ["**/*.test.*", "**/*.spec.*", "test/**", "tests/**", "e2e/**"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# qa
|
||||
|
||||
Test-focused profile. Inherits a `pnpm -s test` gate and a test-files scope.
|
||||
@ -1,19 +0,0 @@
|
||||
---
|
||||
name: reviewer
|
||||
persona: |
|
||||
You are a code reviewer. Do NOT modify code. Read the diff/changes and produce a
|
||||
concise review: correctness, security, tests, readability, and scope adherence.
|
||||
Flag risky or out-of-scope changes and supply-chain concerns (edits to shared
|
||||
packages). Output findings as markdown with severity labels.
|
||||
capabilities: [os:any, has:git]
|
||||
default-verify:
|
||||
engine-class: review-only
|
||||
prefers-engine: [claude]
|
||||
allowed-scope: ["docs/**", "**/*.md"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# reviewer
|
||||
|
||||
Read-only review profile. `engine-class: review-only` has no concrete runner
|
||||
mapping yet (reserved) — use an explicit `engine:` until a review engine lands.
|
||||
@ -1,19 +0,0 @@
|
||||
---
|
||||
name: ui-designer
|
||||
persona: |
|
||||
You are a UI/visual designer. Focus on visual hierarchy, spacing, color, and
|
||||
typography using the existing design tokens and component library. Keep changes
|
||||
consistent with the design system, ensure sufficient contrast, and respect
|
||||
light/dark themes. Prefer token references over hardcoded values.
|
||||
capabilities: [os:any, node>=20]
|
||||
default-verify:
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [claude, devin]
|
||||
allowed-scope: ["packages/ui/**", "packages/design-tokens/**", "**/*.css", "design/**"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# ui-designer
|
||||
|
||||
Visual/design-system work scoped to UI + tokens + styles. Parks in review for a
|
||||
human visual check.
|
||||
@ -1,19 +0,0 @@
|
||||
---
|
||||
name: ux-designer
|
||||
persona: |
|
||||
You are a UX designer. Focus on user flows, information architecture, and
|
||||
interaction states (empty, loading, error, success). Produce wireframes,
|
||||
flow descriptions, and copy as markdown/specs. Justify decisions with usability
|
||||
heuristics and accessibility (WCAG) considerations. Do not change production code.
|
||||
capabilities: [os:any]
|
||||
default-verify:
|
||||
engine-class: agentic-coder
|
||||
prefers-engine: [claude]
|
||||
allowed-scope: ["docs/**", "design/**", "**/*.md"]
|
||||
review-policy: manual
|
||||
---
|
||||
|
||||
# ux-designer
|
||||
|
||||
Flows, IA, and interaction specs. Documentation-scoped; parks in review for human
|
||||
sign-off (no automatic verify gate).
|
||||
File diff suppressed because it is too large
Load Diff
@ -39,7 +39,7 @@ The loader can be sourced from any directory. It discovers the `aliases/` folder
|
||||
## Requirements
|
||||
|
||||
- Supported shells: Bash and Zsh
|
||||
- Optional commands used by aliases: `git`, `tmux`, `tree`, `vim` or `$EDITOR`, and `caffeinate` (macOS, for `awake`/`longrun`)
|
||||
- Optional commands used by aliases: `git`, `tmux`, `tree`, and `vim` or `$EDITOR`
|
||||
|
||||
## Examples
|
||||
|
||||
@ -49,17 +49,8 @@ gd # git diff
|
||||
tl # tmux list-sessions
|
||||
tn work # tmux new-session -s work
|
||||
ta work # tmux attach-session -t work
|
||||
aq <cmd> # agent-queue runner (init|add|run|status|watch|dash|stop|logs)
|
||||
aqs # agent-queue status
|
||||
aqd # agent-queue Node live dashboard
|
||||
awake <cmd> # macOS: run <cmd> while keeping the machine awake (caffeinate -dimsu)
|
||||
longrun phase3 codex --full-auto "<prompt>" # detached+awake+logged overnight run
|
||||
ta phase3 # reattach to the run; tail -f ~/longrun-phase3-*.log to follow output
|
||||
```
|
||||
|
||||
See [`AI.dev/CHEATSHEETS/long-running-jobs.md`](../../learning_ai_common_plat/AI.dev/CHEATSHEETS/long-running-jobs.md)
|
||||
(in `learning_ai_common_plat`) for the full overnight-run guide and best practices.
|
||||
|
||||
## Local Aliases
|
||||
|
||||
Keep machine- or org-specific shortcuts out of the portable default files. Start from `_local.example.alias` if you want private local aliases such as branch-specific git commands.
|
||||
|
||||
@ -1,5 +0,0 @@
|
||||
# agent-queue — folder kanban runner for devin/claude/codex CLIs
|
||||
# Resolved relative to the aliases dir so it works on any machine/clone.
|
||||
alias aq="$BYTELYST_ALIAS_DIR/../agent-queue/agent-queue.sh"
|
||||
alias aqs="$BYTELYST_ALIAS_DIR/../agent-queue/agent-queue.sh status"
|
||||
alias aqd="$BYTELYST_ALIAS_DIR/../agent-queue/agent-queue.sh dash"
|
||||
@ -1,90 +0,0 @@
|
||||
# Long-running / overnight agent runs — keep-awake + detachable tmux + logged output.
|
||||
# Full guide: AI.dev/CHEATSHEETS/long-running-jobs.md (in learning_ai_common_plat).
|
||||
|
||||
# macOS: keep the machine awake while a command runs (prevents sleep stalling the job).
|
||||
# On Linux this alias is a no-op label; use `systemd-inhibit` instead.
|
||||
alias awake='caffeinate -dimsu'
|
||||
|
||||
# longrun <session> <command> [args...]
|
||||
# Runs <command> in a DETACHED tmux session, wrapped in caffeinate (macOS) so the
|
||||
# machine won't sleep, teeing all output to ~/longrun-<session>-<timestamp>.log.
|
||||
# Survives closing the terminal; reattach with `ta <session>`, stop with
|
||||
# `tmux kill-session -t <session>`.
|
||||
# e.g. longrun phase3 codex --dangerously-bypass-approvals-and-sandbox "Read ... and execute it"
|
||||
longrun() {
|
||||
# --- usage ---
|
||||
if [ "$#" -lt 2 ]; then
|
||||
echo "usage: longrun <session> <command> [args...]" >&2
|
||||
echo " e.g. longrun phase3 codex --full-auto \"<the overnight prompt>\"" >&2
|
||||
echo " env: LONGRUN_LOG_DIR overrides the log directory (default: \$HOME)" >&2
|
||||
return 2
|
||||
fi
|
||||
|
||||
# --- required dependency: tmux ---
|
||||
if ! command -v tmux >/dev/null 2>&1; then
|
||||
echo "longrun: 'tmux' is required but not installed." >&2
|
||||
echo " install: macOS 'brew install tmux' | Debian/Ubuntu 'sudo apt-get install -y tmux'" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
local sess="$1"; shift
|
||||
|
||||
# --- session name must be free ---
|
||||
if tmux has-session -t "$sess" 2>/dev/null; then
|
||||
echo "longrun: a tmux session named '$sess' already exists." >&2
|
||||
echo " attach: ta $sess | stop: tmux kill-session -t $sess | or choose another name" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
# --- the command must be runnable ---
|
||||
if ! command -v "$1" >/dev/null 2>&1; then
|
||||
echo "longrun: command not found on PATH: '$1'" >&2
|
||||
echo " make sure the agent CLI is installed and your PATH is set for non-login shells." >&2
|
||||
return 127
|
||||
fi
|
||||
|
||||
# --- log file must be writable ---
|
||||
local ts log dir
|
||||
dir="${LONGRUN_LOG_DIR:-$HOME}"
|
||||
ts="$(date +%Y%m%d-%H%M%S)"
|
||||
log="$dir/longrun-${sess}-${ts}.log"
|
||||
if ! mkdir -p "$dir" 2>/dev/null || ! ( : > "$log" ) 2>/dev/null; then
|
||||
echo "longrun: cannot write log file: $log" >&2
|
||||
echo " set LONGRUN_LOG_DIR to a writable directory and retry." >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
# --- optional dependency: caffeinate (keep-awake) ---
|
||||
local keep=""
|
||||
if command -v caffeinate >/dev/null 2>&1; then
|
||||
keep="caffeinate -dimsu "
|
||||
elif [ "$(uname -s 2>/dev/null)" = "Darwin" ]; then
|
||||
echo "longrun: WARNING — 'caffeinate' not found on macOS; the machine may sleep mid-run." >&2
|
||||
else
|
||||
echo "longrun: note — no 'caffeinate' (non-macOS). To prevent sleep, wrap with 'systemd-inhibit'." >&2
|
||||
fi
|
||||
|
||||
# --- launch (detached), capturing any tmux startup error ---
|
||||
local cmd inner errf
|
||||
cmd="$(printf '%q ' "$@")"
|
||||
inner="${keep}${cmd}2>&1 | tee \"$log\""
|
||||
errf="$(mktemp 2>/dev/null || echo "/tmp/longrun-err.$$")"
|
||||
if ! tmux new-session -d -s "$sess" "$inner" 2>"$errf"; then
|
||||
echo "longrun: failed to start tmux session '$sess':" >&2
|
||||
[ -s "$errf" ] && sed 's/^/ /' "$errf" >&2
|
||||
rm -f "$errf" "$log"
|
||||
return 1
|
||||
fi
|
||||
rm -f "$errf"
|
||||
|
||||
# --- confirm it is actually running (quick-exit detection) ---
|
||||
if ! tmux has-session -t "$sess" 2>/dev/null; then
|
||||
echo "longrun: WARNING — session '$sess' is not running; the command may have exited immediately." >&2
|
||||
echo " check the log: $log" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "[longrun] session=$sess"
|
||||
echo "[longrun] log=$log"
|
||||
echo "[longrun] attach: ta $sess | tail: tail -f \"$log\" | stop: tmux kill-session -t $sess"
|
||||
}
|
||||
@ -15,5 +15,3 @@ source "$BYTELYST_ALIAS_DIR/_cd.alias"
|
||||
source "$BYTELYST_ALIAS_DIR/_ls.alias"
|
||||
source "$BYTELYST_ALIAS_DIR/_general.alias"
|
||||
source "$BYTELYST_ALIAS_DIR/_shell.alias"
|
||||
source "$BYTELYST_ALIAS_DIR/_agent.alias"
|
||||
source "$BYTELYST_ALIAS_DIR/_longrun.alias"
|
||||
|
||||
83
bytelyst-cli.sh
Executable file → Normal file
83
bytelyst-cli.sh
Executable file → Normal file
@ -16,15 +16,6 @@ YELLOW=$(tput setaf 3)
|
||||
BLUE=$(tput setaf 4)
|
||||
RESET=$(tput sgr0)
|
||||
|
||||
CLI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# agent-queue delegates to the standalone tool (no GitHub token / jq / curl needed),
|
||||
# so handle it BEFORE the GITHUB_TOKEN + required-tools gates below.
|
||||
if [[ "${1:-}" == "agent-queue" || "${1:-}" == "aq" ]]; then
|
||||
shift
|
||||
exec "$CLI_DIR/agent-queue/agent-queue.sh" "$@"
|
||||
fi
|
||||
|
||||
REQUIRED_TOOLS=(jq curl)
|
||||
|
||||
# Check for required tools
|
||||
@ -35,49 +26,17 @@ for tool in "${REQUIRED_TOOLS[@]}"; do
|
||||
fi
|
||||
done
|
||||
|
||||
# Load .env if present. `set -a` exports everything sourced; this safely handles
|
||||
# quoted values and spaces, unlike `export $(grep ... | xargs)`.
|
||||
# Load .env if present
|
||||
if [[ -f .env ]]; then
|
||||
set -a
|
||||
# shellcheck disable=SC1091
|
||||
. ./.env
|
||||
set +a
|
||||
export $(grep -v '^#' .env | xargs)
|
||||
fi
|
||||
|
||||
# Validate GITHUB_TOKEN (printf so the newline renders, unlike echo "...\n...")
|
||||
if [[ -z "${GITHUB_TOKEN:-}" ]]; then
|
||||
printf '%s❌ Error: GITHUB_TOKEN is not set.\nSet it in your environment (e.g. export GITHUB_TOKEN=... in ~/.zshrc, ~/.bashrc, or .env).%s\n' "$RED" "$RESET" >&2
|
||||
# Validate GITHUB_TOKEN
|
||||
if [[ -z "$GITHUB_TOKEN" ]]; then
|
||||
echo "${RED}❌ Error: GITHUB_TOKEN is not set.\nSet it in your environment (e.g., export GITHUB_TOKEN=... in ~/.zshrc, ~/.bashrc, or .env).${RESET}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# gh_get_all <url> -> echo one JSON array combining ALL pages (per_page=100).
|
||||
# Verifies HTTP 200 on every page before parsing; returns non-zero on API error.
|
||||
gh_get_all() {
|
||||
local base="$1" page=1 combined="[]"
|
||||
local joiner='&'; [[ "$base" == *'?'* ]] || joiner='?'
|
||||
while :; do
|
||||
local resp http body n
|
||||
resp=$(curl -sS -w $'\n%{http_code}' \
|
||||
-H "Authorization: token $GITHUB_TOKEN" \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
"${base}${joiner}per_page=100&page=${page}")
|
||||
http="${resp##*$'\n'}"
|
||||
body="${resp%$'\n'*}"
|
||||
if [[ "$http" != "200" ]]; then
|
||||
printf '%s❌ GitHub API error (HTTP %s) for %s%s\n' "$RED" "$http" "$base" "$RESET" >&2
|
||||
printf '%s' "$body" | jq -r '.message? // empty' >&2 2>/dev/null || true
|
||||
return 1
|
||||
fi
|
||||
n=$(printf '%s' "$body" | jq 'length' 2>/dev/null || echo 0)
|
||||
[[ "$n" -eq 0 ]] && break
|
||||
combined=$(jq -s 'add' <(printf '%s' "$combined") <(printf '%s' "$body"))
|
||||
[[ "$n" -lt 100 ]] && break
|
||||
page=$((page+1))
|
||||
[[ "$page" -gt 100 ]] && break
|
||||
done
|
||||
printf '%s' "$combined"
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "${BLUE}Bytelyst CLI - Unified GitHub DevOps Tool${RESET}"
|
||||
echo ""
|
||||
@ -88,7 +47,6 @@ usage() {
|
||||
echo " check-collaborators --input <input.json>"
|
||||
echo " export --type <repos|users> --output <file.json>"
|
||||
echo " remove-user-from-all-repos --user <username> [--input <file.json>]"
|
||||
echo " agent-queue (aq) <init|add|run|status|watch|dash|stop|logs|requeue|clean> — agent prompt queue runner"
|
||||
echo " help Show this help message"
|
||||
echo ""
|
||||
echo "If no command is given, an interactive menu will be shown."
|
||||
@ -107,9 +65,8 @@ list_public_repos() {
|
||||
echo "${RED}❌ Please provide --user <username>.${RESET}"; exit 1
|
||||
fi
|
||||
echo "${BLUE}🔍 Fetching all public repositories for user: $user...${RESET}"
|
||||
local json repos
|
||||
json=$(gh_get_all "https://api.github.com/users/$user/repos?type=public") || exit 1
|
||||
repos=$(printf '%s' "$json" | jq -r '.[].full_name')
|
||||
local response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/users/$user/repos?per_page=100&type=public")
|
||||
local repos=$(echo "$response" | jq -r '.[].full_name')
|
||||
if [[ -z "$repos" ]]; then
|
||||
echo "${YELLOW}🚫 No public repositories found for user.${RESET}"
|
||||
else
|
||||
@ -130,9 +87,8 @@ list_private_repos() {
|
||||
echo "${RED}❌ Please provide --org <orgname>.${RESET}"; exit 1
|
||||
fi
|
||||
echo "${BLUE}🔍 Fetching all private repositories for org: $org...${RESET}"
|
||||
local json repos
|
||||
json=$(gh_get_all "https://api.github.com/orgs/$org/repos?type=private") || exit 1
|
||||
repos=$(printf '%s' "$json" | jq -r '.[].full_name')
|
||||
local response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/orgs/$org/repos?per_page=100&type=private")
|
||||
local repos=$(echo "$response" | jq -r '.[].full_name')
|
||||
if [[ -z "$repos" ]]; then
|
||||
echo "${YELLOW}🚫 No private repositories found for org.${RESET}"
|
||||
else
|
||||
@ -161,19 +117,12 @@ check_collaborators() {
|
||||
fi
|
||||
for repo in "${repos[@]}"; do
|
||||
echo "${BLUE}🔍 Checking repo: $repo${RESET}"
|
||||
local cjson collaborators
|
||||
cjson=$(gh_get_all "https://api.github.com/repos/$org/$repo/collaborators") \
|
||||
|| { echo "${YELLOW}⚠️ Skipping $repo (API error).${RESET}"; continue; }
|
||||
collaborators=$(printf '%s' "$cjson" | jq -r '.[].login')
|
||||
local collaborators=$(curl -s -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/repos/$org/$repo/collaborators" | jq -r '.[].login')
|
||||
local non_whitelisted=()
|
||||
for collab in $collaborators; do
|
||||
# explicit membership test (avoids the array-concatenation pitfall of
|
||||
# [[ " ${whitelist[@]} " =~ " $collab " ]], which false-matches substrings)
|
||||
local is_white=false w
|
||||
for w in "${whitelist[@]}"; do
|
||||
[[ "$w" == "$collab" ]] && { is_white=true; break; }
|
||||
done
|
||||
$is_white || non_whitelisted+=("$collab")
|
||||
if [[ ! " ${whitelist[@]} " =~ " ${collab} " ]]; then
|
||||
non_whitelisted+=("$collab")
|
||||
fi
|
||||
done
|
||||
if [[ ${#non_whitelisted[@]} -gt 0 ]]; then
|
||||
echo "${YELLOW}🚨 Repository: $repo${RESET}"
|
||||
@ -260,15 +209,14 @@ remove_user_from_all_repos() {
|
||||
|
||||
interactive_menu() {
|
||||
echo "${BLUE}Bytelyst CLI Interactive Menu${RESET}"
|
||||
select opt in "List Public Repos" "List Private Repos" "Check Collaborators" "Export JSON" "Remove User from All Repos" "Agent Queue Status" "Exit"; do
|
||||
select opt in "List Public Repos" "List Private Repos" "Check Collaborators" "Export JSON" "Remove User from All Repos" "Exit"; do
|
||||
case $REPLY in
|
||||
1) read -p "Enter GitHub username: " user; list_public_repos --user "$user";;
|
||||
2) read -p "Enter GitHub org: " org; list_private_repos --org "$org";;
|
||||
3) read -p "Enter path to input.json: " input; check_collaborators --input "$input";;
|
||||
4) read -p "Export type (repos/users): " type; read -p "Output file: " output; export_json --type "$type" --output "$output";;
|
||||
5) read -p "Enter GitHub username: " user; remove_user_from_all_repos --user "$user";;
|
||||
6) "$CLI_DIR/agent-queue/agent-queue.sh" status;;
|
||||
7) exit 0;;
|
||||
6) exit 0;;
|
||||
*) echo "Invalid option.";;
|
||||
esac
|
||||
done
|
||||
@ -286,7 +234,6 @@ case $1 in
|
||||
check-collaborators) shift; check_collaborators "$@";;
|
||||
export) shift; export_json "$@";;
|
||||
remove-user-from-all-repos) shift; remove_user_from_all_repos "$@";;
|
||||
agent-queue|aq) shift; exec "$CLI_DIR/agent-queue/agent-queue.sh" "$@";;
|
||||
help|--help|-h) usage;;
|
||||
*) echo "${RED}Unknown command: $1${RESET}"; usage; exit 1;;
|
||||
esac
|
||||
@ -1,49 +0,0 @@
|
||||
# CLI install report (WSL) — final
|
||||
|
||||
Generated: 2026-05-29T21:20:00-07:00
|
||||
|
||||
System: WSL Ubuntu (user v-sadhandapa)
|
||||
|
||||
Installed CLIs (verified):
|
||||
|
||||
- Claude Code
|
||||
- Path: /home/v-sadhandapa/.local/bin/claude
|
||||
- Version: 2.1.158
|
||||
- Install method: official installer (https://claude.ai/install.sh)
|
||||
- Auth: claude auth login (or claude auth setup-token; ANTHROPIC_API_KEY env)
|
||||
|
||||
- OpenAI Codex
|
||||
- Path: /home/v-sadhandapa/.npm-global/bin/codex
|
||||
- Version: codex-cli 0.135.0
|
||||
- Install method: npm i -g @openai/codex (fallback installer used if necessary)
|
||||
- Auth: codex login (or: printenv OPENAI_API_KEY | codex login --with-api-key)
|
||||
|
||||
- Devin
|
||||
- Path: /home/v-sadhandapa/.local/bin/devin
|
||||
- Version: devin 2026.5.26-2
|
||||
- Install method: official installer (https://cli.devin.ai/install.sh)
|
||||
- Auth: devin auth login
|
||||
|
||||
- Antigravity (agy)
|
||||
- Path: /home/v-sadhandapa/.local/bin/agy
|
||||
- Version: 1.0.3
|
||||
- Install method: official installer (https://antigravity.google/cli/install.sh)
|
||||
- Auth: agy login
|
||||
|
||||
- GitHub Copilot CLI
|
||||
- Path: /snap/bin/copilot
|
||||
- Version: 1.0.56
|
||||
- Install method: sudo snap install copilot-cli
|
||||
- Auth: copilot auth login
|
||||
|
||||
PATH changes made:
|
||||
- ~/.npm-global/bin and ~/.local/bin were added to ~/.profile and ~/.bashrc (persisted)
|
||||
|
||||
Symlinks:
|
||||
- A helper script was added to the repo: ./make_symlinks_wsl.sh — run it to create /usr/local/bin symlinks (requires sudo).
|
||||
|
||||
Notes:
|
||||
- No API keys or credentials were added to any shell profiles.
|
||||
- For interactive logins, run the auth commands listed above; they may prompt or open device-flow URLs.
|
||||
- Logs: ~/cli-install-wsl.log
|
||||
|
||||
@ -11,86 +11,74 @@ on:
|
||||
- 'pnpm-lock.yaml'
|
||||
- 'pnpm-workspace.yaml'
|
||||
- '.pnpmfile.cjs'
|
||||
- '.gitea/workflows/ci.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'backend/**'
|
||||
- 'web/**'
|
||||
- 'shared/**'
|
||||
- 'package.json'
|
||||
- 'pnpm-lock.yaml'
|
||||
- 'pnpm-workspace.yaml'
|
||||
- '.pnpmfile.cjs'
|
||||
- '.gitea/workflows/ci.yml'
|
||||
|
||||
concurrency:
|
||||
group: ci-devops-dashboard-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
# Self-contained CI: resolve @bytelyst/* deps from the local Gitea registry
|
||||
# rather than a sibling learning_ai_common_plat checkout on the runner.
|
||||
BYTELYST_PACKAGE_SOURCE: gitea
|
||||
|
||||
jobs:
|
||||
build-and-test:
|
||||
name: Build, Test & Typecheck
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
# Check out into the runner workspace (${{ gitea.workspace }}) instead of
|
||||
# cd-ing into a hard-coded host path and `git reset --hard` on the live
|
||||
# checkout. CI must never mutate an operator's working tree.
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Node
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- name: Enable pnpm
|
||||
- name: Pull latest
|
||||
run: |
|
||||
corepack enable
|
||||
corepack prepare pnpm@10.6.5 --activate
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
git fetch origin main
|
||||
git checkout main
|
||||
git reset --hard origin/main
|
||||
|
||||
- name: Secret scan
|
||||
run: pnpm secret-scan
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm secret-scan
|
||||
|
||||
- name: Install dependencies
|
||||
run: pnpm install:gitea
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm install:common-plat
|
||||
|
||||
- name: Build backend
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-backend build
|
||||
|
||||
- name: Build web
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-web build
|
||||
|
||||
- name: Typecheck backend
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-backend typecheck
|
||||
|
||||
- name: Typecheck web
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-web typecheck
|
||||
|
||||
- name: Test backend
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-backend test:run
|
||||
|
||||
- name: Test web
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-web test:run
|
||||
|
||||
- name: Lint
|
||||
run: pnpm lint
|
||||
|
||||
- name: Typecheck
|
||||
run: pnpm typecheck
|
||||
|
||||
- name: Build
|
||||
run: pnpm build
|
||||
|
||||
- name: Unit tests
|
||||
run: pnpm test:run
|
||||
|
||||
# Coverage gate for the backend's tested modules (auth, csrf, health,
|
||||
# hermes-ops, deployments/orchestrator, services). Thresholds live in
|
||||
# `backend/vitest.config.ts`. Add files there as they gain real tests
|
||||
# — ratchet up, never relax.
|
||||
- name: Coverage gate (backend)
|
||||
run: pnpm --filter @bytelyst/devops-backend test:coverage
|
||||
|
||||
# Playwright browsers are pulled per-CI-run. The web suite (`pnpm
|
||||
# test:e2e`) starts its own Next dev server via Playwright's
|
||||
# `webServer` config; the backend is intentionally NOT started — the
|
||||
# hermes spec intercepts `/api/hermes/ops` (which would otherwise
|
||||
# need to shell out to systemctl/git/ps on a live VM) and the
|
||||
# dashboard spec mocks every other backend route via `page.route`.
|
||||
# See `docs/prompts/ci-e2e-hardening.md` for the design.
|
||||
- name: Install Playwright browsers
|
||||
run: pnpm --filter @bytelyst/devops-web exec playwright install --with-deps chromium
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-backend lint
|
||||
pnpm --filter @bytelyst/devops-web lint
|
||||
|
||||
- name: E2E tests
|
||||
run: pnpm --filter @bytelyst/devops-web test:e2e
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
pnpm --filter @bytelyst/devops-web test:e2e
|
||||
|
||||
docker-build:
|
||||
name: Build Docker Images
|
||||
@ -98,17 +86,26 @@ jobs:
|
||||
needs: [build-and-test]
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Pull latest
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
git fetch origin main
|
||||
git checkout main
|
||||
git reset --hard origin/main
|
||||
|
||||
- name: Build backend Docker image
|
||||
run: docker build -f backend/Dockerfile -t devops-backend:latest .
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
docker build -f backend/Dockerfile -t devops-backend:latest .
|
||||
|
||||
- name: Build web Docker image
|
||||
run: docker build -f web/Dockerfile -t devops-web:latest .
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
docker build -f web/Dockerfile -t devops-web:latest .
|
||||
|
||||
- name: Test Docker Compose
|
||||
run: |
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
docker compose up -d
|
||||
sleep 10
|
||||
docker compose down
|
||||
|
||||
@ -1 +0,0 @@
|
||||
@bytelyst:registry=http://localhost:3300/api/packages/learning_ai_user/npm/
|
||||
@ -1,533 +1,111 @@
|
||||
# DevOps & Admin Dashboard Deployment Guide
|
||||
# DevOps Dashboard Deployment Guide
|
||||
|
||||
> Canonical deployment doc for `dashboard/`. The previous `DEPLOYMENT_GUIDE.md`
|
||||
> has been folded into this file; it remains as a one-line redirect for
|
||||
> backwards compatibility with `deploy.sh` and external links.
|
||||
## Current Status
|
||||
|
||||
## Overview
|
||||
The DevOps dashboard has been significantly enhanced with production-ready features, but deployment requires resolving workspace dependencies.
|
||||
|
||||
This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboard using the existing Traefik gateway infrastructure, following the same pattern as the trading dashboard (https://invttrdg.bytelyst.com).
|
||||
## Dependency Issues
|
||||
|
||||
## Public URLs
|
||||
The dashboard currently depends on workspace packages from `learning_ai_common_plat`:
|
||||
- `@bytelyst/config` - Configuration management
|
||||
- `@bytelyst/auth` - Authentication utilities
|
||||
- `@bytelyst/cosmos` - Cosmos DB client
|
||||
- `@bytelyst/errors` - Error handling
|
||||
- `@bytelyst/react-auth` - React auth context
|
||||
- `@bytelyst/telemetry-client` - Telemetry
|
||||
|
||||
For the full living bookmark list across all ByteLyst apps, APIs, Hermes
|
||||
dashboards, and last deploy timestamps, see
|
||||
[`../docs/app-url-bookmarks.md`](../docs/app-url-bookmarks.md).
|
||||
## Deployment Options
|
||||
|
||||
- **DevOps Dashboard**: `https://devops.bytelyst.com`
|
||||
- **Admin Dashboard**: `https://admin.bytelyst.com`
|
||||
- **API Gateway**: `https://api.bytelyst.com`
|
||||
- Platform API: `https://api.bytelyst.com/platform/api`
|
||||
- DevOps API: `https://api.bytelyst.com/api/devops`
|
||||
### Option 1: Deploy with Common Platform (Recommended)
|
||||
|
||||
## Ports — quick reference
|
||||
|
||||
The web container always listens on **3000** internally; what changes is what
|
||||
the host exposes. Memorize the column for the deployment mode you're in:
|
||||
|
||||
| Mode | Web (host) | Backend (host) | Notes |
|
||||
|-------------------------------------|--------------------|-------------------|--------------------------------------------------------------------|
|
||||
| Local dev (`pnpm dev`) | `localhost:3000` | `localhost:4004` | Next listens directly on 3000. |
|
||||
| Docker Compose (this repo) | `localhost:3049` | `localhost:4004` | `docker-compose.yml` maps `127.0.0.1:3049:3000` (loopback only). |
|
||||
| Production (Traefik) | `https://devops.bytelyst.com` | `https://api.bytelyst.com/api/devops` | Traefik label `loadbalancer.server.port=3000` targets the container port. |
|
||||
|
||||
Whenever a doc says "the dashboard runs on port 3000", it means the **container
|
||||
port** seen by Traefik / Next dev mode — not the host port for the deployed
|
||||
stack. Use the table above instead of relying on prose.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Internet → Traefik Gateway → Services
|
||||
├─ DevOps Web (container :3000, host :3049)
|
||||
├─ DevOps Backend (:4004)
|
||||
├─ Admin Web (:3001)
|
||||
├─ Platform Service (:4003)
|
||||
└─ Trading Dashboard (:3085)
|
||||
```
|
||||
|
||||
- **Traefik**: API gateway and reverse proxy.
|
||||
- **Docker network**: All services connect via `learning_ai_common_plat_default`.
|
||||
- **Domain routing**: Traefik routes by host header.
|
||||
- **SSL/TLS**: Managed by Traefik with Let's Encrypt.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Platform stack running with Traefik gateway.
|
||||
2. Docker and Docker Compose installed.
|
||||
3. Domain names configured with DNS pointing to your server.
|
||||
4. Azure Cosmos DB account (shared with platform-service).
|
||||
5. Platform Service running and accessible.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Start the platform stack (if not running)
|
||||
**Prerequisites:**
|
||||
1. Ensure `learning_ai_common_plat` packages are built and available
|
||||
2. Configure npm registry to point to local package registry
|
||||
3. Use the provided install scripts
|
||||
|
||||
**Steps:**
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
|
||||
# Install dependencies with common platform
|
||||
pnpm install:common-plat
|
||||
|
||||
# Build both backend and web
|
||||
pnpm build
|
||||
|
||||
# Deploy with Docker Compose
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### 2. Deploy the dashboards
|
||||
### Option 2: Deploy Standalone (Simplified)
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_devops_tools/dashboard
|
||||
./deploy.sh
|
||||
```
|
||||
**Prerequisites:**
|
||||
1. Remove workspace dependencies
|
||||
2. Implement simplified auth/config/cosmos layers
|
||||
3. Set up environment variables
|
||||
|
||||
This will:
|
||||
- Deploy the DevOps Dashboard (backend + web)
|
||||
- Deploy the Admin Dashboard via the platform stack
|
||||
- Run health checks
|
||||
- Print deployment information
|
||||
|
||||
## Local development
|
||||
|
||||
If you only need a non-containerized iteration loop (no Traefik, no Docker):
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_devops_tools/dashboard
|
||||
|
||||
# Resolve workspace deps
|
||||
pnpm install:common-plat # uses sibling learning_ai_common_plat checkout
|
||||
# or
|
||||
pnpm install:gitea # uses local Gitea registry at localhost:3300
|
||||
|
||||
pnpm dev # backend on 4004, web on 3000 (NOT 3049)
|
||||
```
|
||||
|
||||
Required env vars are documented under **Environment Configuration** below; for
|
||||
local dev a minimal `.env` with `JWT_SECRET`, `COSMOS_*`, and
|
||||
`PLATFORM_SERVICE_URL` is enough.
|
||||
|
||||
## Manual Docker deployment
|
||||
|
||||
### Deploy DevOps Dashboard
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_devops_tools/dashboard
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
### Deploy Admin Dashboard
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose up -d admin-web
|
||||
```
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
### DevOps Dashboard (`.env`)
|
||||
|
||||
```bash
|
||||
# Backend
|
||||
**Environment Variables Required:**
|
||||
```env
|
||||
PORT=4004
|
||||
PLATFORM_SERVICE_URL=http://platform-service:4003
|
||||
PLATFORM_SERVICE_URL=http://localhost:4003
|
||||
COSMOS_ENDPOINT=https://your-cosmos-account.documents.azure.com:443/
|
||||
COSMOS_KEY=your-cosmos-primary-key
|
||||
COSMOS_DATABASE=bytelyst-platform
|
||||
JWT_SECRET=your-production-jwt-secret
|
||||
CSRF_SECRET=your-production-csrf-secret
|
||||
ENCRYPTION_KEY=your-production-encryption-key
|
||||
PRODUCT_ID=bytelyst-devops
|
||||
PRODUCT_NAME=ByteLyst DevOps Dashboard
|
||||
|
||||
# Azure Key Vault (optional)
|
||||
AZURE_TENANT_ID=your-tenant-id
|
||||
AZURE_CLIENT_ID=your-client-id
|
||||
AZURE_CLIENT_SECRET=your-client-secret
|
||||
AZURE_KEY_VAULT_URL=https://your-keyvault.vault.azure.net/
|
||||
|
||||
# Frontend
|
||||
NEXT_PUBLIC_DEVOPS_API_URL=https://api.bytelyst.com/devops
|
||||
NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
|
||||
NEXT_PUBLIC_ADMIN_WEB_URL=https://admin.bytelyst.com
|
||||
NEXT_PUBLIC_PRODUCT_ID=bytelyst-devops
|
||||
NEXT_PUBLIC_PRODUCT_NAME=ByteLyst DevOps Dashboard
|
||||
JWT_SECRET=your-jwt-signing-secret
|
||||
CSRF_SECRET=your-csrf-secret-change-in-production
|
||||
```
|
||||
|
||||
### Platform Dashboard (`.env`)
|
||||
|
||||
Add to your platform `.env`:
|
||||
|
||||
**Steps:**
|
||||
```bash
|
||||
# Admin Web Dashboard
|
||||
NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
|
||||
NEXT_PUBLIC_DEVOPS_WEB_URL=https://devops.bytelyst.com
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard/backend
|
||||
npm install
|
||||
npm run build
|
||||
npm start
|
||||
|
||||
# In another terminal:
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard/web
|
||||
npm install
|
||||
npm run build
|
||||
npm start
|
||||
```
|
||||
|
||||
## Traefik Configuration
|
||||
### Option 3: Deploy to Production Server
|
||||
|
||||
Both dashboards use Traefik labels for routing.
|
||||
**Prerequisites:**
|
||||
1. Production server with Node.js 22+
|
||||
2. Azure Cosmos DB account
|
||||
3. Platform service instance
|
||||
4. Docker installed
|
||||
|
||||
### DevOps Web
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.enable=true'
|
||||
- 'traefik.http.routers.devops-web.rule=Host(`devops.bytelyst.com`)'
|
||||
- 'traefik.http.services.devops-web.loadbalancer.server.port=3000' # container port
|
||||
```
|
||||
|
||||
### DevOps Backend API
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.enable=true'
|
||||
- 'traefik.http.routers.devops-api.rule=PathPrefix(`/api/devops`)'
|
||||
- 'traefik.http.services.devops-api.loadbalancer.server.port=4004'
|
||||
```
|
||||
|
||||
### Admin Web
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.enable=true'
|
||||
- 'traefik.http.routers.admin-web.rule=Host(`admin.bytelyst.com`)'
|
||||
- 'traefik.http.services.admin-web.loadbalancer.server.port=3001'
|
||||
```
|
||||
|
||||
## DNS Configuration
|
||||
|
||||
Add DNS records pointing to your Traefik gateway server:
|
||||
|
||||
```
|
||||
devops.bytelyst.com A <your-server-ip>
|
||||
admin.bytelyst.com A <your-server-ip>
|
||||
api.bytelyst.com A <your-server-ip>
|
||||
```
|
||||
|
||||
## SSL/TLS Configuration
|
||||
|
||||
Traefik can automatically handle SSL certificates with Let's Encrypt:
|
||||
|
||||
```yaml
|
||||
command:
|
||||
- '--certificatesresolvers.myresolver.acme.tlschallenge=true'
|
||||
- '--certificatesresolvers.myresolver.acme.email=admin@bytelyst.com'
|
||||
- '--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json'
|
||||
```
|
||||
|
||||
Then update router labels:
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.http.routers.devops-web.tls=true'
|
||||
- 'traefik.http.routers.devops-web.tls.certresolver=myresolver'
|
||||
```
|
||||
|
||||
## Cross-Navigation
|
||||
|
||||
### DevOps Dashboard → Admin Dashboard
|
||||
- Header includes a "Platform Admin" link with Shield icon.
|
||||
- Opens admin dashboard in a new tab.
|
||||
- Uses `NEXT_PUBLIC_ADMIN_WEB_URL`.
|
||||
|
||||
### Admin Dashboard → DevOps Dashboard
|
||||
- Sidebar includes a "DevOps Dashboard" link with Server icon.
|
||||
- Opens devops dashboard in a new tab.
|
||||
- Uses `NEXT_PUBLIC_DEVOPS_WEB_URL`.
|
||||
|
||||
## Shared Authentication
|
||||
|
||||
1. **Platform Service Auth**: Both authenticate against platform-service.
|
||||
2. **JWT Tokens**: Same `JWT_SECRET` validates tokens across services.
|
||||
3. **Per-Product Access**: Admin access is checked per-product via membership roles.
|
||||
4. **Single Sign-On**: Users stay logged in across both dashboards.
|
||||
|
||||
### Granting Access
|
||||
|
||||
To grant a user access to both dashboards:
|
||||
|
||||
1. Ensure user exists in platform-service.
|
||||
2. Add admin membership for both products:
|
||||
|
||||
```json
|
||||
{
|
||||
"memberships": [
|
||||
{ "productId": "bytelyst-devops", "role": "admin", "plan": "pro" },
|
||||
{ "productId": "bytelyst-platform", "role": "admin", "plan": "pro" }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Health Checks
|
||||
|
||||
- DevOps Backend: `http://localhost:4004/health`
|
||||
- DevOps Web: `http://localhost:3049` (Docker Compose host port; container :3000)
|
||||
- Admin Web: `http://localhost:3001`
|
||||
- Traefik Dashboard: `http://localhost:8080`
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Network issues
|
||||
**Steps:**
|
||||
```bash
|
||||
# Check if the platform network exists
|
||||
docker network inspect learning_ai_common_plat_default
|
||||
# Build Docker images
|
||||
docker-compose build
|
||||
|
||||
# Check container connectivity
|
||||
docker network inspect learning_ai_common_plat_default | grep devops
|
||||
# Tag and push to registry
|
||||
docker tag devops-backend:latest your-registry/devops-backend:latest
|
||||
docker tag devops-web:latest your-registry/devops-web:latest
|
||||
docker push your-registry/devops-backend:latest
|
||||
docker push your-registry/devops-web:latest
|
||||
|
||||
# On production server:
|
||||
docker pull your-registry/devops-backend:latest
|
||||
docker pull your-registry/devops-web:latest
|
||||
docker-compose -f docker-compose.prod.yml up -d
|
||||
```
|
||||
|
||||
### Traefik routing
|
||||
```bash
|
||||
# Traefik dashboard
|
||||
http://localhost:8080
|
||||
|
||||
# Traefik logs
|
||||
docker logs $(docker ps -q -f name=gateway)
|
||||
|
||||
# Router config for the devops web container
|
||||
docker inspect devops-web | grep -A 10 Labels
|
||||
```
|
||||
|
||||
### Authentication failures
|
||||
- Verify `JWT_SECRET` matches across all services.
|
||||
- Check platform-service is accessible: `curl http://localhost:4003/health`.
|
||||
- Ensure the user has the right product memberships.
|
||||
|
||||
### Service not starting
|
||||
```bash
|
||||
docker logs devops-backend
|
||||
docker logs devops-web
|
||||
docker logs admin-web
|
||||
docker ps
|
||||
docker inspect devops-backend | grep -A 5 Health
|
||||
```
|
||||
|
||||
### Workspace dependency errors
|
||||
```bash
|
||||
pnpm install:common-plat # local sibling checkout
|
||||
pnpm install:gitea # local Gitea registry
|
||||
```
|
||||
|
||||
## Service Management
|
||||
|
||||
### Stop services
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_devops_tools/dashboard
|
||||
docker-compose down
|
||||
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose stop admin-web
|
||||
```
|
||||
|
||||
### Restart services
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_devops_tools/dashboard
|
||||
docker-compose restart
|
||||
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose restart admin-web
|
||||
```
|
||||
|
||||
### View logs
|
||||
```bash
|
||||
# DevOps
|
||||
docker logs -f devops-backend
|
||||
docker logs -f devops-web
|
||||
|
||||
# Admin
|
||||
docker logs -f admin-web
|
||||
|
||||
# Traefik
|
||||
docker logs -f gateway
|
||||
```
|
||||
|
||||
## Comparison with Trading Dashboard
|
||||
|
||||
| Feature | Trading | DevOps | Admin |
|
||||
|--------------|----------------------|-------------------------|------------------------|
|
||||
| Domain | invttrdg.bytelyst.com| devops.bytelyst.com | admin.bytelyst.com |
|
||||
| Web Port | 3085 (host) | 3049 (host) / 3000 (ctr)| 3001 (host) |
|
||||
| Backend Port | 4018 | 4004 | N/A |
|
||||
| Network | platform_net | platform_net | default |
|
||||
| Traefik | Yes | Yes | Yes |
|
||||
| Auth | Platform | Platform | Platform |
|
||||
|
||||
## Privilege Surface (Docker socket + host mounts)
|
||||
|
||||
The `devops-backend` container has root-equivalent access to the host. This
|
||||
section documents exactly what is mounted, which routes use each mount, and
|
||||
what the blast radius looks like if an admin token leaks. It exists so reviewers
|
||||
don't have to reverse-engineer this from `docker-compose.yml` and the route
|
||||
handlers — and so any future change to the mount set is reviewed against this
|
||||
list rather than slipped in.
|
||||
|
||||
### Mounts (from `docker-compose.yml`)
|
||||
|
||||
| Host path | Container path | Mode | Purpose |
|
||||
|------------------------------------|-----------------------------------|------|-------------------------------------------------------------------------|
|
||||
| `/var/run/docker.sock` | `/var/run/docker.sock` | rw | Allows `docker` CLI inside the container to control the host daemon. Used by the `system` and `vm` modules. **Equivalent to root on the host.** |
|
||||
| `/opt/bytelyst/learning_ai_devops_tools/scripts` | `/vm-scripts` | ro | Bash scripts the `vm` module shells out to (`HostingerVM/*.sh`). Read-only mount; the container cannot modify the script set. |
|
||||
| `/var/log/vm-cleanup.log` | `/host-logs/vm-cleanup.log` | rw | The `vm` cleanup script appends here; backend reads it via `/api/vm/cleanup-log`. |
|
||||
| `/var/log/vm-health-check.log` | `/host-logs/vm-health-check.log` | rw | Health-check probe output; backend reads it via `/api/vm/health`. |
|
||||
| `/var/log/docker-watchdog.log` | `/host-logs/docker-watchdog.log` | rw | Watchdog tail used by the VM panel. |
|
||||
| `extra_hosts: host-gateway` | `host.docker.internal`-equivalent | — | Lets the container reach `host:11434` (Ollama) and other host-only services. Not a filesystem mount, but a privilege-relevant capability — the container can talk to anything bound to `127.0.0.1` on the host. |
|
||||
|
||||
The container's listening port (`4004`) is bound to `127.0.0.1` only, so the
|
||||
API is **not** exposed to the public internet by this compose file — access is
|
||||
expected via Tailscale or an SSH tunnel. Any reverse proxy in front of it
|
||||
(Traefik in production) is responsible for its own auth + TLS.
|
||||
|
||||
### What shells out + which routes (auth column = effective gate)
|
||||
|
||||
| Route | Handler module | What it executes | Auth |
|
||||
|--------------------------------------------------|-------------------------------|-----------------------------------------------------------------------------------|-------------|
|
||||
| `GET /system/metrics` | `system/repository.ts` | `df -h ...` | `requireAdmin` |
|
||||
| `GET /docker/stats` | `system/repository.ts` | `docker images / ps / volume ls / system df` (read-only) | `requireAdmin` |
|
||||
| `POST /docker/cleanup` | `system/repository.ts` | `docker container prune -f`, `docker image prune -a -f`, `docker volume prune -f`, `docker builder prune -f` (a fixed allow-list — request body picks one of the four "types") | `requireAdmin` |
|
||||
| `GET /vm/health` | `vm/repository.ts` | `bash $VM_SCRIPTS_PATH/vm-health-check.sh --json` | `requireAdmin` |
|
||||
| `GET /vm/cleanup-log` | `vm/repository.ts` | reads `/host-logs/vm-cleanup.log` | `requireAdmin` |
|
||||
| `GET /vm/cron-status` | `vm/repository.ts` | `crontab -l` | `requireAdmin` |
|
||||
| `POST /vm/cleanup` | `vm/repository.ts` | `bash $VM_SCRIPTS_PATH/vm-cleanup.sh` | `requireAdmin` |
|
||||
| `GET /vm/containers`, `.../unhealthy`, `.../:name/logs` | `vm/repository.ts` | `docker ps`, `docker inspect`, `docker stats`, `docker logs` | `requireAdmin` |
|
||||
| `POST /vm/containers/:name/restart` | `vm/repository.ts` | `docker restart "<name>"` (name is a path param — see "Known sharp edges" below) | `requireAdmin` |
|
||||
| `GET /vm/ollama/models`, `DELETE /vm/ollama/models/:name` | `vm/repository.ts` | HTTP-only (talks to host Ollama via `host-gateway`). No shell-out. | `requireAdmin` |
|
||||
| `POST /code-quality/check` | `code-quality/repository.ts` | `npm run typecheck`, `npm run lint`, `npm run build`, `npm run test:run` in the request-supplied `projectPath`. | `requireAdmin` *(added concurrently with this doc; previously unauthenticated — see the Phase 5 P1 commit)* |
|
||||
| `POST /deployments/trigger/:serviceId` | `deployments/orchestrator.ts` | `bash <service.scriptPath>` from the registered service registry (paths are stored at create-time, not request-time). | `requireAdmin` |
|
||||
| `/hermes/ops` (snapshot) | `hermes-ops/repository.ts` | Read-only probes: `systemctl is-active/is-enabled`, `git status`, `du -sh`, `ps`, `tailscale ip`, `runuser -u uma -- systemctl --user ...`. No state-changing commands. | `requireAdmin` *(Phase 7 — private-only)* |
|
||||
| `/hermes/telemetry/:instance` | `hermes-telemetry/repository.ts` | Read-only: `runuser -u <user> -- hermes sessions/cron/memory/skills list --json`, `git -C <backup-repo> log`, tail of the watchdog log. No state-changing commands. | `requireAdmin` |
|
||||
|
||||
### Blast radius if an admin token is leaked
|
||||
|
||||
Anyone holding a valid admin JWT for this product can, today:
|
||||
|
||||
- Run any of the four pre-defined `docker prune` commands (data loss for
|
||||
containers/images/volumes), restart any container, read any container's logs.
|
||||
- Trigger the host VM cleanup script and crontab listing.
|
||||
- Trigger any deployment script registered in the service registry.
|
||||
- Run `npm run` lifecycle scripts in any directory the container can read
|
||||
(since `code-quality/check` takes a caller-supplied `projectPath`).
|
||||
- Read the three host logs that are mounted in.
|
||||
|
||||
In other words, an admin token is **equivalent to a host shell**, modulo the
|
||||
specific commands the codebase chooses to wrap. There is currently **no
|
||||
allow-list wrapper** between the backend and the docker socket; the backend
|
||||
constructs `docker ...` shell strings directly with `execAsync`.
|
||||
|
||||
### Known sharp edges (track and shrink)
|
||||
|
||||
1. **Container name is interpolated into a shell string.** `docker restart
|
||||
"${name}"` and similar paths in `vm/repository.ts` use `execAsync` with a
|
||||
template literal. The `:name` path parameter is admin-only but is not
|
||||
validated against a `^[a-zA-Z0-9._-]+$` allow-list. Lock this down before
|
||||
exposing the dashboard to a wider admin pool.
|
||||
2. **`projectPath` for `/code-quality/check` is unvalidated.** The handler
|
||||
passes the caller-supplied path straight into `execAsync({ cwd })`. Even
|
||||
with `requireAdmin` added, this should be constrained to a known set of
|
||||
project roots (or rejected if it escapes the workspace).
|
||||
3. **No per-route audit-log on shell-outs.** `audit/repository.ts` records
|
||||
deployment triggers but not `/docker/cleanup` or `/vm/cleanup`. A leaked
|
||||
token's actions are reconstructable only from container stdout + host logs.
|
||||
4. **The container runs as root.** Both the backend `Dockerfile` and the bind-
|
||||
mounts assume root. A non-root user with `docker` group membership would
|
||||
shrink the in-container blast radius without losing functionality (the
|
||||
socket is still root on the host); revisit when ready.
|
||||
5. **`fastify-rate-limit` is global, not per-route.** A leaked admin token
|
||||
currently isn't slowed down on the destructive endpoints any more than it
|
||||
is on read-only ones.
|
||||
|
||||
### Mitigation roadmap (incremental, not all at once)
|
||||
|
||||
- [x] **P1:** Allow-list wrapper around shell-outs. *(`lib/shell.ts` ships with
|
||||
`execAllowed` (no shell, just `execFile` with an explicit argv) plus
|
||||
per-command helpers — `dockerRestart(name)` validates against
|
||||
`[a-zA-Z0-9][a-zA-Z0-9._-]{0,127}`, `dockerPrune(kind, {all?})` validates
|
||||
kind ∈ {container,image,volume,builder} and rejects `--all` on non-image,
|
||||
`runBashScript(path, args, {allowedRoots})` and `runNpmScript(script,
|
||||
{cwd, allowedRoots})` lock both the script path and cwd to a configured
|
||||
set of roots. 17 unit tests cover the rejection paths; `vm/restartContainer`
|
||||
and `system/dockerCleanup` migrated. Module covered by the test:coverage
|
||||
gate (≥95% lines).)*
|
||||
- [x] **P1:** Validate `/code-quality/check`'s `projectPath` against a
|
||||
configured set of allowed roots. *(`runCodeQualityCheck` now calls
|
||||
`assertPathInAllowedRoots(projectPath, getAllowedRoots())` before any
|
||||
lifecycle script runs; `getAllowedRoots()` reads
|
||||
`CODE_QUALITY_ALLOWED_ROOTS` (colon-separated) with a default of
|
||||
`/opt/bytelyst`. The path is also re-resolved (normalised, `..`
|
||||
collapsed) before being passed to `runNpmScript`, which lifts it to its
|
||||
own argv slot — no shell interpolation.)*
|
||||
- [x] **P2:** Audit-log every shell-out (command + arg vector + actor + result).
|
||||
*(Audit schema extended with `action: 'shell-exec'` + `entityType: 'host'`.
|
||||
`POST /docker/cleanup`, `POST /vm/cleanup`, `POST /vm/containers/:name/restart`
|
||||
now write a Cosmos audit row including the actor (`authUserId`/`authRole`),
|
||||
entity id (`docker-cleanup:<type>` etc.), and a sanitized details payload.
|
||||
Audit writes are best-effort — a Cosmos hiccup logs a warn but never
|
||||
fails the request.)*
|
||||
- [x] **P2:** Run the backend container as a non-root user with `docker` group
|
||||
membership; rebuild the Dockerfile accordingly. *(Dockerfile scaffolds
|
||||
a non-root `app` user (uid 1001) with `docker` group membership at a
|
||||
build-arg-configurable GID. Default `BACKEND_USER=root` preserves the
|
||||
current behaviour so existing deployments don't break; set
|
||||
`BACKEND_USER=app` and `DOCKER_GID=$(getent group docker | cut -d: -f3)`
|
||||
to flip it on. Requires host-side prep on the bind-mounted log files —
|
||||
see "Running non-root" below for the exact `chmod`/`chgrp` recipe.)*
|
||||
- [ ] **P3:** Move from `docker.sock` to a thin daemon (`docker-proxy`-style)
|
||||
that exposes only the verbs the dashboard actually needs (`stats`,
|
||||
`restart`, `logs`, the four `prune` variants).
|
||||
|
||||
### Running non-root
|
||||
|
||||
Concrete recipe to flip the backend off root:
|
||||
|
||||
```bash
|
||||
# 1. Find the host's docker group GID
|
||||
DOCKER_GID=$(getent group docker | cut -d: -f3)
|
||||
|
||||
# 2. Make the bind-mounted log files group-owned by docker and group-writable
|
||||
# so the in-container `app` user (gid=$DOCKER_GID) can read/write them.
|
||||
sudo chgrp docker /var/log/vm-cleanup.log /var/log/vm-health-check.log /var/log/docker-watchdog.log
|
||||
sudo chmod g+rw /var/log/vm-cleanup.log /var/log/vm-health-check.log /var/log/docker-watchdog.log
|
||||
|
||||
# 3. Confirm the VM scripts mount is world-readable (it's read-only inside
|
||||
# the container, so 0o755 on the directory is enough).
|
||||
sudo chmod -R o+rX /opt/bytelyst/learning_ai_devops_tools/scripts
|
||||
|
||||
# 4. Rebuild the backend image with BACKEND_USER=app and the host's GID.
|
||||
cd /opt/bytelyst/learning_ai_devops_tools/dashboard
|
||||
docker compose build --build-arg BACKEND_USER=app --build-arg DOCKER_GID=$DOCKER_GID backend
|
||||
|
||||
# 5. Restart and verify
|
||||
docker compose up -d backend
|
||||
docker exec devops-backend whoami # → app
|
||||
docker exec devops-backend id # uid=1001(app) gid=$DOCKER_GID(docker)
|
||||
curl -fsS http://localhost:4004/health
|
||||
```
|
||||
|
||||
If the backend can't reach the docker socket after the flip, double-check
|
||||
the in-container `id` matches `getent group docker` on the host. The
|
||||
`docker.sock` bind-mount carries its host ownership into the container,
|
||||
so the in-container gid must match.
|
||||
|
||||
Operators reviewing whether to grant a new admin should read this whole section
|
||||
before doing so. Adding a new shell-out path in code is a **privilege change**
|
||||
and must update this table in the same commit.
|
||||
|
||||
## Production Checklist
|
||||
|
||||
- [ ] Platform stack running with Traefik.
|
||||
- [ ] DNS records configured.
|
||||
- [ ] SSL/TLS certificates configured in Traefik.
|
||||
- [ ] Environment variables set for production.
|
||||
- [ ] Cosmos DB connection configured.
|
||||
- [ ] `JWT_SECRET` matches across all services.
|
||||
- [ ] User memberships configured for access.
|
||||
- [ ] Health checks passing.
|
||||
- [ ] Cross-navigation links working.
|
||||
- [ ] Monitoring and logging configured.
|
||||
|
||||
## Features Implemented
|
||||
|
||||
### Backend (port 4004)
|
||||
The dashboard includes these production-ready features:
|
||||
|
||||
### Backend (Port 4004)
|
||||
- ✅ CI/CD pipeline with Gitea Actions
|
||||
- ✅ E2E tests with Playwright (gated; see `.gitea/workflows/ci.yml`)
|
||||
- ✅ E2E tests with Playwright
|
||||
- ✅ Telemetry integration
|
||||
- ✅ Error boundary
|
||||
- ✅ CSRF protection with token refresh
|
||||
- ✅ Service CRUD operations
|
||||
- ✅ Deployment log retrieval (JSON polling — no SSE; see backend README)
|
||||
- ✅ Real-time log streaming (SSE)
|
||||
- ✅ Audit logging
|
||||
- ✅ Structured logging
|
||||
- ✅ Database migrations
|
||||
@ -537,13 +115,58 @@ and must update this table in the same commit.
|
||||
- ✅ Docker cleanup endpoints
|
||||
- ✅ OpenAPI/Swagger documentation at `/docs`
|
||||
|
||||
### Frontend (container :3000, host :3049 under Compose)
|
||||
### Frontend (Port 3000)
|
||||
- ✅ Service management UI
|
||||
- ✅ Deployment monitoring
|
||||
- ✅ Health dashboard
|
||||
- ✅ Metrics/charts page
|
||||
- ✅ System management page
|
||||
- ✅ Log viewer (poll-based)
|
||||
- ✅ Real-time log viewer
|
||||
- ✅ Accessibility features (ARIA, keyboard nav)
|
||||
- ✅ PWA manifest
|
||||
- ✅ Responsive design
|
||||
|
||||
## Services Configured
|
||||
|
||||
The dashboard can deploy:
|
||||
1. **Investment Trading** (`learning_ai_invt_trdg`)
|
||||
2. **Agentic Notes** (`learning_ai_notes`)
|
||||
3. **AI Clock** (`learning_ai_clock`)
|
||||
4. **Platform Services** (`learning_ai_common_plat`) - can be added
|
||||
|
||||
## Next Steps for Production Deployment
|
||||
|
||||
1. **Resolve Workspace Dependencies**: Ensure common platform packages are accessible
|
||||
2. **Configure Environment Variables**: Set production values for Cosmos, JWT, etc.
|
||||
3. **Set Up Infrastructure**: Azure Cosmos DB, platform service instance
|
||||
4. **Configure CI/CD**: Update Gitea Actions with production registry
|
||||
5. **Test Deployments**: Verify all deployment scripts work in production
|
||||
6. **Set Up Monitoring**: Configure logging, metrics, and alerting
|
||||
|
||||
## Access
|
||||
|
||||
- **Dashboard**: http://localhost:3000 (or production URL)
|
||||
- **API**: http://localhost:4004 (or production URL)
|
||||
- **API Docs**: http://localhost:4004/docs
|
||||
- **System Management**: Navigate to System page in dashboard
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Workspace dependency errors:**
|
||||
```bash
|
||||
# Use the install scripts provided
|
||||
pnpm install:common-plat # For local development
|
||||
pnpm install:gitea # For Gitea environment
|
||||
```
|
||||
|
||||
**Docker build failures:**
|
||||
- Ensure Dockerfiles reference correct lock files
|
||||
- Check that all dependencies are in registry
|
||||
- Verify context paths in docker-compose.yml
|
||||
|
||||
**Port conflicts:**
|
||||
- Backend uses port 4004
|
||||
- Web uses port 3000
|
||||
- Ensure these ports are available
|
||||
|
||||
The dashboard is feature-complete and ready for production deployment once the dependency infrastructure is resolved.
|
||||
|
||||
@ -1,5 +1,339 @@
|
||||
# DevOps & Admin Dashboard Deployment Guide
|
||||
|
||||
This file is a redirect kept for backwards compatibility (e.g. `deploy.sh`).
|
||||
The canonical deployment guide is now [`DEPLOYMENT.md`](./DEPLOYMENT.md). Open
|
||||
that file for the current content; do not edit this stub.
|
||||
## Overview
|
||||
|
||||
This guide covers deploying both the DevOps Dashboard and Platform Admin Dashboard using the existing Traefik gateway infrastructure, following the same pattern as the trading dashboard (https://invttrdg.bytelyst.com).
|
||||
|
||||
## URLs
|
||||
|
||||
- **DevOps Dashboard**: `https://devops.bytelyst.com`
|
||||
- **Admin Dashboard**: `https://admin.bytelyst.com`
|
||||
- **API Gateway**: `https://api.bytelyst.com`
|
||||
- Platform API: `https://api.bytelyst.com/platform/api`
|
||||
- DevOps API: `https://api.bytelyst.com/api/devops`
|
||||
|
||||
## Architecture
|
||||
|
||||
Both dashboards follow the same pattern as the trading dashboard:
|
||||
|
||||
```
|
||||
Internet → Traefik Gateway → Services
|
||||
├─ DevOps Web (port 3049)
|
||||
├─ DevOps Backend (port 4004)
|
||||
├─ Admin Web (port 3001)
|
||||
├─ Platform Service (port 4003)
|
||||
└─ Trading Dashboard (port 3085)
|
||||
```
|
||||
|
||||
- **Traefik**: Acts as API gateway and reverse proxy
|
||||
- **Docker Network**: All services connect via `learning_ai_common_plat_default`
|
||||
- **Domain Routing**: Traefik routes based on host headers
|
||||
- **SSL/TLS**: Managed by Traefik with Let's Encrypt
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Platform stack running with Traefik gateway
|
||||
2. Docker and Docker Compose installed
|
||||
3. Domain names configured with DNS pointing to your server
|
||||
4. Azure Cosmos DB account (shared with platform-service)
|
||||
5. Platform Service running and accessible
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Start Platform Stack (if not running)
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### 2. Deploy Dashboards
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
./deploy.sh
|
||||
```
|
||||
|
||||
This will:
|
||||
- Deploy DevOps Dashboard (backend + web)
|
||||
- Deploy Admin Dashboard via platform stack
|
||||
- Run health checks
|
||||
- Show deployment information
|
||||
|
||||
## Manual Deployment
|
||||
|
||||
### Deploy DevOps Dashboard
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
### Deploy Admin Dashboard
|
||||
|
||||
```bash
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose up -d admin-web
|
||||
```
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
### DevOps Dashboard (.env)
|
||||
|
||||
```bash
|
||||
# Backend
|
||||
PORT=4004
|
||||
PLATFORM_SERVICE_URL=http://platform-service:4003
|
||||
COSMOS_ENDPOINT=https://your-cosmos-account.documents.azure.com:443/
|
||||
COSMOS_KEY=your-cosmos-primary-key
|
||||
COSMOS_DATABASE=bytelyst-platform
|
||||
JWT_SECRET=your-production-jwt-secret
|
||||
CSRF_SECRET=your-production-csrf-secret
|
||||
ENCRYPTION_KEY=your-production-encryption-key
|
||||
PRODUCT_ID=bytelyst-devops
|
||||
PRODUCT_NAME=ByteLyst DevOps Dashboard
|
||||
|
||||
# Azure Key Vault (optional)
|
||||
AZURE_TENANT_ID=your-tenant-id
|
||||
AZURE_CLIENT_ID=your-client-id
|
||||
AZURE_CLIENT_SECRET=your-client-secret
|
||||
AZURE_KEY_VAULT_URL=https://your-keyvault.vault.azure.net/
|
||||
|
||||
# Frontend
|
||||
NEXT_PUBLIC_DEVOPS_API_URL=https://api.bytelyst.com/devops
|
||||
NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
|
||||
NEXT_PUBLIC_ADMIN_WEB_URL=https://admin.bytelyst.com
|
||||
NEXT_PUBLIC_PRODUCT_ID=bytelyst-devops
|
||||
NEXT_PUBLIC_PRODUCT_NAME=ByteLyst DevOps Dashboard
|
||||
```
|
||||
|
||||
### Platform Dashboard (.env)
|
||||
|
||||
Add to your platform `.env`:
|
||||
|
||||
```bash
|
||||
# Admin Web Dashboard
|
||||
NEXT_PUBLIC_PLATFORM_URL=https://api.bytelyst.com/platform/api
|
||||
NEXT_PUBLIC_DEVOPS_WEB_URL=https://devops.bytelyst.com
|
||||
```
|
||||
|
||||
## Traefik Configuration
|
||||
|
||||
Both dashboards use Traefik labels for routing:
|
||||
|
||||
### DevOps Web
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.enable=true'
|
||||
- 'traefik.http.routers.devops-web.rule=Host(`devops.bytelyst.com`)'
|
||||
- 'traefik.http.services.devops-web.loadbalancer.server.port=3000'
|
||||
```
|
||||
|
||||
### DevOps Backend API
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.enable=true'
|
||||
- 'traefik.http.routers.devops-api.rule=PathPrefix(`/api/devops`)'
|
||||
- 'traefik.http.services.devops-api.loadbalancer.server.port=4004'
|
||||
```
|
||||
|
||||
### Admin Web
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.enable=true'
|
||||
- 'traefik.http.routers.admin-web.rule=Host(`admin.bytelyst.com`)'
|
||||
- 'traefik.http.services.admin-web.loadbalancer.server.port=3001'
|
||||
```
|
||||
|
||||
## DNS Configuration
|
||||
|
||||
Add DNS records pointing to your Traefik gateway server:
|
||||
|
||||
```
|
||||
devops.bytelyst.com A <your-server-ip>
|
||||
admin.bytelyst.com A <your-server-ip>
|
||||
api.bytelyst.com A <your-server-ip>
|
||||
```
|
||||
|
||||
## SSL/TLS Configuration
|
||||
|
||||
Traefik can automatically handle SSL certificates with Let's Encrypt. Add to your Traefik configuration:
|
||||
|
||||
```yaml
|
||||
command:
|
||||
- '--certificatesresolvers.myresolver.acme.tlschallenge=true'
|
||||
- '--certificatesresolvers.myresolver.acme.email=admin@bytelyst.com'
|
||||
- '--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json'
|
||||
```
|
||||
|
||||
Then update router labels:
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
- 'traefik.http.routers.devops-web.tls=true'
|
||||
- 'traefik.http.routers.devops-web.tls.certresolver=myresolver'
|
||||
```
|
||||
|
||||
## Cross-Navigation Features
|
||||
|
||||
Both dashboards include cross-navigation links:
|
||||
|
||||
### DevOps Dashboard → Admin Dashboard
|
||||
- Header includes "Platform Admin" link with Shield icon
|
||||
- Opens admin dashboard in new tab
|
||||
- Uses configured `NEXT_PUBLIC_ADMIN_WEB_URL`
|
||||
|
||||
### Admin Dashboard → DevOps Dashboard
|
||||
- Sidebar includes "DevOps Dashboard" link with Server icon
|
||||
- Opens devops dashboard in new tab
|
||||
- Uses configured `NEXT_PUBLIC_DEVOPS_WEB_URL`
|
||||
|
||||
## Shared Authentication
|
||||
|
||||
Both dashboards use the same authentication system:
|
||||
|
||||
1. **Platform Service Auth**: Both authenticate against platform-service
|
||||
2. **JWT Tokens**: Same JWT secret validates tokens across services
|
||||
3. **Per-Product Access**: Admin access is checked per-product via membership roles
|
||||
4. **Single Sign-On**: Users stay logged in across both dashboards
|
||||
|
||||
### Granting Access
|
||||
|
||||
To grant a user access to both dashboards:
|
||||
|
||||
1. Ensure user exists in platform-service
|
||||
2. Add admin membership for both products:
|
||||
|
||||
```json
|
||||
{
|
||||
"memberships": [
|
||||
{
|
||||
"productId": "bytelyst-devops",
|
||||
"role": "admin",
|
||||
"plan": "pro"
|
||||
},
|
||||
{
|
||||
"productId": "bytelyst-platform",
|
||||
"role": "admin",
|
||||
"plan": "pro"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Health Checks
|
||||
|
||||
- DevOps Backend: `http://localhost:4004/health`
|
||||
- DevOps Web: `http://localhost:3049`
|
||||
- Admin Web: `http://localhost:3001`
|
||||
- Traefik Dashboard: `http://localhost:8080`
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Network Issues
|
||||
```bash
|
||||
# Check if platform network exists
|
||||
docker network inspect learning_ai_common_plat_default
|
||||
|
||||
# Check container connectivity
|
||||
docker network inspect learning_ai_common_plat_default | grep devops
|
||||
```
|
||||
|
||||
### Traefik Routing
|
||||
```bash
|
||||
# Check Traefik dashboard
|
||||
http://localhost:8080
|
||||
|
||||
# Check Traefik logs
|
||||
docker logs $(docker ps -q -f name=gateway)
|
||||
|
||||
# Check router configuration
|
||||
docker inspect devops-web | grep -A 10 Labels
|
||||
```
|
||||
|
||||
### Authentication Failures
|
||||
- Verify JWT_SECRET matches across all services
|
||||
- Check platform-service is accessible: `curl http://localhost:4003/health`
|
||||
- Ensure user has proper product memberships
|
||||
|
||||
### Service Not Starting
|
||||
```bash
|
||||
# Check service logs
|
||||
docker logs devops-backend
|
||||
docker logs devops-web
|
||||
docker logs admin-web
|
||||
|
||||
# Check health status
|
||||
docker ps
|
||||
docker inspect devops-backend | grep -A 5 Health
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
Both dashboards include:
|
||||
- Performance monitoring hooks
|
||||
- Audit logging
|
||||
- Health check endpoints
|
||||
- Error tracking
|
||||
|
||||
Monitor these through:
|
||||
- Traefik Dashboard: `http://localhost:8080`
|
||||
- Grafana (if configured): `http://localhost:3000`
|
||||
- Loki logs (if configured): `http://localhost:3100`
|
||||
|
||||
## Comparison with Trading Dashboard
|
||||
|
||||
| Feature | Trading | DevOps | Admin |
|
||||
|---------|---------|--------|-------|
|
||||
| Domain | invttrdg.bytelyst.com | devops.bytelyst.com | admin.bytelyst.com |
|
||||
| Web Port | 3085 | 3049 | 3001 |
|
||||
| Backend Port | 4018 | 4004 | N/A |
|
||||
| Network | platform_net | platform_net | default |
|
||||
| Traefik | Yes | Yes | Yes |
|
||||
| Auth | Platform | Platform | Platform |
|
||||
|
||||
## Service Management
|
||||
|
||||
### Stop Services
|
||||
```bash
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
docker-compose down
|
||||
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose stop admin-web
|
||||
```
|
||||
|
||||
### Restart Services
|
||||
```bash
|
||||
cd /opt/bytelyst/bytelyst-devops-tools/dashboard
|
||||
docker-compose restart
|
||||
|
||||
cd /opt/bytelyst/learning_ai_common_plat
|
||||
docker-compose restart admin-web
|
||||
```
|
||||
|
||||
### View Logs
|
||||
```bash
|
||||
# DevOps
|
||||
docker logs -f devops-backend
|
||||
docker logs -f devops-web
|
||||
|
||||
# Admin
|
||||
docker logs -f admin-web
|
||||
|
||||
# Traefik
|
||||
docker logs -f gateway
|
||||
```
|
||||
|
||||
## Production Checklist
|
||||
|
||||
- [ ] Platform stack running with Traefik
|
||||
- [ ] DNS records configured
|
||||
- [ ] SSL/TLS certificates configured in Traefik
|
||||
- [ ] Environment variables set for production
|
||||
- [ ] Cosmos DB connection configured
|
||||
- [ ] JWT_SECRET matches across all services
|
||||
- [ ] User memberships configured for access
|
||||
- [ ] Health checks passing
|
||||
- [ ] Cross-navigation links working
|
||||
- [ ] Monitoring and logging configured
|
||||
|
||||
@ -4,14 +4,6 @@ Canonical URL reference for the ByteLyst DevOps dashboard workspace.
|
||||
|
||||
Use this document when you need the dashboard website URL, browser routes, backend API endpoints, health checks, or the related integration URLs referenced by the dashboard.
|
||||
|
||||
> **Local port note:** every `http://localhost:3000` URL in this file refers to
|
||||
> the **`pnpm dev`** workflow, where Next listens directly on 3000. Under the
|
||||
> Docker Compose deployment in this repo, the same web container is exposed on
|
||||
> the host as **`http://localhost:3049`** (compose maps `127.0.0.1:3049:3000`).
|
||||
> Substitute `:3049` for `:3000` whenever you're hitting the dockerized stack.
|
||||
> Production traffic goes through Traefik on `https://devops.bytelyst.com` and
|
||||
> doesn't expose either port. See `DEPLOYMENT.md` for the full port table.
|
||||
|
||||
## Canonical Bases
|
||||
|
||||
| Surface | Local | Production | Notes |
|
||||
|
||||
@ -23,7 +23,7 @@ dashboard/
|
||||
- **Service Registry**: Manage all ByteLyst services (trading, notes, clock, etc.)
|
||||
- **Deployment Orchestration**: Trigger deployments via existing bash scripts
|
||||
- **Health Monitoring**: Real-time health checks for all services with caching
|
||||
- **Deployment History**: Audit trail of all deployments with captured logs (JSON-polled by the web client; no SSE)
|
||||
- **Deployment History**: Audit trail of all deployments with log streaming
|
||||
- **Cross-Navigation**: One-click link to Platform Admin dashboard
|
||||
- **Hermes Mission Control**: Read-only mock dashboard for portfolio-wide execution, task ledger, product health, history, agents, and settings
|
||||
- **Testing**: Vitest for backend, React Testing Library for frontend
|
||||
@ -50,9 +50,11 @@ dashboard/
|
||||
- Validated path parameters, query parameters, and request bodies
|
||||
- Strict validation on update operations to prevent accidental field changes
|
||||
|
||||
### Deployment Logs
|
||||
- Endpoint `GET /api/deployments/:id/logs` returns the full captured stdout/stderr + current status as a single JSON payload (admin only).
|
||||
- The web client polls this endpoint while a deployment is `running`. There is intentionally no SSE/WebSocket stream — the previous attempt with `fastify-sse-v2` was incompatible with Fastify 5 and was removed. If a real-time stream is needed later, implement it explicitly via `reply.raw` and update this section in the same change.
|
||||
### Deployment Log Streaming
|
||||
- Added SSE endpoint for real-time log streaming (`GET /api/deployments/:id/logs`)
|
||||
- Frontend EventSource integration with cleanup function
|
||||
- Automatic polling for running deployments (1-second interval)
|
||||
- Proper connection cleanup on client disconnect
|
||||
|
||||
### Security Enhancements
|
||||
- Added rate limiting: 100 requests per minute per IP
|
||||
@ -102,7 +104,7 @@ pnpm dev # Runs on port 4004
|
||||
```bash
|
||||
cd web
|
||||
cp .env.local.example .env.local # Add your URLs
|
||||
pnpm dev # Next dev server on http://localhost:3000 (no Docker)
|
||||
pnpm dev # Runs on port 3000
|
||||
```
|
||||
|
||||
### Running Both
|
||||
@ -161,7 +163,7 @@ Production deployments use `https://api.bytelyst.com/devops` for `NEXT_PUBLIC_DE
|
||||
- `GET /api/deployments` - Recent deployments (with `?limit=` query param)
|
||||
- `GET /api/deployments/service/:serviceId` - Deployments for specific service
|
||||
- `GET /api/deployments/:id` - Single deployment
|
||||
- `GET /api/deployments/:id/logs` - Get captured deployment logs as JSON (web client polls this; no SSE)
|
||||
- `GET /api/deployments/:id/logs` - Stream deployment logs via SSE
|
||||
- `POST /api/deployments/trigger/:serviceId` - Trigger deployment (admin only)
|
||||
|
||||
### Health
|
||||
@ -197,8 +199,8 @@ See [DEPLOYMENT.md](./DEPLOYMENT.md) for detailed deployment instructions.
|
||||
|
||||
Deploy as a ByteLyst product:
|
||||
- Product ID: `devops-internal`
|
||||
- Backend port: 4004 (host) / 4004 (container)
|
||||
- Web port: 3000 (container) — exposed on host as **`localhost:3049`** under Docker Compose; dev mode (`pnpm dev`) listens directly on `localhost:3000`. See [`DEPLOYMENT.md`](./DEPLOYMENT.md) for the full port table.
|
||||
- Backend port: 4004
|
||||
- Web port: 3000
|
||||
- Use existing deployment scripts in parent directory
|
||||
- Public API base: `https://api.bytelyst.com/devops`
|
||||
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
# Dashboard Repo Review — Top Actions
|
||||
|
||||
Reviewed: 2026-05-27. Scope: `/opt/bytelyst/learning_ai_devops_tools/dashboard` (the ByteLyst DevOps Dashboard pnpm workspace: `backend/` Fastify 5 + `web/` Next.js 16).
|
||||
|
||||
Baseline state (verified during review):
|
||||
- `pnpm typecheck` — passes for both backend and web.
|
||||
- `pnpm test:run` — passes (backend 9 tests / 1 file, web 11 tests / 2 files).
|
||||
- `pnpm secret-scan` — clean.
|
||||
- `.env` is gitignored; only `.env.example` files are tracked.
|
||||
|
||||
The dashboard is functional and well-structured, but several issues block CI, hide regressions, and create operational risk. Actions are ordered by priority.
|
||||
|
||||
---
|
||||
|
||||
## P0 — Broken / Urgent
|
||||
|
||||
### 1. CI workflow points at a non-existent path
|
||||
`.gitea/workflows/ci.yml` runs everything from `/opt/bytelyst/bytelyst-devops-tools/dashboard`, but the actual checkout lives at `/opt/bytelyst/learning_ai_devops_tools/dashboard`. The same wrong path is hard-coded in `DEPLOYMENT.md` and `scripts/deploy-hotcopy.sh`.
|
||||
|
||||
- Action: replace the hard-coded path with `${{ gitea.workspace }}` (or a single `WORKDIR` env var) in <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/.gitea/workflows/ci.yml" />, then fix the two other references in <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/DEPLOYMENT.md" /> and <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/scripts/deploy-hotcopy.sh" />.
|
||||
- Verify: trigger a CI run on a throwaway branch and confirm green.
|
||||
|
||||
### 2. "Lint" steps are no-ops
|
||||
Both `backend/package.json` and `web/package.json` define `lint` as `echo 'No linting configured...'`. The CI step "Lint" therefore always passes regardless of code quality. There is no ESLint, Biome, or equivalent configured anywhere in the workspace.
|
||||
|
||||
- Action: pick one tool (recommend ESLint + `@typescript-eslint` for backend, Next.js's built-in ESLint config for web, since `next` already ships it). Wire `next lint` into `web/package.json` and add a minimal `.eslintrc` to backend.
|
||||
- Verify: `pnpm lint` returns a non-zero exit on a deliberately bad change.
|
||||
|
||||
---
|
||||
|
||||
## P1 — Important Gaps
|
||||
|
||||
### 3. Test coverage is extremely thin
|
||||
Backend has 12 modules (`services`, `deployments`, `health`, `audit`, `backup`, `system`, `env`, `azure-config`, `code-quality`, `cosmos-config`, `hermes-ops`, `vm`) but only `services` has a test file. The deployment orchestrator (`backend/src/modules/deployments/orchestrator.ts`), CSRF (`backend/src/lib/csrf.ts`), and auth (`backend/src/lib/auth.ts`) — the highest-risk surfaces — have no tests at all.
|
||||
|
||||
- Action: add `*.test.ts` for at least `auth`, `csrf`, `deployments/orchestrator`, and `health` repository before adding more features. Mirror the style of <ref_file file="/opt/bytelyst/learning_ai_devops_tools/dashboard/backend/src/modules/services/services.test.ts" />.
|
||||
- Add `pnpm test:coverage` to CI and fail under a threshold (start at 50 %, raise over time).
|
||||
|
||||
### 4. SSE deployment-log streaming is disabled — RESOLVED (removed)
|
||||
The TODO has been resolved by **removing the SSE claim**, not by shipping it: the `fastify-sse-v2` dependency is gone from `backend/package.json`, the commented-out import + plugin registration are gone from `backend/src/server.ts`, and the deployment-log endpoint is now documented as JSON-polled. The web client never used `EventSource` (`web/src/lib/api.ts` already polls `/api/deployments/:id/logs` via the normal `apiRequest` helper), so no UI change was required. README/DEPLOYMENT.md updated to match. If a real-time stream is wanted later, ship it explicitly via `reply.raw` and update the docs in the same change.
|
||||
|
||||
### 5. Documentation drift — RESOLVED
|
||||
- `DEPLOYMENT.md` is now the single canonical deployment guide; `DEPLOYMENT_GUIDE.md` is reduced to a one-line redirect for backwards compat with `deploy.sh` and external links. `deploy.sh` updated to reference `DEPLOYMENT.md`.
|
||||
- `DEPLOYMENT.md` carries an explicit **Ports — quick reference** table that distinguishes container port (`:3000`), Compose host port (`:3049`), and the Traefik production URL — so the 3000-vs-3049 question has one truthful answer per deployment mode rather than three contradictory prose claims.
|
||||
- `README.md` "Web port: 3000" rewritten to call out container vs Compose host vs dev-mode explicitly and link to the port table.
|
||||
- `ENDPOINTS.md` got a top-of-file note: every `localhost:3000` URL in the file refers to `pnpm dev`; substitute `:3049` for the Dockerized stack. The `https://api.bytelyst.com/api/devops` vs `/devops` ambiguity was already resolved by the existing "URL Note" section (kept).
|
||||
|
||||
### 6. Docker socket + host log mounts are very privileged — RESOLVED (documented; allow-list wrapper queued)
|
||||
`DEPLOYMENT.md` now has a **Privilege Surface** section enumerating every host mount, every shell-outing route + the exact commands it runs, the auth gate on each, and an explicit blast-radius statement ("an admin token is equivalent to a host shell, modulo what the codebase wraps"). Concurrent fix: the `/code-quality/check` endpoint was missing `requireAdmin` and was therefore reachable unauthenticated even though it shells out to `npm run typecheck/lint/build/test:run` in a caller-supplied `projectPath` — that's been gated to admin in the same commit. Two follow-up P1s remain in the doc's mitigation roadmap (allow-list wrapper around shell-outs; validate `code-quality/check` `projectPath` against an allowed root set); P2/P3 cover audit-logging shell-outs, dropping root in the container, and moving off the raw docker socket.
|
||||
|
||||
---
|
||||
|
||||
## P2 — Hygiene
|
||||
|
||||
### 7. Backend module structure isn't enforced
|
||||
Most modules follow the `routes.ts / repository.ts / types.ts` triple, but a few have extras (`deployments/orchestrator.ts`). There is no architectural test, README, or generator. New contributors will diverge.
|
||||
|
||||
- Action: add a short `backend/src/modules/README.md` describing the convention, and (optionally) an architectural test using `dependency-cruiser` or a custom vitest.
|
||||
|
||||
### 8. README is unfocused
|
||||
`README.md` mixes "Recent Improvements" (a changelog), feature list, setup, env vars, and full API docs into one 219-line file. The first cat of the file even shows it begins with two blank lines after the title — easy to miss content.
|
||||
|
||||
- Action: trim README to: what / quickstart / pointers. Move "Recent Improvements" into `CHANGELOG.md` and keep API docs only in `ENDPOINTS.md` / Swagger.
|
||||
|
||||
### 9. `.pnpmfile.cjs` dual-mode install is undocumented in CI
|
||||
`pnpm install:common-plat` vs `pnpm install:gitea` is only mentioned in the README. The CI workflow uses `install:common-plat`, which only works if the runner has the sibling `learning_ai_common_plat` checkout available. That assumption isn't asserted anywhere.
|
||||
|
||||
- Action: add a pre-install check that fails fast with a clear message if the expected workspace path is missing, and document the runner prerequisites in the CI file.
|
||||
|
||||
### 10. No production logging / metrics story
|
||||
`backend/src/server.ts` uses Fastify's default logger only. There is a `web/src/lib/telemetry.ts` file but nothing wires it to a backend. The dashboard advertises "monitoring" but doesn't emit its own structured telemetry.
|
||||
|
||||
- Action: decide on a target (pino transports → stdout for container logs is enough for now) and write down the choice. If Prometheus / OpenTelemetry is in scope, file a tracked issue rather than leaving it implied.
|
||||
|
||||
### 11. E2E tests aren't wired into local workflow
|
||||
`web/e2e/dashboard.spec.ts` and `web/e2e/hermes.spec.ts` exist and `pnpm test:e2e` is defined, but nothing documents how to start the backend+web before running them, and CI's E2E step (visible in `.gitea/workflows/ci.yml`) is cut off in the file — need to confirm it actually launches the stack.
|
||||
|
||||
- Action: read the bottom half of `ci.yml` and confirm the E2E job sets up backend+web; if not, fix it. Add a `pnpm test:e2e` recipe to README that explicitly says "run `pnpm dev` first" or use Playwright's `webServer` config.
|
||||
|
||||
---
|
||||
|
||||
## Suggested execution order
|
||||
|
||||
1. Fix the CI path (#1) — unblocks everything else.
|
||||
2. Reconcile the SSE TODO (#4) — either remove the claim or ship the feature.
|
||||
3. Add real linting (#2) and tighten test coverage on auth/csrf/orchestrator (#3).
|
||||
4. Documentation pass: ports, deployment docs, README trim (#5, #8).
|
||||
5. Privilege/operational hardening (#6, #10).
|
||||
6. Convention + DX polish (#7, #9, #11).
|
||||
|
||||
Each item above is small enough to land as a single PR.
|
||||
@ -10,8 +10,3 @@ AZURE_TENANT_ID=your-azure-tenant-id
|
||||
AZURE_CLIENT_ID=your-azure-client-id
|
||||
AZURE_CLIENT_SECRET=your-azure-client-secret
|
||||
AZURE_KEY_VAULT_URL=https://your-key-vault.vault.azure.net/
|
||||
|
||||
# Structured logging (pino → stdout). Override per environment as needed.
|
||||
# Levels: fatal | error | warn | info | debug | trace | silent
|
||||
# Default: debug in non-prod, info in prod (when NODE_ENV=production).
|
||||
LOG_LEVEL=info
|
||||
|
||||
@ -1,34 +1,14 @@
|
||||
# Build context: bytelyst-devops-tools/dashboard/ (monorepo root)
|
||||
#
|
||||
# Uses pnpm (matches `packageManager` field in package.json) and the
|
||||
# workspace `pnpm-lock.yaml` at the dashboard root. The previously-used
|
||||
# `npm ci` against `backend/package-lock.json` was broken because the
|
||||
# npm lockfile had been regenerated inside the pnpm workspace and
|
||||
# contained pnpm-store symlinks (e.g. node_modules/typescript pointing
|
||||
# at ../node_modules/.pnpm/typescript@5.9.3/...), which npm treated as
|
||||
# `link: true` and skipped installing — leaving `tsc` missing.
|
||||
#
|
||||
# BYTELYST_PACKAGE_SOURCE=gitea disables the `.pnpmfile.cjs` filesystem
|
||||
# lookup of `learning_ai_common_plat` (which isn't in the build context).
|
||||
# Backend has no `@bytelyst/*` deps so the pnpmfile is a no-op for it,
|
||||
# but we set the env explicitly for clarity.
|
||||
|
||||
# --- Stage 1: Build ---
|
||||
FROM node:20-alpine AS builder
|
||||
|
||||
ENV BYTELYST_PACKAGE_SOURCE=gitea
|
||||
RUN corepack enable && corepack prepare pnpm@10.6.5 --activate
|
||||
WORKDIR /app/backend
|
||||
|
||||
WORKDIR /app
|
||||
COPY backend/package.json backend/package-lock.json ./
|
||||
RUN npm ci --ignore-scripts
|
||||
|
||||
# Workspace metadata (pnpm needs the root files to resolve the workspace).
|
||||
COPY package.json pnpm-lock.yaml pnpm-workspace.yaml .pnpmfile.cjs ./
|
||||
COPY backend/package.json ./backend/
|
||||
|
||||
RUN pnpm install --frozen-lockfile --filter "@bytelyst/devops-backend..." --ignore-scripts
|
||||
|
||||
COPY backend/tsconfig.json ./backend/
|
||||
COPY backend/src/ ./backend/src/
|
||||
COPY backend/tsconfig.json ./
|
||||
COPY backend/src/ ./src/
|
||||
|
||||
# Build-time env vars (baked into the bundle)
|
||||
ARG BYTELYST_COMMIT_SHA=unknown
|
||||
@ -47,66 +27,22 @@ ENV BYTELYST_COMMIT_SHA=${BYTELYST_COMMIT_SHA} \
|
||||
BYTELYST_COMMIT_MESSAGE=${BYTELYST_COMMIT_MESSAGE} \
|
||||
BYTELYST_DOCKER_IMAGE=${BYTELYST_DOCKER_IMAGE}
|
||||
|
||||
WORKDIR /app/backend
|
||||
RUN pnpm run build
|
||||
|
||||
# Carve out a production-only deploy bundle (node_modules without devDeps).
|
||||
RUN pnpm --filter "@bytelyst/devops-backend" deploy --prod --legacy /deploy
|
||||
RUN npm run build
|
||||
|
||||
# --- Stage 2: Run ---
|
||||
# Use Debian slim (not Alpine) because vm-health-check.sh uses GNU df flags
|
||||
# (--output=pcent, --output=avail) that BusyBox df does not support.
|
||||
FROM node:20-slim AS runner
|
||||
FROM node:20-alpine AS runner
|
||||
|
||||
WORKDIR /app/backend
|
||||
|
||||
# Install tools needed by the VM management module:
|
||||
# bash — vm-health-check.sh and vm-cleanup.sh require bash
|
||||
# docker.io — docker CLI to communicate with the host daemon via socket
|
||||
# python3 — used in inline python3 -c snippets inside the scripts
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl bash docker.io python3 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY backend/package.json backend/package-lock.json ./
|
||||
RUN npm ci --omit=dev --ignore-scripts
|
||||
RUN apk add --no-cache curl
|
||||
|
||||
# Non-root user setup (Phase 5 P2 mitigation roadmap, item #4).
|
||||
# The backend doesn't strictly need root — its only privileged action is
|
||||
# talking to the docker daemon, which group membership covers. We create
|
||||
# the user + a docker group at a build-arg-configurable GID so the GID
|
||||
# can match the host's docker group (`getent group docker` on the host).
|
||||
#
|
||||
# Default `BACKEND_USER=root` keeps the current behaviour so existing
|
||||
# deployments don't break. Set `BACKEND_USER=app` to run non-root; this
|
||||
# requires the bind-mounted log files in `/var/log/vm-*.log` and
|
||||
# `/var/log/docker-watchdog.log` to be group-readable+writable by the
|
||||
# matching docker GID (or world-readable for read-only paths). See
|
||||
# `dashboard/DEPLOYMENT.md` Privilege Surface → "Running non-root".
|
||||
ARG BACKEND_USER=root
|
||||
ARG DOCKER_GID=999
|
||||
# `docker.io` in debian:bookworm-slim creates a `docker` group at a
|
||||
# distro-chosen GID (commonly 101). Reconcile it to ${DOCKER_GID} so the
|
||||
# in-container group matches the host's docker GID. If no `docker` group
|
||||
# exists yet, create one at ${DOCKER_GID}.
|
||||
RUN if getent group docker >/dev/null; then \
|
||||
groupmod --gid "${DOCKER_GID}" docker; \
|
||||
else \
|
||||
groupadd --system --gid "${DOCKER_GID}" docker; \
|
||||
fi \
|
||||
&& useradd --system --create-home --uid 1001 --gid "${DOCKER_GID}" --shell /sbin/nologin app \
|
||||
&& chown -R app:"${DOCKER_GID}" /app
|
||||
|
||||
# Bring in the deploy bundle (package.json, prod node_modules) and compiled JS.
|
||||
COPY --from=builder --chown=app:${DOCKER_GID} /deploy/package.json ./package.json
|
||||
COPY --from=builder --chown=app:${DOCKER_GID} /deploy/node_modules ./node_modules
|
||||
COPY --from=builder --chown=app:${DOCKER_GID} /app/backend/dist ./dist
|
||||
COPY --from=builder /app/backend/dist ./dist
|
||||
|
||||
ENV NODE_ENV=production
|
||||
ENV PORT=4004
|
||||
|
||||
EXPOSE 4004
|
||||
|
||||
# Switch to non-root only when explicitly opted in via build arg. If the
|
||||
# arg is `app`, the next two layers actually drop privileges; if `root`,
|
||||
# they're a no-op.
|
||||
USER ${BACKEND_USER}
|
||||
|
||||
CMD ["node", "dist/server.js"]
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
import js from '@eslint/js';
|
||||
import tseslint from 'typescript-eslint';
|
||||
import globals from 'globals';
|
||||
|
||||
// Flat config (ESLint 9). Real linting — replaces the previous no-op `echo`.
|
||||
// Correctness rules from the recommended sets stay errors and fail CI;
|
||||
// stylistic/known-pattern rules are relaxed so the current tree is clean.
|
||||
export default tseslint.config(
|
||||
{ ignores: ['dist/**', 'coverage/**', 'node_modules/**'] },
|
||||
js.configs.recommended,
|
||||
...tseslint.configs.recommended,
|
||||
{
|
||||
files: ['**/*.{ts,mts,cts}'],
|
||||
languageOptions: {
|
||||
globals: { ...globals.node },
|
||||
},
|
||||
rules: {
|
||||
// Fastify request/reply are cast to `any` at framework boundaries.
|
||||
'@typescript-eslint/no-explicit-any': 'off',
|
||||
// Surface dead code without failing the build on work-in-progress.
|
||||
'@typescript-eslint/no-unused-vars': [
|
||||
'warn',
|
||||
{ argsIgnorePattern: '^_', varsIgnorePattern: '^_', caughtErrors: 'none' },
|
||||
],
|
||||
},
|
||||
},
|
||||
);
|
||||
2856
dashboard/backend/package-lock.json
generated
Normal file
2856
dashboard/backend/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@
|
||||
"test": "vitest",
|
||||
"test:run": "vitest run",
|
||||
"test:coverage": "vitest run --coverage",
|
||||
"lint": "eslint src",
|
||||
"lint": "echo 'No linting configured for backend'",
|
||||
"migrate": "tsx src/scripts/run-migrations.ts up",
|
||||
"migrate:rollback": "tsx src/scripts/run-migrations.ts down"
|
||||
},
|
||||
@ -26,20 +26,15 @@
|
||||
"@fastify/swagger-ui": "^5.2.1",
|
||||
"dotenv": "^16.4.5",
|
||||
"fastify": "^5.2.1",
|
||||
"fastify-sse-v2": "^4.2.2",
|
||||
"jose": "^6.1.2",
|
||||
"pino": "^10.3.1",
|
||||
"pino-pretty": "^13.1.3",
|
||||
"zod": "^3.24.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.18.0",
|
||||
"@types/node": "^25.0.3",
|
||||
"@vitest/coverage-v8": "3.2.4",
|
||||
"eslint": "^9.18.0",
|
||||
"globals": "^15.14.0",
|
||||
"tsx": "^4.21.0",
|
||||
"typescript": "^5.9.3",
|
||||
"typescript-eslint": "^8.20.0",
|
||||
"vitest": "^3.1.2"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,135 +0,0 @@
|
||||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
import { SignJWT } from 'jose';
|
||||
import type { FastifyRequest } from 'fastify';
|
||||
|
||||
// Mock config so the auth module sees a deterministic JWT secret + product id.
|
||||
// Mocks must be declared before importing the SUT.
|
||||
vi.mock('./config.js', () => ({
|
||||
config: { JWT_SECRET: 'test-jwt-secret-for-unit-tests' },
|
||||
productId: 'devops-internal',
|
||||
}));
|
||||
|
||||
const { extractAuth, requireAdmin, AuthError } = await import('./auth.js');
|
||||
|
||||
const SECRET = new TextEncoder().encode('test-jwt-secret-for-unit-tests');
|
||||
|
||||
async function makeToken(payload: Record<string, unknown>, opts?: { issuer?: string }): Promise<string> {
|
||||
return new SignJWT(payload)
|
||||
.setProtectedHeader({ alg: 'HS256' })
|
||||
.setIssuer(opts?.issuer ?? 'bytelyst-platform')
|
||||
.setSubject((payload.sub as string) ?? 'user-1')
|
||||
.setExpirationTime('1h')
|
||||
.sign(SECRET);
|
||||
}
|
||||
|
||||
function reqWith(headers: Record<string, string>): FastifyRequest {
|
||||
return { headers } as unknown as FastifyRequest;
|
||||
}
|
||||
|
||||
describe('extractAuth', () => {
|
||||
it('returns null when Authorization header is missing', async () => {
|
||||
expect(await extractAuth(reqWith({}))).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when Authorization is not a Bearer token', async () => {
|
||||
expect(await extractAuth(reqWith({ authorization: 'Basic abc' }))).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when the token is malformed', async () => {
|
||||
expect(await extractAuth(reqWith({ authorization: 'Bearer not-a-jwt' }))).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when issuer does not match', async () => {
|
||||
const token = await makeToken({ sub: 'u1', role: 'admin' }, { issuer: 'other-issuer' });
|
||||
expect(await extractAuth(reqWith({ authorization: `Bearer ${token}` }))).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when signature does not verify (wrong secret)', async () => {
|
||||
const wrong = await new SignJWT({ sub: 'u1', role: 'admin' })
|
||||
.setProtectedHeader({ alg: 'HS256' })
|
||||
.setIssuer('bytelyst-platform')
|
||||
.setExpirationTime('1h')
|
||||
.sign(new TextEncoder().encode('different-secret'));
|
||||
expect(await extractAuth(reqWith({ authorization: `Bearer ${wrong}` }))).toBeNull();
|
||||
});
|
||||
|
||||
it('elevates to admin when global role is admin', async () => {
|
||||
const token = await makeToken({ sub: 'u1', role: 'admin', email: 'a@b.com', productId: 'devops-internal' });
|
||||
const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
|
||||
expect(result).toEqual({
|
||||
userId: 'u1',
|
||||
role: 'admin',
|
||||
email: 'a@b.com',
|
||||
productId: 'devops-internal',
|
||||
});
|
||||
});
|
||||
|
||||
it('elevates to admin via per-product membership for the target productId', async () => {
|
||||
const token = await makeToken({
|
||||
sub: 'u2',
|
||||
role: 'user',
|
||||
products: [
|
||||
{ productId: 'other-product', role: 'admin', plan: 'pro' },
|
||||
{ productId: 'devops-internal', role: 'admin', plan: 'pro' },
|
||||
],
|
||||
});
|
||||
const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
|
||||
expect(result?.role).toBe('admin');
|
||||
expect(result?.userId).toBe('u2');
|
||||
});
|
||||
|
||||
it('does not elevate when product membership is for a different product', async () => {
|
||||
const token = await makeToken({
|
||||
sub: 'u3',
|
||||
role: 'user',
|
||||
products: [{ productId: 'other-product', role: 'admin', plan: 'pro' }],
|
||||
});
|
||||
const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
|
||||
expect(result?.role).toBe('user');
|
||||
});
|
||||
|
||||
it('does not elevate when product membership role is not admin', async () => {
|
||||
const token = await makeToken({
|
||||
sub: 'u4',
|
||||
role: 'user',
|
||||
products: [{ productId: 'devops-internal', role: 'viewer', plan: 'free' }],
|
||||
});
|
||||
const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
|
||||
expect(result?.role).toBe('user');
|
||||
});
|
||||
|
||||
it('defaults role to "user" when payload has no role', async () => {
|
||||
const token = await makeToken({ sub: 'u5' });
|
||||
const result = await extractAuth(reqWith({ authorization: `Bearer ${token}` }));
|
||||
expect(result?.role).toBe('user');
|
||||
});
|
||||
});
|
||||
|
||||
describe('requireAdmin', () => {
|
||||
let req: { authUserId?: string; authRole?: string };
|
||||
|
||||
beforeEach(() => {
|
||||
req = {};
|
||||
});
|
||||
|
||||
it('throws AuthError(403) when no auth context is present', () => {
|
||||
expect(() => requireAdmin(req as unknown as FastifyRequest)).toThrow(AuthError);
|
||||
try {
|
||||
requireAdmin(req as unknown as FastifyRequest);
|
||||
} catch (e) {
|
||||
expect((e as InstanceType<typeof AuthError>).statusCode).toBe(403);
|
||||
}
|
||||
});
|
||||
|
||||
it('throws AuthError(403) for non-admin role', () => {
|
||||
req.authUserId = 'u1';
|
||||
req.authRole = 'user';
|
||||
expect(() => requireAdmin(req as unknown as FastifyRequest)).toThrow(AuthError);
|
||||
});
|
||||
|
||||
it('returns userId when role is admin', () => {
|
||||
req.authUserId = 'u1';
|
||||
req.authRole = 'admin';
|
||||
expect(requireAdmin(req as unknown as FastifyRequest)).toEqual({ userId: 'u1' });
|
||||
});
|
||||
});
|
||||
@ -31,13 +31,5 @@ const envSchema = z.object({
|
||||
|
||||
export const config = envSchema.parse(process.env);
|
||||
|
||||
// Warn loudly when insecure default keys are in use
|
||||
if (config.CSRF_SECRET === 'default-csrf-secret-change-in-production') {
|
||||
console.warn('[config] WARNING: CSRF_SECRET is using the insecure default — set CSRF_SECRET in .env before deploying to production');
|
||||
}
|
||||
if (config.ENCRYPTION_KEY === 'default-encryption-key-change-in-production') {
|
||||
console.warn('[config] WARNING: ENCRYPTION_KEY is using the insecure default — set ENCRYPTION_KEY in .env before deploying to production');
|
||||
}
|
||||
|
||||
export const productId = productIdentity.productId;
|
||||
export const productName = productIdentity.name;
|
||||
|
||||
@ -1,77 +0,0 @@
|
||||
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
||||
|
||||
// Pin a deterministic CSRF secret. Mocks must be declared before importing the SUT.
|
||||
vi.mock('./config.js', () => ({
|
||||
config: { CSRF_SECRET: 'csrf-test-secret' },
|
||||
productId: 'devops-internal',
|
||||
}));
|
||||
|
||||
const { generateCsrfToken, validateCsrfToken, getSessionId } = await import('./csrf.js');
|
||||
|
||||
describe('generateCsrfToken / validateCsrfToken', () => {
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date('2026-01-01T00:00:00Z'));
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('produces a base64-encoded token that round-trips through validate', () => {
|
||||
const token = generateCsrfToken('session-1');
|
||||
expect(token).toMatch(/^[A-Za-z0-9+/=]+$/);
|
||||
expect(validateCsrfToken(token, 'session-1')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects when the session id does not match', () => {
|
||||
const token = generateCsrfToken('session-1');
|
||||
expect(validateCsrfToken(token, 'session-2')).toBe(false);
|
||||
});
|
||||
|
||||
it('rejects when the token has been tampered with (signature mismatch)', () => {
|
||||
const token = generateCsrfToken('session-1');
|
||||
const decoded = Buffer.from(token, 'base64').toString('utf-8');
|
||||
const [sid, ts] = decoded.split(':');
|
||||
// Replace the trailing hash with garbage of the same length.
|
||||
const tampered = Buffer.from(`${sid}:${ts}:${'0'.repeat(64)}`).toString('base64');
|
||||
expect(validateCsrfToken(tampered, 'session-1')).toBe(false);
|
||||
});
|
||||
|
||||
it('rejects when the token is older than the 1h window', () => {
|
||||
const token = generateCsrfToken('session-1');
|
||||
// Advance just past the 3_600_000ms cutoff.
|
||||
vi.setSystemTime(new Date(Date.now() + 3_600_001));
|
||||
expect(validateCsrfToken(token, 'session-1')).toBe(false);
|
||||
});
|
||||
|
||||
it('accepts when the token is just inside the 1h window', () => {
|
||||
const token = generateCsrfToken('session-1');
|
||||
vi.setSystemTime(new Date(Date.now() + 3_599_000));
|
||||
expect(validateCsrfToken(token, 'session-1')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects garbage input without throwing', () => {
|
||||
expect(validateCsrfToken('not-base64!!!', 'session-1')).toBe(false);
|
||||
expect(validateCsrfToken('', 'session-1')).toBe(false);
|
||||
});
|
||||
|
||||
it('produces different tokens for different sessions at the same instant', () => {
|
||||
const t1 = generateCsrfToken('session-a');
|
||||
const t2 = generateCsrfToken('session-b');
|
||||
expect(t1).not.toBe(t2);
|
||||
expect(validateCsrfToken(t1, 'session-b')).toBe(false);
|
||||
expect(validateCsrfToken(t2, 'session-a')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getSessionId', () => {
|
||||
it('returns authUserId when present on the request', () => {
|
||||
expect(getSessionId({ authUserId: 'user-42' })).toBe('user-42');
|
||||
});
|
||||
|
||||
it('returns null when authUserId is absent', () => {
|
||||
expect(getSessionId({})).toBeNull();
|
||||
expect(getSessionId({ headers: {} })).toBeNull();
|
||||
});
|
||||
});
|
||||
@ -1,44 +0,0 @@
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
const appendFileMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('fs/promises', () => ({ appendFile: appendFileMock }));
|
||||
|
||||
const { appendDashboardWarning, clearDashboardWarningDedupe } = await import('./dashboard-alerts.js');
|
||||
|
||||
describe('dashboard-alerts', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
clearDashboardWarningDedupe();
|
||||
delete process.env.HERMES_DASHBOARD_ALERT_LOG;
|
||||
});
|
||||
|
||||
it('does nothing when the alert log is not configured', async () => {
|
||||
const wrote = await appendDashboardWarning({ severity: 'warn', instance: 'vijay', message: 'gateway down' });
|
||||
expect(wrote).toBe(false);
|
||||
expect(appendFileMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('writes a routed warning line when configured', async () => {
|
||||
process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
|
||||
const wrote = await appendDashboardWarning(
|
||||
{ severity: 'critical', instance: 'bheem', message: 'backup missing' },
|
||||
Date.parse('2026-05-31T07:00:00Z'),
|
||||
);
|
||||
|
||||
expect(wrote).toBe(true);
|
||||
expect(appendFileMock).toHaveBeenCalledWith(
|
||||
'/tmp/hermes-dashboard-warnings.log',
|
||||
'2026-05-31T07:00:00.000Z CRITICAL instance=bheem backup missing\n',
|
||||
'utf8',
|
||||
);
|
||||
});
|
||||
|
||||
it('deduplicates for one hour and writes again after expiry', async () => {
|
||||
process.env.HERMES_DASHBOARD_ALERT_LOG = '/tmp/hermes-dashboard-warnings.log';
|
||||
const input = { severity: 'warn' as const, instance: 'all' as const, message: 'shared warning' };
|
||||
expect(await appendDashboardWarning(input, 1_000)).toBe(true);
|
||||
expect(await appendDashboardWarning(input, 2_000)).toBe(false);
|
||||
expect(await appendDashboardWarning(input, 3_602_000)).toBe(true);
|
||||
expect(appendFileMock).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
@ -1,48 +0,0 @@
|
||||
import { appendFile } from 'fs/promises';
|
||||
|
||||
type AlertSeverity = 'info' | 'warn' | 'critical';
|
||||
type AlertInstance = 'vijay' | 'bheem' | 'all';
|
||||
|
||||
interface DashboardWarningInput {
|
||||
severity: AlertSeverity;
|
||||
instance: AlertInstance;
|
||||
message: string;
|
||||
}
|
||||
|
||||
const DEDUPE_WINDOW_MS = 60 * 60 * 1000;
|
||||
const recent = new Map<string, number>();
|
||||
|
||||
function severityToken(severity: AlertSeverity): string {
|
||||
if (severity === 'critical') return 'CRITICAL';
|
||||
if (severity === 'warn') return 'WARNING';
|
||||
return 'INFO';
|
||||
}
|
||||
|
||||
function alertKey(input: DashboardWarningInput): string {
|
||||
return `${input.severity}\0${input.instance}\0${input.message}`;
|
||||
}
|
||||
|
||||
function purgeExpired(now: number): void {
|
||||
for (const [key, at] of recent) {
|
||||
if (now - at > DEDUPE_WINDOW_MS) recent.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
export async function appendDashboardWarning(input: DashboardWarningInput, now = Date.now()): Promise<boolean> {
|
||||
const logPath = process.env.HERMES_DASHBOARD_ALERT_LOG;
|
||||
if (!logPath) return false;
|
||||
|
||||
purgeExpired(now);
|
||||
const key = alertKey(input);
|
||||
const previous = recent.get(key);
|
||||
if (previous && now - previous <= DEDUPE_WINDOW_MS) return false;
|
||||
|
||||
recent.set(key, now);
|
||||
const line = `${new Date(now).toISOString()} ${severityToken(input.severity)} instance=${input.instance} ${input.message}\n`;
|
||||
await appendFile(logPath, line, 'utf8');
|
||||
return true;
|
||||
}
|
||||
|
||||
export function clearDashboardWarningDedupe(): void {
|
||||
recent.clear();
|
||||
}
|
||||
@ -1,74 +0,0 @@
|
||||
// Centralized pino logger.
|
||||
//
|
||||
// Fastify already uses pino under the hood, but we want one configured pino
|
||||
// instance shared between Fastify (via `logger: <instance>` in `Fastify({...})`)
|
||||
// and any non-request code path (background tasks, repositories called outside
|
||||
// a request, scripts). Importing the same instance everywhere means uniform
|
||||
// formatting, redaction, and log level — and gives us one place to change
|
||||
// transport later.
|
||||
//
|
||||
// Env knobs:
|
||||
// LOG_LEVEL — pino level (`fatal|error|warn|info|debug|trace|silent`).
|
||||
// Default: `debug` in non-production, `info` in production.
|
||||
// NODE_ENV — `production` flips the default level.
|
||||
//
|
||||
// Redaction:
|
||||
// We strip Authorization headers and a small allow-list of secret-shaped
|
||||
// field names (`password`, `token`, `secret`, common Azure/JWT keys) from
|
||||
// any logged object so that an accidental `req.log.info(req.body)` or
|
||||
// `logger.error({ err, config }, ...)` doesn't leak credentials.
|
||||
|
||||
import pino from 'pino';
|
||||
|
||||
const isProd = process.env.NODE_ENV === 'production';
|
||||
const level = process.env.LOG_LEVEL ?? (isProd ? 'info' : 'debug');
|
||||
|
||||
// Field paths we never want in logs. Pino's redact uses fast-redact's
|
||||
// dot-path syntax with `*` wildcards. Cover the common cases without trying
|
||||
// to be exhaustive — this is a backstop, not the primary defense.
|
||||
const redactPaths = [
|
||||
// Headers (Fastify request log shape)
|
||||
'req.headers.authorization',
|
||||
'req.headers.cookie',
|
||||
'request.headers.authorization',
|
||||
'request.headers.cookie',
|
||||
'headers.authorization',
|
||||
'headers.cookie',
|
||||
// Common secret-shaped keys at the top level of a logged object
|
||||
'*.password',
|
||||
'*.token',
|
||||
'*.refreshToken',
|
||||
'*.refresh_token',
|
||||
'*.accessToken',
|
||||
'*.access_token',
|
||||
'*.csrfToken',
|
||||
'*.csrf_token',
|
||||
'*.JWT_SECRET',
|
||||
'*.CSRF_SECRET',
|
||||
'*.ENCRYPTION_KEY',
|
||||
'*.COSMOS_KEY',
|
||||
'*.AZURE_CLIENT_SECRET',
|
||||
];
|
||||
|
||||
export const logger = pino({
|
||||
level,
|
||||
redact: {
|
||||
paths: redactPaths,
|
||||
censor: '[REDACTED]',
|
||||
},
|
||||
// Stable, JSON to stdout in every environment. If you want pretty output
|
||||
// locally, pipe through `pino-pretty` from your shell — we deliberately
|
||||
// don't bundle it as a runtime dep.
|
||||
base: { service: 'devops-backend' },
|
||||
timestamp: pino.stdTimeFunctions.isoTime,
|
||||
});
|
||||
|
||||
// Convenience: a child logger tagged with a module name. Use this in
|
||||
// repositories / background workers so log lines carry their origin
|
||||
// without having to repeat it in every call site.
|
||||
//
|
||||
// const log = childLogger('deployments/orchestrator');
|
||||
// log.error({ err, deploymentId }, 'background work failed');
|
||||
export function childLogger(module: string) {
|
||||
return logger.child({ module });
|
||||
}
|
||||
@ -1,165 +0,0 @@
|
||||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
|
||||
const execFileMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('child_process', () => ({ execFile: execFileMock }));
|
||||
|
||||
const {
|
||||
assertPathInAllowedRoots,
|
||||
dockerPrune,
|
||||
dockerRestart,
|
||||
execAllowed,
|
||||
InvalidShellArgError,
|
||||
runBashScript,
|
||||
runNpmScript,
|
||||
} = await import('./shell.js');
|
||||
|
||||
function setExec(handler: (cmd: string, args: string[]) => { error?: Error; stdout?: string; stderr?: string }) {
|
||||
execFileMock.mockImplementation(
|
||||
(
|
||||
command: string,
|
||||
args: string[],
|
||||
_opts: unknown,
|
||||
cb: (err: unknown, result?: { stdout: string; stderr: string }) => void,
|
||||
) => {
|
||||
const res = handler(command, args);
|
||||
if (res.error) cb(res.error);
|
||||
else cb(null, { stdout: res.stdout ?? '', stderr: res.stderr ?? '' });
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
describe('execAllowed', () => {
|
||||
beforeEach(() => vi.clearAllMocks());
|
||||
|
||||
it('passes argv through to execFile without a shell', async () => {
|
||||
setExec(() => ({ stdout: 'ok' }));
|
||||
const result = await execAllowed('docker', ['ps', '-a']);
|
||||
expect(result.stdout).toBe('ok');
|
||||
expect(execFileMock).toHaveBeenCalledTimes(1);
|
||||
const [cmd, args] = execFileMock.mock.calls[0];
|
||||
expect(cmd).toBe('docker');
|
||||
expect(args).toEqual(['ps', '-a']);
|
||||
});
|
||||
});
|
||||
|
||||
describe('dockerRestart', () => {
|
||||
beforeEach(() => vi.clearAllMocks());
|
||||
|
||||
it('rejects names with shell metacharacters before reaching execFile', async () => {
|
||||
await expect(dockerRestart('foo; rm -rf /')).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
await expect(dockerRestart('foo bar')).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
await expect(dockerRestart('$(whoami)')).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
await expect(dockerRestart('')).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
expect(execFileMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('accepts valid container names and forwards them as a single argv element', async () => {
|
||||
setExec(() => ({ stdout: 'restarted' }));
|
||||
await dockerRestart('hermes-gateway');
|
||||
const [, args] = execFileMock.mock.calls[0];
|
||||
// `restart` and the name are separate argv slots — never one
|
||||
// concatenated string that could be re-parsed by a shell.
|
||||
expect(args).toEqual(['restart', 'hermes-gateway']);
|
||||
});
|
||||
|
||||
it('non-string input throws InvalidShellArgError', async () => {
|
||||
// @ts-expect-error — testing runtime guard
|
||||
await expect(dockerRestart(undefined)).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
// @ts-expect-error — testing runtime guard
|
||||
await expect(dockerRestart(123)).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
});
|
||||
});
|
||||
|
||||
describe('dockerPrune', () => {
|
||||
beforeEach(() => vi.clearAllMocks());
|
||||
|
||||
it('rejects unknown prune kinds', async () => {
|
||||
// @ts-expect-error — exercising the runtime check
|
||||
await expect(dockerPrune('everything')).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('emits the documented argv per kind', async () => {
|
||||
setExec(() => ({ stdout: '' }));
|
||||
await dockerPrune('container');
|
||||
expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['container', 'prune', '-f']);
|
||||
await dockerPrune('image', { all: true });
|
||||
// `docker image prune -a -f` — kind first, then the verb, then -a/-f flags.
|
||||
expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['image', 'prune', '-a', '-f']);
|
||||
await dockerPrune('volume');
|
||||
expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['volume', 'prune', '-f']);
|
||||
await dockerPrune('builder');
|
||||
expect(execFileMock.mock.calls.at(-1)![1]).toEqual(['builder', 'prune', '-f']);
|
||||
});
|
||||
|
||||
it('rejects --all on non-image kinds', async () => {
|
||||
await expect(dockerPrune('container', { all: true })).rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
});
|
||||
});
|
||||
|
||||
describe('assertPathInAllowedRoots', () => {
|
||||
it('accepts paths inside an allowed root', () => {
|
||||
expect(assertPathInAllowedRoots('/opt/projects/foo', ['/opt/projects'])).toBe('/opt/projects/foo');
|
||||
expect(assertPathInAllowedRoots('/opt/projects/foo/bar/baz', ['/opt/projects'])).toBe('/opt/projects/foo/bar/baz');
|
||||
expect(assertPathInAllowedRoots('/opt/projects', ['/opt/projects'])).toBe('/opt/projects');
|
||||
});
|
||||
|
||||
it('rejects relative paths', () => {
|
||||
expect(() => assertPathInAllowedRoots('relative/path', ['/opt/projects'])).toThrow(InvalidShellArgError);
|
||||
expect(() => assertPathInAllowedRoots('./foo', ['/opt/projects'])).toThrow(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('rejects ../ escape attempts even when prefix-matching the root', () => {
|
||||
expect(() => assertPathInAllowedRoots('/opt/projects/../etc', ['/opt/projects'])).toThrow(InvalidShellArgError);
|
||||
expect(() => assertPathInAllowedRoots('/opt/projects/../../etc', ['/opt/projects'])).toThrow(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('rejects sibling directories that share a prefix string', () => {
|
||||
// /opt/projects-evil should NOT be accepted just because it starts with /opt/projects
|
||||
expect(() => assertPathInAllowedRoots('/opt/projects-evil/foo', ['/opt/projects'])).toThrow(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('checks every allowed root', () => {
|
||||
expect(assertPathInAllowedRoots('/srv/app', ['/opt/projects', '/srv/app'])).toBe('/srv/app');
|
||||
});
|
||||
});
|
||||
|
||||
describe('runBashScript', () => {
|
||||
beforeEach(() => vi.clearAllMocks());
|
||||
|
||||
it('rejects scripts outside allowed roots', async () => {
|
||||
await expect(runBashScript('/etc/init.d/anything', [], { allowedRoots: ['/opt/projects'] }))
|
||||
.rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('runs a script that is inside an allowed root', async () => {
|
||||
setExec(() => ({ stdout: 'ok' }));
|
||||
const result = await runBashScript('/opt/projects/deploy.sh', ['--prod'], { allowedRoots: ['/opt/projects'] });
|
||||
expect(result.stdout).toBe('ok');
|
||||
const [cmd, args] = execFileMock.mock.calls[0];
|
||||
expect(cmd).toBe('bash');
|
||||
expect(args).toEqual(['/opt/projects/deploy.sh', '--prod']);
|
||||
});
|
||||
});
|
||||
|
||||
describe('runNpmScript', () => {
|
||||
beforeEach(() => vi.clearAllMocks());
|
||||
|
||||
it('rejects npm scripts not in the lifecycle allow-list', async () => {
|
||||
// @ts-expect-error — exercising the runtime guard
|
||||
await expect(runNpmScript('publish', { allowedRoots: ['/opt/projects'], cwd: '/opt/projects/foo' }))
|
||||
.rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('rejects cwd outside allowed roots', async () => {
|
||||
await expect(runNpmScript('typecheck', { allowedRoots: ['/opt/projects'], cwd: '/etc' }))
|
||||
.rejects.toBeInstanceOf(InvalidShellArgError);
|
||||
});
|
||||
|
||||
it('runs a whitelisted lifecycle script in an allowed cwd', async () => {
|
||||
setExec(() => ({ stdout: 'ok' }));
|
||||
await runNpmScript('typecheck', { allowedRoots: ['/opt/projects'], cwd: '/opt/projects/foo' });
|
||||
const [cmd, args] = execFileMock.mock.calls[0];
|
||||
expect(cmd).toBe('npm');
|
||||
expect(args).toEqual(['run', 'typecheck']);
|
||||
});
|
||||
});
|
||||
@ -1,170 +0,0 @@
|
||||
// Allow-list wrapper around shell-outs.
|
||||
//
|
||||
// Every privileged route in this backend ultimately runs `docker`, `bash`,
|
||||
// `npm`, etc. on the host. Historically those were issued as template-literal
|
||||
// strings passed through `child_process.exec`, which means a misvalidated
|
||||
// path param can become a shell-injection. This module fixes that by:
|
||||
//
|
||||
// 1. Always passing argv as a real array to `execFile` (no shell expansion,
|
||||
// no string templating). `execAllowed()` is the only escape hatch and it
|
||||
// still uses `execFile`, never `exec`.
|
||||
// 2. Exposing per-command helpers (`dockerRestart`, `dockerPrune`,
|
||||
// `runBashScript`, `runNpmScript`) that validate their inputs against
|
||||
// a per-command allow-list regex. Repos call these instead of building
|
||||
// `docker ...` strings directly.
|
||||
//
|
||||
// This is the "allow-list wrapper" item from the DEPLOYMENT.md privilege-
|
||||
// surface mitigation roadmap.
|
||||
|
||||
import { execFile } from 'child_process';
|
||||
import { isAbsolute, normalize, relative, resolve } from 'path';
|
||||
import { promisify } from 'util';
|
||||
import { childLogger } from './logger.js';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
const log = childLogger('lib/shell');
|
||||
|
||||
export interface ShellExecOptions {
|
||||
cwd?: string;
|
||||
timeoutMs?: number;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
}
|
||||
|
||||
export interface ShellResult {
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single command with an explicit `argv` array. No shell expansion,
|
||||
* no string interpolation. Prefer the per-command helpers below; reach for
|
||||
* this when the command isn't on the allow-list yet.
|
||||
*/
|
||||
export async function execAllowed(
|
||||
command: string,
|
||||
args: string[],
|
||||
options: ShellExecOptions = {},
|
||||
): Promise<ShellResult> {
|
||||
log.debug({ command, args, cwd: options.cwd }, 'shell exec');
|
||||
const { stdout, stderr } = await execFileAsync(command, args, {
|
||||
cwd: options.cwd,
|
||||
timeout: options.timeoutMs ?? 30_000,
|
||||
env: options.env,
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
});
|
||||
return {
|
||||
stdout: stdout?.toString?.() ?? String(stdout ?? ''),
|
||||
stderr: stderr?.toString?.() ?? String(stderr ?? ''),
|
||||
};
|
||||
}
|
||||
|
||||
// --- Docker allow-list ------------------------------------------------------
|
||||
|
||||
// Container/volume/image names from the docker daemon. Docker's own rule is
|
||||
// `[a-zA-Z0-9][a-zA-Z0-9_.-]+` but we tighten the leading char too.
|
||||
const CONTAINER_NAME_RE = /^[a-zA-Z0-9][a-zA-Z0-9._-]{0,127}$/;
|
||||
|
||||
export class InvalidShellArgError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = 'InvalidShellArgError';
|
||||
}
|
||||
}
|
||||
|
||||
function assertContainerName(name: string): void {
|
||||
if (typeof name !== 'string' || !CONTAINER_NAME_RE.test(name)) {
|
||||
throw new InvalidShellArgError(`Invalid container name: ${JSON.stringify(name)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/** `docker restart <name>` — validated. */
|
||||
export async function dockerRestart(name: string): Promise<ShellResult> {
|
||||
assertContainerName(name);
|
||||
return execAllowed('docker', ['restart', name], { timeoutMs: 30_000 });
|
||||
}
|
||||
|
||||
const PRUNE_KINDS = ['container', 'image', 'volume', 'builder'] as const;
|
||||
export type PruneKind = typeof PRUNE_KINDS[number];
|
||||
|
||||
/** `docker <kind> prune -f` (`-a` only valid for `image`). */
|
||||
export async function dockerPrune(kind: PruneKind, opts: { all?: boolean } = {}): Promise<ShellResult> {
|
||||
if (!PRUNE_KINDS.includes(kind)) {
|
||||
throw new InvalidShellArgError(`Invalid prune kind: ${JSON.stringify(kind)}`);
|
||||
}
|
||||
const args: string[] = [kind, 'prune', '-f'];
|
||||
if (opts.all) {
|
||||
if (kind !== 'image') throw new InvalidShellArgError('`all` is only valid for image prune');
|
||||
args.splice(2, 0, '-a');
|
||||
}
|
||||
return execAllowed('docker', args, { timeoutMs: 60_000 });
|
||||
}
|
||||
|
||||
// --- Filesystem-path allow-list --------------------------------------------
|
||||
|
||||
/**
|
||||
* Verify that `candidate` is an absolute path that resolves inside one of
|
||||
* the allowed roots. Used to lock down request-supplied `cwd` values
|
||||
* (e.g. `/code-quality/check`'s `projectPath`) so callers can't run
|
||||
* lifecycle scripts in arbitrary directories.
|
||||
*/
|
||||
export function assertPathInAllowedRoots(candidate: string, allowedRoots: string[]): string {
|
||||
if (typeof candidate !== 'string' || !isAbsolute(candidate)) {
|
||||
throw new InvalidShellArgError(`Path must be absolute: ${JSON.stringify(candidate)}`);
|
||||
}
|
||||
const resolved = resolve(normalize(candidate));
|
||||
for (const root of allowedRoots) {
|
||||
const resolvedRoot = resolve(normalize(root));
|
||||
const rel = relative(resolvedRoot, resolved);
|
||||
// Inside the root iff the relative path doesn't escape upward
|
||||
// (no leading `..`) and isn't an absolute path back out.
|
||||
if (rel === '' || (!rel.startsWith('..') && !isAbsolute(rel))) {
|
||||
return resolved;
|
||||
}
|
||||
}
|
||||
throw new InvalidShellArgError(
|
||||
`Path is not inside an allowed root: ${JSON.stringify(candidate)}`,
|
||||
);
|
||||
}
|
||||
|
||||
// --- bash / npm wrappers ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Run a `bash <script>` invocation with `cwd` constrained to allowed
|
||||
* roots. The script path itself must also be inside an allowed root.
|
||||
*/
|
||||
export async function runBashScript(
|
||||
scriptPath: string,
|
||||
args: string[] = [],
|
||||
options: ShellExecOptions & { allowedRoots: string[] } = { allowedRoots: [] },
|
||||
): Promise<ShellResult> {
|
||||
const safeScript = assertPathInAllowedRoots(scriptPath, options.allowedRoots);
|
||||
if (options.cwd) assertPathInAllowedRoots(options.cwd, options.allowedRoots);
|
||||
return execAllowed('bash', [safeScript, ...args], {
|
||||
cwd: options.cwd,
|
||||
timeoutMs: options.timeoutMs ?? 300_000,
|
||||
env: options.env,
|
||||
});
|
||||
}
|
||||
|
||||
const NPM_LIFECYCLE = ['typecheck', 'lint', 'build', 'test', 'test:run', 'start'] as const;
|
||||
export type NpmLifecycle = typeof NPM_LIFECYCLE[number];
|
||||
|
||||
/**
|
||||
* `npm run <script>` constrained to a known set of lifecycle scripts and
|
||||
* run only inside an allowed project root. Used by `/code-quality/check`.
|
||||
*/
|
||||
export async function runNpmScript(
|
||||
script: NpmLifecycle,
|
||||
options: ShellExecOptions & { allowedRoots: string[] } = { allowedRoots: [] },
|
||||
): Promise<ShellResult> {
|
||||
if (!NPM_LIFECYCLE.includes(script)) {
|
||||
throw new InvalidShellArgError(`npm script not in allow-list: ${JSON.stringify(script)}`);
|
||||
}
|
||||
if (!options.cwd) throw new InvalidShellArgError('npm run requires a cwd');
|
||||
assertPathInAllowedRoots(options.cwd, options.allowedRoots);
|
||||
return execAllowed('npm', ['run', script], {
|
||||
cwd: options.cwd,
|
||||
timeoutMs: options.timeoutMs ?? 120_000,
|
||||
env: options.env,
|
||||
});
|
||||
}
|
||||
@ -2,11 +2,8 @@ import { z } from 'zod';
|
||||
|
||||
export const AuditLogSchema = z.object({
|
||||
id: z.string(),
|
||||
// `shell-exec` covers privileged shell-outs (docker prune, container
|
||||
// restart, code-quality npm runs) so a leaked admin token's actions are
|
||||
// reconstructable from cosmos rather than only from container stdout.
|
||||
action: z.enum(['create', 'update', 'delete', 'deploy', 'trigger', 'shell-exec']),
|
||||
entityType: z.enum(['service', 'deployment', 'user', 'host']),
|
||||
action: z.enum(['create', 'update', 'delete', 'deploy', 'trigger']),
|
||||
entityType: z.enum(['service', 'deployment', 'user']),
|
||||
entityId: z.string(),
|
||||
userId: z.string(),
|
||||
role: z.string(),
|
||||
|
||||
@ -1,10 +1,7 @@
|
||||
import { getContainer } from '../../lib/cosmos-init.js';
|
||||
import { productId } from '../../lib/config.js';
|
||||
import { childLogger } from '../../lib/logger.js';
|
||||
import type { Backup, BackupParams } from './types.js';
|
||||
|
||||
const log = childLogger('backup/repository');
|
||||
|
||||
const BACKUPS_CONTAINER = 'backups';
|
||||
|
||||
export async function createBackup(params: BackupParams = {}): Promise<Backup> {
|
||||
@ -24,7 +21,7 @@ export async function createBackup(params: BackupParams = {}): Promise<Backup> {
|
||||
backupData[containerName] = resources;
|
||||
totalItems += resources.length;
|
||||
} catch (error) {
|
||||
log.error({ err: error, containerName }, "failed to backup container");
|
||||
console.error(`Failed to backup container ${containerName}:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@ -57,7 +54,7 @@ export async function getBackups(): Promise<Backup[]> {
|
||||
|
||||
return resources as Backup[];
|
||||
} catch (error) {
|
||||
log.error({ err: error }, "failed to get backups");
|
||||
console.error('Failed to get backups:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
@ -92,7 +89,7 @@ export async function restoreBackup(backupId: string): Promise<void> {
|
||||
try {
|
||||
await targetContainer.items.upsert(item);
|
||||
} catch (error) {
|
||||
log.error({ err: error, containerName }, "failed to restore backup item");
|
||||
console.error(`Failed to restore item in ${containerName}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -102,7 +99,7 @@ export async function deleteBackup(backupId: string): Promise<void> {
|
||||
try {
|
||||
await getContainer(BACKUPS_CONTAINER).item(backupId).delete();
|
||||
} catch (error) {
|
||||
log.error({ err: error }, "failed to delete backup");
|
||||
console.error('Failed to delete backup:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,61 +2,12 @@ import { exec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { join } from 'path';
|
||||
import { assertPathInAllowedRoots, InvalidShellArgError, runNpmScript, type NpmLifecycle } from '../../lib/shell.js';
|
||||
import type { CodeQualityReport, CodeQualityCheckParams, CodeQualityIssue } from './types.js';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Allow-listed roots inside which `/code-quality/check` may run
|
||||
// `npm run typecheck/lint/build/test:run`. Anything outside these roots is
|
||||
// rejected before a subprocess is spawned. Configure via
|
||||
// `CODE_QUALITY_ALLOWED_ROOTS` (colon-separated) for non-default deployments.
|
||||
const DEFAULT_ALLOWED_ROOTS = ['/opt/bytelyst'];
|
||||
function getAllowedRoots(): string[] {
|
||||
const raw = process.env.CODE_QUALITY_ALLOWED_ROOTS?.trim();
|
||||
if (!raw) return DEFAULT_ALLOWED_ROOTS;
|
||||
return raw.split(':').map((s) => s.trim()).filter(Boolean);
|
||||
}
|
||||
|
||||
// Run an `npm run <script>` invocation through the shell allow-list and
|
||||
// always resolve, even on non-zero exit (the parsers downstream want to
|
||||
// inspect stdout+stderr regardless of exit code).
|
||||
async function runScriptCapturingOutput(
|
||||
script: NpmLifecycle,
|
||||
cwd: string,
|
||||
timeoutMs: number,
|
||||
): Promise<{ output: string; ok: boolean }> {
|
||||
try {
|
||||
const { stdout, stderr } = await runNpmScript(script, {
|
||||
allowedRoots: getAllowedRoots(),
|
||||
cwd,
|
||||
timeoutMs,
|
||||
});
|
||||
return { output: `${stdout}${stderr}`, ok: true };
|
||||
} catch (error) {
|
||||
if (error instanceof InvalidShellArgError) throw error;
|
||||
const e = error as { stdout?: string; stderr?: string; message?: string };
|
||||
return { output: `${e.stdout ?? ''}${e.stderr ?? ''}` || (e.message ?? ''), ok: false };
|
||||
}
|
||||
}
|
||||
|
||||
export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promise<CodeQualityReport> {
|
||||
const { projectId, projectPath, checks } = params;
|
||||
|
||||
// Reject paths outside the allow-list before spawning anything.
|
||||
// `assertPathInAllowedRoots` returns the resolved absolute form so we
|
||||
// pass that into the npm wrapper rather than the raw input.
|
||||
let resolvedPath: string;
|
||||
try {
|
||||
resolvedPath = assertPathInAllowedRoots(projectPath, getAllowedRoots());
|
||||
} catch (error) {
|
||||
if (error instanceof InvalidShellArgError) {
|
||||
throw new Error(
|
||||
`projectPath is not inside an allowed root (${getAllowedRoots().join(', ')}); refusing to run lifecycle scripts there.`,
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
const issues: CodeQualityIssue[] = [];
|
||||
const summary = {
|
||||
totalIssues: 0,
|
||||
@ -76,35 +27,66 @@ export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promi
|
||||
// TypeScript check
|
||||
if (checks.includes('typescript')) {
|
||||
const tsStart = Date.now();
|
||||
const { output } = await runScriptCapturingOutput('typecheck', resolvedPath, 60000);
|
||||
const tsIssues = parseTypeScriptOutput(output, resolvedPath);
|
||||
try {
|
||||
const { stdout, stderr } = await execAsync('npm run typecheck', {
|
||||
cwd: projectPath,
|
||||
timeout: 60000,
|
||||
});
|
||||
const output = stdout + stderr;
|
||||
const tsIssues = parseTypeScriptOutput(output, projectPath);
|
||||
issues.push(...tsIssues);
|
||||
categories.typescript.duration = Date.now() - tsStart;
|
||||
categories.typescript.errors = tsIssues.filter(i => i.type === 'error').length;
|
||||
categories.typescript.warnings = tsIssues.filter(i => i.type === 'warning').length;
|
||||
} catch (error: any) {
|
||||
categories.typescript.duration = Date.now() - tsStart;
|
||||
const output = error.stdout + error.stderr || error.message;
|
||||
const tsIssues = parseTypeScriptOutput(output, projectPath);
|
||||
issues.push(...tsIssues);
|
||||
categories.typescript.errors = tsIssues.filter(i => i.type === 'error').length;
|
||||
categories.typescript.warnings = tsIssues.filter(i => i.type === 'warning').length;
|
||||
}
|
||||
}
|
||||
|
||||
// ESLint check
|
||||
if (checks.includes('eslint')) {
|
||||
const eslintStart = Date.now();
|
||||
const { output } = await runScriptCapturingOutput('lint', resolvedPath, 60000);
|
||||
const eslintIssues = parseEslintOutput(output, resolvedPath);
|
||||
try {
|
||||
const { stdout, stderr } = await execAsync('npm run lint', {
|
||||
cwd: projectPath,
|
||||
timeout: 60000,
|
||||
});
|
||||
const output = stdout + stderr;
|
||||
const eslintIssues = parseEslintOutput(output, projectPath);
|
||||
issues.push(...eslintIssues);
|
||||
categories.eslint.duration = Date.now() - eslintStart;
|
||||
categories.eslint.errors = eslintIssues.filter(i => i.type === 'error').length;
|
||||
categories.eslint.warnings = eslintIssues.filter(i => i.type === 'warning').length;
|
||||
} catch (error: any) {
|
||||
categories.eslint.duration = Date.now() - eslintStart;
|
||||
const output = error.stdout + error.stderr || error.message;
|
||||
const eslintIssues = parseEslintOutput(output, projectPath);
|
||||
issues.push(...eslintIssues);
|
||||
categories.eslint.errors = eslintIssues.filter(i => i.type === 'error').length;
|
||||
categories.eslint.warnings = eslintIssues.filter(i => i.type === 'warning').length;
|
||||
}
|
||||
}
|
||||
|
||||
// Build check
|
||||
if (checks.includes('build')) {
|
||||
const buildStart = Date.now();
|
||||
const { output, ok } = await runScriptCapturingOutput('build', resolvedPath, 120000);
|
||||
categories.build.duration = Date.now() - buildStart;
|
||||
if (ok) {
|
||||
try {
|
||||
const { stdout, stderr } = await execAsync('npm run build', {
|
||||
cwd: projectPath,
|
||||
timeout: 120000,
|
||||
});
|
||||
categories.build.success = true;
|
||||
} else {
|
||||
categories.build.duration = Date.now() - buildStart;
|
||||
} catch (error: any) {
|
||||
categories.build.success = false;
|
||||
const buildIssues = parseBuildOutput(output, resolvedPath);
|
||||
categories.build.duration = Date.now() - buildStart;
|
||||
const output = error.stdout + error.stderr || error.message;
|
||||
const buildIssues = parseBuildOutput(output, projectPath);
|
||||
issues.push(...buildIssues);
|
||||
categories.build.errors = buildIssues.length;
|
||||
}
|
||||
@ -113,16 +95,25 @@ export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promi
|
||||
// Test check
|
||||
if (checks.includes('test')) {
|
||||
const testStart = Date.now();
|
||||
const { output, ok } = await runScriptCapturingOutput('test:run', resolvedPath, 120000);
|
||||
try {
|
||||
const { stdout, stderr } = await execAsync('npm run test:run', {
|
||||
cwd: projectPath,
|
||||
timeout: 120000,
|
||||
});
|
||||
const output = stdout + stderr;
|
||||
const testResults = parseTestOutput(output);
|
||||
categories.test.duration = Date.now() - testStart;
|
||||
categories.test.success = testResults.failed === 0;
|
||||
categories.test.passed = testResults.passed;
|
||||
categories.test.failed = testResults.failed;
|
||||
if (ok) {
|
||||
categories.test.success = testResults.failed === 0;
|
||||
} else {
|
||||
categories.test.duration = Date.now() - testStart;
|
||||
} catch (error: any) {
|
||||
categories.test.success = false;
|
||||
const testIssues = parseTestOutputErrors(output, resolvedPath);
|
||||
categories.test.duration = Date.now() - testStart;
|
||||
const output = error.stdout + error.stderr || error.message;
|
||||
const testResults = parseTestOutput(output);
|
||||
categories.test.passed = testResults.passed;
|
||||
categories.test.failed = testResults.failed;
|
||||
const testIssues = parseTestOutputErrors(output, projectPath);
|
||||
issues.push(...testIssues);
|
||||
}
|
||||
}
|
||||
@ -133,13 +124,13 @@ export async function runCodeQualityCheck(params: CodeQualityCheckParams): Promi
|
||||
summary.warnings = issues.filter(i => i.type === 'warning').length;
|
||||
summary.infos = issues.filter(i => i.type === 'info').length;
|
||||
|
||||
const projectName = resolvedPath.split('/').pop() || resolvedPath;
|
||||
const projectName = projectPath.split('/').pop() || projectPath;
|
||||
|
||||
return {
|
||||
id: `cq-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
|
||||
projectId,
|
||||
projectName,
|
||||
projectPath: resolvedPath,
|
||||
projectPath,
|
||||
timestamp: new Date().toISOString(),
|
||||
summary,
|
||||
categories,
|
||||
@ -157,7 +148,7 @@ function parseTypeScriptOutput(output: string, projectPath: string): CodeQuality
|
||||
if (tsErrorMatch) {
|
||||
issues.push({
|
||||
id: `ts-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
|
||||
type: tsErrorMatch[4] as 'error' | 'warning', // group 4 = type; group 3 = column
|
||||
type: tsErrorMatch[3] as 'error' | 'warning',
|
||||
category: 'typescript',
|
||||
file: tsErrorMatch[1],
|
||||
line: parseInt(tsErrorMatch[2]),
|
||||
@ -176,12 +167,10 @@ function parseEslintOutput(output: string, projectPath: string): CodeQualityIssu
|
||||
const lines = output.split('\n');
|
||||
|
||||
for (const line of lines) {
|
||||
// ESLint unix format: file:line:col: message [rule]
|
||||
// Rule part in brackets may or may not be present depending on formatter
|
||||
const eslintMatch = line.match(/(.+\.tsx?):(\d+):(\d+)[:\s]+(.+?)(?:\s+\[([^\]]+)\])?$/);
|
||||
// ESLint format: file:line:col message [rule]
|
||||
const eslintMatch = line.match(/(.+\.tsx?):(\d+):(\d+)\s+(.+?)\s+\[(.+)\]/);
|
||||
if (eslintMatch) {
|
||||
const msgAndLevel = eslintMatch[4];
|
||||
const severity = /\berror\b/i.test(msgAndLevel) ? 'error' : 'warning';
|
||||
const severity = eslintMatch[4].includes('error') ? 'error' : 'warning';
|
||||
issues.push({
|
||||
id: `eslint-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
|
||||
type: severity,
|
||||
@ -189,8 +178,8 @@ function parseEslintOutput(output: string, projectPath: string): CodeQualityIssu
|
||||
file: eslintMatch[1],
|
||||
line: parseInt(eslintMatch[2]),
|
||||
column: parseInt(eslintMatch[3]),
|
||||
message: msgAndLevel,
|
||||
rule: eslintMatch[5] ?? 'unknown',
|
||||
message: eslintMatch[4],
|
||||
rule: eslintMatch[5],
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -221,24 +210,18 @@ function parseTestOutput(output: string): { passed: number; failed: number } {
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
|
||||
// Try to parse Vitest output — use "Tests" line (individual tests), not "Test Files" line
|
||||
// Format: " Tests 3 failed | 5 passed (8)" or " Tests 8 passed (8)"
|
||||
const vitestFailMatch = output.match(/\bTests\b\s+(\d+)\s+failed[^|]*\|\s*(\d+)\s+passed/);
|
||||
const vitestPassMatch = output.match(/\bTests\b\s+(\d+)\s+passed/);
|
||||
if (vitestFailMatch) {
|
||||
failed = parseInt(vitestFailMatch[1]);
|
||||
passed = parseInt(vitestFailMatch[2]);
|
||||
} else if (vitestPassMatch) {
|
||||
passed = parseInt(vitestPassMatch[1]);
|
||||
failed = 0;
|
||||
// Try to parse Vitest output
|
||||
const vitestMatch = output.match(/Test Files\s+(\d+)\s+\((\d+)\s+failed/);
|
||||
if (vitestMatch) {
|
||||
failed = parseInt(vitestMatch[2]);
|
||||
passed = parseInt(vitestMatch[1]) - failed;
|
||||
}
|
||||
|
||||
// Try to parse Jest output: "Tests: 5 passed, 2 failed" or "Tests: 2 failed, 5 passed"
|
||||
const jestPassMatch = output.match(/Tests:.*?(\d+)\s+passed/);
|
||||
const jestFailMatch = output.match(/Tests:.*?(\d+)\s+failed/);
|
||||
if (jestPassMatch || jestFailMatch) {
|
||||
passed = jestPassMatch ? parseInt(jestPassMatch[1]) : 0;
|
||||
failed = jestFailMatch ? parseInt(jestFailMatch[1]) : 0;
|
||||
// Try to parse Jest output
|
||||
const jestMatch = output.match(/Tests:\s+(\d+)\s+passed,?\s*(\d+)\s+failed/);
|
||||
if (jestMatch) {
|
||||
passed = parseInt(jestMatch[1]);
|
||||
failed = parseInt(jestMatch[2]);
|
||||
}
|
||||
|
||||
return { passed, failed };
|
||||
|
||||
@ -1,16 +1,10 @@
|
||||
import { FastifyInstance } from 'fastify';
|
||||
import { runCodeQualityCheck } from './repository.js';
|
||||
import { CodeQualityCheckParamsSchema } from './types.js';
|
||||
import { requireAdmin } from '../../lib/auth.js';
|
||||
|
||||
export async function codeQualityRoutes(fastify: FastifyInstance) {
|
||||
// Run code quality check.
|
||||
// Admin-only: this route shells out (`npm run typecheck/lint/build/test:run`)
|
||||
// in a caller-supplied `projectPath` and is therefore privileged. See the
|
||||
// "Privilege Surface" section in `dashboard/DEPLOYMENT.md`.
|
||||
fastify.post('/code-quality/check', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (request, reply) => {
|
||||
// Run code quality check
|
||||
fastify.post('/code-quality/check', async (request, reply) => {
|
||||
try {
|
||||
const params = CodeQualityCheckParamsSchema.parse(request.body);
|
||||
const report = await runCodeQualityCheck(params);
|
||||
|
||||
@ -1,143 +0,0 @@
|
||||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
import type { Service } from '../services/types.js';
|
||||
|
||||
// --- I/O mocks. Hoisted so vi.mock factories below can see them. ---------------
|
||||
const execMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('child_process', () => ({ exec: execMock }));
|
||||
|
||||
const createDeploymentMock = vi.hoisted(() => vi.fn());
|
||||
const updateDeploymentMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('./repository.js', () => ({
|
||||
createDeployment: createDeploymentMock,
|
||||
updateDeployment: updateDeploymentMock,
|
||||
}));
|
||||
|
||||
const getServiceByIdMock = vi.hoisted(() => vi.fn());
|
||||
const updateServiceMock = vi.hoisted(() => vi.fn());
|
||||
vi.mock('../services/repository.js', () => ({
|
||||
getServiceById: getServiceByIdMock,
|
||||
updateService: updateServiceMock,
|
||||
}));
|
||||
|
||||
vi.mock('../../lib/config.js', () => ({
|
||||
config: {},
|
||||
productId: 'devops-internal',
|
||||
}));
|
||||
|
||||
const { triggerDeployment } = await import('./orchestrator.js');
|
||||
|
||||
function makeService(overrides?: Partial<Service>): Service {
|
||||
return {
|
||||
id: 'svc-1',
|
||||
name: 'Test Service',
|
||||
scriptPath: 'deploy.sh',
|
||||
healthUrl: 'https://example.com/health',
|
||||
repoPath: '../repo',
|
||||
status: 'up',
|
||||
version: '1.0.0',
|
||||
productId: 'devops-internal',
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
// promisify(exec) calls exec(cmd, options, cb(err, { stdout, stderr })). Drive
|
||||
// the callback synchronously off the mock so the deferred script work resolves
|
||||
// before our awaited assertion.
|
||||
function setExec(handler: () => { error?: Error & { stdout?: string; stderr?: string }; stdout?: string; stderr?: string }) {
|
||||
execMock.mockImplementation(
|
||||
(
|
||||
_cmd: string,
|
||||
_opts: unknown,
|
||||
cb: (err: (Error & { stdout?: string; stderr?: string }) | null, result?: { stdout: string; stderr: string }) => void,
|
||||
) => {
|
||||
const res = handler();
|
||||
if (res.error) cb(res.error);
|
||||
else cb(null, { stdout: res.stdout ?? '', stderr: res.stderr ?? '' });
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
describe('triggerDeployment', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
createDeploymentMock.mockImplementation(async (data) => ({ id: 'dep-1', ...data }));
|
||||
updateDeploymentMock.mockResolvedValue({});
|
||||
getServiceByIdMock.mockImplementation(async (id) => makeService({ id, version: '0.9.0' }));
|
||||
updateServiceMock.mockResolvedValue({});
|
||||
});
|
||||
|
||||
it('creates a pending deployment record and returns its id immediately', async () => {
|
||||
setExec(() => ({ stdout: 'deployed v1.2.3', stderr: '' }));
|
||||
const id = await triggerDeployment(makeService(), 'tester@bytelyst');
|
||||
expect(id).toBe('dep-1');
|
||||
expect(createDeploymentMock).toHaveBeenCalledWith({
|
||||
serviceId: 'svc-1',
|
||||
version: 'pending',
|
||||
triggeredBy: 'tester@bytelyst',
|
||||
productId: 'devops-internal',
|
||||
});
|
||||
});
|
||||
|
||||
// Wait for the post-trigger async work to flush. We can't await the inner
|
||||
// promise directly (orchestrator deliberately fire-and-forgets), so we yield
|
||||
// ticks until updateDeployment is observed.
|
||||
async function flushBackground(): Promise<void> {
|
||||
for (let i = 0; i < 50; i++) {
|
||||
if (updateDeploymentMock.mock.calls.length > 0) return;
|
||||
await Promise.resolve();
|
||||
}
|
||||
}
|
||||
|
||||
it('marks the deployment success and updates the service version on a clean run', async () => {
|
||||
setExec(() => ({ stdout: 'release version: 2.5.1\n', stderr: '' }));
|
||||
await triggerDeployment(makeService(), 'tester');
|
||||
await flushBackground();
|
||||
|
||||
const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
|
||||
expect(finalCall.status).toBe('success');
|
||||
expect(finalCall.version).toBe('2.5.1');
|
||||
expect(typeof finalCall.completedAt).toBe('string');
|
||||
|
||||
// Service is moved to 'up' with the extracted version.
|
||||
expect(updateServiceMock).toHaveBeenCalledWith(
|
||||
'svc-1',
|
||||
expect.objectContaining({ status: 'up', version: '2.5.1' }),
|
||||
);
|
||||
});
|
||||
|
||||
it('falls back to version "unknown" when the script logs no recognizable version', async () => {
|
||||
setExec(() => ({ stdout: 'all good, no numbers here', stderr: '' }));
|
||||
await triggerDeployment(makeService(), 'tester');
|
||||
await flushBackground();
|
||||
|
||||
const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
|
||||
expect(finalCall.status).toBe('success');
|
||||
expect(finalCall.version).toBe('unknown');
|
||||
});
|
||||
|
||||
it('marks the deployment failed and the service down when the script throws', async () => {
|
||||
const err = Object.assign(new Error('exit 1'), { stdout: 'partial', stderr: 'boom' });
|
||||
setExec(() => ({ error: err }));
|
||||
await triggerDeployment(makeService(), 'tester');
|
||||
await flushBackground();
|
||||
|
||||
const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
|
||||
expect(finalCall.status).toBe('failed');
|
||||
expect(finalCall.logs).toContain('ERROR: exit 1');
|
||||
expect(finalCall.logs).toContain('STDERR:\nboom');
|
||||
expect(finalCall).not.toHaveProperty('version');
|
||||
|
||||
expect(updateServiceMock).toHaveBeenCalledWith('svc-1', { status: 'down' });
|
||||
});
|
||||
|
||||
it('does not crash when getServiceById returns null in the success path', async () => {
|
||||
getServiceByIdMock.mockResolvedValue(null);
|
||||
setExec(() => ({ stdout: 'version: 1.0.0', stderr: '' }));
|
||||
await triggerDeployment(makeService(), 'tester');
|
||||
await flushBackground();
|
||||
|
||||
expect(updateServiceMock).not.toHaveBeenCalled();
|
||||
const finalCall = updateDeploymentMock.mock.calls.at(-1)![1];
|
||||
expect(finalCall.status).toBe('success');
|
||||
});
|
||||
});
|
||||
@ -4,10 +4,8 @@ import { join } from 'path';
|
||||
import type { Service } from '../services/types.js';
|
||||
import { createDeployment, updateDeployment } from './repository.js';
|
||||
import { productId } from '../../lib/config.js';
|
||||
import { childLogger } from '../../lib/logger.js';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
const log = childLogger('deployments/orchestrator');
|
||||
|
||||
export async function triggerDeployment(service: Service, triggeredBy: string): Promise<string> {
|
||||
// Create deployment record
|
||||
@ -22,7 +20,7 @@ export async function triggerDeployment(service: Service, triggeredBy: string):
|
||||
|
||||
// Trigger bash script asynchronously
|
||||
runDeploymentScript(service, deploymentId).catch(error => {
|
||||
log.error({ err: error, deploymentId, serviceId: service.id }, 'background deployment failed');
|
||||
console.error(`Deployment ${deploymentId} failed:`, error);
|
||||
});
|
||||
|
||||
return deploymentId;
|
||||
@ -32,10 +30,6 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
|
||||
const scriptDir = join(process.cwd(), '../../'); // Go to bytelyst-devops-tools root
|
||||
const scriptPath = join(scriptDir, service.scriptPath);
|
||||
|
||||
let finalStatus: 'success' | 'failed' = 'failed';
|
||||
let logs = '';
|
||||
let version: string | undefined;
|
||||
|
||||
try {
|
||||
const { stdout, stderr } = await execAsync(`bash ${scriptPath}`, {
|
||||
cwd: scriptDir,
|
||||
@ -46,9 +40,15 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
|
||||
},
|
||||
});
|
||||
|
||||
logs = `STDOUT:\n${stdout}\n\nSTDERR:\n${stderr}`;
|
||||
finalStatus = 'success';
|
||||
version = extractVersion(stdout + stderr) || 'unknown';
|
||||
const logs = `STDOUT:\n${stdout}\n\nSTDERR:\n${stderr}`;
|
||||
|
||||
// Update deployment as success
|
||||
await updateDeployment(deploymentId, {
|
||||
status: 'success',
|
||||
logs,
|
||||
completedAt: new Date().toISOString(),
|
||||
version: extractVersion(stdout + stderr) || 'unknown',
|
||||
});
|
||||
|
||||
// Update service status
|
||||
const { getServiceById, updateService } = await import('../services/repository.js');
|
||||
@ -57,14 +57,21 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
|
||||
await updateService(service.id, {
|
||||
status: 'up',
|
||||
lastDeployedAt: new Date().toISOString(),
|
||||
version: version || svc.version,
|
||||
version: extractVersion(stdout + stderr) || svc.version,
|
||||
});
|
||||
}
|
||||
} catch (error: any) {
|
||||
logs = error instanceof Error
|
||||
const logs = error instanceof Error
|
||||
? `ERROR: ${error.message}\n\n${(error as any).stdout ? `STDOUT:\n${(error as any).stdout}\n\n` : ''}${(error as any).stderr ? `STDERR:\n${(error as any).stderr}` : ''}`
|
||||
: String(error);
|
||||
|
||||
// Update deployment as failed
|
||||
await updateDeployment(deploymentId, {
|
||||
status: 'failed',
|
||||
logs,
|
||||
completedAt: new Date().toISOString(),
|
||||
});
|
||||
|
||||
// Update service status to down
|
||||
const { getServiceById, updateService } = await import('../services/repository.js');
|
||||
const svc = await getServiceById(service.id);
|
||||
@ -73,21 +80,6 @@ async function runDeploymentScript(service: Service, deploymentId: string) {
|
||||
status: 'down',
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
// Always write final status — ensures the deployment never gets stuck in 'running'
|
||||
try {
|
||||
await updateDeployment(deploymentId, {
|
||||
status: finalStatus,
|
||||
logs,
|
||||
completedAt: new Date().toISOString(),
|
||||
...(version ? { version } : {}),
|
||||
});
|
||||
} catch (updateError) {
|
||||
log.error(
|
||||
{ err: updateError, deploymentId, finalStatus },
|
||||
'failed to persist final deployment status',
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -13,29 +13,23 @@ import { createAuditLog } from '../audit/repository.js';
|
||||
import { productId } from '../../lib/config.js';
|
||||
|
||||
export async function deploymentRoutes(fastify: FastifyInstance) {
|
||||
// Get recent deployments across all services (admin only)
|
||||
fastify.get('/deployments', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
// Get recent deployments across all services
|
||||
fastify.get('/deployments', async (req, reply) => {
|
||||
const query = QueryParamsSchema.parse(req.query);
|
||||
const deployments = await getRecentDeployments(query.limit);
|
||||
return reply.send(deployments);
|
||||
});
|
||||
|
||||
// Get deployments for a specific service (admin only)
|
||||
fastify.get('/deployments/service/:serviceId', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
// Get deployments for a specific service
|
||||
fastify.get('/deployments/service/:serviceId', async (req, reply) => {
|
||||
const params = TriggerDeploymentParamsSchema.parse(req.params);
|
||||
const query = QueryParamsSchema.parse(req.query);
|
||||
const deployments = await getDeploymentsByService(params.serviceId, query.limit);
|
||||
return reply.send(deployments);
|
||||
});
|
||||
|
||||
// Get single deployment (admin only)
|
||||
fastify.get('/deployments/:id', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
// Get single deployment
|
||||
fastify.get('/deployments/:id', async (req, reply) => {
|
||||
const params = DeploymentParamsSchema.parse(req.params);
|
||||
const deployment = await getDeploymentById(params.id);
|
||||
if (!deployment) {
|
||||
@ -44,13 +38,9 @@ export async function deploymentRoutes(fastify: FastifyInstance) {
|
||||
return reply.send(deployment);
|
||||
});
|
||||
|
||||
// Get deployment logs (admin only). Returns the captured stdout/stderr +
|
||||
// current status as a single JSON payload. The web client polls this for
|
||||
// running deployments — there is intentionally no SSE/streaming variant
|
||||
// (see server.ts for the full rationale).
|
||||
fastify.get('/deployments/:id/logs', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
// Get deployment logs (SSE disabled due to Fastify 5 compatibility)
|
||||
// TODO: Re-enable SSE when fastify-sse-v2 supports Fastify 5
|
||||
fastify.get('/deployments/:id/logs', async (req, reply) => {
|
||||
const params = DeploymentParamsSchema.parse(req.params);
|
||||
const deployment = await getDeploymentById(params.id);
|
||||
|
||||
|
||||
31
dashboard/backend/src/modules/env/repository.ts
vendored
31
dashboard/backend/src/modules/env/repository.ts
vendored
@ -1,31 +0,0 @@
|
||||
import type { EnvVar } from './types.js';
|
||||
|
||||
const envVars = new Map<string, EnvVar>();
|
||||
|
||||
export async function getEnvVars(): Promise<EnvVar[]> {
|
||||
return Array.from(envVars.values()).sort((a, b) => a.name.localeCompare(b.name));
|
||||
}
|
||||
|
||||
export async function getEnvVar(id: string): Promise<EnvVar | null> {
|
||||
return envVars.get(id) ?? null;
|
||||
}
|
||||
|
||||
export async function upsertEnvVar(input: Partial<EnvVar> & { name: string }): Promise<EnvVar> {
|
||||
const id = input.id || input.name.toLowerCase().replace(/[^a-z0-9_]+/g, '_');
|
||||
const envVar: EnvVar = {
|
||||
id,
|
||||
name: input.name,
|
||||
value: input.isSecret ? 'REDACTED' : input.value ?? '',
|
||||
isSecret: input.isSecret ?? true,
|
||||
source: input.source ?? 'local',
|
||||
azureKeyVaultName: input.azureKeyVaultName,
|
||||
azureSecretName: input.azureSecretName,
|
||||
updatedAt: new Date().toISOString(),
|
||||
};
|
||||
envVars.set(id, envVar);
|
||||
return envVar;
|
||||
}
|
||||
|
||||
export async function deleteEnvVar(id: string): Promise<boolean> {
|
||||
return envVars.delete(id);
|
||||
}
|
||||
61
dashboard/backend/src/modules/env/routes.ts
vendored
61
dashboard/backend/src/modules/env/routes.ts
vendored
@ -1,61 +0,0 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { BadRequestError, requireAdmin } from '../../lib/auth.js';
|
||||
import { deleteEnvVar, getEnvVar, getEnvVars, upsertEnvVar } from './repository.js';
|
||||
import { EnvVarInputSchema, EnvVarParamsSchema } from './types.js';
|
||||
|
||||
export async function envRoutes(fastify: FastifyInstance) {
|
||||
fastify.get('/env', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
return reply.send(await getEnvVars());
|
||||
});
|
||||
|
||||
fastify.get('/env/:id', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
const params = EnvVarParamsSchema.parse(req.params);
|
||||
const envVar = await getEnvVar(params.id);
|
||||
if (!envVar) return reply.code(404).send({ error: 'Environment variable not found' });
|
||||
return reply.send(envVar);
|
||||
});
|
||||
|
||||
fastify.post('/env', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
try {
|
||||
const input = EnvVarInputSchema.parse(req.body) as { name: string };
|
||||
return reply.code(201).send(await upsertEnvVar(input));
|
||||
} catch (error) {
|
||||
if (error instanceof Error) throw new BadRequestError(error.message);
|
||||
throw error;
|
||||
}
|
||||
});
|
||||
|
||||
fastify.put('/env/:id', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
try {
|
||||
const params = EnvVarParamsSchema.parse(req.params);
|
||||
const input = EnvVarInputSchema.parse({ ...(req.body as object), id: params.id }) as { name: string; id: string };
|
||||
return reply.send(await upsertEnvVar(input));
|
||||
} catch (error) {
|
||||
if (error instanceof Error) throw new BadRequestError(error.message);
|
||||
throw error;
|
||||
}
|
||||
});
|
||||
|
||||
fastify.delete('/env/:id', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
const params = EnvVarParamsSchema.parse(req.params);
|
||||
const deleted = await deleteEnvVar(params.id);
|
||||
if (!deleted) return reply.code(404).send({ error: 'Environment variable not found' });
|
||||
return reply.code(204).send();
|
||||
});
|
||||
|
||||
fastify.post('/env/sync-azure', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (req, reply) => {
|
||||
return reply.send({ synced: 0, errors: ['Azure Key Vault sync is not configured in this local dashboard build.'] });
|
||||
});
|
||||
}
|
||||
22
dashboard/backend/src/modules/env/types.ts
vendored
22
dashboard/backend/src/modules/env/types.ts
vendored
@ -1,22 +0,0 @@
|
||||
import { z } from 'zod';
|
||||
|
||||
export const EnvVarSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
name: z.string().min(1),
|
||||
value: z.string().default(''),
|
||||
isSecret: z.boolean().default(true),
|
||||
source: z.enum(['local', 'azure-key-vault']).default('local'),
|
||||
azureKeyVaultName: z.string().optional(),
|
||||
azureSecretName: z.string().optional(),
|
||||
updatedAt: z.string().datetime().default(() => new Date().toISOString()),
|
||||
});
|
||||
|
||||
export const EnvVarParamsSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
});
|
||||
|
||||
export const EnvVarInputSchema = EnvVarSchema.omit({ name: true }).partial().extend({
|
||||
name: z.string().min(1),
|
||||
});
|
||||
|
||||
export type EnvVar = z.infer<typeof EnvVarSchema>;
|
||||
@ -1,127 +0,0 @@
|
||||
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
||||
import type { Service } from '../services/types.js';
|
||||
|
||||
const { checkServiceHealth, checkAllServices, clearHealthCache } = await import('./repository.js');
|
||||
|
||||
function makeService(overrides?: Partial<Service>): Service {
|
||||
return {
|
||||
id: 'svc-1',
|
||||
name: 'Test Service',
|
||||
scriptPath: '../deploy.sh',
|
||||
healthUrl: 'https://example.com/health',
|
||||
repoPath: '../repo',
|
||||
status: 'up',
|
||||
version: '1.0.0',
|
||||
productId: 'devops-internal',
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('checkServiceHealth', () => {
|
||||
beforeEach(() => {
|
||||
clearHealthCache();
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date('2026-01-01T00:00:00Z'));
|
||||
// Each test installs its own fetch mock as needed.
|
||||
vi.stubGlobal('fetch', vi.fn());
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
vi.unstubAllGlobals();
|
||||
});
|
||||
|
||||
it('reports "up" for a fast 2xx response', async () => {
|
||||
(globalThis.fetch as unknown as ReturnType<typeof vi.fn>).mockResolvedValue({ ok: true });
|
||||
const result = await checkServiceHealth(makeService());
|
||||
expect(result.status).toBe('up');
|
||||
expect(result.serviceId).toBe('svc-1');
|
||||
expect(result.lastCheck).toBe('2026-01-01T00:00:00.000Z');
|
||||
expect(result.responseTime).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
it('reports "down" for a non-2xx response', async () => {
|
||||
(globalThis.fetch as unknown as ReturnType<typeof vi.fn>).mockResolvedValue({ ok: false });
|
||||
const result = await checkServiceHealth(makeService({ id: 'svc-down' }));
|
||||
expect(result.status).toBe('down');
|
||||
});
|
||||
|
||||
it('reports "down" when fetch throws (network/timeout)', async () => {
|
||||
(globalThis.fetch as unknown as ReturnType<typeof vi.fn>).mockRejectedValue(new Error('boom'));
|
||||
const result = await checkServiceHealth(makeService({ id: 'svc-net' }));
|
||||
expect(result.status).toBe('down');
|
||||
// Failure path does not record a responseTime.
|
||||
expect(result.responseTime).toBeUndefined();
|
||||
});
|
||||
|
||||
it('caches successful results within the 30s TTL window', async () => {
|
||||
const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
|
||||
fetchMock.mockResolvedValue({ ok: true });
|
||||
await checkServiceHealth(makeService({ id: 'svc-cache' }));
|
||||
await checkServiceHealth(makeService({ id: 'svc-cache' }));
|
||||
await checkServiceHealth(makeService({ id: 'svc-cache' }));
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('refetches after the cache TTL expires', async () => {
|
||||
const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
|
||||
fetchMock.mockResolvedValue({ ok: true });
|
||||
await checkServiceHealth(makeService({ id: 'svc-ttl' }));
|
||||
vi.setSystemTime(new Date(Date.now() + 31_000));
|
||||
await checkServiceHealth(makeService({ id: 'svc-ttl' }));
|
||||
expect(fetchMock).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('caches failures for ~5s, not the full 30s', async () => {
|
||||
const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
|
||||
fetchMock.mockRejectedValue(new Error('boom'));
|
||||
await checkServiceHealth(makeService({ id: 'svc-fail' }));
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Within the short failure-cache window: still served from cache.
|
||||
vi.setSystemTime(new Date(Date.now() + 4_000));
|
||||
await checkServiceHealth(makeService({ id: 'svc-fail' }));
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Past the short failure window: refetch.
|
||||
vi.setSystemTime(new Date(Date.now() + 2_000));
|
||||
await checkServiceHealth(makeService({ id: 'svc-fail' }));
|
||||
expect(fetchMock).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('clearHealthCache forces a refetch', async () => {
|
||||
const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
|
||||
fetchMock.mockResolvedValue({ ok: true });
|
||||
await checkServiceHealth(makeService({ id: 'svc-clear' }));
|
||||
clearHealthCache();
|
||||
await checkServiceHealth(makeService({ id: 'svc-clear' }));
|
||||
expect(fetchMock).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe('checkAllServices', () => {
|
||||
beforeEach(() => {
|
||||
clearHealthCache();
|
||||
vi.stubGlobal('fetch', vi.fn());
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.unstubAllGlobals();
|
||||
});
|
||||
|
||||
it('returns a result per input service in input order', async () => {
|
||||
const fetchMock = globalThis.fetch as unknown as ReturnType<typeof vi.fn>;
|
||||
fetchMock.mockImplementation(async (url: string) => ({ ok: !url.includes('bad') }));
|
||||
|
||||
const services = [
|
||||
makeService({ id: 'a', healthUrl: 'https://a.example.com/health' }),
|
||||
makeService({ id: 'b', healthUrl: 'https://bad.example.com/health' }),
|
||||
makeService({ id: 'c', healthUrl: 'https://c.example.com/health' }),
|
||||
];
|
||||
|
||||
const out = await checkAllServices(services);
|
||||
expect(out).toHaveLength(3);
|
||||
expect(out.map(h => h.serviceId)).toEqual(['a', 'b', 'c']);
|
||||
expect(out.map(h => h.status)).toEqual(['up', 'down', 'up']);
|
||||
});
|
||||
});
|
||||
@ -53,8 +53,16 @@ export async function healthRoutes(fastify: FastifyInstance) {
|
||||
// Clear health cache (admin only)
|
||||
fastify.delete('/health/cache', {
|
||||
preHandler: async (req) => requireAdmin(req),
|
||||
}, async (_req, reply) => {
|
||||
}, async (req, reply) => {
|
||||
try {
|
||||
requireAdmin(req);
|
||||
clearHealthCache();
|
||||
return reply.send({ message: 'Health cache cleared' });
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
throw new BadRequestError(error.message);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user