docs(devops): add Track A handoff and prep gateway changes

This commit is contained in:
root 2026-03-29 23:57:03 +00:00
parent eba6c7a641
commit c0cf80d96b
8 changed files with 352 additions and 48 deletions

View File

@ -42,11 +42,12 @@ ENV HUSKY=0
RUN addgroup --system --gid 1001 nodejs
RUN adduser --system --uid 1001 nextjs
COPY --from=builder --chown=nextjs:nodejs /app/deploy ./
COPY --from=builder --chown=nextjs:nodejs /app/dashboards/admin-web/.next/standalone ./
COPY --from=builder --chown=nextjs:nodejs /app/dashboards/admin-web/.next/static ./.next/static
COPY --from=builder --chown=nextjs:nodejs /app/dashboards/admin-web/public ./public
USER nextjs
EXPOSE 3001
CMD ["npm", "start"]
CMD ["node", "server.js"]

View File

@ -38,11 +38,12 @@ ENV HUSKY=0
RUN addgroup --system --gid 1001 nodejs
RUN adduser --system --uid 1001 nextjs
COPY --from=builder --chown=nextjs:nodejs /app/deploy ./
COPY --from=builder --chown=nextjs:nodejs /app/dashboards/tracker-web/.next/standalone ./
COPY --from=builder --chown=nextjs:nodejs /app/dashboards/tracker-web/.next/static ./.next/static
COPY --from=builder --chown=nextjs:nodejs /app/dashboards/tracker-web/public ./public
USER nextjs
EXPOSE 3003
CMD ["npm", "start"]
CMD ["node", "server.js"]

View File

@ -126,6 +126,8 @@ services:
gateway:
image: traefik:v3.3
profiles:
- legacy-gateway
command:
- '--api.insecure=true'
- '--providers.docker=true'
@ -143,6 +145,25 @@ services:
condition: service_started
restart: unless-stopped
caddy:
image: caddy:2-alpine
container_name: caddy
ports:
- '80:80'
- '443:443'
volumes:
- ../Caddyfile:/etc/caddy/Caddyfile:ro
- caddy-data:/data
- caddy-config:/config
depends_on:
platform-service:
condition: service_healthy
extraction-service:
condition: service_healthy
mcp-server:
condition: service_healthy
restart: unless-stopped
# ═════════════════════════════════════════════════════════════════
# PLATFORM SERVICES (from this repo)
# ═════════════════════════════════════════════════════════════════
@ -151,8 +172,6 @@ services:
build:
context: .
dockerfile: services/platform-service/Dockerfile
ports:
- '4003:4003'
env_file:
- .env.ecosystem
environment:
@ -184,8 +203,6 @@ services:
build:
context: .
dockerfile: services/extraction-service/Dockerfile
ports:
- '4005:4005'
env_file:
- .env.ecosystem
environment:
@ -209,8 +226,6 @@ services:
build:
context: .
dockerfile: services/mcp-server/Dockerfile
ports:
- '4007:4007'
env_file:
- .env.ecosystem
environment:
@ -289,8 +304,6 @@ services:
<<: *product-build
context: ../learning_ai_peakpulse
dockerfile: backend/Dockerfile
ports:
- '4010:4010'
env_file:
- .env.ecosystem
environment:
@ -312,8 +325,6 @@ services:
<<: *product-build
context: ../learning_ai_clock
dockerfile: backend/Dockerfile
ports:
- '4011:4011'
env_file:
- .env.ecosystem
environment:
@ -335,8 +346,6 @@ services:
<<: *product-build
context: ../learning_ai_jarvis_jr
dockerfile: backend/Dockerfile
ports:
- '4012:4012'
env_file:
- .env.ecosystem
environment:
@ -358,8 +367,6 @@ services:
<<: *product-build
context: ../learning_ai_fastgap
dockerfile: backend/Dockerfile
ports:
- '4013:4013'
env_file:
- .env.ecosystem
environment:
@ -381,8 +388,6 @@ services:
<<: *product-build
context: ../learning_multimodal_memory_agents
dockerfile: backend/Dockerfile
ports:
- '4014:4014'
env_file:
- .env.ecosystem
environment:
@ -404,8 +409,6 @@ services:
<<: *product-build
context: ../learning_voice_ai_agent
dockerfile: backend/Dockerfile
ports:
- '4015:4015'
env_file:
- .env.ecosystem
environment:
@ -427,8 +430,6 @@ services:
<<: *product-build
context: ../learning_ai_notes
dockerfile: backend/Dockerfile
ports:
- '4016:4016'
env_file:
- .env.ecosystem
environment:
@ -451,8 +452,6 @@ services:
<<: *product-build
context: ../learning_ai_flowmonk
dockerfile: backend/Dockerfile
ports:
- '4017:4017'
env_file:
- .env.ecosystem
environment:
@ -474,8 +473,6 @@ services:
<<: *product-build
context: ../learning_ai_trails
dockerfile: backend/Dockerfile
ports:
- '4018:4018'
env_file:
- .env.ecosystem
environment:
@ -497,8 +494,6 @@ services:
<<: *product-build
context: ../learning_ai_local_memory_gpt
dockerfile: backend/Dockerfile
ports:
- '4019:4019'
extra_hosts:
- 'host.docker.internal:host-gateway'
env_file:
@ -822,23 +817,6 @@ services:
retries: 3
restart: unless-stopped
efforise-web:
build:
<<: *product-build
context: ../learning_ai_efforise
dockerfile: client/Dockerfile
ports:
- '3080:3080'
depends_on:
efforise-backend:
condition: service_healthy
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:3080']
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped
# ═════════════════════════════════════════════════════════════════
# VOLUMES
# ═════════════════════════════════════════════════════════════════
@ -848,3 +826,5 @@ volumes:
loki-data:
grafana-data:
localmemgpt-data:
caddy-data:
caddy-config:

View File

@ -0,0 +1,87 @@
{
email admin@bytelyst.com
}
api.bytelyst.com {
encode gzip
@platform_root path /platform
redir @platform_root /platform/ 308
handle_path /platform/* {
reverse_proxy platform-service:4003
}
@extraction_root path /extraction
redir @extraction_root /extraction/ 308
handle_path /extraction/* {
reverse_proxy extraction-service:4005
}
@mcp_root path /mcp
redir @mcp_root /mcp/ 308
handle_path /mcp/* {
reverse_proxy mcp-server:4007
}
@peakpulse_root path /peakpulse
redir @peakpulse_root /peakpulse/ 308
handle_path /peakpulse/* {
reverse_proxy peakpulse-backend:4010
}
@chronomind_root path /chronomind
redir @chronomind_root /chronomind/ 308
handle_path /chronomind/* {
reverse_proxy chronomind-backend:4011
}
@jarvisjr_root path /jarvisjr
redir @jarvisjr_root /jarvisjr/ 308
handle_path /jarvisjr/* {
reverse_proxy jarvisjr-backend:4012
}
@nomgap_root path /nomgap
redir @nomgap_root /nomgap/ 308
handle_path /nomgap/* {
reverse_proxy nomgap-backend:4013
}
@mindlyst_root path /mindlyst
redir @mindlyst_root /mindlyst/ 308
handle_path /mindlyst/* {
reverse_proxy mindlyst-backend:4014
}
@lysnrai_root path /lysnrai
redir @lysnrai_root /lysnrai/ 308
handle_path /lysnrai/* {
reverse_proxy lysnrai-backend:4015
}
@notelett_root path /notelett
redir @notelett_root /notelett/ 308
handle_path /notelett/* {
reverse_proxy notelett-backend:4016
}
@flowmonk_root path /flowmonk
redir @flowmonk_root /flowmonk/ 308
handle_path /flowmonk/* {
reverse_proxy flowmonk-backend:4017
}
@actiontrail_root path /actiontrail
redir @actiontrail_root /actiontrail/ 308
handle_path /actiontrail/* {
reverse_proxy actiontrail-backend:4018
}
@localmemgpt_root path /localmemgpt
redir @localmemgpt_root /localmemgpt/ 308
handle_path /localmemgpt/* {
reverse_proxy localmemgpt-backend:4019
}
respond 404
}

View File

@ -213,6 +213,23 @@ Impact:
- dashboard image became buildable
### 6. Dashboard standalone runtime fix
Problem:
- `admin-web` and `tracker-web` were built with Next.js standalone output enabled
- their runtime images still attempted to start from a deployed package layout that did not contain the expected standalone entrypoint
Fix:
- changed both dashboard Dockerfiles to copy `.next/standalone` and `.next/static`
- switched the runtime command to `node server.js`
Impact:
- aligns both images with the Next.js standalone artifact layout
- removes the immediate startup failure seen in the container logs
---
## Validation Results

View File

@ -194,6 +194,26 @@ All optional — defaults work for most setups:
- **CORS errors in browser:** The generated `.env.ecosystem` sets `CORS_ORIGIN=*` for dev/test. If you restrict it, update the value to match your access URL.
- **Services in development mode:** `.env.ecosystem` now sets `NODE_ENV=production` for all services. If you need debug logging, remove or change this value.
## HTTPS Gateway
- Public backend access is intended to flow through Caddy on `https://api.bytelyst.com`, not direct backend port exposure.
- The gateway config lives at `/opt/bytelyst/Caddyfile` and is mounted into the `caddy` container.
- Backend routes are path-based and strip their prefixes before proxying:
- `/platform/*``platform-service:4003`
- `/extraction/*``extraction-service:4005`
- `/mcp/*``mcp-server:4007`
- `/peakpulse/*``peakpulse-backend:4010`
- `/chronomind/*``chronomind-backend:4011`
- `/jarvisjr/*``jarvisjr-backend:4012`
- `/nomgap/*``nomgap-backend:4013`
- `/mindlyst/*``mindlyst-backend:4014`
- `/lysnrai/*``lysnrai-backend:4015`
- `/notelett/*``notelett-backend:4016`
- `/flowmonk/*``flowmonk-backend:4017`
- `/actiontrail/*``actiontrail-backend:4018`
- `/localmemgpt/*``localmemgpt-backend:4019`
- Keep backend ports closed publicly once DNS and NSG rules are aligned. Docker-internal service discovery remains unchanged.
## Known Limitations
- **Remote browser access:** Product web apps use `http://localhost:<port>` for browser-side API calls (baked at Next.js build time via `NEXT_PUBLIC_*` args). This works when browsing from the VM itself but **not from a remote browser** (e.g., laptop accessing `http://<vm-ip>:3060`). For remote access, use SSH port-forwarding:

View File

@ -15,7 +15,7 @@
| # | Prompt | Status | Commit SHA | Verified |
| --- | ------------------------ | :------------: | :--------: | :------: |
| A1 | Caddy Gateway Setup | ⬜ Not started | — | ⬜ |
| A1 | Caddy Gateway Setup | 🟨 In progress | — | ⬜ |
| A2 | Gitea HTTPS Exposure | ⬜ Not started | — | ⬜ |
| A3 | Dashboard Containers Fix | ⬜ Not started | — | ⬜ |
| A4 | NSG Lockdown | ⬜ Not started | — | ⬜ |

View File

@ -0,0 +1,198 @@
# Track A Handoff — 2026-03-29
This handoff captures the current state of Track A on the Azure VM at `/opt/bytelyst/`.
## What Was Completed
- Fixed a blocking parse error in [`docker-compose.ecosystem.yml`](/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml) by removing a duplicate `efforise-web` service definition.
- Added [`/opt/bytelyst/Caddyfile`](/opt/bytelyst/Caddyfile) with path-based routing for the 13 backend services.
- Added a tracked template copy at [`docs/devops/single_azure_vm/docker/Caddyfile.bytelyst.example`](/opt/bytelyst/learning_ai_common_plat/docs/devops/single_azure_vm/docker/Caddyfile.bytelyst.example).
- Updated [`docker-compose.ecosystem.yml`](/opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml) to:
- add a `caddy` service
- place Traefik `gateway` behind a `legacy-gateway` profile
- remove published backend ports for `4003`, `4005`, `4007`, `4010`-`4019`
- Updated dashboard Dockerfiles to use the Next.js standalone runtime:
- [`dashboards/admin-web/Dockerfile`](/opt/bytelyst/learning_ai_common_plat/dashboards/admin-web/Dockerfile)
- [`dashboards/tracker-web/Dockerfile`](/opt/bytelyst/learning_ai_common_plat/dashboards/tracker-web/Dockerfile)
- Updated deployment docs:
- [`docs/devops/single_azure_vm/docker/README.md`](/opt/bytelyst/learning_ai_common_plat/docs/devops/single_azure_vm/docker/README.md)
- [`docs/devops/single_azure_vm/docker/DEPLOYMENT_STATUS_2026-03-29.md`](/opt/bytelyst/learning_ai_common_plat/docs/devops/single_azure_vm/docker/DEPLOYMENT_STATUS_2026-03-29.md)
- Updated the Track A progress table to mark A1 in progress:
- [`docs/devops/vercel/CODEX_PROMPTS_TRACK_A_AZURE_VM.md`](/opt/bytelyst/learning_ai_common_plat/docs/devops/vercel/CODEX_PROMPTS_TRACK_A_AZURE_VM.md)
## Blockers Found
### 1. DNS is not ready
From the VM, these records did not resolve:
- `api.bytelyst.com`
- `gitea.bytelyst.com`
- `admin.bytelyst.com`
- `tracker.bytelyst.com`
Without these, Caddy cannot obtain Let's Encrypt certificates and A1/A2/A3 HTTPS verification cannot pass.
### 2. Azure CLI is missing
`az account show` failed with:
```text
az: command not found
```
That blocks A4 because the NSG cannot be snapshotted or edited from this VM.
### 3. Linux build host is still using the wrong Gitea registry hostname
Broad compose rebuilds hit repeated package fetch failures such as:
```text
GET http://host.docker.internal:3300/api/packages/bytelyst/npm/... error (ENOTFOUND)
```
The VM is Linux, and many builds still default to `GITEA_NPM_HOST=host.docker.internal`. On this VM the Gitea registry is reachable at `localhost:3300` on the host and via the detected Docker host IP used by `setup.sh`.
This blocks reliable rebuilds for A3 and potentially other services.
## Required Preconditions Before Resuming
1. Create DNS A records for:
- `api.bytelyst.com`
- `gitea.bytelyst.com`
- `admin.bytelyst.com`
- `tracker.bytelyst.com`
2. Confirm they resolve to the VM public IP from the VM:
```bash
dig +short api.bytelyst.com
dig +short gitea.bytelyst.com
dig +short admin.bytelyst.com
dig +short tracker.bytelyst.com
curl -sf https://api.ipify.org && echo
```
3. Install Azure CLI and log in with permissions to manage the VM NSG.
4. Export a Linux-safe Gitea host before rebuilds, for example:
```bash
export GITEA_NPM_HOST=172.17.0.1
```
Use the actual Docker-reachable host IP if it differs on this VM.
## Resume Steps
Run from:
```bash
cd /opt/bytelyst/learning_ai_common_plat
```
### A1
1. Confirm backend health:
```bash
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem ps
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem ps | grep -c healthy
```
2. Start or refresh only the gateway-related services first:
```bash
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem up -d caddy platform-service extraction-service mcp-server peakpulse-backend chronomind-backend jarvisjr-backend nomgap-backend mindlyst-backend lysnrai-backend notelett-backend flowmonk-backend actiontrail-backend localmemgpt-backend
docker logs caddy --tail 100
```
3. Verify A1:
```bash
curl -sI https://api.bytelyst.com/platform/health | head -5
for svc in platform extraction mcp peakpulse chronomind jarvisjr nomgap mindlyst lysnrai notelett flowmonk actiontrail localmemgpt; do
echo -n "$svc: "
curl -sf https://api.bytelyst.com/$svc/health | jq -r '.status // "FAIL"'
done
curl -sf --max-time 3 http://<VM_PUBLIC_IP>:4003/health && echo "FAIL: port still open" || echo "PASS: port closed"
```
### A2
1. Reconfigure the standalone `gitea-npm-registry` container for HTTPS exposure. It is not managed by the ecosystem compose file.
2. Update its `ROOT_URL` to `https://gitea.bytelyst.com`.
3. Add a `gitea.bytelyst.com` block to `/opt/bytelyst/Caddyfile`.
4. Reload Caddy.
5. Verify:
```bash
curl -sI https://gitea.bytelyst.com | head -3
curl -sf https://gitea.bytelyst.com/api/packages/ByteLyst/npm/@bytelyst%2ferrors | jq '.name'
docker exec platform-service curl -sf http://gitea:3300/api/v1/version | jq '.version'
```
Note:
The current running container is:
```bash
docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' | grep -i gitea
```
Observed name during this session:
```text
gitea-npm-registry
```
### A3
1. Keep `GITEA_NPM_HOST` exported to the Docker-reachable host IP before rebuilding.
2. Rebuild only the dashboard services first:
```bash
export GITEA_NPM_HOST=172.17.0.1
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem up -d --build admin-web tracker-web
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem logs admin-web --tail 100
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem logs tracker-web --tail 100
```
3. Add dashboard host blocks to `/opt/bytelyst/Caddyfile`:
```caddy
admin.bytelyst.com {
reverse_proxy admin-web:3001
}
tracker.bytelyst.com {
reverse_proxy tracker-web:3003
}
```
4. Reload Caddy and verify:
```bash
curl -sf http://127.0.0.1:3001 | head -5
curl -sf http://127.0.0.1:3003 | head -5
curl -sI https://admin.bytelyst.com | head -3
curl -sI https://tracker.bytelyst.com | head -3
docker compose -f docker-compose.ecosystem.yml --env-file .env.ecosystem ps | grep -E "(unhealthy|Exit)" | wc -l
```
### A4
1. Install Azure CLI if still missing.
2. Identify the resource group and NSG name for this VM.
3. Snapshot rules, then reduce inbound access to only `22`, `80`, and `443`.
4. Run the full A4 verification from the Track A prompt file.
## Suggested Commit Sequence
After each prompt passes, update the progress table in:
- [`docs/devops/vercel/CODEX_PROMPTS_TRACK_A_AZURE_VM.md`](/opt/bytelyst/learning_ai_common_plat/docs/devops/vercel/CODEX_PROMPTS_TRACK_A_AZURE_VM.md)
Suggested commits:
- `feat(gateway): replace Traefik with Caddy for HTTPS path routing`
- `feat(gateway): expose Gitea npm registry via HTTPS at gitea.bytelyst.com`
- `fix(deployment): resolve admin-web and tracker-web containers`
- `chore(security): lock down Azure NSG to 22/80/443 only`