learning_ai_common_plat/docker-compose.yml
Saravana Kumar 72fa2d297f fix(infra): switch cosmos-emulator from vnext-preview to stable :latest
The vnext-preview (Postgres-backed) image returned PGCosmosError plaintext
for cross-partition queryFeed calls, crashing @azure/cosmos at JSON.parse.
:latest is HTTPS-only with a self-signed cert, so consumers are gated by
NODE_TLS_REJECT_UNAUTHORIZED=0 (dev-prototype only). platform-service now
points at the real Azure Cosmos account (per .env), so its dependency on
the local emulator service is removed.
2026-05-30 09:59:21 +00:00

305 lines
11 KiB
YAML

services:
# ── Mailpit SMTP Sandbox (prototype only) ────────────────────
mailpit:
image: axllent/mailpit:v1.27.5
ports:
- '1025:1025'
- '8025:8025'
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:8025']
interval: 10s
timeout: 5s
retries: 6
restart: unless-stopped
# ── Azurite Blob Storage (prototype only) ─────────────────────
azurite:
image: mcr.microsoft.com/azure-storage/azurite:3.35.0
command: azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --location /data --skipApiVersionCheck
ports:
- '10000:10000'
volumes:
- azurite-data:/data
healthcheck:
test:
[
'CMD',
'node',
'-e',
'const net=require("net");const s=net.connect(10000,"127.0.0.1",()=>{s.end();process.exit(0)});s.on("error",()=>process.exit(1));',
]
interval: 10s
timeout: 5s
retries: 6
restart: unless-stopped
# ── Azure Cosmos DB Emulator (prototype only) ─────────────────
# 2026-05-30: switched off `:vnext-preview` (Postgres-backed experimental
# image) because cross-partition `queryFeed` returned plain-text
# `PGCosmosError(3, "Database query failed: PostgresError(SqlState(EXX000))...")`
# for every query, which crashes the @azure/cosmos SDK at JSON.parse. Affected
# every collection — point reads worked, but anything filtering by non-PK
# fields failed (login, register, OAuth, feature-flag list, etc.).
# `:latest` is the stable Linux port of the Windows Cosmos emulator and
# returns proper JSON errors. It is HTTPS-only with a self-signed cert,
# so consumers in this compose file are gated by NODE_TLS_REJECT_UNAUTHORIZED=0
# (dev-prototype only — never set that in production).
cosmos-emulator:
image: mcr.microsoft.com/cosmosdb/linux/azure-cosmos-emulator:latest
ports:
- '8081:8081'
- '1234:1234'
environment:
- AZURE_COSMOS_EMULATOR_PARTITION_COUNT=10
- AZURE_COSMOS_EMULATOR_ENABLE_DATA_PERSISTENCE=false
- ENABLE_EXPLORER=true
- GATEWAY_PUBLIC_ENDPOINT=cosmos-emulator
healthcheck:
test:
[
'CMD-SHELL',
'curl -sk --max-time 3 https://127.0.0.1:8081/_explorer/emulator.pem > /dev/null',
]
interval: 10s
timeout: 5s
retries: 30
start_period: 60s
restart: unless-stopped
# ── Loki (Log Aggregation) ────────────────────────────────────
loki:
image: grafana/loki:3.3.2
ports:
- '3100:3100'
volumes:
- ./services/monitoring/loki/loki-config.yml:/etc/loki/local-config.yaml
- loki-data:/loki
command: -config.file=/etc/loki/local-config.yaml
restart: unless-stopped
healthcheck:
# BusyBox wget (used in Alpine images) doesn't support --no-verbose/--tries
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:3100/ready']
interval: 15s
timeout: 5s
retries: 3
# ── Grafana (Log Viewer + Dashboards) ─────────────────────────
grafana:
image: grafana/grafana:11.4.0
ports:
- '3000:3000'
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=lysnrai
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- ./services/monitoring/grafana/provisioning:/etc/grafana/provisioning
- ./services/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
- grafana-data:/var/lib/grafana
depends_on:
loki:
condition: service_started
restart: unless-stopped
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:3000/api/health']
interval: 15s
timeout: 5s
retries: 3
# ── API Gateway (Traefik) ───────────────────────────────────
gateway:
image: traefik:v3.3
command:
- '--api.insecure=true'
- '--providers.docker=true'
- '--providers.docker.exposedbydefault=false'
- '--entrypoints.web.address=:80'
- '--accesslog=true'
- '--accesslog.format=json'
ports:
- '80:80'
- '8080:8080' # Traefik dashboard
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
loki:
condition: service_started
restart: unless-stopped
# ── Platform Service (Fastify + TypeScript) ─────────────
# Consolidated: auth, audit, notifications, flags, blob, invitations, referrals, promos,
# subscriptions, usage, plans, licenses, stripe, items, comments, votes, public
platform-service:
build:
context: .
dockerfile: services/platform-service/Dockerfile
ports:
- '4003:4003'
env_file:
- .env
environment:
- PORT=4003
# Local/dev convenience: ensure Cosmos DB + containers exist.
- COSMOS_AUTO_INIT=true
# 2026-05-30: switched off the local Cosmos emulator (Postgres-backed
# vnext-preview broke `queryFeed` with `PGCosmosError`; stable :latest
# crashed under load with a core dump). Pointed at the real Azure
# Cosmos DB account (`cosmos-mywisprai`, db `bytelyst`) instead. Values
# come from `.env`; the cosmos-emulator service block in this compose
# file is no longer needed and platform-service no longer depends on it.
- PLATFORM_SERVICE_URL=http://platform-service:4003
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
- MCP_SERVER_URL=http://mcp-server:4007
- MAILPIT_UI_URL=http://mailpit:8025
depends_on:
mailpit:
condition: service_healthy
azurite:
condition: service_healthy
labels:
- 'traefik.enable=true'
- 'traefik.http.routers.platform.rule=PathPrefix(`/api`) || PathPrefix(`/public`) || PathPrefix(`/health`)'
- 'traefik.http.services.platform.loadbalancer.server.port=4003'
restart: unless-stopped
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:4003/health']
interval: 30s
timeout: 10s
retries: 3
# ── Extraction Service (Fastify + TypeScript + Python sidecar) ──
extraction-service:
build:
context: .
dockerfile: services/extraction-service/Dockerfile
ports:
- '4005:4005'
env_file:
- .env
environment:
- PORT=4005
- PYTHON_SIDECAR_URL=http://localhost:4006
# See cosmos-emulator service block: stable image is HTTPS-only with
# a self-signed cert. Dev-prototype only.
- COSMOS_ENDPOINT=https://cosmos-emulator:8081
- NODE_TLS_REJECT_UNAUTHORIZED=0
depends_on:
cosmos-emulator:
condition: service_healthy
labels:
- 'traefik.enable=true'
- 'traefik.http.routers.extraction.rule=PathPrefix(`/api/extract`) || PathPrefix(`/api/tasks`)'
- 'traefik.http.services.extraction.loadbalancer.server.port=4005'
restart: unless-stopped
healthcheck:
test:
[
'CMD',
'node',
'-e',
'fetch("http://127.0.0.1:4005/health").then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))',
]
interval: 30s
timeout: 10s
retries: 3
# ── MCP Server (Fastify + TypeScript) ────────────────────────
# Exposes tool namespaces: platform.telemetry.*, platform.diagnostics.*,
# extraction.*, support.* — consumed by AI agents + admin tooling
mcp-server:
build:
context: .
dockerfile: services/mcp-server/Dockerfile
ports:
- '4007:4007'
env_file:
- .env
environment:
- PORT=4007
- PLATFORM_SERVICE_URL=http://platform-service:4003
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
depends_on:
platform-service:
condition: service_healthy
extraction-service:
condition: service_healthy
labels:
- 'traefik.enable=true'
- 'traefik.http.routers.mcp.rule=PathPrefix(`/api/tools`)'
- 'traefik.http.services.mcp.loadbalancer.server.port=4007'
restart: unless-stopped
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:4007/health']
interval: 30s
timeout: 10s
retries: 3
# ── Cowork Service (Fastify bridge to Rust agent runtime) ──────
# Bridges Tauri desktop / clients and cowork-orchestrator, delegating auth,
# flags, audit, telemetry, and AI budgets to platform-service.
cowork-service:
build:
context: .
dockerfile: services/cowork-service/Dockerfile
ports:
- '4009:4009'
env_file:
- .env
environment:
- PORT=4009
- NODE_ENV=development
- PRODUCT_ID=clawcowork
- COSMOS_ENDPOINT=https://cosmos-emulator:8081
# Stable emulator's self-signed cert — dev-prototype only.
- NODE_TLS_REJECT_UNAUTHORIZED=0
- PLATFORM_SERVICE_URL=http://platform-service:4003
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
depends_on:
cosmos-emulator:
condition: service_healthy
platform-service:
condition: service_healthy
labels:
- 'traefik.enable=true'
- 'traefik.http.routers.cowork.rule=PathPrefix(`/api/cowork`)'
- 'traefik.http.services.cowork.loadbalancer.server.port=4009'
restart: unless-stopped
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:4009/health']
interval: 10s
timeout: 5s
retries: 10
# ── Admin Web (Next.js Platform Admin Console) ─────────────────
admin-web:
build:
context: .
dockerfile: dashboards/admin-web/Dockerfile
args:
BYTELYST_PACKAGE_SOURCE: ${BYTELYST_PACKAGE_SOURCE:-vendor}
NEXT_PUBLIC_PRODUCT_ID: ${NEXT_PUBLIC_PRODUCT_ID:-admin}
NEXT_PUBLIC_PLATFORM_URL: http://platform-service:4003
container_name: admin-web
ports:
- '3001:3001'
networks:
- default
restart: unless-stopped
depends_on:
platform-service:
condition: service_healthy
environment:
- NODE_ENV=production
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:3001']
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
# ── Volumes ───────────────────────────────────────────────────────
volumes:
azurite-data:
loki-data:
grafana-data: