fix(infra,cowork): remove broken Cosmos emulator; harden IPC bridge

docker-compose:
  - Drop the cosmos-emulator service block. Both image variants we
    tried were unfit for the prototype: `:vnext-preview` returned
    plain-text PGCosmosError strings that crashed @azure/cosmos at
    JSON.parse, and `:latest` core-dumped under load. The container
    has been Exited(255) for weeks and was blocking depends_on chains.
  - Real Azure Cosmos account `cosmos-mywisprai` (db `bytelyst`,
    West US 2) is now the single source of truth; all services pick
    up COSMOS_ENDPOINT/KEY/DATABASE from `.env` (already mounted via
    `env_file: .env`).
  - extraction-service: drop hardcoded `COSMOS_ENDPOINT=…cosmos-emulator…`,
    `NODE_TLS_REJECT_UNAUTHORIZED=0`, and `depends_on: cosmos-emulator`.
  - cowork-service: same cleanup.

cowork-service IPC bridge:
  - Add `error` listeners to the spawned child's stdin/stdout/stderr.
    Without them, an EPIPE on stdin (child died mid-write) or a
    teardown-time stream error surfaced as an unhandled error and
    crashed vitest after all 140 tests had passed.
  - Removes the only failing recursive test in the workspace.

Test status after this commit:
  - 94 workspace packages, all green
  - cowork-service: 19 passed | 1 skipped (140 tests)
  - platform-service: 131 test files passed
  - extraction-service: 13 test files passed
  - All other packages: passing

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
Saravana Kumar 2026-05-30 10:27:12 +00:00
parent 72fa2d297f
commit ec055f6948
2 changed files with 37 additions and 44 deletions

View File

@ -33,38 +33,28 @@ services:
retries: 6 retries: 6
restart: unless-stopped restart: unless-stopped
# ── Azure Cosmos DB Emulator (prototype only) ───────────────── # ── Azure Cosmos DB Emulator — REMOVED 2026-05-30 ─────────────
# 2026-05-30: switched off `:vnext-preview` (Postgres-backed experimental #
# image) because cross-partition `queryFeed` returned plain-text # Both image variants we tried were unfit for the prototype:
# `PGCosmosError(3, "Database query failed: PostgresError(SqlState(EXX000))...")` # - `:vnext-preview` (Postgres-backed experimental): cross-partition
# for every query, which crashes the @azure/cosmos SDK at JSON.parse. Affected # `queryFeed` returned plain-text PGCosmosError strings instead of
# every collection — point reads worked, but anything filtering by non-PK # JSON, crashing @azure/cosmos at JSON.parse on every login,
# fields failed (login, register, OAuth, feature-flag list, etc.). # register, OAuth, and feature-flag list call
# `:latest` is the stable Linux port of the Windows Cosmos emulator and # - `:latest` (stable Linux port of Windows emulator): HTTPS-only
# returns proper JSON errors. It is HTTPS-only with a self-signed cert, # with a self-signed cert and core-dumped under modest load,
# so consumers in this compose file are gated by NODE_TLS_REJECT_UNAUTHORIZED=0 # leaving services hung waiting on never-resolving futures
# (dev-prototype only — never set that in production). #
cosmos-emulator: # Replacement: real Azure Cosmos DB account `cosmos-mywisprai` in
image: mcr.microsoft.com/cosmosdb/linux/azure-cosmos-emulator:latest # `rg-mywisprai` (West US 2), database `bytelyst`. All services pick
ports: # up the connection from `.env` (`COSMOS_ENDPOINT`, `COSMOS_KEY`,
- '8081:8081' # `COSMOS_DATABASE`) via their `env_file: .env` entries below.
- '1234:1234' #
environment: # If you need a local-only stack for offline development, prefer:
- AZURE_COSMOS_EMULATOR_PARTITION_COUNT=10 # 1. Mocked Cosmos in tests (already wired across the workspace), or
- AZURE_COSMOS_EMULATOR_ENABLE_DATA_PERSISTENCE=false # 2. A scoped Cosmos account on a free Azure subscription with a
- ENABLE_EXPLORER=true # throwaway database
- GATEWAY_PUBLIC_ENDPOINT=cosmos-emulator # Do NOT resurrect the emulator service block without verifying both
healthcheck: # of the above failure modes have been fixed upstream.
test:
[
'CMD-SHELL',
'curl -sk --max-time 3 https://127.0.0.1:8081/_explorer/emulator.pem > /dev/null',
]
interval: 10s
timeout: 5s
retries: 30
start_period: 60s
restart: unless-stopped
# ── Loki (Log Aggregation) ──────────────────────────────────── # ── Loki (Log Aggregation) ────────────────────────────────────
loki: loki:
@ -179,13 +169,7 @@ services:
environment: environment:
- PORT=4005 - PORT=4005
- PYTHON_SIDECAR_URL=http://localhost:4006 - PYTHON_SIDECAR_URL=http://localhost:4006
# See cosmos-emulator service block: stable image is HTTPS-only with # COSMOS_* come from `.env` (real Cosmos account; see top of file).
# a self-signed cert. Dev-prototype only.
- COSMOS_ENDPOINT=https://cosmos-emulator:8081
- NODE_TLS_REJECT_UNAUTHORIZED=0
depends_on:
cosmos-emulator:
condition: service_healthy
labels: labels:
- 'traefik.enable=true' - 'traefik.enable=true'
- 'traefik.http.routers.extraction.rule=PathPrefix(`/api/extract`) || PathPrefix(`/api/tasks`)' - 'traefik.http.routers.extraction.rule=PathPrefix(`/api/extract`) || PathPrefix(`/api/tasks`)'
@ -249,14 +233,10 @@ services:
- PORT=4009 - PORT=4009
- NODE_ENV=development - NODE_ENV=development
- PRODUCT_ID=clawcowork - PRODUCT_ID=clawcowork
- COSMOS_ENDPOINT=https://cosmos-emulator:8081 # COSMOS_* come from `.env` (real Cosmos account; see top of file).
# Stable emulator's self-signed cert — dev-prototype only.
- NODE_TLS_REJECT_UNAUTHORIZED=0
- PLATFORM_SERVICE_URL=http://platform-service:4003 - PLATFORM_SERVICE_URL=http://platform-service:4003
- EXTRACTION_SERVICE_URL=http://extraction-service:4005 - EXTRACTION_SERVICE_URL=http://extraction-service:4005
depends_on: depends_on:
cosmos-emulator:
condition: service_healthy
platform-service: platform-service:
condition: service_healthy condition: service_healthy
labels: labels:

View File

@ -77,6 +77,19 @@ export class IpcBridge {
this.log.error(`IPC child process error: ${err.message}`); this.log.error(`IPC child process error: ${err.message}`);
}); });
// Without these listeners, an EPIPE on stdin (child died mid-write) or
// a stdout/stderr error during teardown becomes an unhandled stream
// error and crashes the test runner / parent process.
this.child.stdin?.on('error', err => {
this.log.error(`IPC child stdin error: ${err.message}`);
});
this.child.stdout?.on('error', err => {
this.log.error(`IPC child stdout error: ${err.message}`);
});
this.child.stderr?.on('error', err => {
this.log.error(`IPC child stderr error: ${err.message}`);
});
this.child.on('exit', (code, signal) => { this.child.on('exit', (code, signal) => {
this.log.info(`IPC child process exited: code=${code} signal=${signal}`); this.log.info(`IPC child process exited: code=${code} signal=${signal}`);
this.rejectAllPending(new Error(`IPC child process exited (code=${code})`)); this.rejectAllPending(new Error(`IPC child process exited (code=${code})`));