diff --git a/.env.example b/.env.example index 9db2ea25..226ed6a9 100644 --- a/.env.example +++ b/.env.example @@ -99,3 +99,15 @@ RUST_RUNTIME_TIMEOUT_MS=300000 OLLAMA_URL=http://localhost:11434/v1 OLLAMA_MODELS= FEATURE_FLAGS_ENABLED=true + +# ── Fleet ops/observability ─────────────────────────────────── +# Bearer token Prometheus uses to scrape GET /api/fleet/metrics/prom. Must match +# the `credentials` in services/monitoring/prometheus/prometheus.yml. When unset, +# the endpoint requires an admin JWT instead (so it is never world-readable). +FLEET_METRICS_TOKEN=changeme-fleet-metrics-token +# Fleet feature flags (default OFF): cost/latency routing, per-engine breaker, +# per-product/-engine budget enforcement, and multi-tenant access enforcement. +FLEET_COST_ROUTING= +FLEET_ENGINE_BREAKER= +FLEET_BUDGETS= +FLEET_TENANT_ENFORCEMENT= diff --git a/docker-compose.yml b/docker-compose.yml index 4ebaacc2..cf2f37f7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -96,6 +96,23 @@ services: timeout: 5s retries: 3 + # ── Prometheus (fleet + infra metrics scrape) ───────────────── + prometheus: + image: prom/prometheus:v3.1.0 + ports: + - '9090:9090' + volumes: + - ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + restart: unless-stopped + healthcheck: + test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/ready'] + interval: 15s + timeout: 5s + retries: 3 + # ── API Gateway (Traefik) ─────────────────────────────────── gateway: image: traefik:v3.3 @@ -282,3 +299,4 @@ volumes: azurite-data: loki-data: grafana-data: + prometheus-data: diff --git a/services/monitoring/grafana/dashboards/fleet-overview.json b/services/monitoring/grafana/dashboards/fleet-overview.json new file mode 100644 index 00000000..b0f577ee --- /dev/null +++ b/services/monitoring/grafana/dashboards/fleet-overview.json @@ -0,0 +1,184 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "title": "Fleet Overview", + "tags": ["fleet", "gigafactory"], + "uid": "fleet-overview", + "schemaVersion": 38, + "version": 1, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] }, + "templating": { + "list": [ + { + "name": "product", + "type": "query", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "query": "label_values(fleet_queue_depth, product)", + "refresh": 2, + "includeAll": true, + "multi": true, + "current": { "text": "All", "value": "$__all" } + } + ] + }, + "panels": [ + { + "title": "Open circuit breakers", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }] + }, + { + "title": "Dead-letter jobs", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})" + } + ] + }, + { + "title": "Stale factories", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }] + }, + { + "title": "Active alerts", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" } + ] + }, + { + "title": "Queue depth", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } }, + "targets": [ + { + "refId": "A", + "expr": "fleet_queue_depth{product=~\"$product\"}", + "legendFormat": "{{product}}" + } + ] + }, + { + "title": "Seat utilization %", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "fieldConfig": { + "defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } } + }, + "targets": [ + { + "refId": "A", + "expr": "fleet_utilization_pct{product=~\"$product\"}", + "legendFormat": "{{product}}" + } + ] + }, + { + "title": "Budget: spent vs ceiling (USD)", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "fieldConfig": { + "defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } } + }, + "targets": [ + { + "refId": "A", + "expr": "fleet_budget_spent_usd{product=~\"$product\"}", + "legendFormat": "{{product}} spent" + }, + { + "refId": "B", + "expr": "fleet_budget_ceiling_usd{product=~\"$product\"}", + "legendFormat": "{{product}} ceiling" + }, + { + "refId": "C", + "expr": "fleet_budget_projected_usd{product=~\"$product\"}", + "legendFormat": "{{product}} projected" + } + ] + }, + { + "title": "Reaper reclaims (rate/5m)", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } }, + "targets": [ + { + "refId": "A", + "expr": "rate(fleet_reaper_expired_reclaimed_total[5m])", + "legendFormat": "expired" + }, + { + "refId": "B", + "expr": "rate(fleet_reaper_stale_reclaimed_total[5m])", + "legendFormat": "stale" + } + ] + } + ] +} diff --git a/services/monitoring/grafana/provisioning/datasources/prometheus.yml b/services/monitoring/grafana/provisioning/datasources/prometheus.yml index c59df9f4..ff14e695 100644 --- a/services/monitoring/grafana/provisioning/datasources/prometheus.yml +++ b/services/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -3,6 +3,7 @@ apiVersion: 1 datasources: - name: Prometheus type: prometheus + uid: prometheus access: proxy url: http://prometheus:9090 editable: false diff --git a/services/monitoring/prometheus/prometheus.yml b/services/monitoring/prometheus/prometheus.yml index 3463ddcf..580395d4 100644 --- a/services/monitoring/prometheus/prometheus.yml +++ b/services/monitoring/prometheus/prometheus.yml @@ -8,6 +8,21 @@ scrape_configs: - targets: - prometheus:9090 + # Fleet coordinator metrics (queue depth, factory health, reaper, breakers, + # budgets). The endpoint requires a bearer token — set `credentials` below to + # the same value as platform-service's FLEET_METRICS_TOKEN (.env). The default + # here is a non-secret placeholder for the local prototype; change it for any + # shared/remote deployment. + - job_name: platform-service-fleet + metrics_path: /api/fleet/metrics/prom + scheme: http + authorization: + type: Bearer + credentials: changeme-fleet-metrics-token + static_configs: + - targets: + - platform-service:4003 + - job_name: node-exporter static_configs: - targets: diff --git a/services/platform-service/src/modules/fleet/prometheus.test.ts b/services/platform-service/src/modules/fleet/prometheus.test.ts new file mode 100644 index 00000000..e390874f --- /dev/null +++ b/services/platform-service/src/modules/fleet/prometheus.test.ts @@ -0,0 +1,179 @@ +/** + * Fleet Prometheus exposition (§4) — pure renderer unit tests. + */ + +import { describe, it, expect, afterEach } from 'vitest'; +import { renderFleetMetricsProm, scrapeTokenMatches, type FleetPromInput } from './prometheus.js'; +import type { FleetMetrics } from './coordinator.js'; +import type { ReaperStats } from './reaper.js'; + +function metrics(over: Partial = {}): FleetMetrics { + return { + productId: 'lysnrai', + generatedAt: '2026-06-01T00:00:00.000Z', + jobs: { + total: 3, + byStage: { queued: 2, shipped: 1 }, + queueDepth: 2, + blocked: 0, + active: 0, + oldestQueuedAgeMs: 5000, + }, + factories: { + total: 2, + live: 1, + stale: 1, + byHealth: { ok: 1, degraded: 0, down: 0 }, + seatsUsed: 1, + seatsTotal: 4, + utilizationPct: 25, + }, + budget: null, + alerts: [], + ...over, + } as FleetMetrics; +} + +const reaper: ReaperStats = { + running: true, + startedAt: null, + lastReclaimAt: null, + lastSweepAt: null, + totals: { + expiredReclaimed: 5, + staleReclaimed: 2, + leasesDeleted: 10, + factoriesDeleted: 0, + tokensDeleted: 0, + jobsDeleted: 3, + runsDeleted: 0, + eventsDeleted: 0, + artifactsDeleted: 0, + }, +}; + +const base: FleetPromInput = { + products: [{ productId: 'lysnrai', metrics: metrics() }], + reaper, + breakers: [], +}; + +describe('renderFleetMetricsProm', () => { + it('emits HELP/TYPE headers and product-labelled gauges', () => { + const out = renderFleetMetricsProm(base); + expect(out).toContain('# TYPE fleet_queue_depth gauge'); + expect(out).toContain('fleet_queue_depth{product="lysnrai"} 2'); + expect(out).toContain('fleet_utilization_pct{product="lysnrai"} 25'); + expect(out).toContain('fleet_jobs_by_stage{product="lysnrai",stage="queued"} 2'); + }); + + it('emits global reaper counters and a running gauge', () => { + const out = renderFleetMetricsProm(base); + expect(out).toContain('# TYPE fleet_reaper_expired_reclaimed_total counter'); + expect(out).toContain('fleet_reaper_expired_reclaimed_total 5'); + expect(out).toContain('fleet_reaper_jobs_deleted_total 3'); + expect(out).toContain('fleet_reaper_running 1'); + }); + + it('emits one alert series per active alert, labelled by code + level', () => { + const out = renderFleetMetricsProm({ + ...base, + products: [ + { + productId: 'lysnrai', + metrics: metrics({ + alerts: [{ level: 'warning', code: 'dead_letter', message: 'x' }], + }), + }, + ], + }); + expect(out).toContain( + 'fleet_alert_active{product="lysnrai",code="dead_letter",level="warning"} 1' + ); + }); + + it('emits budget gauges only when a budget is configured', () => { + const withBudget = renderFleetMetricsProm({ + ...base, + products: [ + { + productId: 'lysnrai', + metrics: metrics({ + budget: { + ceilingUsd: 100, + spentUsd: 40, + status: 'active', + window: 'monthly', + projectedUsd: 300, + engines: [], + }, + }), + }, + ], + }); + expect(withBudget).toContain('fleet_budget_spent_usd{product="lysnrai"} 40'); + expect(withBudget).toContain('fleet_budget_projected_usd{product="lysnrai"} 300'); + // no budget ⇒ no budget series + expect(renderFleetMetricsProm(base)).not.toContain('fleet_budget_spent_usd'); + }); + + it('counts and labels only tripped breakers', () => { + const out = renderFleetMetricsProm({ + ...base, + breakers: [ + { + factoryId: 'fac_1', + engine: 'codex', + state: 'OPEN', + failureCount: 3, + lastFailureAt: null, + }, + { + factoryId: 'fac_1', + engine: 'devin', + state: 'CLOSED', + failureCount: 0, + lastFailureAt: null, + }, + ], + }); + expect(out).toContain( + 'fleet_engine_breaker_open{factory="fac_1",engine="codex",state="OPEN"} 1' + ); + expect(out).not.toContain('engine="devin"'); + expect(out).toContain('fleet_engine_breaker_open_count 1'); + }); + + it('escapes special characters in label values', () => { + const out = renderFleetMetricsProm({ + ...base, + breakers: [ + { + factoryId: 'fac"x', + engine: 'a\\b', + state: 'HALF_OPEN', + failureCount: 1, + lastFailureAt: null, + }, + ], + }); + expect(out).toContain('factory="fac\\"x",engine="a\\\\b"'); + }); +}); + +describe('scrapeTokenMatches', () => { + afterEach(() => { + delete process.env.FLEET_METRICS_TOKEN; + }); + + it('is false when no token is configured (forces admin-auth fallback)', () => { + expect(scrapeTokenMatches('Bearer anything')).toBe(false); + }); + + it('matches a correct bearer token and rejects a wrong / missing one', () => { + process.env.FLEET_METRICS_TOKEN = 'secret'; + expect(scrapeTokenMatches('Bearer secret')).toBe(true); + expect(scrapeTokenMatches('Bearer nope')).toBe(false); + expect(scrapeTokenMatches(undefined)).toBe(false); + }); +}); diff --git a/services/platform-service/src/modules/fleet/prometheus.ts b/services/platform-service/src/modules/fleet/prometheus.ts new file mode 100644 index 00000000..e9fe7f5f --- /dev/null +++ b/services/platform-service/src/modules/fleet/prometheus.ts @@ -0,0 +1,203 @@ +/** + * Prometheus exposition for fleet metrics (§4 ops export). + * + * Renders the per-product `FleetMetrics` + the process-wide reaper/GC counters + + * the engine circuit-breaker snapshot into the Prometheus text format + * (`text/plain; version=0.0.4`). PURE + synchronous — the route does the I/O + * (scan products, compute metrics) and hands the snapshot here, which keeps the + * formatting fully unit-testable. + */ + +import type { FleetMetrics } from './coordinator.js'; +import type { ReaperStats } from './reaper.js'; +import type { EngineBreakerEntry } from './engine-breaker.js'; + +/** One product's computed metrics, paired with its id for labelling. */ +export interface ProductMetrics { + productId: string; + metrics: FleetMetrics; +} + +export interface FleetPromInput { + products: ProductMetrics[]; + reaper: ReaperStats; + breakers: EngineBreakerEntry[]; +} + +/** Escape a Prometheus label value (backslash, double-quote, newline). */ +function esc(v: string): string { + return v.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n'); +} + +/** Render one labelled series line. `labels` is an ordered [key,value][]. */ +function line(name: string, labels: [string, string][], value: number): string { + if (labels.length === 0) return `${name} ${value}`; + const inner = labels.map(([k, v]) => `${k}="${esc(v)}"`).join(','); + return `${name}{${inner}} ${value}`; +} + +interface Metric { + name: string; + help: string; + type: 'gauge' | 'counter'; + rows: string[]; +} + +/** + * Render the full fleet exposition. Deterministic ordering (products as given, + * then global counters) so output diffs are stable and testable. + */ +export function renderFleetMetricsProm(input: FleetPromInput): string { + const m: Record = {}; + const def = (name: string, type: Metric['type'], help: string): Metric => + (m[name] ??= { name, help, type, rows: [] }); + + for (const { productId, metrics } of input.products) { + const p: [string, string][] = [['product', productId]]; + def('fleet_jobs_total', 'gauge', 'Total jobs for the product').rows.push( + line('fleet_jobs_total', p, metrics.jobs.total) + ); + def('fleet_queue_depth', 'gauge', 'Jobs currently queued').rows.push( + line('fleet_queue_depth', p, metrics.jobs.queueDepth) + ); + def('fleet_jobs_blocked', 'gauge', 'Jobs blocked on unmet dependencies').rows.push( + line('fleet_jobs_blocked', p, metrics.jobs.blocked) + ); + def('fleet_jobs_active', 'gauge', 'Jobs in an active (claimed/running) stage').rows.push( + line('fleet_jobs_active', p, metrics.jobs.active) + ); + def('fleet_oldest_queued_age_ms', 'gauge', 'Age of the oldest queued job (ms)').rows.push( + line('fleet_oldest_queued_age_ms', p, metrics.jobs.oldestQueuedAgeMs ?? 0) + ); + // Per-stage histogram of the job lifecycle. + const stage = def('fleet_jobs_by_stage', 'gauge', 'Jobs by lifecycle stage'); + for (const [s, n] of Object.entries(metrics.jobs.byStage)) { + stage.rows.push( + line( + 'fleet_jobs_by_stage', + [ + ['product', productId], + ['stage', s], + ], + n + ) + ); + } + def('fleet_factories_total', 'gauge', 'Registered factories').rows.push( + line('fleet_factories_total', p, metrics.factories.total) + ); + def('fleet_factories_live', 'gauge', 'Factories seen within the stale window').rows.push( + line('fleet_factories_live', p, metrics.factories.live) + ); + def('fleet_factories_stale', 'gauge', 'Factories past the heartbeat stale window').rows.push( + line('fleet_factories_stale', p, metrics.factories.stale) + ); + def('fleet_seats_used', 'gauge', 'Occupied factory seats').rows.push( + line('fleet_seats_used', p, metrics.factories.seatsUsed) + ); + def('fleet_seats_total', 'gauge', 'Total advertised factory seats').rows.push( + line('fleet_seats_total', p, metrics.factories.seatsTotal) + ); + def('fleet_utilization_pct', 'gauge', 'Seat utilization percentage').rows.push( + line('fleet_utilization_pct', p, metrics.factories.utilizationPct) + ); + + // One active-alert series per alert code (value 1), labelled by level. + const alert = def('fleet_alert_active', 'gauge', 'Active fleet alert (1 = firing)'); + for (const a of metrics.alerts) { + alert.rows.push( + line( + 'fleet_alert_active', + [ + ['product', productId], + ['code', a.code], + ['level', a.level], + ], + 1 + ) + ); + } + + if (metrics.budget) { + def('fleet_budget_spent_usd', 'gauge', 'Budget spent this window (USD)').rows.push( + line('fleet_budget_spent_usd', p, metrics.budget.spentUsd) + ); + def('fleet_budget_ceiling_usd', 'gauge', 'Budget ceiling (USD)').rows.push( + line('fleet_budget_ceiling_usd', p, metrics.budget.ceilingUsd) + ); + if (metrics.budget.projectedUsd !== null) { + def('fleet_budget_projected_usd', 'gauge', 'Projected end-of-window spend (USD)').rows.push( + line('fleet_budget_projected_usd', p, metrics.budget.projectedUsd) + ); + } + } + } + + // ── Process-wide reaper / GC counters ── + const r = input.reaper.totals; + const rc = (name: string, help: string, value: number): void => { + def(name, 'counter', help).rows.push(line(name, [], value)); + }; + rc( + 'fleet_reaper_expired_reclaimed_total', + 'Jobs reclaimed from expired leases', + r.expiredReclaimed + ); + rc('fleet_reaper_stale_reclaimed_total', 'Jobs reclaimed from stale factories', r.staleReclaimed); + rc('fleet_reaper_leases_deleted_total', 'Finished leases garbage-collected', r.leasesDeleted); + rc('fleet_reaper_jobs_deleted_total', 'Terminal jobs garbage-collected', r.jobsDeleted); + def('fleet_reaper_running', 'gauge', 'Reaper loop running (1/0)').rows.push( + line('fleet_reaper_running', [], input.reaper.running ? 1 : 0) + ); + + // ── Engine circuit breakers (process-wide) ── + const open = def( + 'fleet_engine_breaker_open', + 'gauge', + 'Engine breaker tripped (1 = OPEN/HALF_OPEN)' + ); + let openCount = 0; + for (const b of input.breakers) { + if (b.state === 'CLOSED') continue; + openCount += 1; + open.rows.push( + line( + 'fleet_engine_breaker_open', + [ + ['factory', b.factoryId], + ['engine', b.engine], + ['state', b.state], + ], + 1 + ) + ); + } + def( + 'fleet_engine_breaker_open_count', + 'gauge', + 'Number of tripped (factory,engine) breakers' + ).rows.push(line('fleet_engine_breaker_open_count', [], openCount)); + + const out: string[] = []; + for (const metric of Object.values(m)) { + out.push(`# HELP ${metric.name} ${metric.help}`); + out.push(`# TYPE ${metric.name} ${metric.type}`); + out.push(...metric.rows); + } + out.push(''); + return out.join('\n'); +} + +/** + * Authorize a metrics scrape. When `FLEET_METRICS_TOKEN` is set, a matching + * `Authorization: Bearer ` is accepted (the Prometheus scrape path). + * Returns false when the token is unset (caller falls back to admin auth) or + * the bearer does not match — so the endpoint is never world-readable by default. + */ +export function scrapeTokenMatches(authHeader: string | undefined): boolean { + const token = (process.env.FLEET_METRICS_TOKEN ?? '').trim(); + if (!token) return false; + if (!authHeader) return false; + const m = authHeader.match(/^Bearer\s+(.+)$/i); + return m?.[1] === token; +} diff --git a/services/platform-service/src/modules/fleet/routes.test.ts b/services/platform-service/src/modules/fleet/routes.test.ts index 420764d6..2a3df8ef 100644 --- a/services/platform-service/src/modules/fleet/routes.test.ts +++ b/services/platform-service/src/modules/fleet/routes.test.ts @@ -116,6 +116,17 @@ describe('fleetRoutes', () => { expect(JSON.parse(metrics.body).jobs.byStage.shipped).toBe(1); }); + it('GET /fleet/metrics/prom renders Prometheus exposition (admin)', async () => { + const app = await buildApp(); + await submit(app, { idempotencyKey: 'prom-1', bodyMd: '# task' }); + const res = await app.inject({ method: 'GET', url: '/api/fleet/metrics/prom' }); + expect(res.statusCode).toBe(200); + expect(res.headers['content-type']).toContain('text/plain; version=0.0.4'); + expect(res.body).toContain('# TYPE fleet_queue_depth gauge'); + expect(res.body).toContain('fleet_reaper_running'); + expect(res.body).toContain('fleet_engine_breaker_open_count'); + }); + it('release with insights records run cost/tokens (factory reporting)', async () => { const app = await buildApp(); const sub = await submit(app, { idempotencyKey: 'ins-1', bodyMd: '# task' }); diff --git a/services/platform-service/src/modules/fleet/routes.ts b/services/platform-service/src/modules/fleet/routes.ts index 427f3e21..503305a7 100644 --- a/services/platform-service/src/modules/fleet/routes.ts +++ b/services/platform-service/src/modules/fleet/routes.ts @@ -35,6 +35,8 @@ import * as enrollment from './enrollment.js'; import * as trackerBridge from './tracker-bridge.js'; import { getReaperStats } from './reaper.js'; import { getEngineBreakerSnapshot } from './engine-breaker.js'; +import { renderFleetMetricsProm, scrapeTokenMatches } from './prometheus.js'; +import { getAllProducts } from '../products/cache.js'; import { SubmitJobSchema, ListJobsQuerySchema, @@ -455,6 +457,33 @@ export async function fleetRoutes(app: FastifyInstance) { return { ...metrics, reaper: getReaperStats(), engineBreakers: getEngineBreakerSnapshot() }; }); + // ── Prometheus exposition for fleet metrics (§4 ops export) ── + // GLOBAL (all products, labelled) so a single scrape target covers the fleet. + // Auth: a matching `FLEET_METRICS_TOKEN` bearer (the Prometheus scrape path), + // else an admin/super_admin JWT — never world-readable by default. + app.get('/fleet/metrics/prom', async (req, reply) => { + if (!scrapeTokenMatches(req.headers['authorization'])) { + const auth = await extractAuth(req); + if (auth.role !== 'admin' && auth.role !== 'super_admin') { + throw new ForbiddenError('admin role or a metrics scrape token is required'); + } + } + const products = getAllProducts(); + const perProduct = await Promise.all( + products.map(async p => ({ + productId: p.productId, + metrics: await coordinator.fleetMetrics(p.productId), + })) + ); + const body = renderFleetMetricsProm({ + products: perProduct, + reaper: getReaperStats(), + breakers: getEngineBreakerSnapshot(), + }); + reply.type('text/plain; version=0.0.4; charset=utf-8'); + return body; + }); + // ── M0 RU gate: per-product queue version (cheap ~1 RU point read) ── // A polling factory reads this each tick and only runs the expensive claim when // `version` changed since its last attempt. See