feat(fleet): Prometheus metrics export + Grafana dashboard (ops #4)
Exports fleet observability to Prometheus/Grafana (previously JSON-only). - GET /api/fleet/metrics/prom: global, product-labelled Prometheus exposition (queue depth, blocked/active, per-stage histogram, factory health/seats/ utilization, active alerts, budget spent/ceiling/projected) plus process-wide reaper/GC counters and engine circuit-breaker state. Pure renderer (renderFleetMetricsProm) is unit-tested; route auth accepts a FLEET_METRICS_TOKEN bearer (scrape path) or an admin JWT — never world-readable by default. - Infra: add a prometheus container to docker-compose + a platform-service-fleet scrape job; pin the Prometheus Grafana datasource uid; add a provisioned "Fleet Overview" dashboard (breakers, dead-letter, stale factories, alerts, queue depth, utilization, budget burn, reaper rate) with a product template var. - Document FLEET_METRICS_TOKEN + the fleet feature flags in .env.example. No default behavior change: the endpoint is additive and the new container is opt-in via the compose stack. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
parent
acf7c36cda
commit
93d1caf4a2
12
.env.example
12
.env.example
@ -99,3 +99,15 @@ RUST_RUNTIME_TIMEOUT_MS=300000
|
|||||||
OLLAMA_URL=http://localhost:11434/v1
|
OLLAMA_URL=http://localhost:11434/v1
|
||||||
OLLAMA_MODELS=
|
OLLAMA_MODELS=
|
||||||
FEATURE_FLAGS_ENABLED=true
|
FEATURE_FLAGS_ENABLED=true
|
||||||
|
|
||||||
|
# ── Fleet ops/observability ───────────────────────────────────
|
||||||
|
# Bearer token Prometheus uses to scrape GET /api/fleet/metrics/prom. Must match
|
||||||
|
# the `credentials` in services/monitoring/prometheus/prometheus.yml. When unset,
|
||||||
|
# the endpoint requires an admin JWT instead (so it is never world-readable).
|
||||||
|
FLEET_METRICS_TOKEN=changeme-fleet-metrics-token
|
||||||
|
# Fleet feature flags (default OFF): cost/latency routing, per-engine breaker,
|
||||||
|
# per-product/-engine budget enforcement, and multi-tenant access enforcement.
|
||||||
|
FLEET_COST_ROUTING=
|
||||||
|
FLEET_ENGINE_BREAKER=
|
||||||
|
FLEET_BUDGETS=
|
||||||
|
FLEET_TENANT_ENFORCEMENT=
|
||||||
|
|||||||
@ -96,6 +96,23 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 3
|
retries: 3
|
||||||
|
|
||||||
|
# ── Prometheus (fleet + infra metrics scrape) ─────────────────
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v3.1.0
|
||||||
|
ports:
|
||||||
|
- '9090:9090'
|
||||||
|
volumes:
|
||||||
|
- ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- prometheus-data:/prometheus
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/ready']
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
# ── API Gateway (Traefik) ───────────────────────────────────
|
# ── API Gateway (Traefik) ───────────────────────────────────
|
||||||
gateway:
|
gateway:
|
||||||
image: traefik:v3.3
|
image: traefik:v3.3
|
||||||
@ -282,3 +299,4 @@ volumes:
|
|||||||
azurite-data:
|
azurite-data:
|
||||||
loki-data:
|
loki-data:
|
||||||
grafana-data:
|
grafana-data:
|
||||||
|
prometheus-data:
|
||||||
|
|||||||
184
services/monitoring/grafana/dashboards/fleet-overview.json
Normal file
184
services/monitoring/grafana/dashboards/fleet-overview.json
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 1,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"title": "Fleet Overview",
|
||||||
|
"tags": ["fleet", "gigafactory"],
|
||||||
|
"uid": "fleet-overview",
|
||||||
|
"schemaVersion": 38,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-6h", "to": "now" },
|
||||||
|
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "product",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"query": "label_values(fleet_queue_depth, product)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"current": { "text": "All", "value": "$__all" }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Open circuit breakers",
|
||||||
|
"type": "stat",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Dead-letter jobs",
|
||||||
|
"type": "stat",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Stale factories",
|
||||||
|
"type": "stat",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Active alerts",
|
||||||
|
"type": "stat",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Queue depth",
|
||||||
|
"type": "timeseries",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||||
|
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "fleet_queue_depth{product=~\"$product\"}",
|
||||||
|
"legendFormat": "{{product}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Seat utilization %",
|
||||||
|
"type": "timeseries",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "fleet_utilization_pct{product=~\"$product\"}",
|
||||||
|
"legendFormat": "{{product}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Budget: spent vs ceiling (USD)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
|
||||||
|
"legendFormat": "{{product}} spent"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "B",
|
||||||
|
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
|
||||||
|
"legendFormat": "{{product}} ceiling"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "C",
|
||||||
|
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
|
||||||
|
"legendFormat": "{{product}} projected"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Reaper reclaims (rate/5m)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||||
|
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
|
||||||
|
"legendFormat": "expired"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "B",
|
||||||
|
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
|
||||||
|
"legendFormat": "stale"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -3,6 +3,7 @@ apiVersion: 1
|
|||||||
datasources:
|
datasources:
|
||||||
- name: Prometheus
|
- name: Prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
|
uid: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
url: http://prometheus:9090
|
url: http://prometheus:9090
|
||||||
editable: false
|
editable: false
|
||||||
|
|||||||
@ -8,6 +8,21 @@ scrape_configs:
|
|||||||
- targets:
|
- targets:
|
||||||
- prometheus:9090
|
- prometheus:9090
|
||||||
|
|
||||||
|
# Fleet coordinator metrics (queue depth, factory health, reaper, breakers,
|
||||||
|
# budgets). The endpoint requires a bearer token — set `credentials` below to
|
||||||
|
# the same value as platform-service's FLEET_METRICS_TOKEN (.env). The default
|
||||||
|
# here is a non-secret placeholder for the local prototype; change it for any
|
||||||
|
# shared/remote deployment.
|
||||||
|
- job_name: platform-service-fleet
|
||||||
|
metrics_path: /api/fleet/metrics/prom
|
||||||
|
scheme: http
|
||||||
|
authorization:
|
||||||
|
type: Bearer
|
||||||
|
credentials: changeme-fleet-metrics-token
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- platform-service:4003
|
||||||
|
|
||||||
- job_name: node-exporter
|
- job_name: node-exporter
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
|
|||||||
179
services/platform-service/src/modules/fleet/prometheus.test.ts
Normal file
179
services/platform-service/src/modules/fleet/prometheus.test.ts
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
/**
|
||||||
|
* Fleet Prometheus exposition (§4) — pure renderer unit tests.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect, afterEach } from 'vitest';
|
||||||
|
import { renderFleetMetricsProm, scrapeTokenMatches, type FleetPromInput } from './prometheus.js';
|
||||||
|
import type { FleetMetrics } from './coordinator.js';
|
||||||
|
import type { ReaperStats } from './reaper.js';
|
||||||
|
|
||||||
|
function metrics(over: Partial<FleetMetrics> = {}): FleetMetrics {
|
||||||
|
return {
|
||||||
|
productId: 'lysnrai',
|
||||||
|
generatedAt: '2026-06-01T00:00:00.000Z',
|
||||||
|
jobs: {
|
||||||
|
total: 3,
|
||||||
|
byStage: { queued: 2, shipped: 1 },
|
||||||
|
queueDepth: 2,
|
||||||
|
blocked: 0,
|
||||||
|
active: 0,
|
||||||
|
oldestQueuedAgeMs: 5000,
|
||||||
|
},
|
||||||
|
factories: {
|
||||||
|
total: 2,
|
||||||
|
live: 1,
|
||||||
|
stale: 1,
|
||||||
|
byHealth: { ok: 1, degraded: 0, down: 0 },
|
||||||
|
seatsUsed: 1,
|
||||||
|
seatsTotal: 4,
|
||||||
|
utilizationPct: 25,
|
||||||
|
},
|
||||||
|
budget: null,
|
||||||
|
alerts: [],
|
||||||
|
...over,
|
||||||
|
} as FleetMetrics;
|
||||||
|
}
|
||||||
|
|
||||||
|
const reaper: ReaperStats = {
|
||||||
|
running: true,
|
||||||
|
startedAt: null,
|
||||||
|
lastReclaimAt: null,
|
||||||
|
lastSweepAt: null,
|
||||||
|
totals: {
|
||||||
|
expiredReclaimed: 5,
|
||||||
|
staleReclaimed: 2,
|
||||||
|
leasesDeleted: 10,
|
||||||
|
factoriesDeleted: 0,
|
||||||
|
tokensDeleted: 0,
|
||||||
|
jobsDeleted: 3,
|
||||||
|
runsDeleted: 0,
|
||||||
|
eventsDeleted: 0,
|
||||||
|
artifactsDeleted: 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const base: FleetPromInput = {
|
||||||
|
products: [{ productId: 'lysnrai', metrics: metrics() }],
|
||||||
|
reaper,
|
||||||
|
breakers: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
describe('renderFleetMetricsProm', () => {
|
||||||
|
it('emits HELP/TYPE headers and product-labelled gauges', () => {
|
||||||
|
const out = renderFleetMetricsProm(base);
|
||||||
|
expect(out).toContain('# TYPE fleet_queue_depth gauge');
|
||||||
|
expect(out).toContain('fleet_queue_depth{product="lysnrai"} 2');
|
||||||
|
expect(out).toContain('fleet_utilization_pct{product="lysnrai"} 25');
|
||||||
|
expect(out).toContain('fleet_jobs_by_stage{product="lysnrai",stage="queued"} 2');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('emits global reaper counters and a running gauge', () => {
|
||||||
|
const out = renderFleetMetricsProm(base);
|
||||||
|
expect(out).toContain('# TYPE fleet_reaper_expired_reclaimed_total counter');
|
||||||
|
expect(out).toContain('fleet_reaper_expired_reclaimed_total 5');
|
||||||
|
expect(out).toContain('fleet_reaper_jobs_deleted_total 3');
|
||||||
|
expect(out).toContain('fleet_reaper_running 1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('emits one alert series per active alert, labelled by code + level', () => {
|
||||||
|
const out = renderFleetMetricsProm({
|
||||||
|
...base,
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
productId: 'lysnrai',
|
||||||
|
metrics: metrics({
|
||||||
|
alerts: [{ level: 'warning', code: 'dead_letter', message: 'x' }],
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(out).toContain(
|
||||||
|
'fleet_alert_active{product="lysnrai",code="dead_letter",level="warning"} 1'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('emits budget gauges only when a budget is configured', () => {
|
||||||
|
const withBudget = renderFleetMetricsProm({
|
||||||
|
...base,
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
productId: 'lysnrai',
|
||||||
|
metrics: metrics({
|
||||||
|
budget: {
|
||||||
|
ceilingUsd: 100,
|
||||||
|
spentUsd: 40,
|
||||||
|
status: 'active',
|
||||||
|
window: 'monthly',
|
||||||
|
projectedUsd: 300,
|
||||||
|
engines: [],
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(withBudget).toContain('fleet_budget_spent_usd{product="lysnrai"} 40');
|
||||||
|
expect(withBudget).toContain('fleet_budget_projected_usd{product="lysnrai"} 300');
|
||||||
|
// no budget ⇒ no budget series
|
||||||
|
expect(renderFleetMetricsProm(base)).not.toContain('fleet_budget_spent_usd');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('counts and labels only tripped breakers', () => {
|
||||||
|
const out = renderFleetMetricsProm({
|
||||||
|
...base,
|
||||||
|
breakers: [
|
||||||
|
{
|
||||||
|
factoryId: 'fac_1',
|
||||||
|
engine: 'codex',
|
||||||
|
state: 'OPEN',
|
||||||
|
failureCount: 3,
|
||||||
|
lastFailureAt: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
factoryId: 'fac_1',
|
||||||
|
engine: 'devin',
|
||||||
|
state: 'CLOSED',
|
||||||
|
failureCount: 0,
|
||||||
|
lastFailureAt: null,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(out).toContain(
|
||||||
|
'fleet_engine_breaker_open{factory="fac_1",engine="codex",state="OPEN"} 1'
|
||||||
|
);
|
||||||
|
expect(out).not.toContain('engine="devin"');
|
||||||
|
expect(out).toContain('fleet_engine_breaker_open_count 1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('escapes special characters in label values', () => {
|
||||||
|
const out = renderFleetMetricsProm({
|
||||||
|
...base,
|
||||||
|
breakers: [
|
||||||
|
{
|
||||||
|
factoryId: 'fac"x',
|
||||||
|
engine: 'a\\b',
|
||||||
|
state: 'HALF_OPEN',
|
||||||
|
failureCount: 1,
|
||||||
|
lastFailureAt: null,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(out).toContain('factory="fac\\"x",engine="a\\\\b"');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('scrapeTokenMatches', () => {
|
||||||
|
afterEach(() => {
|
||||||
|
delete process.env.FLEET_METRICS_TOKEN;
|
||||||
|
});
|
||||||
|
|
||||||
|
it('is false when no token is configured (forces admin-auth fallback)', () => {
|
||||||
|
expect(scrapeTokenMatches('Bearer anything')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches a correct bearer token and rejects a wrong / missing one', () => {
|
||||||
|
process.env.FLEET_METRICS_TOKEN = 'secret';
|
||||||
|
expect(scrapeTokenMatches('Bearer secret')).toBe(true);
|
||||||
|
expect(scrapeTokenMatches('Bearer nope')).toBe(false);
|
||||||
|
expect(scrapeTokenMatches(undefined)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
203
services/platform-service/src/modules/fleet/prometheus.ts
Normal file
203
services/platform-service/src/modules/fleet/prometheus.ts
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
/**
|
||||||
|
* Prometheus exposition for fleet metrics (§4 ops export).
|
||||||
|
*
|
||||||
|
* Renders the per-product `FleetMetrics` + the process-wide reaper/GC counters +
|
||||||
|
* the engine circuit-breaker snapshot into the Prometheus text format
|
||||||
|
* (`text/plain; version=0.0.4`). PURE + synchronous — the route does the I/O
|
||||||
|
* (scan products, compute metrics) and hands the snapshot here, which keeps the
|
||||||
|
* formatting fully unit-testable.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { FleetMetrics } from './coordinator.js';
|
||||||
|
import type { ReaperStats } from './reaper.js';
|
||||||
|
import type { EngineBreakerEntry } from './engine-breaker.js';
|
||||||
|
|
||||||
|
/** One product's computed metrics, paired with its id for labelling. */
|
||||||
|
export interface ProductMetrics {
|
||||||
|
productId: string;
|
||||||
|
metrics: FleetMetrics;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FleetPromInput {
|
||||||
|
products: ProductMetrics[];
|
||||||
|
reaper: ReaperStats;
|
||||||
|
breakers: EngineBreakerEntry[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Escape a Prometheus label value (backslash, double-quote, newline). */
|
||||||
|
function esc(v: string): string {
|
||||||
|
return v.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Render one labelled series line. `labels` is an ordered [key,value][]. */
|
||||||
|
function line(name: string, labels: [string, string][], value: number): string {
|
||||||
|
if (labels.length === 0) return `${name} ${value}`;
|
||||||
|
const inner = labels.map(([k, v]) => `${k}="${esc(v)}"`).join(',');
|
||||||
|
return `${name}{${inner}} ${value}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Metric {
|
||||||
|
name: string;
|
||||||
|
help: string;
|
||||||
|
type: 'gauge' | 'counter';
|
||||||
|
rows: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Render the full fleet exposition. Deterministic ordering (products as given,
|
||||||
|
* then global counters) so output diffs are stable and testable.
|
||||||
|
*/
|
||||||
|
export function renderFleetMetricsProm(input: FleetPromInput): string {
|
||||||
|
const m: Record<string, Metric> = {};
|
||||||
|
const def = (name: string, type: Metric['type'], help: string): Metric =>
|
||||||
|
(m[name] ??= { name, help, type, rows: [] });
|
||||||
|
|
||||||
|
for (const { productId, metrics } of input.products) {
|
||||||
|
const p: [string, string][] = [['product', productId]];
|
||||||
|
def('fleet_jobs_total', 'gauge', 'Total jobs for the product').rows.push(
|
||||||
|
line('fleet_jobs_total', p, metrics.jobs.total)
|
||||||
|
);
|
||||||
|
def('fleet_queue_depth', 'gauge', 'Jobs currently queued').rows.push(
|
||||||
|
line('fleet_queue_depth', p, metrics.jobs.queueDepth)
|
||||||
|
);
|
||||||
|
def('fleet_jobs_blocked', 'gauge', 'Jobs blocked on unmet dependencies').rows.push(
|
||||||
|
line('fleet_jobs_blocked', p, metrics.jobs.blocked)
|
||||||
|
);
|
||||||
|
def('fleet_jobs_active', 'gauge', 'Jobs in an active (claimed/running) stage').rows.push(
|
||||||
|
line('fleet_jobs_active', p, metrics.jobs.active)
|
||||||
|
);
|
||||||
|
def('fleet_oldest_queued_age_ms', 'gauge', 'Age of the oldest queued job (ms)').rows.push(
|
||||||
|
line('fleet_oldest_queued_age_ms', p, metrics.jobs.oldestQueuedAgeMs ?? 0)
|
||||||
|
);
|
||||||
|
// Per-stage histogram of the job lifecycle.
|
||||||
|
const stage = def('fleet_jobs_by_stage', 'gauge', 'Jobs by lifecycle stage');
|
||||||
|
for (const [s, n] of Object.entries(metrics.jobs.byStage)) {
|
||||||
|
stage.rows.push(
|
||||||
|
line(
|
||||||
|
'fleet_jobs_by_stage',
|
||||||
|
[
|
||||||
|
['product', productId],
|
||||||
|
['stage', s],
|
||||||
|
],
|
||||||
|
n
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
def('fleet_factories_total', 'gauge', 'Registered factories').rows.push(
|
||||||
|
line('fleet_factories_total', p, metrics.factories.total)
|
||||||
|
);
|
||||||
|
def('fleet_factories_live', 'gauge', 'Factories seen within the stale window').rows.push(
|
||||||
|
line('fleet_factories_live', p, metrics.factories.live)
|
||||||
|
);
|
||||||
|
def('fleet_factories_stale', 'gauge', 'Factories past the heartbeat stale window').rows.push(
|
||||||
|
line('fleet_factories_stale', p, metrics.factories.stale)
|
||||||
|
);
|
||||||
|
def('fleet_seats_used', 'gauge', 'Occupied factory seats').rows.push(
|
||||||
|
line('fleet_seats_used', p, metrics.factories.seatsUsed)
|
||||||
|
);
|
||||||
|
def('fleet_seats_total', 'gauge', 'Total advertised factory seats').rows.push(
|
||||||
|
line('fleet_seats_total', p, metrics.factories.seatsTotal)
|
||||||
|
);
|
||||||
|
def('fleet_utilization_pct', 'gauge', 'Seat utilization percentage').rows.push(
|
||||||
|
line('fleet_utilization_pct', p, metrics.factories.utilizationPct)
|
||||||
|
);
|
||||||
|
|
||||||
|
// One active-alert series per alert code (value 1), labelled by level.
|
||||||
|
const alert = def('fleet_alert_active', 'gauge', 'Active fleet alert (1 = firing)');
|
||||||
|
for (const a of metrics.alerts) {
|
||||||
|
alert.rows.push(
|
||||||
|
line(
|
||||||
|
'fleet_alert_active',
|
||||||
|
[
|
||||||
|
['product', productId],
|
||||||
|
['code', a.code],
|
||||||
|
['level', a.level],
|
||||||
|
],
|
||||||
|
1
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (metrics.budget) {
|
||||||
|
def('fleet_budget_spent_usd', 'gauge', 'Budget spent this window (USD)').rows.push(
|
||||||
|
line('fleet_budget_spent_usd', p, metrics.budget.spentUsd)
|
||||||
|
);
|
||||||
|
def('fleet_budget_ceiling_usd', 'gauge', 'Budget ceiling (USD)').rows.push(
|
||||||
|
line('fleet_budget_ceiling_usd', p, metrics.budget.ceilingUsd)
|
||||||
|
);
|
||||||
|
if (metrics.budget.projectedUsd !== null) {
|
||||||
|
def('fleet_budget_projected_usd', 'gauge', 'Projected end-of-window spend (USD)').rows.push(
|
||||||
|
line('fleet_budget_projected_usd', p, metrics.budget.projectedUsd)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Process-wide reaper / GC counters ──
|
||||||
|
const r = input.reaper.totals;
|
||||||
|
const rc = (name: string, help: string, value: number): void => {
|
||||||
|
def(name, 'counter', help).rows.push(line(name, [], value));
|
||||||
|
};
|
||||||
|
rc(
|
||||||
|
'fleet_reaper_expired_reclaimed_total',
|
||||||
|
'Jobs reclaimed from expired leases',
|
||||||
|
r.expiredReclaimed
|
||||||
|
);
|
||||||
|
rc('fleet_reaper_stale_reclaimed_total', 'Jobs reclaimed from stale factories', r.staleReclaimed);
|
||||||
|
rc('fleet_reaper_leases_deleted_total', 'Finished leases garbage-collected', r.leasesDeleted);
|
||||||
|
rc('fleet_reaper_jobs_deleted_total', 'Terminal jobs garbage-collected', r.jobsDeleted);
|
||||||
|
def('fleet_reaper_running', 'gauge', 'Reaper loop running (1/0)').rows.push(
|
||||||
|
line('fleet_reaper_running', [], input.reaper.running ? 1 : 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
// ── Engine circuit breakers (process-wide) ──
|
||||||
|
const open = def(
|
||||||
|
'fleet_engine_breaker_open',
|
||||||
|
'gauge',
|
||||||
|
'Engine breaker tripped (1 = OPEN/HALF_OPEN)'
|
||||||
|
);
|
||||||
|
let openCount = 0;
|
||||||
|
for (const b of input.breakers) {
|
||||||
|
if (b.state === 'CLOSED') continue;
|
||||||
|
openCount += 1;
|
||||||
|
open.rows.push(
|
||||||
|
line(
|
||||||
|
'fleet_engine_breaker_open',
|
||||||
|
[
|
||||||
|
['factory', b.factoryId],
|
||||||
|
['engine', b.engine],
|
||||||
|
['state', b.state],
|
||||||
|
],
|
||||||
|
1
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
def(
|
||||||
|
'fleet_engine_breaker_open_count',
|
||||||
|
'gauge',
|
||||||
|
'Number of tripped (factory,engine) breakers'
|
||||||
|
).rows.push(line('fleet_engine_breaker_open_count', [], openCount));
|
||||||
|
|
||||||
|
const out: string[] = [];
|
||||||
|
for (const metric of Object.values(m)) {
|
||||||
|
out.push(`# HELP ${metric.name} ${metric.help}`);
|
||||||
|
out.push(`# TYPE ${metric.name} ${metric.type}`);
|
||||||
|
out.push(...metric.rows);
|
||||||
|
}
|
||||||
|
out.push('');
|
||||||
|
return out.join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Authorize a metrics scrape. When `FLEET_METRICS_TOKEN` is set, a matching
|
||||||
|
* `Authorization: Bearer <token>` is accepted (the Prometheus scrape path).
|
||||||
|
* Returns false when the token is unset (caller falls back to admin auth) or
|
||||||
|
* the bearer does not match — so the endpoint is never world-readable by default.
|
||||||
|
*/
|
||||||
|
export function scrapeTokenMatches(authHeader: string | undefined): boolean {
|
||||||
|
const token = (process.env.FLEET_METRICS_TOKEN ?? '').trim();
|
||||||
|
if (!token) return false;
|
||||||
|
if (!authHeader) return false;
|
||||||
|
const m = authHeader.match(/^Bearer\s+(.+)$/i);
|
||||||
|
return m?.[1] === token;
|
||||||
|
}
|
||||||
@ -116,6 +116,17 @@ describe('fleetRoutes', () => {
|
|||||||
expect(JSON.parse(metrics.body).jobs.byStage.shipped).toBe(1);
|
expect(JSON.parse(metrics.body).jobs.byStage.shipped).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('GET /fleet/metrics/prom renders Prometheus exposition (admin)', async () => {
|
||||||
|
const app = await buildApp();
|
||||||
|
await submit(app, { idempotencyKey: 'prom-1', bodyMd: '# task' });
|
||||||
|
const res = await app.inject({ method: 'GET', url: '/api/fleet/metrics/prom' });
|
||||||
|
expect(res.statusCode).toBe(200);
|
||||||
|
expect(res.headers['content-type']).toContain('text/plain; version=0.0.4');
|
||||||
|
expect(res.body).toContain('# TYPE fleet_queue_depth gauge');
|
||||||
|
expect(res.body).toContain('fleet_reaper_running');
|
||||||
|
expect(res.body).toContain('fleet_engine_breaker_open_count');
|
||||||
|
});
|
||||||
|
|
||||||
it('release with insights records run cost/tokens (factory reporting)', async () => {
|
it('release with insights records run cost/tokens (factory reporting)', async () => {
|
||||||
const app = await buildApp();
|
const app = await buildApp();
|
||||||
const sub = await submit(app, { idempotencyKey: 'ins-1', bodyMd: '# task' });
|
const sub = await submit(app, { idempotencyKey: 'ins-1', bodyMd: '# task' });
|
||||||
|
|||||||
@ -35,6 +35,8 @@ import * as enrollment from './enrollment.js';
|
|||||||
import * as trackerBridge from './tracker-bridge.js';
|
import * as trackerBridge from './tracker-bridge.js';
|
||||||
import { getReaperStats } from './reaper.js';
|
import { getReaperStats } from './reaper.js';
|
||||||
import { getEngineBreakerSnapshot } from './engine-breaker.js';
|
import { getEngineBreakerSnapshot } from './engine-breaker.js';
|
||||||
|
import { renderFleetMetricsProm, scrapeTokenMatches } from './prometheus.js';
|
||||||
|
import { getAllProducts } from '../products/cache.js';
|
||||||
import {
|
import {
|
||||||
SubmitJobSchema,
|
SubmitJobSchema,
|
||||||
ListJobsQuerySchema,
|
ListJobsQuerySchema,
|
||||||
@ -455,6 +457,33 @@ export async function fleetRoutes(app: FastifyInstance) {
|
|||||||
return { ...metrics, reaper: getReaperStats(), engineBreakers: getEngineBreakerSnapshot() };
|
return { ...metrics, reaper: getReaperStats(), engineBreakers: getEngineBreakerSnapshot() };
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── Prometheus exposition for fleet metrics (§4 ops export) ──
|
||||||
|
// GLOBAL (all products, labelled) so a single scrape target covers the fleet.
|
||||||
|
// Auth: a matching `FLEET_METRICS_TOKEN` bearer (the Prometheus scrape path),
|
||||||
|
// else an admin/super_admin JWT — never world-readable by default.
|
||||||
|
app.get('/fleet/metrics/prom', async (req, reply) => {
|
||||||
|
if (!scrapeTokenMatches(req.headers['authorization'])) {
|
||||||
|
const auth = await extractAuth(req);
|
||||||
|
if (auth.role !== 'admin' && auth.role !== 'super_admin') {
|
||||||
|
throw new ForbiddenError('admin role or a metrics scrape token is required');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const products = getAllProducts();
|
||||||
|
const perProduct = await Promise.all(
|
||||||
|
products.map(async p => ({
|
||||||
|
productId: p.productId,
|
||||||
|
metrics: await coordinator.fleetMetrics(p.productId),
|
||||||
|
}))
|
||||||
|
);
|
||||||
|
const body = renderFleetMetricsProm({
|
||||||
|
products: perProduct,
|
||||||
|
reaper: getReaperStats(),
|
||||||
|
breakers: getEngineBreakerSnapshot(),
|
||||||
|
});
|
||||||
|
reply.type('text/plain; version=0.0.4; charset=utf-8');
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
|
||||||
// ── M0 RU gate: per-product queue version (cheap ~1 RU point read) ──
|
// ── M0 RU gate: per-product queue version (cheap ~1 RU point read) ──
|
||||||
// A polling factory reads this each tick and only runs the expensive claim when
|
// A polling factory reads this each tick and only runs the expensive claim when
|
||||||
// `version` changed since its last attempt. See
|
// `version` changed since its last attempt. See
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user