feat(fleet): Prometheus metrics export + Grafana dashboard (ops #4)
Exports fleet observability to Prometheus/Grafana (previously JSON-only). - GET /api/fleet/metrics/prom: global, product-labelled Prometheus exposition (queue depth, blocked/active, per-stage histogram, factory health/seats/ utilization, active alerts, budget spent/ceiling/projected) plus process-wide reaper/GC counters and engine circuit-breaker state. Pure renderer (renderFleetMetricsProm) is unit-tested; route auth accepts a FLEET_METRICS_TOKEN bearer (scrape path) or an admin JWT — never world-readable by default. - Infra: add a prometheus container to docker-compose + a platform-service-fleet scrape job; pin the Prometheus Grafana datasource uid; add a provisioned "Fleet Overview" dashboard (breakers, dead-letter, stale factories, alerts, queue depth, utilization, budget burn, reaper rate) with a product template var. - Document FLEET_METRICS_TOKEN + the fleet feature flags in .env.example. No default behavior change: the endpoint is additive and the new container is opt-in via the compose stack. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
parent
acf7c36cda
commit
93d1caf4a2
12
.env.example
12
.env.example
@ -99,3 +99,15 @@ RUST_RUNTIME_TIMEOUT_MS=300000
|
||||
OLLAMA_URL=http://localhost:11434/v1
|
||||
OLLAMA_MODELS=
|
||||
FEATURE_FLAGS_ENABLED=true
|
||||
|
||||
# ── Fleet ops/observability ───────────────────────────────────
|
||||
# Bearer token Prometheus uses to scrape GET /api/fleet/metrics/prom. Must match
|
||||
# the `credentials` in services/monitoring/prometheus/prometheus.yml. When unset,
|
||||
# the endpoint requires an admin JWT instead (so it is never world-readable).
|
||||
FLEET_METRICS_TOKEN=changeme-fleet-metrics-token
|
||||
# Fleet feature flags (default OFF): cost/latency routing, per-engine breaker,
|
||||
# per-product/-engine budget enforcement, and multi-tenant access enforcement.
|
||||
FLEET_COST_ROUTING=
|
||||
FLEET_ENGINE_BREAKER=
|
||||
FLEET_BUDGETS=
|
||||
FLEET_TENANT_ENFORCEMENT=
|
||||
|
||||
@ -96,6 +96,23 @@ services:
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Prometheus (fleet + infra metrics scrape) ─────────────────
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.1.0
|
||||
ports:
|
||||
- '9090:9090'
|
||||
volumes:
|
||||
- ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/ready']
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── API Gateway (Traefik) ───────────────────────────────────
|
||||
gateway:
|
||||
image: traefik:v3.3
|
||||
@ -282,3 +299,4 @@ volumes:
|
||||
azurite-data:
|
||||
loki-data:
|
||||
grafana-data:
|
||||
prometheus-data:
|
||||
|
||||
184
services/monitoring/grafana/dashboards/fleet-overview.json
Normal file
184
services/monitoring/grafana/dashboards/fleet-overview.json
Normal file
@ -0,0 +1,184 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"title": "Fleet Overview",
|
||||
"tags": ["fleet", "gigafactory"],
|
||||
"uid": "fleet-overview",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "product",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"query": "label_values(fleet_queue_depth, product)",
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"current": { "text": "All", "value": "$__all" }
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"title": "Open circuit breakers",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
|
||||
},
|
||||
{
|
||||
"title": "Dead-letter jobs",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Stale factories",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
|
||||
},
|
||||
{
|
||||
"title": "Active alerts",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"targets": [
|
||||
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Queue depth",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "fleet_queue_depth{product=~\"$product\"}",
|
||||
"legendFormat": "{{product}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Seat utilization %",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "fleet_utilization_pct{product=~\"$product\"}",
|
||||
"legendFormat": "{{product}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Budget: spent vs ceiling (USD)",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
|
||||
"legendFormat": "{{product}} spent"
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
|
||||
"legendFormat": "{{product}} ceiling"
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
|
||||
"legendFormat": "{{product}} projected"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Reaper reclaims (rate/5m)",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
|
||||
"legendFormat": "expired"
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
|
||||
"legendFormat": "stale"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -3,6 +3,7 @@ apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
editable: false
|
||||
|
||||
@ -8,6 +8,21 @@ scrape_configs:
|
||||
- targets:
|
||||
- prometheus:9090
|
||||
|
||||
# Fleet coordinator metrics (queue depth, factory health, reaper, breakers,
|
||||
# budgets). The endpoint requires a bearer token — set `credentials` below to
|
||||
# the same value as platform-service's FLEET_METRICS_TOKEN (.env). The default
|
||||
# here is a non-secret placeholder for the local prototype; change it for any
|
||||
# shared/remote deployment.
|
||||
- job_name: platform-service-fleet
|
||||
metrics_path: /api/fleet/metrics/prom
|
||||
scheme: http
|
||||
authorization:
|
||||
type: Bearer
|
||||
credentials: changeme-fleet-metrics-token
|
||||
static_configs:
|
||||
- targets:
|
||||
- platform-service:4003
|
||||
|
||||
- job_name: node-exporter
|
||||
static_configs:
|
||||
- targets:
|
||||
|
||||
179
services/platform-service/src/modules/fleet/prometheus.test.ts
Normal file
179
services/platform-service/src/modules/fleet/prometheus.test.ts
Normal file
@ -0,0 +1,179 @@
|
||||
/**
|
||||
* Fleet Prometheus exposition (§4) — pure renderer unit tests.
|
||||
*/
|
||||
|
||||
import { describe, it, expect, afterEach } from 'vitest';
|
||||
import { renderFleetMetricsProm, scrapeTokenMatches, type FleetPromInput } from './prometheus.js';
|
||||
import type { FleetMetrics } from './coordinator.js';
|
||||
import type { ReaperStats } from './reaper.js';
|
||||
|
||||
function metrics(over: Partial<FleetMetrics> = {}): FleetMetrics {
|
||||
return {
|
||||
productId: 'lysnrai',
|
||||
generatedAt: '2026-06-01T00:00:00.000Z',
|
||||
jobs: {
|
||||
total: 3,
|
||||
byStage: { queued: 2, shipped: 1 },
|
||||
queueDepth: 2,
|
||||
blocked: 0,
|
||||
active: 0,
|
||||
oldestQueuedAgeMs: 5000,
|
||||
},
|
||||
factories: {
|
||||
total: 2,
|
||||
live: 1,
|
||||
stale: 1,
|
||||
byHealth: { ok: 1, degraded: 0, down: 0 },
|
||||
seatsUsed: 1,
|
||||
seatsTotal: 4,
|
||||
utilizationPct: 25,
|
||||
},
|
||||
budget: null,
|
||||
alerts: [],
|
||||
...over,
|
||||
} as FleetMetrics;
|
||||
}
|
||||
|
||||
const reaper: ReaperStats = {
|
||||
running: true,
|
||||
startedAt: null,
|
||||
lastReclaimAt: null,
|
||||
lastSweepAt: null,
|
||||
totals: {
|
||||
expiredReclaimed: 5,
|
||||
staleReclaimed: 2,
|
||||
leasesDeleted: 10,
|
||||
factoriesDeleted: 0,
|
||||
tokensDeleted: 0,
|
||||
jobsDeleted: 3,
|
||||
runsDeleted: 0,
|
||||
eventsDeleted: 0,
|
||||
artifactsDeleted: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const base: FleetPromInput = {
|
||||
products: [{ productId: 'lysnrai', metrics: metrics() }],
|
||||
reaper,
|
||||
breakers: [],
|
||||
};
|
||||
|
||||
describe('renderFleetMetricsProm', () => {
|
||||
it('emits HELP/TYPE headers and product-labelled gauges', () => {
|
||||
const out = renderFleetMetricsProm(base);
|
||||
expect(out).toContain('# TYPE fleet_queue_depth gauge');
|
||||
expect(out).toContain('fleet_queue_depth{product="lysnrai"} 2');
|
||||
expect(out).toContain('fleet_utilization_pct{product="lysnrai"} 25');
|
||||
expect(out).toContain('fleet_jobs_by_stage{product="lysnrai",stage="queued"} 2');
|
||||
});
|
||||
|
||||
it('emits global reaper counters and a running gauge', () => {
|
||||
const out = renderFleetMetricsProm(base);
|
||||
expect(out).toContain('# TYPE fleet_reaper_expired_reclaimed_total counter');
|
||||
expect(out).toContain('fleet_reaper_expired_reclaimed_total 5');
|
||||
expect(out).toContain('fleet_reaper_jobs_deleted_total 3');
|
||||
expect(out).toContain('fleet_reaper_running 1');
|
||||
});
|
||||
|
||||
it('emits one alert series per active alert, labelled by code + level', () => {
|
||||
const out = renderFleetMetricsProm({
|
||||
...base,
|
||||
products: [
|
||||
{
|
||||
productId: 'lysnrai',
|
||||
metrics: metrics({
|
||||
alerts: [{ level: 'warning', code: 'dead_letter', message: 'x' }],
|
||||
}),
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(out).toContain(
|
||||
'fleet_alert_active{product="lysnrai",code="dead_letter",level="warning"} 1'
|
||||
);
|
||||
});
|
||||
|
||||
it('emits budget gauges only when a budget is configured', () => {
|
||||
const withBudget = renderFleetMetricsProm({
|
||||
...base,
|
||||
products: [
|
||||
{
|
||||
productId: 'lysnrai',
|
||||
metrics: metrics({
|
||||
budget: {
|
||||
ceilingUsd: 100,
|
||||
spentUsd: 40,
|
||||
status: 'active',
|
||||
window: 'monthly',
|
||||
projectedUsd: 300,
|
||||
engines: [],
|
||||
},
|
||||
}),
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(withBudget).toContain('fleet_budget_spent_usd{product="lysnrai"} 40');
|
||||
expect(withBudget).toContain('fleet_budget_projected_usd{product="lysnrai"} 300');
|
||||
// no budget ⇒ no budget series
|
||||
expect(renderFleetMetricsProm(base)).not.toContain('fleet_budget_spent_usd');
|
||||
});
|
||||
|
||||
it('counts and labels only tripped breakers', () => {
|
||||
const out = renderFleetMetricsProm({
|
||||
...base,
|
||||
breakers: [
|
||||
{
|
||||
factoryId: 'fac_1',
|
||||
engine: 'codex',
|
||||
state: 'OPEN',
|
||||
failureCount: 3,
|
||||
lastFailureAt: null,
|
||||
},
|
||||
{
|
||||
factoryId: 'fac_1',
|
||||
engine: 'devin',
|
||||
state: 'CLOSED',
|
||||
failureCount: 0,
|
||||
lastFailureAt: null,
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(out).toContain(
|
||||
'fleet_engine_breaker_open{factory="fac_1",engine="codex",state="OPEN"} 1'
|
||||
);
|
||||
expect(out).not.toContain('engine="devin"');
|
||||
expect(out).toContain('fleet_engine_breaker_open_count 1');
|
||||
});
|
||||
|
||||
it('escapes special characters in label values', () => {
|
||||
const out = renderFleetMetricsProm({
|
||||
...base,
|
||||
breakers: [
|
||||
{
|
||||
factoryId: 'fac"x',
|
||||
engine: 'a\\b',
|
||||
state: 'HALF_OPEN',
|
||||
failureCount: 1,
|
||||
lastFailureAt: null,
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(out).toContain('factory="fac\\"x",engine="a\\\\b"');
|
||||
});
|
||||
});
|
||||
|
||||
describe('scrapeTokenMatches', () => {
|
||||
afterEach(() => {
|
||||
delete process.env.FLEET_METRICS_TOKEN;
|
||||
});
|
||||
|
||||
it('is false when no token is configured (forces admin-auth fallback)', () => {
|
||||
expect(scrapeTokenMatches('Bearer anything')).toBe(false);
|
||||
});
|
||||
|
||||
it('matches a correct bearer token and rejects a wrong / missing one', () => {
|
||||
process.env.FLEET_METRICS_TOKEN = 'secret';
|
||||
expect(scrapeTokenMatches('Bearer secret')).toBe(true);
|
||||
expect(scrapeTokenMatches('Bearer nope')).toBe(false);
|
||||
expect(scrapeTokenMatches(undefined)).toBe(false);
|
||||
});
|
||||
});
|
||||
203
services/platform-service/src/modules/fleet/prometheus.ts
Normal file
203
services/platform-service/src/modules/fleet/prometheus.ts
Normal file
@ -0,0 +1,203 @@
|
||||
/**
|
||||
* Prometheus exposition for fleet metrics (§4 ops export).
|
||||
*
|
||||
* Renders the per-product `FleetMetrics` + the process-wide reaper/GC counters +
|
||||
* the engine circuit-breaker snapshot into the Prometheus text format
|
||||
* (`text/plain; version=0.0.4`). PURE + synchronous — the route does the I/O
|
||||
* (scan products, compute metrics) and hands the snapshot here, which keeps the
|
||||
* formatting fully unit-testable.
|
||||
*/
|
||||
|
||||
import type { FleetMetrics } from './coordinator.js';
|
||||
import type { ReaperStats } from './reaper.js';
|
||||
import type { EngineBreakerEntry } from './engine-breaker.js';
|
||||
|
||||
/** One product's computed metrics, paired with its id for labelling. */
|
||||
export interface ProductMetrics {
|
||||
productId: string;
|
||||
metrics: FleetMetrics;
|
||||
}
|
||||
|
||||
export interface FleetPromInput {
|
||||
products: ProductMetrics[];
|
||||
reaper: ReaperStats;
|
||||
breakers: EngineBreakerEntry[];
|
||||
}
|
||||
|
||||
/** Escape a Prometheus label value (backslash, double-quote, newline). */
|
||||
function esc(v: string): string {
|
||||
return v.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
||||
}
|
||||
|
||||
/** Render one labelled series line. `labels` is an ordered [key,value][]. */
|
||||
function line(name: string, labels: [string, string][], value: number): string {
|
||||
if (labels.length === 0) return `${name} ${value}`;
|
||||
const inner = labels.map(([k, v]) => `${k}="${esc(v)}"`).join(',');
|
||||
return `${name}{${inner}} ${value}`;
|
||||
}
|
||||
|
||||
interface Metric {
|
||||
name: string;
|
||||
help: string;
|
||||
type: 'gauge' | 'counter';
|
||||
rows: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Render the full fleet exposition. Deterministic ordering (products as given,
|
||||
* then global counters) so output diffs are stable and testable.
|
||||
*/
|
||||
export function renderFleetMetricsProm(input: FleetPromInput): string {
|
||||
const m: Record<string, Metric> = {};
|
||||
const def = (name: string, type: Metric['type'], help: string): Metric =>
|
||||
(m[name] ??= { name, help, type, rows: [] });
|
||||
|
||||
for (const { productId, metrics } of input.products) {
|
||||
const p: [string, string][] = [['product', productId]];
|
||||
def('fleet_jobs_total', 'gauge', 'Total jobs for the product').rows.push(
|
||||
line('fleet_jobs_total', p, metrics.jobs.total)
|
||||
);
|
||||
def('fleet_queue_depth', 'gauge', 'Jobs currently queued').rows.push(
|
||||
line('fleet_queue_depth', p, metrics.jobs.queueDepth)
|
||||
);
|
||||
def('fleet_jobs_blocked', 'gauge', 'Jobs blocked on unmet dependencies').rows.push(
|
||||
line('fleet_jobs_blocked', p, metrics.jobs.blocked)
|
||||
);
|
||||
def('fleet_jobs_active', 'gauge', 'Jobs in an active (claimed/running) stage').rows.push(
|
||||
line('fleet_jobs_active', p, metrics.jobs.active)
|
||||
);
|
||||
def('fleet_oldest_queued_age_ms', 'gauge', 'Age of the oldest queued job (ms)').rows.push(
|
||||
line('fleet_oldest_queued_age_ms', p, metrics.jobs.oldestQueuedAgeMs ?? 0)
|
||||
);
|
||||
// Per-stage histogram of the job lifecycle.
|
||||
const stage = def('fleet_jobs_by_stage', 'gauge', 'Jobs by lifecycle stage');
|
||||
for (const [s, n] of Object.entries(metrics.jobs.byStage)) {
|
||||
stage.rows.push(
|
||||
line(
|
||||
'fleet_jobs_by_stage',
|
||||
[
|
||||
['product', productId],
|
||||
['stage', s],
|
||||
],
|
||||
n
|
||||
)
|
||||
);
|
||||
}
|
||||
def('fleet_factories_total', 'gauge', 'Registered factories').rows.push(
|
||||
line('fleet_factories_total', p, metrics.factories.total)
|
||||
);
|
||||
def('fleet_factories_live', 'gauge', 'Factories seen within the stale window').rows.push(
|
||||
line('fleet_factories_live', p, metrics.factories.live)
|
||||
);
|
||||
def('fleet_factories_stale', 'gauge', 'Factories past the heartbeat stale window').rows.push(
|
||||
line('fleet_factories_stale', p, metrics.factories.stale)
|
||||
);
|
||||
def('fleet_seats_used', 'gauge', 'Occupied factory seats').rows.push(
|
||||
line('fleet_seats_used', p, metrics.factories.seatsUsed)
|
||||
);
|
||||
def('fleet_seats_total', 'gauge', 'Total advertised factory seats').rows.push(
|
||||
line('fleet_seats_total', p, metrics.factories.seatsTotal)
|
||||
);
|
||||
def('fleet_utilization_pct', 'gauge', 'Seat utilization percentage').rows.push(
|
||||
line('fleet_utilization_pct', p, metrics.factories.utilizationPct)
|
||||
);
|
||||
|
||||
// One active-alert series per alert code (value 1), labelled by level.
|
||||
const alert = def('fleet_alert_active', 'gauge', 'Active fleet alert (1 = firing)');
|
||||
for (const a of metrics.alerts) {
|
||||
alert.rows.push(
|
||||
line(
|
||||
'fleet_alert_active',
|
||||
[
|
||||
['product', productId],
|
||||
['code', a.code],
|
||||
['level', a.level],
|
||||
],
|
||||
1
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
if (metrics.budget) {
|
||||
def('fleet_budget_spent_usd', 'gauge', 'Budget spent this window (USD)').rows.push(
|
||||
line('fleet_budget_spent_usd', p, metrics.budget.spentUsd)
|
||||
);
|
||||
def('fleet_budget_ceiling_usd', 'gauge', 'Budget ceiling (USD)').rows.push(
|
||||
line('fleet_budget_ceiling_usd', p, metrics.budget.ceilingUsd)
|
||||
);
|
||||
if (metrics.budget.projectedUsd !== null) {
|
||||
def('fleet_budget_projected_usd', 'gauge', 'Projected end-of-window spend (USD)').rows.push(
|
||||
line('fleet_budget_projected_usd', p, metrics.budget.projectedUsd)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Process-wide reaper / GC counters ──
|
||||
const r = input.reaper.totals;
|
||||
const rc = (name: string, help: string, value: number): void => {
|
||||
def(name, 'counter', help).rows.push(line(name, [], value));
|
||||
};
|
||||
rc(
|
||||
'fleet_reaper_expired_reclaimed_total',
|
||||
'Jobs reclaimed from expired leases',
|
||||
r.expiredReclaimed
|
||||
);
|
||||
rc('fleet_reaper_stale_reclaimed_total', 'Jobs reclaimed from stale factories', r.staleReclaimed);
|
||||
rc('fleet_reaper_leases_deleted_total', 'Finished leases garbage-collected', r.leasesDeleted);
|
||||
rc('fleet_reaper_jobs_deleted_total', 'Terminal jobs garbage-collected', r.jobsDeleted);
|
||||
def('fleet_reaper_running', 'gauge', 'Reaper loop running (1/0)').rows.push(
|
||||
line('fleet_reaper_running', [], input.reaper.running ? 1 : 0)
|
||||
);
|
||||
|
||||
// ── Engine circuit breakers (process-wide) ──
|
||||
const open = def(
|
||||
'fleet_engine_breaker_open',
|
||||
'gauge',
|
||||
'Engine breaker tripped (1 = OPEN/HALF_OPEN)'
|
||||
);
|
||||
let openCount = 0;
|
||||
for (const b of input.breakers) {
|
||||
if (b.state === 'CLOSED') continue;
|
||||
openCount += 1;
|
||||
open.rows.push(
|
||||
line(
|
||||
'fleet_engine_breaker_open',
|
||||
[
|
||||
['factory', b.factoryId],
|
||||
['engine', b.engine],
|
||||
['state', b.state],
|
||||
],
|
||||
1
|
||||
)
|
||||
);
|
||||
}
|
||||
def(
|
||||
'fleet_engine_breaker_open_count',
|
||||
'gauge',
|
||||
'Number of tripped (factory,engine) breakers'
|
||||
).rows.push(line('fleet_engine_breaker_open_count', [], openCount));
|
||||
|
||||
const out: string[] = [];
|
||||
for (const metric of Object.values(m)) {
|
||||
out.push(`# HELP ${metric.name} ${metric.help}`);
|
||||
out.push(`# TYPE ${metric.name} ${metric.type}`);
|
||||
out.push(...metric.rows);
|
||||
}
|
||||
out.push('');
|
||||
return out.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Authorize a metrics scrape. When `FLEET_METRICS_TOKEN` is set, a matching
|
||||
* `Authorization: Bearer <token>` is accepted (the Prometheus scrape path).
|
||||
* Returns false when the token is unset (caller falls back to admin auth) or
|
||||
* the bearer does not match — so the endpoint is never world-readable by default.
|
||||
*/
|
||||
export function scrapeTokenMatches(authHeader: string | undefined): boolean {
|
||||
const token = (process.env.FLEET_METRICS_TOKEN ?? '').trim();
|
||||
if (!token) return false;
|
||||
if (!authHeader) return false;
|
||||
const m = authHeader.match(/^Bearer\s+(.+)$/i);
|
||||
return m?.[1] === token;
|
||||
}
|
||||
@ -116,6 +116,17 @@ describe('fleetRoutes', () => {
|
||||
expect(JSON.parse(metrics.body).jobs.byStage.shipped).toBe(1);
|
||||
});
|
||||
|
||||
it('GET /fleet/metrics/prom renders Prometheus exposition (admin)', async () => {
|
||||
const app = await buildApp();
|
||||
await submit(app, { idempotencyKey: 'prom-1', bodyMd: '# task' });
|
||||
const res = await app.inject({ method: 'GET', url: '/api/fleet/metrics/prom' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(res.headers['content-type']).toContain('text/plain; version=0.0.4');
|
||||
expect(res.body).toContain('# TYPE fleet_queue_depth gauge');
|
||||
expect(res.body).toContain('fleet_reaper_running');
|
||||
expect(res.body).toContain('fleet_engine_breaker_open_count');
|
||||
});
|
||||
|
||||
it('release with insights records run cost/tokens (factory reporting)', async () => {
|
||||
const app = await buildApp();
|
||||
const sub = await submit(app, { idempotencyKey: 'ins-1', bodyMd: '# task' });
|
||||
|
||||
@ -35,6 +35,8 @@ import * as enrollment from './enrollment.js';
|
||||
import * as trackerBridge from './tracker-bridge.js';
|
||||
import { getReaperStats } from './reaper.js';
|
||||
import { getEngineBreakerSnapshot } from './engine-breaker.js';
|
||||
import { renderFleetMetricsProm, scrapeTokenMatches } from './prometheus.js';
|
||||
import { getAllProducts } from '../products/cache.js';
|
||||
import {
|
||||
SubmitJobSchema,
|
||||
ListJobsQuerySchema,
|
||||
@ -455,6 +457,33 @@ export async function fleetRoutes(app: FastifyInstance) {
|
||||
return { ...metrics, reaper: getReaperStats(), engineBreakers: getEngineBreakerSnapshot() };
|
||||
});
|
||||
|
||||
// ── Prometheus exposition for fleet metrics (§4 ops export) ──
|
||||
// GLOBAL (all products, labelled) so a single scrape target covers the fleet.
|
||||
// Auth: a matching `FLEET_METRICS_TOKEN` bearer (the Prometheus scrape path),
|
||||
// else an admin/super_admin JWT — never world-readable by default.
|
||||
app.get('/fleet/metrics/prom', async (req, reply) => {
|
||||
if (!scrapeTokenMatches(req.headers['authorization'])) {
|
||||
const auth = await extractAuth(req);
|
||||
if (auth.role !== 'admin' && auth.role !== 'super_admin') {
|
||||
throw new ForbiddenError('admin role or a metrics scrape token is required');
|
||||
}
|
||||
}
|
||||
const products = getAllProducts();
|
||||
const perProduct = await Promise.all(
|
||||
products.map(async p => ({
|
||||
productId: p.productId,
|
||||
metrics: await coordinator.fleetMetrics(p.productId),
|
||||
}))
|
||||
);
|
||||
const body = renderFleetMetricsProm({
|
||||
products: perProduct,
|
||||
reaper: getReaperStats(),
|
||||
breakers: getEngineBreakerSnapshot(),
|
||||
});
|
||||
reply.type('text/plain; version=0.0.4; charset=utf-8');
|
||||
return body;
|
||||
});
|
||||
|
||||
// ── M0 RU gate: per-product queue version (cheap ~1 RU point read) ──
|
||||
// A polling factory reads this each tick and only runs the expensive claim when
|
||||
// `version` changed since its last attempt. See
|
||||
|
||||
Loading…
Reference in New Issue
Block a user