Exports fleet observability to Prometheus/Grafana (previously JSON-only). - GET /api/fleet/metrics/prom: global, product-labelled Prometheus exposition (queue depth, blocked/active, per-stage histogram, factory health/seats/ utilization, active alerts, budget spent/ceiling/projected) plus process-wide reaper/GC counters and engine circuit-breaker state. Pure renderer (renderFleetMetricsProm) is unit-tested; route auth accepts a FLEET_METRICS_TOKEN bearer (scrape path) or an admin JWT — never world-readable by default. - Infra: add a prometheus container to docker-compose + a platform-service-fleet scrape job; pin the Prometheus Grafana datasource uid; add a provisioned "Fleet Overview" dashboard (breakers, dead-letter, stale factories, alerts, queue depth, utilization, budget burn, reaper rate) with a product template var. - Document FLEET_METRICS_TOKEN + the fleet feature flags in .env.example. No default behavior change: the endpoint is additive and the new container is opt-in via the compose stack. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
185 lines
5.4 KiB
JSON
185 lines
5.4 KiB
JSON
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [],
|
|
"title": "Fleet Overview",
|
|
"tags": ["fleet", "gigafactory"],
|
|
"uid": "fleet-overview",
|
|
"schemaVersion": 38,
|
|
"version": 1,
|
|
"refresh": "30s",
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"name": "product",
|
|
"type": "query",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"query": "label_values(fleet_queue_depth, product)",
|
|
"refresh": 2,
|
|
"includeAll": true,
|
|
"multi": true,
|
|
"current": { "text": "All", "value": "$__all" }
|
|
}
|
|
]
|
|
},
|
|
"panels": [
|
|
{
|
|
"title": "Open circuit breakers",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "red", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
|
|
},
|
|
{
|
|
"title": "Dead-letter jobs",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Stale factories",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
|
|
},
|
|
{
|
|
"title": "Active alerts",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [
|
|
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
|
|
]
|
|
},
|
|
{
|
|
"title": "Queue depth",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_queue_depth{product=~\"$product\"}",
|
|
"legendFormat": "{{product}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Seat utilization %",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_utilization_pct{product=~\"$product\"}",
|
|
"legendFormat": "{{product}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Budget: spent vs ceiling (USD)",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} spent"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} ceiling"
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} projected"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Reaper reclaims (rate/5m)",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
|
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
|
|
"legendFormat": "expired"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
|
|
"legendFormat": "stale"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|