learning_ai_common_plat/services/monitoring/grafana/dashboards/fleet-overview.json
saravanakumardb1 93d1caf4a2 feat(fleet): Prometheus metrics export + Grafana dashboard (ops #4)
Exports fleet observability to Prometheus/Grafana (previously JSON-only).

- GET /api/fleet/metrics/prom: global, product-labelled Prometheus exposition
  (queue depth, blocked/active, per-stage histogram, factory health/seats/
  utilization, active alerts, budget spent/ceiling/projected) plus process-wide
  reaper/GC counters and engine circuit-breaker state. Pure renderer
  (renderFleetMetricsProm) is unit-tested; route auth accepts a FLEET_METRICS_TOKEN
  bearer (scrape path) or an admin JWT — never world-readable by default.
- Infra: add a prometheus container to docker-compose + a platform-service-fleet
  scrape job; pin the Prometheus Grafana datasource uid; add a provisioned
  "Fleet Overview" dashboard (breakers, dead-letter, stale factories, alerts,
  queue depth, utilization, budget burn, reaper rate) with a product template var.
- Document FLEET_METRICS_TOKEN + the fleet feature flags in .env.example.

No default behavior change: the endpoint is additive and the new container is
opt-in via the compose stack.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
2026-06-01 22:24:03 -07:00

185 lines
5.4 KiB
JSON

{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"title": "Fleet Overview",
"tags": ["fleet", "gigafactory"],
"uid": "fleet-overview",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
"templating": {
"list": [
{
"name": "product",
"type": "query",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"query": "label_values(fleet_queue_depth, product)",
"refresh": 2,
"includeAll": true,
"multi": true,
"current": { "text": "All", "value": "$__all" }
}
]
},
"panels": [
{
"title": "Open circuit breakers",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
},
{
"title": "Dead-letter jobs",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
}
]
},
{
"title": "Stale factories",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
},
{
"title": "Active alerts",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
]
},
{
"title": "Queue depth",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
"targets": [
{
"refId": "A",
"expr": "fleet_queue_depth{product=~\"$product\"}",
"legendFormat": "{{product}}"
}
]
},
{
"title": "Seat utilization %",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": {
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_utilization_pct{product=~\"$product\"}",
"legendFormat": "{{product}}"
}
]
},
{
"title": "Budget: spent vs ceiling (USD)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
"legendFormat": "{{product}} spent"
},
{
"refId": "B",
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
"legendFormat": "{{product}} ceiling"
},
{
"refId": "C",
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
"legendFormat": "{{product}} projected"
}
]
},
{
"title": "Reaper reclaims (rate/5m)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
"targets": [
{
"refId": "A",
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
"legendFormat": "expired"
},
{
"refId": "B",
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
"legendFormat": "stale"
}
]
}
]
}