learning_ai_common_plat/services/monitoring/grafana/dashboards/fleet-overview.json
saravanakumardb1 c63736459b feat(fleet): anti-flap hysteresis + autoscale Prometheus series & dashboard (ops #5)
Make the capacity autoscaling signal safe to act on automatically and observable
in Grafana.

Anti-flap hysteresis:
- New pure applyHysteresis: suppresses a direction reversal (scale_in after
  scale_out, or vice versa) within a cooldown window so a consumer cannot thrash
  capacity. A critical scale-out (queued work, zero usable capacity) always
  bypasses the cooldown. Cooldown anchor only advances on an emitted action, so a
  suppressed signal keeps counting down from the real last action.
- Process-wide per-product cooldown state (mirrors reaper/breaker in-mem state)
  with a test seam; cooldown tunable via FLEET_AUTOSCALE_COOLDOWN_SEC (default 300).
- GET /fleet/autoscale[/all] now serve the debounced (stateful) recommendation.

Observability:
- Prometheus exposition emits the RAW recommendation per product
  (fleet_autoscale_recommended_seats/delta/pressure + one-hot fleet_autoscale_action
  {action}). RAW (not stateful) so a scrape never mutates the cooldown anchors.
- Grafana "Fleet Overview" gains two panels: products recommending scale-out
  (stat) + recommended seat delta vs backlog (timeseries).

Docs: FLEET_AUTOSCALE_COOLDOWN_SEC in .env.example.

Tests: +10 (hysteresis/stateful/cooldown + prom autoscale series); full suite 1856
green; lint + tsc clean. Verified live: a throwaway Prometheus scraped the running
service and the dashboard PromQL returned real scale-out/scale-in recommendations
across products.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
2026-06-01 23:02:08 -07:00

235 lines
7.0 KiB
JSON

{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"title": "Fleet Overview",
"tags": ["fleet", "gigafactory"],
"uid": "fleet-overview",
"schemaVersion": 38,
"version": 2,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
"templating": {
"list": [
{
"name": "product",
"type": "query",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"query": "label_values(fleet_queue_depth, product)",
"refresh": 2,
"includeAll": true,
"multi": true,
"current": { "text": "All", "value": "$__all" }
}
]
},
"panels": [
{
"title": "Open circuit breakers",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
},
{
"title": "Dead-letter jobs",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
}
]
},
{
"title": "Stale factories",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
},
{
"title": "Active alerts",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
]
},
{
"title": "Queue depth",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
"targets": [
{
"refId": "A",
"expr": "fleet_queue_depth{product=~\"$product\"}",
"legendFormat": "{{product}}"
}
]
},
{
"title": "Seat utilization %",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": {
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_utilization_pct{product=~\"$product\"}",
"legendFormat": "{{product}}"
}
]
},
{
"title": "Budget: spent vs ceiling (USD)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
"legendFormat": "{{product}} spent"
},
{
"refId": "B",
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
"legendFormat": "{{product}} ceiling"
},
{
"refId": "C",
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
"legendFormat": "{{product}} projected"
}
]
},
{
"title": "Reaper reclaims (rate/5m)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
"targets": [
{
"refId": "A",
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
"legendFormat": "expired"
},
{
"refId": "B",
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
"legendFormat": "stale"
}
]
},
{
"title": "Autoscale: products recommending scale-out",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 20 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "sum(fleet_autoscale_action{product=~\"$product\",action=\"scale_out\"}) or vector(0)"
}
]
},
{
"title": "Autoscale: recommended seat delta",
"type": "timeseries",
"description": "Recommended change in factory seats per product (positive = scale out, negative = scale in). Raw signal, pre-hysteresis.",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 18, "x": 6, "y": 20 },
"fieldConfig": {
"defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_autoscale_delta{product=~\"$product\"}",
"legendFormat": "{{product}} delta"
},
{
"refId": "B",
"expr": "fleet_autoscale_pressure{product=~\"$product\"}",
"legendFormat": "{{product}} backlog"
}
]
}
]
}