Make the capacity autoscaling signal safe to act on automatically and observable
in Grafana.
Anti-flap hysteresis:
- New pure applyHysteresis: suppresses a direction reversal (scale_in after
scale_out, or vice versa) within a cooldown window so a consumer cannot thrash
capacity. A critical scale-out (queued work, zero usable capacity) always
bypasses the cooldown. Cooldown anchor only advances on an emitted action, so a
suppressed signal keeps counting down from the real last action.
- Process-wide per-product cooldown state (mirrors reaper/breaker in-mem state)
with a test seam; cooldown tunable via FLEET_AUTOSCALE_COOLDOWN_SEC (default 300).
- GET /fleet/autoscale[/all] now serve the debounced (stateful) recommendation.
Observability:
- Prometheus exposition emits the RAW recommendation per product
(fleet_autoscale_recommended_seats/delta/pressure + one-hot fleet_autoscale_action
{action}). RAW (not stateful) so a scrape never mutates the cooldown anchors.
- Grafana "Fleet Overview" gains two panels: products recommending scale-out
(stat) + recommended seat delta vs backlog (timeseries).
Docs: FLEET_AUTOSCALE_COOLDOWN_SEC in .env.example.
Tests: +10 (hysteresis/stateful/cooldown + prom autoscale series); full suite 1856
green; lint + tsc clean. Verified live: a throwaway Prometheus scraped the running
service and the dashboard PromQL returned real scale-out/scale-in recommendations
across products.
Generated with [Devin](https://cli.devin.ai/docs)
Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
235 lines
7.0 KiB
JSON
235 lines
7.0 KiB
JSON
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [],
|
|
"title": "Fleet Overview",
|
|
"tags": ["fleet", "gigafactory"],
|
|
"uid": "fleet-overview",
|
|
"schemaVersion": 38,
|
|
"version": 2,
|
|
"refresh": "30s",
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"name": "product",
|
|
"type": "query",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"query": "label_values(fleet_queue_depth, product)",
|
|
"refresh": 2,
|
|
"includeAll": true,
|
|
"multi": true,
|
|
"current": { "text": "All", "value": "$__all" }
|
|
}
|
|
]
|
|
},
|
|
"panels": [
|
|
{
|
|
"title": "Open circuit breakers",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "red", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
|
|
},
|
|
{
|
|
"title": "Dead-letter jobs",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Stale factories",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
|
|
},
|
|
{
|
|
"title": "Active alerts",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [
|
|
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
|
|
]
|
|
},
|
|
{
|
|
"title": "Queue depth",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_queue_depth{product=~\"$product\"}",
|
|
"legendFormat": "{{product}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Seat utilization %",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_utilization_pct{product=~\"$product\"}",
|
|
"legendFormat": "{{product}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Budget: spent vs ceiling (USD)",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} spent"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} ceiling"
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} projected"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Reaper reclaims (rate/5m)",
|
|
"type": "timeseries",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
|
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
|
|
"legendFormat": "expired"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
|
|
"legendFormat": "stale"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Autoscale: products recommending scale-out",
|
|
"type": "stat",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 20 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(fleet_autoscale_action{product=~\"$product\",action=\"scale_out\"}) or vector(0)"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Autoscale: recommended seat delta",
|
|
"type": "timeseries",
|
|
"description": "Recommended change in factory seats per product (positive = scale out, negative = scale in). Raw signal, pre-hysteresis.",
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 18, "x": 6, "y": 20 },
|
|
"fieldConfig": {
|
|
"defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } }
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "fleet_autoscale_delta{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} delta"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": "fleet_autoscale_pressure{product=~\"$product\"}",
|
|
"legendFormat": "{{product}} backlog"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|