learning_ai_common_plat/services/monitoring/grafana/dashboards/vm-overview.json
Saravana Kumar fe8338c2c5 feat(monitoring): add VM Overview Grafana dashboard
12-panel dashboard auto-provisioned via /var/lib/grafana/dashboards:
  - 4 stat tiles (disk %, RAM avail, swap used, CPU steal) with
    threshold colouring matching vm-health-check.sh
  - 4 time-series (disk %, RAM trend, steal, sda write GB/hr) — 7d default
  - 2 bargauge top-10 by RAM and CPU (cAdvisor container_memory_working_set,
    container_cpu_usage)
  - Load average (1/5/15) + network throughput (RX/TX, host interfaces)

uid: vm-overview. Picked up on next Grafana boot.

Closes Phase 5: "Add Grafana" item from VM observability roadmap.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 21:26:35 +00:00

315 lines
8.8 KiB
JSON

{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"title": "VM Overview — srv1491630",
"tags": ["vm", "infrastructure"],
"uid": "vm-overview",
"schemaVersion": 38,
"version": 1,
"refresh": "1m",
"time": { "from": "now-7d", "to": "now" },
"timepicker": {
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h"]
},
"templating": { "list": [] },
"panels": [
{
"title": "Disk Used %",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 55 },
{ "color": "red", "value": 70 }
]
}
}
},
"options": {
"graphMode": "area",
"colorMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100"
}
]
},
{
"title": "RAM Available (GB)",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "decgbytes",
"decimals": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 3 },
{ "color": "green", "value": 6 }
]
}
}
},
"options": {
"graphMode": "area",
"colorMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "node_memory_MemAvailable_bytes / 1073741824"
}
]
},
{
"title": "Swap Used (GB)",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "decgbytes",
"decimals": 2,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1.5 },
{ "color": "red", "value": 3 }
]
}
}
},
"options": {
"graphMode": "area",
"colorMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824"
}
]
},
{
"title": "CPU Steal %",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 2,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 5 },
{ "color": "red", "value": 15 }
]
}
}
},
"options": {
"graphMode": "area",
"colorMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "avg(rate(node_cpu_seconds_total{mode=\"steal\"}[5m])) * 100"
}
]
},
{
"title": "Disk Used % — 7d trend",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": { "fillOpacity": 15, "lineWidth": 2 }
}
},
"targets": [
{
"refId": "A",
"expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100",
"legendFormat": "/"
}
]
},
{
"title": "RAM Available — 7d trend",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": {
"defaults": {
"unit": "decgbytes",
"custom": { "fillOpacity": 15, "lineWidth": 2 }
}
},
"targets": [
{
"refId": "A",
"expr": "node_memory_MemAvailable_bytes / 1073741824",
"legendFormat": "Available"
},
{
"refId": "B",
"expr": "(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824",
"legendFormat": "Swap used"
}
]
},
{
"title": "CPU Steal % — 7d trend",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": { "fillOpacity": 15, "lineWidth": 2 }
}
},
"targets": [
{
"refId": "A",
"expr": "avg(rate(node_cpu_seconds_total{mode=\"steal\"}[5m])) * 100",
"legendFormat": "steal"
}
]
},
{
"title": "Disk Writes (sda) — GB/hr",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"fieldConfig": {
"defaults": {
"unit": "decgbytes",
"custom": { "fillOpacity": 15, "lineWidth": 2 }
}
},
"targets": [
{
"refId": "A",
"expr": "rate(node_disk_written_bytes_total{device=\"sda\"}[5m]) * 3600 / 1073741824",
"legendFormat": "GB/hr"
}
]
},
{
"title": "Top 10 Containers by Memory",
"type": "bargauge",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 20 },
"fieldConfig": {
"defaults": {
"unit": "decmbytes",
"decimals": 1
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "topk(10, container_memory_working_set_bytes{name!=\"\"} / 1048576)",
"legendFormat": "{{name}}"
}
]
},
{
"title": "Top 10 Containers by CPU",
"type": "bargauge",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 20 },
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 2
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "topk(10, sum by (name) (rate(container_cpu_usage_seconds_total{name!=\"\"}[5m])) * 100)",
"legendFormat": "{{name}}"
}
]
},
{
"title": "Load Average (1m / 5m / 15m)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 },
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 }
}
},
"targets": [
{ "refId": "A", "expr": "node_load1", "legendFormat": "1m" },
{ "refId": "B", "expr": "node_load5", "legendFormat": "5m" },
{ "refId": "C", "expr": "node_load15", "legendFormat": "15m" }
]
},
{
"title": "Network Throughput (sum of all interfaces)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 },
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": { "fillOpacity": 10, "lineWidth": 2 }
}
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))",
"legendFormat": "RX"
},
{
"refId": "B",
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))",
"legendFormat": "TX"
}
]
}
]
}