From fe8338c2c5990747df22fc853a9cff17e4f1e625 Mon Sep 17 00:00:00 2001 From: Saravana Kumar Date: Fri, 29 May 2026 02:24:50 +0000 Subject: [PATCH] feat(monitoring): add VM Overview Grafana dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 12-panel dashboard auto-provisioned via /var/lib/grafana/dashboards: - 4 stat tiles (disk %, RAM avail, swap used, CPU steal) with threshold colouring matching vm-health-check.sh - 4 time-series (disk %, RAM trend, steal, sda write GB/hr) — 7d default - 2 bargauge top-10 by RAM and CPU (cAdvisor container_memory_working_set, container_cpu_usage) - Load average (1/5/15) + network throughput (RX/TX, host interfaces) uid: vm-overview. Picked up on next Grafana boot. Closes Phase 5: "Add Grafana" item from VM observability roadmap. Co-Authored-By: Claude Sonnet 4.6 --- .../grafana/dashboards/vm-overview.json | 314 ++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 services/monitoring/grafana/dashboards/vm-overview.json diff --git a/services/monitoring/grafana/dashboards/vm-overview.json b/services/monitoring/grafana/dashboards/vm-overview.json new file mode 100644 index 00000000..281ec029 --- /dev/null +++ b/services/monitoring/grafana/dashboards/vm-overview.json @@ -0,0 +1,314 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "title": "VM Overview — srv1491630", + "tags": ["vm", "infrastructure"], + "uid": "vm-overview", + "schemaVersion": 38, + "version": 1, + "refresh": "1m", + "time": { "from": "now-7d", "to": "now" }, + "timepicker": { + "refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h"] + }, + "templating": { "list": [] }, + "panels": [ + { + "title": "Disk Used %", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 55 }, + { "color": "red", "value": 70 } + ] + } + } + }, + "options": { + "graphMode": "area", + "colorMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100" + } + ] + }, + { + "title": "RAM Available (GB)", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 3 }, + { "color": "green", "value": 6 } + ] + } + } + }, + "options": { + "graphMode": "area", + "colorMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "node_memory_MemAvailable_bytes / 1073741824" + } + ] + }, + { + "title": "Swap Used (GB)", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1.5 }, + { "color": "red", "value": 3 } + ] + } + } + }, + "options": { + "graphMode": "area", + "colorMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824" + } + ] + }, + { + "title": "CPU Steal %", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 15 } + ] + } + } + }, + "options": { + "graphMode": "area", + "colorMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "avg(rate(node_cpu_seconds_total{mode=\"steal\"}[5m])) * 100" + } + ] + }, + { + "title": "Disk Used % — 7d trend", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "custom": { "fillOpacity": 15, "lineWidth": 2 } + } + }, + "targets": [ + { + "refId": "A", + "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100", + "legendFormat": "/" + } + ] + }, + { + "title": "RAM Available — 7d trend", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "custom": { "fillOpacity": 15, "lineWidth": 2 } + } + }, + "targets": [ + { + "refId": "A", + "expr": "node_memory_MemAvailable_bytes / 1073741824", + "legendFormat": "Available" + }, + { + "refId": "B", + "expr": "(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1073741824", + "legendFormat": "Swap used" + } + ] + }, + { + "title": "CPU Steal % — 7d trend", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "custom": { "fillOpacity": 15, "lineWidth": 2 } + } + }, + "targets": [ + { + "refId": "A", + "expr": "avg(rate(node_cpu_seconds_total{mode=\"steal\"}[5m])) * 100", + "legendFormat": "steal" + } + ] + }, + { + "title": "Disk Writes (sda) — GB/hr", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "custom": { "fillOpacity": 15, "lineWidth": 2 } + } + }, + "targets": [ + { + "refId": "A", + "expr": "rate(node_disk_written_bytes_total{device=\"sda\"}[5m]) * 3600 / 1073741824", + "legendFormat": "GB/hr" + } + ] + }, + { + "title": "Top 10 Containers by Memory", + "type": "bargauge", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 20 }, + "fieldConfig": { + "defaults": { + "unit": "decmbytes", + "decimals": 1 + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "topk(10, container_memory_working_set_bytes{name!=\"\"} / 1048576)", + "legendFormat": "{{name}}" + } + ] + }, + { + "title": "Top 10 Containers by CPU", + "type": "bargauge", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 20 }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 2 + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "refId": "A", + "expr": "topk(10, sum by (name) (rate(container_cpu_usage_seconds_total{name!=\"\"}[5m])) * 100)", + "legendFormat": "{{name}}" + } + ] + }, + { + "title": "Load Average (1m / 5m / 15m)", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 }, + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 10, "lineWidth": 2 } + } + }, + "targets": [ + { "refId": "A", "expr": "node_load1", "legendFormat": "1m" }, + { "refId": "B", "expr": "node_load5", "legendFormat": "5m" }, + { "refId": "C", "expr": "node_load15", "legendFormat": "15m" } + ] + }, + { + "title": "Network Throughput (sum of all interfaces)", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 }, + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { "fillOpacity": 10, "lineWidth": 2 } + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))", + "legendFormat": "RX" + }, + { + "refId": "B", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))", + "legendFormat": "TX" + } + ] + } + ] +}