feat(observability): add phase 2 monitoring and valkey services

This commit is contained in:
root 2026-03-31 06:57:12 +00:00
parent d4d8c48a4c
commit b8661392c6
4 changed files with 130 additions and 0 deletions

View File

@ -16,6 +16,7 @@
# Port Map:
# Infrastructure: cosmos-emulator 8081, azurite 10000, mailpit 1025/8025,
# loki 3100, grafana 3000, traefik 80/8080
# Phase 2 (opt-in): prometheus 9090, valkey 6379
# Platform: platform-service 4003, extraction-service 4005, mcp-server 4007
# Dashboards: admin-web 3001, tracker-web 3003
# Products: peakpulse 4010, chronomind 4011, jarvisjr 4012,
@ -124,6 +125,86 @@ services:
retries: 3
restart: unless-stopped
prometheus:
image: prom/prometheus:v3.5.0
profiles:
- phase2-observability
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
volumes:
- ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
depends_on:
node-exporter:
condition: service_started
cadvisor:
condition: service_started
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/healthy']
interval: 15s
timeout: 5s
retries: 3
restart: unless-stopped
node-exporter:
image: prom/node-exporter:v1.9.1
profiles:
- phase2-observability
command:
- '--path.rootfs=/host'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)'
volumes:
- /:/host:ro,rslave
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9100/metrics']
interval: 15s
timeout: 5s
retries: 3
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
profiles:
- phase2-observability
privileged: true
command:
- '--housekeeping_interval=30s'
- '--docker_only=true'
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk:/dev/disk:ro
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:8080/healthz']
interval: 15s
timeout: 5s
retries: 3
restart: unless-stopped
valkey:
image: valkey/valkey:8-alpine
profiles:
- phase2-shared
command:
- valkey-server
- --save
- '60'
- '1'
- --loglevel
- warning
volumes:
- valkey-data:/data
healthcheck:
test: ['CMD', 'valkey-cli', 'ping']
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
gateway:
image: traefik:v3.3
profiles:
@ -825,6 +906,8 @@ volumes:
azurite-data:
loki-data:
grafana-data:
prometheus-data:
valkey-data:
localmemgpt-data:
caddy-data:
caddy-config:

View File

@ -158,6 +158,26 @@ docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml down -v
```
### Optional Phase 2 profiles
The compose file now includes opt-in profiles for the next internal-only infrastructure additions:
```bash
# Metrics stack: Prometheus + node-exporter + cadvisor
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \
--profile phase2-observability up -d prometheus node-exporter cadvisor
# Shared cache/pubsub layer: Valkey
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \
--profile phase2-shared up -d valkey
```
Notes:
- these services are intended to stay internal-only on the VM
- `prometheus` is provisioned for Grafana automatically through the Grafana datasource directory
- neither `prometheus` nor `valkey` needs a raw public port exposure for normal operation
## Environment Variables
All optional — defaults work for most setups:

View File

@ -0,0 +1,8 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
editable: false

View File

@ -0,0 +1,19 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- prometheus:9090
- job_name: node-exporter
static_configs:
- targets:
- node-exporter:9100
- job_name: cadvisor
static_configs:
- targets:
- cadvisor:8080