feat(observability): add phase 2 monitoring and valkey services
This commit is contained in:
parent
d4d8c48a4c
commit
b8661392c6
@ -16,6 +16,7 @@
|
||||
# Port Map:
|
||||
# Infrastructure: cosmos-emulator 8081, azurite 10000, mailpit 1025/8025,
|
||||
# loki 3100, grafana 3000, traefik 80/8080
|
||||
# Phase 2 (opt-in): prometheus 9090, valkey 6379
|
||||
# Platform: platform-service 4003, extraction-service 4005, mcp-server 4007
|
||||
# Dashboards: admin-web 3001, tracker-web 3003
|
||||
# Products: peakpulse 4010, chronomind 4011, jarvisjr 4012,
|
||||
@ -124,6 +125,86 @@ services:
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.5.0
|
||||
profiles:
|
||||
- phase2-observability
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.enable-lifecycle'
|
||||
volumes:
|
||||
- ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
depends_on:
|
||||
node-exporter:
|
||||
condition: service_started
|
||||
cadvisor:
|
||||
condition: service_started
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/healthy']
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.9.1
|
||||
profiles:
|
||||
- phase2-observability
|
||||
command:
|
||||
- '--path.rootfs=/host'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)'
|
||||
volumes:
|
||||
- /:/host:ro,rslave
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9100/metrics']
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.49.1
|
||||
profiles:
|
||||
- phase2-observability
|
||||
privileged: true
|
||||
command:
|
||||
- '--housekeeping_interval=30s'
|
||||
- '--docker_only=true'
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker:/var/lib/docker:ro
|
||||
- /dev/disk:/dev/disk:ro
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:8080/healthz']
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
|
||||
valkey:
|
||||
image: valkey/valkey:8-alpine
|
||||
profiles:
|
||||
- phase2-shared
|
||||
command:
|
||||
- valkey-server
|
||||
- --save
|
||||
- '60'
|
||||
- '1'
|
||||
- --loglevel
|
||||
- warning
|
||||
volumes:
|
||||
- valkey-data:/data
|
||||
healthcheck:
|
||||
test: ['CMD', 'valkey-cli', 'ping']
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
restart: unless-stopped
|
||||
|
||||
gateway:
|
||||
image: traefik:v3.3
|
||||
profiles:
|
||||
@ -825,6 +906,8 @@ volumes:
|
||||
azurite-data:
|
||||
loki-data:
|
||||
grafana-data:
|
||||
prometheus-data:
|
||||
valkey-data:
|
||||
localmemgpt-data:
|
||||
caddy-data:
|
||||
caddy-config:
|
||||
|
||||
@ -158,6 +158,26 @@ docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem
|
||||
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml down -v
|
||||
```
|
||||
|
||||
### Optional Phase 2 profiles
|
||||
|
||||
The compose file now includes opt-in profiles for the next internal-only infrastructure additions:
|
||||
|
||||
```bash
|
||||
# Metrics stack: Prometheus + node-exporter + cadvisor
|
||||
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \
|
||||
--profile phase2-observability up -d prometheus node-exporter cadvisor
|
||||
|
||||
# Shared cache/pubsub layer: Valkey
|
||||
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \
|
||||
--profile phase2-shared up -d valkey
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- these services are intended to stay internal-only on the VM
|
||||
- `prometheus` is provisioned for Grafana automatically through the Grafana datasource directory
|
||||
- neither `prometheus` nor `valkey` needs a raw public port exposure for normal operation
|
||||
|
||||
## Environment Variables
|
||||
|
||||
All optional — defaults work for most setups:
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
editable: false
|
||||
19
services/monitoring/prometheus/prometheus.yml
Normal file
19
services/monitoring/prometheus/prometheus.yml
Normal file
@ -0,0 +1,19 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets:
|
||||
- prometheus:9090
|
||||
|
||||
- job_name: node-exporter
|
||||
static_configs:
|
||||
- targets:
|
||||
- node-exporter:9100
|
||||
|
||||
- job_name: cadvisor
|
||||
static_configs:
|
||||
- targets:
|
||||
- cadvisor:8080
|
||||
Loading…
Reference in New Issue
Block a user