feat(observability): add phase 2 monitoring and valkey services
This commit is contained in:
parent
d4d8c48a4c
commit
b8661392c6
@ -16,6 +16,7 @@
|
|||||||
# Port Map:
|
# Port Map:
|
||||||
# Infrastructure: cosmos-emulator 8081, azurite 10000, mailpit 1025/8025,
|
# Infrastructure: cosmos-emulator 8081, azurite 10000, mailpit 1025/8025,
|
||||||
# loki 3100, grafana 3000, traefik 80/8080
|
# loki 3100, grafana 3000, traefik 80/8080
|
||||||
|
# Phase 2 (opt-in): prometheus 9090, valkey 6379
|
||||||
# Platform: platform-service 4003, extraction-service 4005, mcp-server 4007
|
# Platform: platform-service 4003, extraction-service 4005, mcp-server 4007
|
||||||
# Dashboards: admin-web 3001, tracker-web 3003
|
# Dashboards: admin-web 3001, tracker-web 3003
|
||||||
# Products: peakpulse 4010, chronomind 4011, jarvisjr 4012,
|
# Products: peakpulse 4010, chronomind 4011, jarvisjr 4012,
|
||||||
@ -124,6 +125,86 @@ services:
|
|||||||
retries: 3
|
retries: 3
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v3.5.0
|
||||||
|
profiles:
|
||||||
|
- phase2-observability
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
volumes:
|
||||||
|
- ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- prometheus-data:/prometheus
|
||||||
|
depends_on:
|
||||||
|
node-exporter:
|
||||||
|
condition: service_started
|
||||||
|
cadvisor:
|
||||||
|
condition: service_started
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/healthy']
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter:v1.9.1
|
||||||
|
profiles:
|
||||||
|
- phase2-observability
|
||||||
|
command:
|
||||||
|
- '--path.rootfs=/host'
|
||||||
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)'
|
||||||
|
volumes:
|
||||||
|
- /:/host:ro,rslave
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9100/metrics']
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
cadvisor:
|
||||||
|
image: gcr.io/cadvisor/cadvisor:v0.49.1
|
||||||
|
profiles:
|
||||||
|
- phase2-observability
|
||||||
|
privileged: true
|
||||||
|
command:
|
||||||
|
- '--housekeeping_interval=30s'
|
||||||
|
- '--docker_only=true'
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /var/run:/var/run:rw
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
|
- /dev/disk:/dev/disk:ro
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:8080/healthz']
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
valkey:
|
||||||
|
image: valkey/valkey:8-alpine
|
||||||
|
profiles:
|
||||||
|
- phase2-shared
|
||||||
|
command:
|
||||||
|
- valkey-server
|
||||||
|
- --save
|
||||||
|
- '60'
|
||||||
|
- '1'
|
||||||
|
- --loglevel
|
||||||
|
- warning
|
||||||
|
volumes:
|
||||||
|
- valkey-data:/data
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'valkey-cli', 'ping']
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
gateway:
|
gateway:
|
||||||
image: traefik:v3.3
|
image: traefik:v3.3
|
||||||
profiles:
|
profiles:
|
||||||
@ -825,6 +906,8 @@ volumes:
|
|||||||
azurite-data:
|
azurite-data:
|
||||||
loki-data:
|
loki-data:
|
||||||
grafana-data:
|
grafana-data:
|
||||||
|
prometheus-data:
|
||||||
|
valkey-data:
|
||||||
localmemgpt-data:
|
localmemgpt-data:
|
||||||
caddy-data:
|
caddy-data:
|
||||||
caddy-config:
|
caddy-config:
|
||||||
|
|||||||
@ -158,6 +158,26 @@ docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem
|
|||||||
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml down -v
|
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml down -v
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Optional Phase 2 profiles
|
||||||
|
|
||||||
|
The compose file now includes opt-in profiles for the next internal-only infrastructure additions:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Metrics stack: Prometheus + node-exporter + cadvisor
|
||||||
|
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \
|
||||||
|
--profile phase2-observability up -d prometheus node-exporter cadvisor
|
||||||
|
|
||||||
|
# Shared cache/pubsub layer: Valkey
|
||||||
|
docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \
|
||||||
|
--profile phase2-shared up -d valkey
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- these services are intended to stay internal-only on the VM
|
||||||
|
- `prometheus` is provisioned for Grafana automatically through the Grafana datasource directory
|
||||||
|
- neither `prometheus` nor `valkey` needs a raw public port exposure for normal operation
|
||||||
|
|
||||||
## Environment Variables
|
## Environment Variables
|
||||||
|
|
||||||
All optional — defaults work for most setups:
|
All optional — defaults work for most setups:
|
||||||
|
|||||||
@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
editable: false
|
||||||
19
services/monitoring/prometheus/prometheus.yml
Normal file
19
services/monitoring/prometheus/prometheus.yml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: prometheus
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- prometheus:9090
|
||||||
|
|
||||||
|
- job_name: node-exporter
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- node-exporter:9100
|
||||||
|
|
||||||
|
- job_name: cadvisor
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- cadvisor:8080
|
||||||
Loading…
Reference in New Issue
Block a user