From b8661392c66c987999947909acd57845c4beeb09 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 31 Mar 2026 06:57:12 +0000 Subject: [PATCH] feat(observability): add phase 2 monitoring and valkey services --- docker-compose.ecosystem.yml | 83 +++++++++++++++++++ docs/devops/single_azure_vm/docker/README.md | 20 +++++ .../provisioning/datasources/prometheus.yml | 8 ++ services/monitoring/prometheus/prometheus.yml | 19 +++++ 4 files changed, 130 insertions(+) create mode 100644 services/monitoring/grafana/provisioning/datasources/prometheus.yml create mode 100644 services/monitoring/prometheus/prometheus.yml diff --git a/docker-compose.ecosystem.yml b/docker-compose.ecosystem.yml index cb1a8f7a..7afa7205 100644 --- a/docker-compose.ecosystem.yml +++ b/docker-compose.ecosystem.yml @@ -16,6 +16,7 @@ # Port Map: # Infrastructure: cosmos-emulator 8081, azurite 10000, mailpit 1025/8025, # loki 3100, grafana 3000, traefik 80/8080 +# Phase 2 (opt-in): prometheus 9090, valkey 6379 # Platform: platform-service 4003, extraction-service 4005, mcp-server 4007 # Dashboards: admin-web 3001, tracker-web 3003 # Products: peakpulse 4010, chronomind 4011, jarvisjr 4012, @@ -124,6 +125,86 @@ services: retries: 3 restart: unless-stopped + prometheus: + image: prom/prometheus:v3.5.0 + profiles: + - phase2-observability + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.enable-lifecycle' + volumes: + - ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + depends_on: + node-exporter: + condition: service_started + cadvisor: + condition: service_started + healthcheck: + test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/healthy'] + interval: 15s + timeout: 5s + retries: 3 + restart: unless-stopped + + node-exporter: + image: prom/node-exporter:v1.9.1 + profiles: + - phase2-observability + command: + - '--path.rootfs=/host' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)' + volumes: + - /:/host:ro,rslave + healthcheck: + test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9100/metrics'] + interval: 15s + timeout: 5s + retries: 3 + restart: unless-stopped + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + profiles: + - phase2-observability + privileged: true + command: + - '--housekeeping_interval=30s' + - '--docker_only=true' + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + - /dev/disk:/dev/disk:ro + healthcheck: + test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:8080/healthz'] + interval: 15s + timeout: 5s + retries: 3 + restart: unless-stopped + + valkey: + image: valkey/valkey:8-alpine + profiles: + - phase2-shared + command: + - valkey-server + - --save + - '60' + - '1' + - --loglevel + - warning + volumes: + - valkey-data:/data + healthcheck: + test: ['CMD', 'valkey-cli', 'ping'] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + gateway: image: traefik:v3.3 profiles: @@ -825,6 +906,8 @@ volumes: azurite-data: loki-data: grafana-data: + prometheus-data: + valkey-data: localmemgpt-data: caddy-data: caddy-config: diff --git a/docs/devops/single_azure_vm/docker/README.md b/docs/devops/single_azure_vm/docker/README.md index d57d0651..b60d9d0f 100644 --- a/docs/devops/single_azure_vm/docker/README.md +++ b/docs/devops/single_azure_vm/docker/README.md @@ -158,6 +158,26 @@ docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml down -v ``` +### Optional Phase 2 profiles + +The compose file now includes opt-in profiles for the next internal-only infrastructure additions: + +```bash +# Metrics stack: Prometheus + node-exporter + cadvisor +docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \ + --profile phase2-observability up -d prometheus node-exporter cadvisor + +# Shared cache/pubsub layer: Valkey +docker compose -f /opt/bytelyst/learning_ai_common_plat/docker-compose.ecosystem.yml \ + --profile phase2-shared up -d valkey +``` + +Notes: + +- these services are intended to stay internal-only on the VM +- `prometheus` is provisioned for Grafana automatically through the Grafana datasource directory +- neither `prometheus` nor `valkey` needs a raw public port exposure for normal operation + ## Environment Variables All optional — defaults work for most setups: diff --git a/services/monitoring/grafana/provisioning/datasources/prometheus.yml b/services/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..c59df9f4 --- /dev/null +++ b/services/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + editable: false diff --git a/services/monitoring/prometheus/prometheus.yml b/services/monitoring/prometheus/prometheus.yml new file mode 100644 index 00000000..3463ddcf --- /dev/null +++ b/services/monitoring/prometheus/prometheus.yml @@ -0,0 +1,19 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - prometheus:9090 + + - job_name: node-exporter + static_configs: + - targets: + - node-exporter:9100 + + - job_name: cadvisor + static_configs: + - targets: + - cadvisor:8080