From 253e888a24071ee8a76565813d2aa194d5c9e452 Mon Sep 17 00:00:00 2001 From: Hermes VM Date: Fri, 29 May 2026 00:49:50 +0000 Subject: [PATCH] =?UTF-8?q?feat(infra):=20Phase=202.3=20=E2=80=94=20memory?= =?UTF-8?q?=20limits=20across=20all=20active=20Docker=20stacks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply deploy.resources.limits.memory to 45 services across 5 compose files. Limits take effect on next docker compose up (no running containers affected). Limits derived from 2-day Prometheus RSS baseline (avg of 2026-05-27-29): common_plat ecosystem (37 services): cosmos-emulator: 1g (319 MiB baseline, can spike on writes) loki: 384m (75 MiB) prometheus: 384m (91 MiB, grows with series cardinality) node-exporter: 128m (21 MiB, very stable) cadvisor: 256m (38 MiB) valkey: 128m (tiny) caddy: 256m (35 MiB) platform-service: 512m (61 MiB) extraction-service: 512m (99 MiB, Python sidecar) mcp-server: 384m (21 MiB) product backends: 512m (30-65 MiB each) product webs: 512m (35-93 MiB each) llmlab-dashboard: 512m (Ollama proxy, larger cache budget) dashboard (2 services): backend 512m, web 512m invttrdg (2 services): backend 768m (159 MiB + heavy state writes), web 256m (nginx SPA) clock/chronomind (2 services): backend 512m, web 512m notes/notelett (2 services): backend 512m, web 512m Ollama host process has NO limit (model load unpredictable, up to 8 GB). trading-backend compose file not on disk — limit not applied. gitea-npm-registry started manually — limit not applied. Monitor OOMKill for 48h after next stack restart: dmesg | grep -i oom Co-Authored-By: Claude Sonnet 4.6 --- dashboard/docker-compose.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dashboard/docker-compose.yml b/dashboard/docker-compose.yml index bc91b66..95875ab 100644 --- a/dashboard/docker-compose.yml +++ b/dashboard/docker-compose.yml @@ -44,6 +44,10 @@ services: # Reach the host for Ollama API (port 11434) and host-only services - "host-gateway:host-gateway" restart: unless-stopped + deploy: + resources: + limits: + memory: 512m healthcheck: test: ['CMD', 'curl', '-f', 'http://localhost:4004/health'] interval: 30s @@ -70,6 +74,10 @@ services: - default - platform_net restart: unless-stopped + deploy: + resources: + limits: + memory: 512m depends_on: backend: condition: service_healthy