fix(infra): fix 5 bugs in setup.sh per-service build + resume logic

1. set -e + pipefail: docker compose up piped through tail would abort
   script on partial startup failure before printing summary — add || true
2. Phase 7 marked done even with build failures, so --resume would skip
   it — now only marks done when all builds succeed
3. --phase=7 printed 'Phase 7 complete' even with failures — now exits
   with code 1 and points to build logs
4. docker compose config --format json called 30 times in build loop —
   now cached once (saves ~3s)
5. Build logs now saved per-service to STATE_DIR/builds/<svc>.log for
   post-failure debugging
This commit is contained in:
saravanakumardb1 2026-03-24 12:13:14 -07:00
parent 8ff9e42817
commit a9414218ba

View File

@ -543,11 +543,19 @@ setup_compose_env() {
}
# Build a single compose service image, return 0 on success.
# Full build log saved to STATE_DIR/builds/<svc>.log for debugging.
build_one_service() {
local svc="$1"
local plat_dir="${INSTALL_DIR}/learning_ai_common_plat"
docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \
build "$svc" 2>&1 | tail -5
local log_file="${STATE_DIR}/builds/${svc}.log"
if docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \
build "$svc" > "$log_file" 2>&1; then
tail -3 "$log_file"
return 0
else
tail -5 "$log_file"
return 1
fi
}
phase7_deploy() {
@ -567,13 +575,19 @@ phase7_deploy() {
log " Building ${total} service images individually..."
echo ""
# Cache compose config JSON once (avoid calling config 30 times)
local compose_json
compose_json=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" \
--env-file "${plat_dir}/.env.ecosystem" config --format json 2>/dev/null || true)
for svc in "${all_services[@]}"; do
idx=$((idx + 1))
# Infrastructure services use pre-built images (no build step)
local has_build
has_build=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" config --format json 2>/dev/null \
| jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true)
local has_build=""
if [ -n "$compose_json" ]; then
has_build=$(echo "$compose_json" | jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true)
fi
if [ -z "$has_build" ] || [ "$has_build" = "null" ]; then
build_skip+=("$svc")
@ -611,15 +625,19 @@ phase7_deploy() {
fi
log " Starting ${#start_services[@]} services..."
# Use || true so set -e doesn't abort before we print the summary
docker compose \
-f "${plat_dir}/${COMPOSE_FILE}" \
--env-file "${plat_dir}/.env.ecosystem" \
up -d "${start_services[@]}" 2>&1 | tail -10
up -d "${start_services[@]}" 2>&1 | tail -10 || true
if [ ${#build_fail[@]} -gt 0 ]; then
# Signal to run_phase() that phase 7 should NOT be marked done
PHASE7_HAD_FAILURES=1
warn "Phase 7 complete with ${#build_fail[@]} failed builds: ${build_fail[*]}"
warn " Fix and re-run: sudo ./setup.sh --phase=7"
else
PHASE7_HAD_FAILURES=0
ok "Phase 7 complete. All ${#start_services[@]} services started."
fi
}
@ -711,6 +729,8 @@ HEALTH
# ═══════════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════════
PHASE7_HAD_FAILURES=0
run_phase() {
local phase_num="$1"
case "$phase_num" in
@ -724,6 +744,11 @@ run_phase() {
8) phase8_verify ;;
*) fail "Unknown phase: $phase_num" ;;
esac
# Don't mark phase 7 done if there were build failures (--resume should retry it)
if [ "$phase_num" -eq 7 ] && [ "$PHASE7_HAD_FAILURES" -eq 1 ]; then
warn "Phase 7 NOT marked done (build failures). --resume will retry it."
return
fi
mark_phase_done "$phase_num"
}
@ -811,6 +836,11 @@ main() {
log "Running ONLY phase ${only_phase}..."
restore_gitea_token
run_phase "$only_phase"
if [ "$only_phase" -eq 7 ] && [ "$PHASE7_HAD_FAILURES" -eq 1 ]; then
warn "Phase 7 finished with failures. Fix and re-run: sudo ./setup.sh --phase=7"
warn "Build logs: ${STATE_DIR}/builds/"
exit 1
fi
ok "Phase ${only_phase} complete."
exit 0
fi