fix(infra): fix 5 bugs in setup.sh per-service build + resume logic
1. set -e + pipefail: docker compose up piped through tail would abort script on partial startup failure before printing summary — add || true 2. Phase 7 marked done even with build failures, so --resume would skip it — now only marks done when all builds succeed 3. --phase=7 printed 'Phase 7 complete' even with failures — now exits with code 1 and points to build logs 4. docker compose config --format json called 30 times in build loop — now cached once (saves ~3s) 5. Build logs now saved per-service to STATE_DIR/builds/<svc>.log for post-failure debugging
This commit is contained in:
parent
8ff9e42817
commit
a9414218ba
@ -543,11 +543,19 @@ setup_compose_env() {
|
||||
}
|
||||
|
||||
# Build a single compose service image, return 0 on success.
|
||||
# Full build log saved to STATE_DIR/builds/<svc>.log for debugging.
|
||||
build_one_service() {
|
||||
local svc="$1"
|
||||
local plat_dir="${INSTALL_DIR}/learning_ai_common_plat"
|
||||
docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \
|
||||
build "$svc" 2>&1 | tail -5
|
||||
local log_file="${STATE_DIR}/builds/${svc}.log"
|
||||
if docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \
|
||||
build "$svc" > "$log_file" 2>&1; then
|
||||
tail -3 "$log_file"
|
||||
return 0
|
||||
else
|
||||
tail -5 "$log_file"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
phase7_deploy() {
|
||||
@ -567,13 +575,19 @@ phase7_deploy() {
|
||||
log " Building ${total} service images individually..."
|
||||
echo ""
|
||||
|
||||
# Cache compose config JSON once (avoid calling config 30 times)
|
||||
local compose_json
|
||||
compose_json=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" \
|
||||
--env-file "${plat_dir}/.env.ecosystem" config --format json 2>/dev/null || true)
|
||||
|
||||
for svc in "${all_services[@]}"; do
|
||||
idx=$((idx + 1))
|
||||
|
||||
# Infrastructure services use pre-built images (no build step)
|
||||
local has_build
|
||||
has_build=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" config --format json 2>/dev/null \
|
||||
| jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true)
|
||||
local has_build=""
|
||||
if [ -n "$compose_json" ]; then
|
||||
has_build=$(echo "$compose_json" | jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true)
|
||||
fi
|
||||
|
||||
if [ -z "$has_build" ] || [ "$has_build" = "null" ]; then
|
||||
build_skip+=("$svc")
|
||||
@ -611,15 +625,19 @@ phase7_deploy() {
|
||||
fi
|
||||
|
||||
log " Starting ${#start_services[@]} services..."
|
||||
# Use || true so set -e doesn't abort before we print the summary
|
||||
docker compose \
|
||||
-f "${plat_dir}/${COMPOSE_FILE}" \
|
||||
--env-file "${plat_dir}/.env.ecosystem" \
|
||||
up -d "${start_services[@]}" 2>&1 | tail -10
|
||||
up -d "${start_services[@]}" 2>&1 | tail -10 || true
|
||||
|
||||
if [ ${#build_fail[@]} -gt 0 ]; then
|
||||
# Signal to run_phase() that phase 7 should NOT be marked done
|
||||
PHASE7_HAD_FAILURES=1
|
||||
warn "Phase 7 complete with ${#build_fail[@]} failed builds: ${build_fail[*]}"
|
||||
warn " Fix and re-run: sudo ./setup.sh --phase=7"
|
||||
else
|
||||
PHASE7_HAD_FAILURES=0
|
||||
ok "Phase 7 complete. All ${#start_services[@]} services started."
|
||||
fi
|
||||
}
|
||||
@ -711,6 +729,8 @@ HEALTH
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# MAIN
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
PHASE7_HAD_FAILURES=0
|
||||
|
||||
run_phase() {
|
||||
local phase_num="$1"
|
||||
case "$phase_num" in
|
||||
@ -724,6 +744,11 @@ run_phase() {
|
||||
8) phase8_verify ;;
|
||||
*) fail "Unknown phase: $phase_num" ;;
|
||||
esac
|
||||
# Don't mark phase 7 done if there were build failures (--resume should retry it)
|
||||
if [ "$phase_num" -eq 7 ] && [ "$PHASE7_HAD_FAILURES" -eq 1 ]; then
|
||||
warn "Phase 7 NOT marked done (build failures). --resume will retry it."
|
||||
return
|
||||
fi
|
||||
mark_phase_done "$phase_num"
|
||||
}
|
||||
|
||||
@ -811,6 +836,11 @@ main() {
|
||||
log "Running ONLY phase ${only_phase}..."
|
||||
restore_gitea_token
|
||||
run_phase "$only_phase"
|
||||
if [ "$only_phase" -eq 7 ] && [ "$PHASE7_HAD_FAILURES" -eq 1 ]; then
|
||||
warn "Phase 7 finished with failures. Fix and re-run: sudo ./setup.sh --phase=7"
|
||||
warn "Build logs: ${STATE_DIR}/builds/"
|
||||
exit 1
|
||||
fi
|
||||
ok "Phase ${only_phase} complete."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
Loading…
Reference in New Issue
Block a user