diff --git a/docs/devops/single_azure_vm/setup.sh b/docs/devops/single_azure_vm/setup.sh index 48c3aee6..eb46d6ba 100755 --- a/docs/devops/single_azure_vm/setup.sh +++ b/docs/devops/single_azure_vm/setup.sh @@ -543,11 +543,19 @@ setup_compose_env() { } # Build a single compose service image, return 0 on success. +# Full build log saved to STATE_DIR/builds/.log for debugging. build_one_service() { local svc="$1" local plat_dir="${INSTALL_DIR}/learning_ai_common_plat" - docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \ - build "$svc" 2>&1 | tail -5 + local log_file="${STATE_DIR}/builds/${svc}.log" + if docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \ + build "$svc" > "$log_file" 2>&1; then + tail -3 "$log_file" + return 0 + else + tail -5 "$log_file" + return 1 + fi } phase7_deploy() { @@ -567,13 +575,19 @@ phase7_deploy() { log " Building ${total} service images individually..." echo "" + # Cache compose config JSON once (avoid calling config 30 times) + local compose_json + compose_json=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" \ + --env-file "${plat_dir}/.env.ecosystem" config --format json 2>/dev/null || true) + for svc in "${all_services[@]}"; do idx=$((idx + 1)) # Infrastructure services use pre-built images (no build step) - local has_build - has_build=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" config --format json 2>/dev/null \ - | jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true) + local has_build="" + if [ -n "$compose_json" ]; then + has_build=$(echo "$compose_json" | jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true) + fi if [ -z "$has_build" ] || [ "$has_build" = "null" ]; then build_skip+=("$svc") @@ -611,15 +625,19 @@ phase7_deploy() { fi log " Starting ${#start_services[@]} services..." + # Use || true so set -e doesn't abort before we print the summary docker compose \ -f "${plat_dir}/${COMPOSE_FILE}" \ --env-file "${plat_dir}/.env.ecosystem" \ - up -d "${start_services[@]}" 2>&1 | tail -10 + up -d "${start_services[@]}" 2>&1 | tail -10 || true if [ ${#build_fail[@]} -gt 0 ]; then + # Signal to run_phase() that phase 7 should NOT be marked done + PHASE7_HAD_FAILURES=1 warn "Phase 7 complete with ${#build_fail[@]} failed builds: ${build_fail[*]}" warn " Fix and re-run: sudo ./setup.sh --phase=7" else + PHASE7_HAD_FAILURES=0 ok "Phase 7 complete. All ${#start_services[@]} services started." fi } @@ -711,6 +729,8 @@ HEALTH # ═══════════════════════════════════════════════════════════════════════ # MAIN # ═══════════════════════════════════════════════════════════════════════ +PHASE7_HAD_FAILURES=0 + run_phase() { local phase_num="$1" case "$phase_num" in @@ -724,6 +744,11 @@ run_phase() { 8) phase8_verify ;; *) fail "Unknown phase: $phase_num" ;; esac + # Don't mark phase 7 done if there were build failures (--resume should retry it) + if [ "$phase_num" -eq 7 ] && [ "$PHASE7_HAD_FAILURES" -eq 1 ]; then + warn "Phase 7 NOT marked done (build failures). --resume will retry it." + return + fi mark_phase_done "$phase_num" } @@ -811,6 +836,11 @@ main() { log "Running ONLY phase ${only_phase}..." restore_gitea_token run_phase "$only_phase" + if [ "$only_phase" -eq 7 ] && [ "$PHASE7_HAD_FAILURES" -eq 1 ]; then + warn "Phase 7 finished with failures. Fix and re-run: sudo ./setup.sh --phase=7" + warn "Build logs: ${STATE_DIR}/builds/" + exit 1 + fi ok "Phase ${only_phase} complete." exit 0 fi