From 8ff9e42817d4e0cc3e7f17fe8f0fa2fc78cd7eea Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Tue, 24 Mar 2026 12:03:55 -0700 Subject: [PATCH] feat(infra): add resume/retry, per-service build, and fallback to setup.sh - --resume: auto-detect last completed phase and continue from there - --resume-from=N: resume from a specific phase - --phase=N: run only one phase (e.g. --phase=7 to retry deploy) - --reset: clear phase markers and start fresh - --status: show completed phases - Phase 7 now builds each of 27 services individually with progress - Failed builds are skipped; remaining services still start - Phase completion markers stored in /opt/bytelyst/.setup-state/ - GITEA_NPM_TOKEN auto-restored from saved state on resume --- docs/devops/single_azure_vm/setup.sh | 269 +++++++++++++++++++++++++-- 1 file changed, 250 insertions(+), 19 deletions(-) diff --git a/docs/devops/single_azure_vm/setup.sh b/docs/devops/single_azure_vm/setup.sh index aa9dcf07..48c3aee6 100755 --- a/docs/devops/single_azure_vm/setup.sh +++ b/docs/devops/single_azure_vm/setup.sh @@ -14,7 +14,13 @@ # - All @bytelyst/* packages (built + published to Gitea) # - Full 27-service ecosystem (via docker-compose.ecosystem.yml) # -# Usage: sudo ./setup.sh +# Usage: sudo ./setup.sh [OPTIONS] +# +# Options: +# --resume Auto-resume from last completed phase +# --resume-from=N Resume from phase N (1-8) +# --phase=N Run ONLY phase N (useful for retrying a single phase) +# --reset Clear phase markers and start fresh # # Optional env vars: # GITHUB_USER — GitHub org/user to clone from (default: saravanakumardb1) @@ -78,6 +84,40 @@ detect_docker_host_ip() { ip -4 addr show docker0 2>/dev/null | grep -oP '(?<=inet\s)\d+(\.\d+){3}' || echo "172.17.0.1" } +# ── Phase tracking (resume/retry support) ────────────────────────── +STATE_DIR="${INSTALL_DIR}/.setup-state" + +mark_phase_done() { + mkdir -p "$STATE_DIR" + date -Iseconds > "${STATE_DIR}/phase${1}.done" +} + +is_phase_done() { + [ -f "${STATE_DIR}/phase${1}.done" ] +} + +last_completed_phase() { + local last=0 + for i in 1 2 3 4 5 6 7 8; do + is_phase_done "$i" && last=$i + done + echo "$last" +} + +reset_phase_markers() { + rm -rf "$STATE_DIR" + ok "Phase markers cleared." +} + +# Restore GITEA_NPM_TOKEN from saved state (needed when resuming after phase 2) +restore_gitea_token() { + if [ -z "${GITEA_NPM_TOKEN:-}" ] && [ -f "${INSTALL_DIR}/.gitea_token" ]; then + GITEA_NPM_TOKEN=$(cat "${INSTALL_DIR}/.gitea_token") + export GITEA_NPM_TOKEN + log "Restored GITEA_NPM_TOKEN from saved state." + fi +} + # ═══════════════════════════════════════════════════════════════════════ # PHASE 1: System Dependencies # ═══════════════════════════════════════════════════════════════════════ @@ -472,31 +512,116 @@ ENV # ═══════════════════════════════════════════════════════════════════════ # PHASE 7: Deploy Ecosystem via Docker Compose # ═══════════════════════════════════════════════════════════════════════ -phase7_deploy() { - log "Phase 7: Deploying 27-service ecosystem..." +# All 27 compose services, grouped for ordered build + reporting. +INFRA_SERVICES=(cosmos-emulator azurite mailpit loki grafana gateway) +PLATFORM_SERVICES=(platform-service extraction-service mcp-server) +DASHBOARD_SERVICES=(admin-web tracker-web) +BACKEND_SERVICES=( + peakpulse-backend chronomind-backend jarvisjr-backend nomgap-backend + mindlyst-backend lysnrai-backend notelett-backend flowmonk-backend + actiontrail-backend localmemgpt-backend +) +WEB_SERVICES=( + lysnrai-dashboard chronomind-web jarvisjr-web flowmonk-web notelett-web + mindlyst-web nomgap-web actiontrail-web localmemgpt-web +) +setup_compose_env() { local plat_dir="${INSTALL_DIR}/learning_ai_common_plat" cd "$plat_dir" - # Detect host IP for Docker builds to reach Gitea + restore_gitea_token + local docker_host_ip docker_host_ip=$(detect_docker_host_ip) log " Docker host IP for Gitea access: ${docker_host_ip}" - # Export vars needed by compose export GITEA_NPM_TOKEN export GITEA_NPM_HOST="${docker_host_ip}" export DOCKER_BUILDKIT=1 export COMPOSE_DOCKER_CLI_BUILD=1 +} - # Build and start all services - log " Building and starting services (this takes ~10-15 minutes)..." +# Build a single compose service image, return 0 on success. +build_one_service() { + local svc="$1" + local plat_dir="${INSTALL_DIR}/learning_ai_common_plat" + docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \ + build "$svc" 2>&1 | tail -5 +} + +phase7_deploy() { + log "Phase 7: Deploying ecosystem (per-service build + fallback)..." + + setup_compose_env + + local plat_dir="${INSTALL_DIR}/learning_ai_common_plat" + local build_ok=() build_fail=() build_skip=() + mkdir -p "${STATE_DIR}/builds" + + # ── 7a: Build each service individually ────────────────────────── + local all_services=("${INFRA_SERVICES[@]}" "${PLATFORM_SERVICES[@]}" "${DASHBOARD_SERVICES[@]}" "${BACKEND_SERVICES[@]}" "${WEB_SERVICES[@]}") + local total=${#all_services[@]} + local idx=0 + + log " Building ${total} service images individually..." + echo "" + + for svc in "${all_services[@]}"; do + idx=$((idx + 1)) + + # Infrastructure services use pre-built images (no build step) + local has_build + has_build=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" config --format json 2>/dev/null \ + | jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true) + + if [ -z "$has_build" ] || [ "$has_build" = "null" ]; then + build_skip+=("$svc") + ok " [${idx}/${total}] ${svc} — pre-built image (skip build)" + continue + fi + + log " [${idx}/${total}] Building ${svc}..." + if build_one_service "$svc"; then + build_ok+=("$svc") + date -Iseconds > "${STATE_DIR}/builds/${svc}.ok" + ok " [${idx}/${total}] ${svc} — build OK" + else + build_fail+=("$svc") + date -Iseconds > "${STATE_DIR}/builds/${svc}.fail" + warn " [${idx}/${total}] ${svc} — BUILD FAILED (will skip)" + fi + done + + # ── Build summary ──────────────────────────────────────────────── + echo "" + log " Build results: ${#build_ok[@]} OK, ${#build_fail[@]} FAILED, ${#build_skip[@]} pre-built" + if [ ${#build_fail[@]} -gt 0 ]; then + warn " Failed services: ${build_fail[*]}" + fi + + # ── 7b: Start services (skip failed builds) ───────────────────── + # Compose up only the services that built successfully + pre-built infra + local start_services=() + for svc in "${build_skip[@]}"; do start_services+=("$svc"); done + for svc in "${build_ok[@]}"; do start_services+=("$svc"); done + + if [ ${#start_services[@]} -eq 0 ]; then + fail "No services to start — all builds failed." + fi + + log " Starting ${#start_services[@]} services..." docker compose \ - -f "$COMPOSE_FILE" \ - --env-file .env.ecosystem \ - up --build -d 2>&1 | tail -20 + -f "${plat_dir}/${COMPOSE_FILE}" \ + --env-file "${plat_dir}/.env.ecosystem" \ + up -d "${start_services[@]}" 2>&1 | tail -10 - ok "Phase 7 complete. All containers started." + if [ ${#build_fail[@]} -gt 0 ]; then + warn "Phase 7 complete with ${#build_fail[@]} failed builds: ${build_fail[*]}" + warn " Fix and re-run: sudo ./setup.sh --phase=7" + else + ok "Phase 7 complete. All ${#start_services[@]} services started." + fi } # ═══════════════════════════════════════════════════════════════════════ @@ -586,7 +711,81 @@ HEALTH # ═══════════════════════════════════════════════════════════════════════ # MAIN # ═══════════════════════════════════════════════════════════════════════ +run_phase() { + local phase_num="$1" + case "$phase_num" in + 1) phase1_system ;; + 2) phase2_gitea ;; + 3) phase3_clone ;; + 4) phase4_build ;; + 5) phase5_publish ;; + 6) phase6_env ;; + 7) phase7_deploy ;; + 8) phase8_verify ;; + *) fail "Unknown phase: $phase_num" ;; + esac + mark_phase_done "$phase_num" +} + +usage() { + echo "Usage: sudo ./setup.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --resume Auto-resume from last completed phase" + echo " --resume-from=N Resume starting at phase N (1-8)" + echo " --phase=N Run ONLY phase N" + echo " --reset Clear phase markers and start fresh" + echo " --status Show completed phases and exit" + echo " -h, --help Show this help" + echo "" + echo "Phases:" + echo " 1 System dependencies (Docker, Node, pnpm, Ollama)" + echo " 2 Gitea npm registry" + echo " 3 Clone repositories" + echo " 4 Build @bytelyst/* packages" + echo " 5 Publish packages to Gitea" + echo " 6 Generate .env.ecosystem" + echo " 7 Build + deploy Docker services (per-service, with fallback)" + echo " 8 Health check" +} + main() { + # Parse CLI arguments + local mode="full" start_phase=1 only_phase=0 + + for arg in "$@"; do + case "$arg" in + --resume) + mode="resume" ;; + --resume-from=*) + mode="resume-from" + start_phase="${arg#*=}" ;; + --phase=*) + mode="single" + only_phase="${arg#*=}" ;; + --reset) + mkdir -p "$INSTALL_DIR" + reset_phase_markers + exit 0 ;; + --status) + mkdir -p "$INSTALL_DIR" + echo "Phase completion status:" + for i in 1 2 3 4 5 6 7 8; do + if is_phase_done "$i"; then + echo " Phase $i: DONE ($(cat "${STATE_DIR}/phase${i}.done"))" + else + echo " Phase $i: pending" + fi + done + exit 0 ;; + -h|--help) + usage; exit 0 ;; + *) + warn "Unknown option: $arg" + usage; exit 1 ;; + esac + done + # Tee all output to a log file so SSH disconnection doesn't lose context mkdir -p "$INSTALL_DIR" exec > >(tee -a "${INSTALL_DIR}/setup.log") 2>&1 @@ -606,16 +805,46 @@ main() { log "Target OS: $(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '"')" log "Target arch: $(uname -m)" + + # ── Single-phase mode ──────────────────────────────────────────── + if [ "$mode" = "single" ]; then + log "Running ONLY phase ${only_phase}..." + restore_gitea_token + run_phase "$only_phase" + ok "Phase ${only_phase} complete." + exit 0 + fi + + # ── Auto-resume mode ───────────────────────────────────────────── + if [ "$mode" = "resume" ]; then + local last + last=$(last_completed_phase) + if [ "$last" -eq 0 ]; then + log "No completed phases found. Starting from phase 1." + start_phase=1 + elif [ "$last" -ge 8 ]; then + ok "All phases already completed. Use --reset to start over." + exit 0 + else + start_phase=$((last + 1)) + log "Resuming from phase ${start_phase} (phases 1-${last} already done)." + fi + elif [ "$mode" = "resume-from" ]; then + log "Resuming from phase ${start_phase} (as requested)." + fi + + # Restore token if resuming past phase 2 + if [ "$start_phase" -gt 2 ]; then + restore_gitea_token + fi + echo "" - phase1_system - phase2_gitea - phase3_clone - phase4_build - phase5_publish - phase6_env - phase7_deploy - phase8_verify + # ── Run phases ─────────────────────────────────────────────────── + for phase_num in 1 2 3 4 5 6 7 8; do + [ "$phase_num" -ge "$start_phase" ] || continue + run_phase "$phase_num" + done local elapsed=$(( $(date +%s) - start_time )) local minutes=$(( elapsed / 60 )) @@ -627,6 +856,8 @@ main() { echo "║ ║" echo "║ Health check: /opt/bytelyst/check-health.sh ║" echo "║ Compose logs: docker compose -f ${COMPOSE_FILE} logs -f ║" + echo "║ Retry failed: sudo ./setup.sh --phase=7 ║" + echo "║ Resume: sudo ./setup.sh --resume ║" echo "║ Gitea UI: http://localhost:3300 ║" echo "║ Ollama API: http://localhost:11434 ║" echo "║ Grafana: http://localhost:3000 (admin / bytelyst) ║"