feat(infra): add resume/retry, per-service build, and fallback to setup.sh

- --resume: auto-detect last completed phase and continue from there
- --resume-from=N: resume from a specific phase
- --phase=N: run only one phase (e.g. --phase=7 to retry deploy)
- --reset: clear phase markers and start fresh
- --status: show completed phases
- Phase 7 now builds each of 27 services individually with progress
- Failed builds are skipped; remaining services still start
- Phase completion markers stored in /opt/bytelyst/.setup-state/
- GITEA_NPM_TOKEN auto-restored from saved state on resume
This commit is contained in:
saravanakumardb1 2026-03-24 12:03:55 -07:00
parent c0bc13e10a
commit 8ff9e42817

View File

@ -14,7 +14,13 @@
# - All @bytelyst/* packages (built + published to Gitea)
# - Full 27-service ecosystem (via docker-compose.ecosystem.yml)
#
# Usage: sudo ./setup.sh
# Usage: sudo ./setup.sh [OPTIONS]
#
# Options:
# --resume Auto-resume from last completed phase
# --resume-from=N Resume from phase N (1-8)
# --phase=N Run ONLY phase N (useful for retrying a single phase)
# --reset Clear phase markers and start fresh
#
# Optional env vars:
# GITHUB_USER — GitHub org/user to clone from (default: saravanakumardb1)
@ -78,6 +84,40 @@ detect_docker_host_ip() {
ip -4 addr show docker0 2>/dev/null | grep -oP '(?<=inet\s)\d+(\.\d+){3}' || echo "172.17.0.1"
}
# ── Phase tracking (resume/retry support) ──────────────────────────
STATE_DIR="${INSTALL_DIR}/.setup-state"
mark_phase_done() {
mkdir -p "$STATE_DIR"
date -Iseconds > "${STATE_DIR}/phase${1}.done"
}
is_phase_done() {
[ -f "${STATE_DIR}/phase${1}.done" ]
}
last_completed_phase() {
local last=0
for i in 1 2 3 4 5 6 7 8; do
is_phase_done "$i" && last=$i
done
echo "$last"
}
reset_phase_markers() {
rm -rf "$STATE_DIR"
ok "Phase markers cleared."
}
# Restore GITEA_NPM_TOKEN from saved state (needed when resuming after phase 2)
restore_gitea_token() {
if [ -z "${GITEA_NPM_TOKEN:-}" ] && [ -f "${INSTALL_DIR}/.gitea_token" ]; then
GITEA_NPM_TOKEN=$(cat "${INSTALL_DIR}/.gitea_token")
export GITEA_NPM_TOKEN
log "Restored GITEA_NPM_TOKEN from saved state."
fi
}
# ═══════════════════════════════════════════════════════════════════════
# PHASE 1: System Dependencies
# ═══════════════════════════════════════════════════════════════════════
@ -472,31 +512,116 @@ ENV
# ═══════════════════════════════════════════════════════════════════════
# PHASE 7: Deploy Ecosystem via Docker Compose
# ═══════════════════════════════════════════════════════════════════════
phase7_deploy() {
log "Phase 7: Deploying 27-service ecosystem..."
# All 27 compose services, grouped for ordered build + reporting.
INFRA_SERVICES=(cosmos-emulator azurite mailpit loki grafana gateway)
PLATFORM_SERVICES=(platform-service extraction-service mcp-server)
DASHBOARD_SERVICES=(admin-web tracker-web)
BACKEND_SERVICES=(
peakpulse-backend chronomind-backend jarvisjr-backend nomgap-backend
mindlyst-backend lysnrai-backend notelett-backend flowmonk-backend
actiontrail-backend localmemgpt-backend
)
WEB_SERVICES=(
lysnrai-dashboard chronomind-web jarvisjr-web flowmonk-web notelett-web
mindlyst-web nomgap-web actiontrail-web localmemgpt-web
)
setup_compose_env() {
local plat_dir="${INSTALL_DIR}/learning_ai_common_plat"
cd "$plat_dir"
# Detect host IP for Docker builds to reach Gitea
restore_gitea_token
local docker_host_ip
docker_host_ip=$(detect_docker_host_ip)
log " Docker host IP for Gitea access: ${docker_host_ip}"
# Export vars needed by compose
export GITEA_NPM_TOKEN
export GITEA_NPM_HOST="${docker_host_ip}"
export DOCKER_BUILDKIT=1
export COMPOSE_DOCKER_CLI_BUILD=1
}
# Build and start all services
log " Building and starting services (this takes ~10-15 minutes)..."
# Build a single compose service image, return 0 on success.
build_one_service() {
local svc="$1"
local plat_dir="${INSTALL_DIR}/learning_ai_common_plat"
docker compose -f "${plat_dir}/${COMPOSE_FILE}" --env-file "${plat_dir}/.env.ecosystem" \
build "$svc" 2>&1 | tail -5
}
phase7_deploy() {
log "Phase 7: Deploying ecosystem (per-service build + fallback)..."
setup_compose_env
local plat_dir="${INSTALL_DIR}/learning_ai_common_plat"
local build_ok=() build_fail=() build_skip=()
mkdir -p "${STATE_DIR}/builds"
# ── 7a: Build each service individually ──────────────────────────
local all_services=("${INFRA_SERVICES[@]}" "${PLATFORM_SERVICES[@]}" "${DASHBOARD_SERVICES[@]}" "${BACKEND_SERVICES[@]}" "${WEB_SERVICES[@]}")
local total=${#all_services[@]}
local idx=0
log " Building ${total} service images individually..."
echo ""
for svc in "${all_services[@]}"; do
idx=$((idx + 1))
# Infrastructure services use pre-built images (no build step)
local has_build
has_build=$(docker compose -f "${plat_dir}/${COMPOSE_FILE}" config --format json 2>/dev/null \
| jq -r ".services.\"${svc}\".build // empty" 2>/dev/null || true)
if [ -z "$has_build" ] || [ "$has_build" = "null" ]; then
build_skip+=("$svc")
ok " [${idx}/${total}] ${svc} — pre-built image (skip build)"
continue
fi
log " [${idx}/${total}] Building ${svc}..."
if build_one_service "$svc"; then
build_ok+=("$svc")
date -Iseconds > "${STATE_DIR}/builds/${svc}.ok"
ok " [${idx}/${total}] ${svc} — build OK"
else
build_fail+=("$svc")
date -Iseconds > "${STATE_DIR}/builds/${svc}.fail"
warn " [${idx}/${total}] ${svc} — BUILD FAILED (will skip)"
fi
done
# ── Build summary ────────────────────────────────────────────────
echo ""
log " Build results: ${#build_ok[@]} OK, ${#build_fail[@]} FAILED, ${#build_skip[@]} pre-built"
if [ ${#build_fail[@]} -gt 0 ]; then
warn " Failed services: ${build_fail[*]}"
fi
# ── 7b: Start services (skip failed builds) ─────────────────────
# Compose up only the services that built successfully + pre-built infra
local start_services=()
for svc in "${build_skip[@]}"; do start_services+=("$svc"); done
for svc in "${build_ok[@]}"; do start_services+=("$svc"); done
if [ ${#start_services[@]} -eq 0 ]; then
fail "No services to start — all builds failed."
fi
log " Starting ${#start_services[@]} services..."
docker compose \
-f "$COMPOSE_FILE" \
--env-file .env.ecosystem \
up --build -d 2>&1 | tail -20
-f "${plat_dir}/${COMPOSE_FILE}" \
--env-file "${plat_dir}/.env.ecosystem" \
up -d "${start_services[@]}" 2>&1 | tail -10
ok "Phase 7 complete. All containers started."
if [ ${#build_fail[@]} -gt 0 ]; then
warn "Phase 7 complete with ${#build_fail[@]} failed builds: ${build_fail[*]}"
warn " Fix and re-run: sudo ./setup.sh --phase=7"
else
ok "Phase 7 complete. All ${#start_services[@]} services started."
fi
}
# ═══════════════════════════════════════════════════════════════════════
@ -586,7 +711,81 @@ HEALTH
# ═══════════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════════
run_phase() {
local phase_num="$1"
case "$phase_num" in
1) phase1_system ;;
2) phase2_gitea ;;
3) phase3_clone ;;
4) phase4_build ;;
5) phase5_publish ;;
6) phase6_env ;;
7) phase7_deploy ;;
8) phase8_verify ;;
*) fail "Unknown phase: $phase_num" ;;
esac
mark_phase_done "$phase_num"
}
usage() {
echo "Usage: sudo ./setup.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --resume Auto-resume from last completed phase"
echo " --resume-from=N Resume starting at phase N (1-8)"
echo " --phase=N Run ONLY phase N"
echo " --reset Clear phase markers and start fresh"
echo " --status Show completed phases and exit"
echo " -h, --help Show this help"
echo ""
echo "Phases:"
echo " 1 System dependencies (Docker, Node, pnpm, Ollama)"
echo " 2 Gitea npm registry"
echo " 3 Clone repositories"
echo " 4 Build @bytelyst/* packages"
echo " 5 Publish packages to Gitea"
echo " 6 Generate .env.ecosystem"
echo " 7 Build + deploy Docker services (per-service, with fallback)"
echo " 8 Health check"
}
main() {
# Parse CLI arguments
local mode="full" start_phase=1 only_phase=0
for arg in "$@"; do
case "$arg" in
--resume)
mode="resume" ;;
--resume-from=*)
mode="resume-from"
start_phase="${arg#*=}" ;;
--phase=*)
mode="single"
only_phase="${arg#*=}" ;;
--reset)
mkdir -p "$INSTALL_DIR"
reset_phase_markers
exit 0 ;;
--status)
mkdir -p "$INSTALL_DIR"
echo "Phase completion status:"
for i in 1 2 3 4 5 6 7 8; do
if is_phase_done "$i"; then
echo " Phase $i: DONE ($(cat "${STATE_DIR}/phase${i}.done"))"
else
echo " Phase $i: pending"
fi
done
exit 0 ;;
-h|--help)
usage; exit 0 ;;
*)
warn "Unknown option: $arg"
usage; exit 1 ;;
esac
done
# Tee all output to a log file so SSH disconnection doesn't lose context
mkdir -p "$INSTALL_DIR"
exec > >(tee -a "${INSTALL_DIR}/setup.log") 2>&1
@ -606,16 +805,46 @@ main() {
log "Target OS: $(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '"')"
log "Target arch: $(uname -m)"
# ── Single-phase mode ────────────────────────────────────────────
if [ "$mode" = "single" ]; then
log "Running ONLY phase ${only_phase}..."
restore_gitea_token
run_phase "$only_phase"
ok "Phase ${only_phase} complete."
exit 0
fi
# ── Auto-resume mode ─────────────────────────────────────────────
if [ "$mode" = "resume" ]; then
local last
last=$(last_completed_phase)
if [ "$last" -eq 0 ]; then
log "No completed phases found. Starting from phase 1."
start_phase=1
elif [ "$last" -ge 8 ]; then
ok "All phases already completed. Use --reset to start over."
exit 0
else
start_phase=$((last + 1))
log "Resuming from phase ${start_phase} (phases 1-${last} already done)."
fi
elif [ "$mode" = "resume-from" ]; then
log "Resuming from phase ${start_phase} (as requested)."
fi
# Restore token if resuming past phase 2
if [ "$start_phase" -gt 2 ]; then
restore_gitea_token
fi
echo ""
phase1_system
phase2_gitea
phase3_clone
phase4_build
phase5_publish
phase6_env
phase7_deploy
phase8_verify
# ── Run phases ───────────────────────────────────────────────────
for phase_num in 1 2 3 4 5 6 7 8; do
[ "$phase_num" -ge "$start_phase" ] || continue
run_phase "$phase_num"
done
local elapsed=$(( $(date +%s) - start_time ))
local minutes=$(( elapsed / 60 ))
@ -627,6 +856,8 @@ main() {
echo "║ ║"
echo "║ Health check: /opt/bytelyst/check-health.sh ║"
echo "║ Compose logs: docker compose -f ${COMPOSE_FILE} logs -f ║"
echo "║ Retry failed: sudo ./setup.sh --phase=7 ║"
echo "║ Resume: sudo ./setup.sh --resume ║"
echo "║ Gitea UI: http://localhost:3300 ║"
echo "║ Ollama API: http://localhost:11434 ║"
echo "║ Grafana: http://localhost:3000 (admin / bytelyst) ║"