#!/bin/bash # ============================================================ # TTS Setup - One-Shot Script for Fresh Laptop # # Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python) # on macOS (Apple Silicon) or Linux (CUDA GPU / WSL2). # # What this does: # 1. Installs Python 3.12 (Homebrew on macOS, apt on Linux) # 2. Creates Python venv with TTS packages (MPS on macOS, CUDA on Linux) # 3. Pulls Orpheus TTS model via Ollama # 4. Downloads SNAC audio decoder # 5. (Optional) Downloads Qwen3-TTS 0.6B # # Prerequisites: # macOS: Homebrew + Ollama installed # Linux: apt + Ollama accessible at localhost:11434 # # Usage: # bash setup-tts.sh # From WSL with Ollama on Windows: OLLAMA_HOST=http://:11434 bash setup-tts.sh # # After setup, test with: # .venv-qwen-tts/bin/python test_orpheus_tts.py # ============================================================ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" VENV="$SCRIPT_DIR/.venv-qwen-tts" MODELS_DIR="$SCRIPT_DIR/models" # HuggingFace mirror that works through corporate proxy # On personal machines, set HF_MIRROR=https://huggingface.co to download directly HF_MIRROR="${HF_MIRROR:-https://hf-mirror.com}" # Ollama API base URL (for WSL2 with Ollama on Windows, set OLLAMA_HOST=http://:11434) OLLAMA_BASE="${OLLAMA_HOST:-http://localhost:11434}" OLLAMA_BASE="${OLLAMA_BASE%/}" # WSL2: if localhost fails, try Windows host via default gateway (common when Ollama runs on Windows) if [ "$OLLAMA_BASE" = "http://localhost:11434" ] && ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then if [ -r /proc/version ] && grep -qi microsoft /proc/version 2>/dev/null; then WIN_HOST=$(ip route show default 2>/dev/null | awk '{print $3}' | head -1) if [ -n "$WIN_HOST" ] && curl -s --max-time 2 "http://${WIN_HOST}:11434/api/tags" &>/dev/null; then OLLAMA_BASE="http://${WIN_HOST}:11434" export OLLAMA_HOST="${OLLAMA_BASE}" fi fi fi export OLLAMA_HOST="${OLLAMA_HOST:-$OLLAMA_BASE}" # Detect OS OS_TYPE="$(uname -s)" IS_MAC=false [ "$OS_TYPE" = "Darwin" ] && IS_MAC=true RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' ok() { echo -e "${GREEN}[OK]${NC} $1"; } warn() { echo -e "${YELLOW}[!!]${NC} $1"; } fail() { echo -e "${RED}[FAIL]${NC} $1"; exit 1; } step() { echo -e "\n${GREEN}=== $1 ===${NC}"; } echo "+------------------------------------------------------+" echo "| TTS Setup - Local Speech Generation |" echo "| Orpheus TTS (Ollama) + Qwen3-TTS (Python) |" echo "+------------------------------------------------------+" echo "" # -- 0. Check prerequisites ----------------------------------- step "Checking prerequisites" if $IS_MAC; then # Homebrew if ! command -v brew &>/dev/null; then fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\"" fi ok "Homebrew" # Ollama (install via Homebrew if missing) if ! command -v ollama &>/dev/null; then warn "Ollama not found. Installing..." brew install ollama fi else # Linux / WSL2 - Ollama should be installed on host or via install script if ! command -v ollama &>/dev/null; then # On WSL2 Ollama runs on the Windows side; check if reachable if ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then fail "Ollama not reachable at $OLLAMA_BASE. From WSL set OLLAMA_HOST=http://:11434 (e.g. from /etc/resolv.conf nameserver)." fi ok "Ollama reachable at $OLLAMA_BASE (Windows host)" fi fi ok "Ollama installed" # Check if Ollama is running if ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then warn "Ollama not running at $OLLAMA_BASE. Starting..." if command -v ollama &>/dev/null; then ollama serve &>/dev/null & sleep 3 fi if ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then fail "Could not start Ollama. Try manually: ollama serve (or from WSL set OLLAMA_HOST=http://:11434)" fi fi ok "Ollama running at $OLLAMA_BASE" # GPU check ARCH=$(uname -m) if $IS_MAC; then if [ "$ARCH" != "arm64" ]; then warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available." else ok "Apple Silicon ($ARCH) - MPS acceleration available" fi else if command -v nvidia-smi &>/dev/null; then GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) ok "NVIDIA GPU detected: $GPU_NAME - CUDA acceleration available" else warn "nvidia-smi not found. CUDA acceleration won't be available (CPU fallback)." fi fi # -- 1. Install Python 3.12 ----------------------------------- step "Python 3.12" PYTHON_CMD="" # Check various Python 3.12 locations for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12 python3; do if command -v "$cmd" &>/dev/null; then PYTHON_CMD="$cmd" break fi done if [ -z "$PYTHON_CMD" ]; then if $IS_MAC; then warn "Python 3.12 not found. Installing via Homebrew..." brew install python@3.12 PYTHON_CMD="/opt/homebrew/bin/python3.12" else warn "Python 3.12 not found. Installing via apt..." sudo apt update && sudo apt install -y python3.12 python3.12-venv python3-pip PYTHON_CMD="python3.12" fi fi PYTHON_VER=$("$PYTHON_CMD" --version 2>&1) ok "$PYTHON_VER at $PYTHON_CMD" # -- 2. Create venv -------------------------------------------- step "Python virtual environment" if [ -f "$VENV/bin/python" ]; then ok "Venv exists at $VENV" else echo "Creating venv..." "$PYTHON_CMD" -m venv "$VENV" ok "Venv created at $VENV" fi # -- 3. Install Python packages -------------------------------- step "Python packages" # Check if snac is installed (quick proxy for all packages) if "$VENV/bin/python" -c "import snac" &>/dev/null; then ok "Packages already installed (snac, torch, etc.)" else echo "Installing packages (this may take a few minutes)..." "$VENV/bin/pip" install -U pip --quiet if $IS_MAC; then # macOS: default PyTorch includes MPS support "$VENV/bin/pip" install -U snac qwen-tts --quiet else # Linux: install PyTorch with CUDA first, then snac/qwen-tts echo "Installing PyTorch with CUDA support..." "$VENV/bin/pip" install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --quiet "$VENV/bin/pip" install -U snac qwen-tts --quiet fi ok "Packages installed" fi # -- 4. Pull Orpheus TTS model --------------------------------- step "Orpheus TTS model (Ollama)" # Helper: use ollama CLI if available, otherwise use API (WSL2 where CLI is on Windows) ollama_list() { if command -v ollama &>/dev/null; then ollama list 2>/dev/null else curl -s "$OLLAMA_BASE/api/tags" 2>/dev/null | python3 -c "import sys,json; [print(m['name']) for m in json.load(sys.stdin).get('models',[])]" 2>/dev/null fi } ollama_pull() { local model="$1" if command -v ollama &>/dev/null; then ollama pull "$model" else echo " (Using Ollama API at $OLLAMA_BASE to pull model...)" curl -s -X POST "$OLLAMA_BASE/api/pull" -d "{\"name\":\"$model\",\"stream\":false}" --max-time 600 echo "" fi } if ollama_list | grep -q "orpheus"; then ok "Orpheus TTS already downloaded" else echo "Pulling sematre/orpheus:en (4 GB)..." ollama_pull "sematre/orpheus:en" ok "Orpheus TTS downloaded" fi # -- 5. Download SNAC decoder ---------------------------------- step "SNAC 24kHz audio decoder (~76 MB)" mkdir -p "$MODELS_DIR/snac_24khz" if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then if $IS_MAC; then SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null) else SIZE=$(stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null) fi if [ "$SIZE" -gt 1000000 ]; then ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)" else warn "SNAC file looks corrupted (${SIZE} bytes). Re-downloading..." rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" fi fi if [ ! -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then echo "Downloading config.json..." curl -k -sL -o "$MODELS_DIR/snac_24khz/config.json" \ "$HF_MIRROR/hubertsiuzdak/snac_24khz/raw/main/config.json" # Verify config is JSON (not an HTML block page) if ! python3 -c "import json; json.load(open('$MODELS_DIR/snac_24khz/config.json'))" &>/dev/null; then fail "Downloaded config.json is not valid JSON. The mirror may be blocked. Try from home network." fi ok "config.json downloaded" echo "Downloading pytorch_model.bin (~76 MB)..." curl -k -L --progress-bar -o "$MODELS_DIR/snac_24khz/pytorch_model.bin" \ "$HF_MIRROR/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" # Verify it's a real model file (zip/pytorch format), not HTML FILE_TYPE=$(file -b "$MODELS_DIR/snac_24khz/pytorch_model.bin" | head -c 20) if echo "$FILE_TYPE" | grep -qi "html"; then rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" fail "Downloaded model is HTML (proxy block page). Try from home network." fi ok "SNAC decoder downloaded" fi # Verify SNAC loads in Python echo "Verifying SNAC decoder loads..." if "$VENV/bin/python" -c " import snac, torch model = snac.SNAC.from_pretrained('$MODELS_DIR/snac_24khz') print(f'SNAC: {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters') " 2>/dev/null; then ok "SNAC decoder verified" else fail "SNAC decoder failed to load. Delete models/snac_24khz/ and re-run." fi # -- 6. (Optional) Download Qwen3-TTS -------------------------- step "Qwen3-TTS 0.6B (optional, ~1.7 GB total)" QWEN_TOKENIZER_DIR="$MODELS_DIR/Qwen3-TTS-Tokenizer-12Hz" QWEN_MODEL_DIR="$MODELS_DIR/Qwen3-TTS-12Hz-0.6B-CustomVoice" if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then ok "Qwen3-TTS already downloaded" else echo "Qwen3-TTS 0.6B requires ~1.7 GB download (tokenizer + model)." echo "This is optional - Orpheus TTS (above) works without it." read -p "Download Qwen3-TTS? [y/N] " -n 1 -r echo if [[ $REPLY =~ ^[Yy]$ ]]; then # Tokenizer (~650 MB) echo "Downloading Qwen3-TTS Tokenizer (~650 MB)..." mkdir -p "$QWEN_TOKENIZER_DIR" for f in config.json configuration.json preprocessor_config.json; do curl -k -sL -o "$QWEN_TOKENIZER_DIR/$f" \ "$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" 2>/dev/null || true done curl -k -L --progress-bar -o "$QWEN_TOKENIZER_DIR/model.safetensors" \ "$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors" ok "Tokenizer downloaded" # Model echo "Downloading Qwen3-TTS 0.6B (~1.2 GB)..." mkdir -p "$QWEN_MODEL_DIR" for f in config.json generation_config.json; do curl -k -sL -o "$QWEN_MODEL_DIR/$f" \ "$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" 2>/dev/null || true done curl -k -L --progress-bar -o "$QWEN_MODEL_DIR/model.safetensors" \ "$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors" ok "Qwen3-TTS 0.6B downloaded" else warn "Skipped. You can re-run this script later to download." fi fi # -- Summary --------------------------------------------------- step "Setup Complete" echo "" echo "Installed components:" echo " Orpheus TTS (Ollama): $(ollama_list 2>/dev/null | grep orpheus | head -1 || echo 'ready')" echo " SNAC decoder: $MODELS_DIR/snac_24khz/" if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then echo " Qwen3-TTS 0.6B: $QWEN_MODEL_DIR/" else echo " Qwen3-TTS 0.6B: (not installed - re-run setup to add)" fi echo "" echo "Disk usage:" du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /' echo "" echo "Test commands:" echo " $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py" if $IS_MAC; then echo " afplay test_orpheus_tara.wav" else echo " aplay test_orpheus_tara.wav (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)" fi if [ -d "$QWEN_MODEL_DIR" ]; then echo " $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py" fi echo "" echo "Voices: tara, leah, jess, leo, dan, mia, zac, zoe" echo "Emotion: , , , , , , "