Stabilize Windows+WSL setup by fixing script line-ending pitfalls, WSL Ollama host detection, and dashboard startup behavior so models are detected reliably in Mission Control. Co-authored-by: Cursor <cursoragent@cursor.com>
343 lines
12 KiB
Bash
Executable File
343 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
|
# ============================================================
|
|
# TTS Setup - One-Shot Script for Fresh Laptop
|
|
#
|
|
# Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python)
|
|
# on macOS (Apple Silicon) or Linux (CUDA GPU / WSL2).
|
|
#
|
|
# What this does:
|
|
# 1. Installs Python 3.12 (Homebrew on macOS, apt on Linux)
|
|
# 2. Creates Python venv with TTS packages (MPS on macOS, CUDA on Linux)
|
|
# 3. Pulls Orpheus TTS model via Ollama
|
|
# 4. Downloads SNAC audio decoder
|
|
# 5. (Optional) Downloads Qwen3-TTS 0.6B
|
|
#
|
|
# Prerequisites:
|
|
# macOS: Homebrew + Ollama installed
|
|
# Linux: apt + Ollama accessible at localhost:11434
|
|
#
|
|
# Usage:
|
|
# bash setup-tts.sh
|
|
# From WSL with Ollama on Windows: OLLAMA_HOST=http://<Windows_IP>:11434 bash setup-tts.sh
|
|
#
|
|
# After setup, test with:
|
|
# .venv-qwen-tts/bin/python test_orpheus_tts.py
|
|
# ============================================================
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
VENV="$SCRIPT_DIR/.venv-qwen-tts"
|
|
MODELS_DIR="$SCRIPT_DIR/models"
|
|
|
|
# HuggingFace mirror that works through corporate proxy
|
|
# On personal machines, set HF_MIRROR=https://huggingface.co to download directly
|
|
HF_MIRROR="${HF_MIRROR:-https://hf-mirror.com}"
|
|
|
|
# Ollama API base URL (for WSL2 with Ollama on Windows, set OLLAMA_HOST=http://<Windows_IP>:11434)
|
|
OLLAMA_BASE="${OLLAMA_HOST:-http://localhost:11434}"
|
|
OLLAMA_BASE="${OLLAMA_BASE%/}"
|
|
# WSL2: if localhost fails, try Windows host via default gateway (common when Ollama runs on Windows)
|
|
if [ "$OLLAMA_BASE" = "http://localhost:11434" ] && ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then
|
|
if [ -r /proc/version ] && grep -qi microsoft /proc/version 2>/dev/null; then
|
|
WIN_HOST=$(ip route show default 2>/dev/null | awk '{print $3}' | head -1)
|
|
if [ -n "$WIN_HOST" ] && curl -s --max-time 2 "http://${WIN_HOST}:11434/api/tags" &>/dev/null; then
|
|
OLLAMA_BASE="http://${WIN_HOST}:11434"
|
|
export OLLAMA_HOST="${OLLAMA_BASE}"
|
|
fi
|
|
fi
|
|
fi
|
|
export OLLAMA_HOST="${OLLAMA_HOST:-$OLLAMA_BASE}"
|
|
|
|
# Detect OS
|
|
OS_TYPE="$(uname -s)"
|
|
IS_MAC=false
|
|
[ "$OS_TYPE" = "Darwin" ] && IS_MAC=true
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m'
|
|
|
|
ok() { echo -e "${GREEN}[OK]${NC} $1"; }
|
|
warn() { echo -e "${YELLOW}[!!]${NC} $1"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $1"; exit 1; }
|
|
step() { echo -e "\n${GREEN}=== $1 ===${NC}"; }
|
|
|
|
echo "+------------------------------------------------------+"
|
|
echo "| TTS Setup - Local Speech Generation |"
|
|
echo "| Orpheus TTS (Ollama) + Qwen3-TTS (Python) |"
|
|
echo "+------------------------------------------------------+"
|
|
echo ""
|
|
|
|
# -- 0. Check prerequisites -----------------------------------
|
|
step "Checking prerequisites"
|
|
|
|
if $IS_MAC; then
|
|
# Homebrew
|
|
if ! command -v brew &>/dev/null; then
|
|
fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
|
|
fi
|
|
ok "Homebrew"
|
|
|
|
# Ollama (install via Homebrew if missing)
|
|
if ! command -v ollama &>/dev/null; then
|
|
warn "Ollama not found. Installing..."
|
|
brew install ollama
|
|
fi
|
|
else
|
|
# Linux / WSL2 - Ollama should be installed on host or via install script
|
|
if ! command -v ollama &>/dev/null; then
|
|
# On WSL2 Ollama runs on the Windows side; check if reachable
|
|
if ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then
|
|
fail "Ollama not reachable at $OLLAMA_BASE. From WSL set OLLAMA_HOST=http://<Windows_IP>:11434 (e.g. from /etc/resolv.conf nameserver)."
|
|
fi
|
|
ok "Ollama reachable at $OLLAMA_BASE (Windows host)"
|
|
fi
|
|
fi
|
|
ok "Ollama installed"
|
|
|
|
# Check if Ollama is running
|
|
if ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then
|
|
warn "Ollama not running at $OLLAMA_BASE. Starting..."
|
|
if command -v ollama &>/dev/null; then
|
|
ollama serve &>/dev/null &
|
|
sleep 3
|
|
fi
|
|
if ! curl -s --max-time 2 "$OLLAMA_BASE/api/tags" &>/dev/null; then
|
|
fail "Could not start Ollama. Try manually: ollama serve (or from WSL set OLLAMA_HOST=http://<Windows_IP>:11434)"
|
|
fi
|
|
fi
|
|
ok "Ollama running at $OLLAMA_BASE"
|
|
|
|
# GPU check
|
|
ARCH=$(uname -m)
|
|
if $IS_MAC; then
|
|
if [ "$ARCH" != "arm64" ]; then
|
|
warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available."
|
|
else
|
|
ok "Apple Silicon ($ARCH) - MPS acceleration available"
|
|
fi
|
|
else
|
|
if command -v nvidia-smi &>/dev/null; then
|
|
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
|
|
ok "NVIDIA GPU detected: $GPU_NAME - CUDA acceleration available"
|
|
else
|
|
warn "nvidia-smi not found. CUDA acceleration won't be available (CPU fallback)."
|
|
fi
|
|
fi
|
|
|
|
# -- 1. Install Python 3.12 -----------------------------------
|
|
step "Python 3.12"
|
|
|
|
PYTHON_CMD=""
|
|
# Check various Python 3.12 locations
|
|
for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12 python3; do
|
|
if command -v "$cmd" &>/dev/null; then
|
|
PYTHON_CMD="$cmd"
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ -z "$PYTHON_CMD" ]; then
|
|
if $IS_MAC; then
|
|
warn "Python 3.12 not found. Installing via Homebrew..."
|
|
brew install python@3.12
|
|
PYTHON_CMD="/opt/homebrew/bin/python3.12"
|
|
else
|
|
warn "Python 3.12 not found. Installing via apt..."
|
|
sudo apt update && sudo apt install -y python3.12 python3.12-venv python3-pip
|
|
PYTHON_CMD="python3.12"
|
|
fi
|
|
fi
|
|
|
|
PYTHON_VER=$("$PYTHON_CMD" --version 2>&1)
|
|
ok "$PYTHON_VER at $PYTHON_CMD"
|
|
|
|
# -- 2. Create venv --------------------------------------------
|
|
step "Python virtual environment"
|
|
|
|
if [ -f "$VENV/bin/python" ]; then
|
|
ok "Venv exists at $VENV"
|
|
else
|
|
echo "Creating venv..."
|
|
"$PYTHON_CMD" -m venv "$VENV"
|
|
ok "Venv created at $VENV"
|
|
fi
|
|
|
|
# -- 3. Install Python packages --------------------------------
|
|
step "Python packages"
|
|
|
|
# Check if snac is installed (quick proxy for all packages)
|
|
if "$VENV/bin/python" -c "import snac" &>/dev/null; then
|
|
ok "Packages already installed (snac, torch, etc.)"
|
|
else
|
|
echo "Installing packages (this may take a few minutes)..."
|
|
"$VENV/bin/pip" install -U pip --quiet
|
|
if $IS_MAC; then
|
|
# macOS: default PyTorch includes MPS support
|
|
"$VENV/bin/pip" install -U snac qwen-tts --quiet
|
|
else
|
|
# Linux: install PyTorch with CUDA first, then snac/qwen-tts
|
|
echo "Installing PyTorch with CUDA support..."
|
|
"$VENV/bin/pip" install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --quiet
|
|
"$VENV/bin/pip" install -U snac qwen-tts --quiet
|
|
fi
|
|
ok "Packages installed"
|
|
fi
|
|
|
|
# -- 4. Pull Orpheus TTS model ---------------------------------
|
|
step "Orpheus TTS model (Ollama)"
|
|
|
|
# Helper: use ollama CLI if available, otherwise use API (WSL2 where CLI is on Windows)
|
|
ollama_list() {
|
|
if command -v ollama &>/dev/null; then
|
|
ollama list 2>/dev/null
|
|
else
|
|
curl -s "$OLLAMA_BASE/api/tags" 2>/dev/null | python3 -c "import sys,json; [print(m['name']) for m in json.load(sys.stdin).get('models',[])]" 2>/dev/null
|
|
fi
|
|
}
|
|
|
|
ollama_pull() {
|
|
local model="$1"
|
|
if command -v ollama &>/dev/null; then
|
|
ollama pull "$model"
|
|
else
|
|
echo " (Using Ollama API at $OLLAMA_BASE to pull model...)"
|
|
curl -s -X POST "$OLLAMA_BASE/api/pull" -d "{\"name\":\"$model\",\"stream\":false}" --max-time 600
|
|
echo ""
|
|
fi
|
|
}
|
|
|
|
if ollama_list | grep -q "orpheus"; then
|
|
ok "Orpheus TTS already downloaded"
|
|
else
|
|
echo "Pulling sematre/orpheus:en (4 GB)..."
|
|
ollama_pull "sematre/orpheus:en"
|
|
ok "Orpheus TTS downloaded"
|
|
fi
|
|
|
|
# -- 5. Download SNAC decoder ----------------------------------
|
|
step "SNAC 24kHz audio decoder (~76 MB)"
|
|
|
|
mkdir -p "$MODELS_DIR/snac_24khz"
|
|
|
|
if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
|
|
if $IS_MAC; then
|
|
SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
|
|
else
|
|
SIZE=$(stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
|
|
fi
|
|
if [ "$SIZE" -gt 1000000 ]; then
|
|
ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
|
|
else
|
|
warn "SNAC file looks corrupted (${SIZE} bytes). Re-downloading..."
|
|
rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin"
|
|
fi
|
|
fi
|
|
|
|
if [ ! -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
|
|
echo "Downloading config.json..."
|
|
curl -k -sL -o "$MODELS_DIR/snac_24khz/config.json" \
|
|
"$HF_MIRROR/hubertsiuzdak/snac_24khz/raw/main/config.json"
|
|
|
|
# Verify config is JSON (not an HTML block page)
|
|
if ! python3 -c "import json; json.load(open('$MODELS_DIR/snac_24khz/config.json'))" &>/dev/null; then
|
|
fail "Downloaded config.json is not valid JSON. The mirror may be blocked. Try from home network."
|
|
fi
|
|
ok "config.json downloaded"
|
|
|
|
echo "Downloading pytorch_model.bin (~76 MB)..."
|
|
curl -k -L --progress-bar -o "$MODELS_DIR/snac_24khz/pytorch_model.bin" \
|
|
"$HF_MIRROR/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
|
|
|
|
# Verify it's a real model file (zip/pytorch format), not HTML
|
|
FILE_TYPE=$(file -b "$MODELS_DIR/snac_24khz/pytorch_model.bin" | head -c 20)
|
|
if echo "$FILE_TYPE" | grep -qi "html"; then
|
|
rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin"
|
|
fail "Downloaded model is HTML (proxy block page). Try from home network."
|
|
fi
|
|
ok "SNAC decoder downloaded"
|
|
fi
|
|
|
|
# Verify SNAC loads in Python
|
|
echo "Verifying SNAC decoder loads..."
|
|
if "$VENV/bin/python" -c "
|
|
import snac, torch
|
|
model = snac.SNAC.from_pretrained('$MODELS_DIR/snac_24khz')
|
|
print(f'SNAC: {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters')
|
|
" 2>/dev/null; then
|
|
ok "SNAC decoder verified"
|
|
else
|
|
fail "SNAC decoder failed to load. Delete models/snac_24khz/ and re-run."
|
|
fi
|
|
|
|
# -- 6. (Optional) Download Qwen3-TTS --------------------------
|
|
step "Qwen3-TTS 0.6B (optional, ~1.7 GB total)"
|
|
|
|
QWEN_TOKENIZER_DIR="$MODELS_DIR/Qwen3-TTS-Tokenizer-12Hz"
|
|
QWEN_MODEL_DIR="$MODELS_DIR/Qwen3-TTS-12Hz-0.6B-CustomVoice"
|
|
|
|
if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then
|
|
ok "Qwen3-TTS already downloaded"
|
|
else
|
|
echo "Qwen3-TTS 0.6B requires ~1.7 GB download (tokenizer + model)."
|
|
echo "This is optional - Orpheus TTS (above) works without it."
|
|
read -p "Download Qwen3-TTS? [y/N] " -n 1 -r
|
|
echo
|
|
if [[ $REPLY =~ ^[Yy]$ ]]; then
|
|
# Tokenizer (~650 MB)
|
|
echo "Downloading Qwen3-TTS Tokenizer (~650 MB)..."
|
|
mkdir -p "$QWEN_TOKENIZER_DIR"
|
|
for f in config.json configuration.json preprocessor_config.json; do
|
|
curl -k -sL -o "$QWEN_TOKENIZER_DIR/$f" \
|
|
"$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" 2>/dev/null || true
|
|
done
|
|
curl -k -L --progress-bar -o "$QWEN_TOKENIZER_DIR/model.safetensors" \
|
|
"$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors"
|
|
ok "Tokenizer downloaded"
|
|
|
|
# Model
|
|
echo "Downloading Qwen3-TTS 0.6B (~1.2 GB)..."
|
|
mkdir -p "$QWEN_MODEL_DIR"
|
|
for f in config.json generation_config.json; do
|
|
curl -k -sL -o "$QWEN_MODEL_DIR/$f" \
|
|
"$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" 2>/dev/null || true
|
|
done
|
|
curl -k -L --progress-bar -o "$QWEN_MODEL_DIR/model.safetensors" \
|
|
"$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors"
|
|
ok "Qwen3-TTS 0.6B downloaded"
|
|
else
|
|
warn "Skipped. You can re-run this script later to download."
|
|
fi
|
|
fi
|
|
|
|
# -- Summary ---------------------------------------------------
|
|
step "Setup Complete"
|
|
|
|
echo ""
|
|
echo "Installed components:"
|
|
echo " Orpheus TTS (Ollama): $(ollama_list 2>/dev/null | grep orpheus | head -1 || echo 'ready')"
|
|
echo " SNAC decoder: $MODELS_DIR/snac_24khz/"
|
|
if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then
|
|
echo " Qwen3-TTS 0.6B: $QWEN_MODEL_DIR/"
|
|
else
|
|
echo " Qwen3-TTS 0.6B: (not installed - re-run setup to add)"
|
|
fi
|
|
echo ""
|
|
echo "Disk usage:"
|
|
du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /'
|
|
echo ""
|
|
echo "Test commands:"
|
|
echo " $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py"
|
|
if $IS_MAC; then
|
|
echo " afplay test_orpheus_tara.wav"
|
|
else
|
|
echo " aplay test_orpheus_tara.wav (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)"
|
|
fi
|
|
if [ -d "$QWEN_MODEL_DIR" ]; then
|
|
echo " $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py"
|
|
fi
|
|
echo ""
|
|
echo "Voices: tara, leah, jess, leo, dan, mia, zac, zoe"
|
|
echo "Emotion: <laugh>, <chuckle>, <sigh>, <cough>, <groan>, <yawn>, <gasp>"
|