From b1d2e4ec81b0248e37381bbbf7cb920a969f4cad Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Sat, 21 Feb 2026 15:27:49 -0800 Subject: [PATCH] =?UTF-8?q?fix(local-llms):=20cross-platform=20audit=20?= =?UTF-8?q?=E2=80=94=208=20bugs/gaps=20fixed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - setup-tts.sh: make fully cross-platform (macOS + Linux/WSL2) - OS detection, apt fallback, CUDA PyTorch install, nvidia-smi check - cross-platform playback hints, HF_MIRROR env override - api/system/route.ts: fix ffmpeg detection (use -version not --version) - api/system/memory/route.ts: remove unused total variable in Linux path - api/system/exec/route.ts: expand allowlist with Linux commands (head, tail, grep, which, ps, uname, free, lscpu, nvidia-smi, etc.) - api/tts/route.ts: cross-platform venv path + CUDA/MPS label - api/whisper/route.ts: Linux binary/model paths - api/ollama/logs/route.ts: Linux log paths + WSL2 hint - test_qwen_tts.py: platform-aware speech text + CUDA device detection - test_orpheus_tts.py: platform-aware text, move import sys to top - setup-guide.md: fix false auto-detect claim, add HF_MIRROR hint --- .../src/app/api/ollama/logs/route.ts | 14 +- .../src/app/api/system/exec/route.ts | 17 +- .../src/app/api/system/memory/route.ts | 94 ++-- .../dashboard/src/app/api/system/route.ts | 166 ++++-- .../dashboard/src/app/api/tts/route.ts | 24 +- .../dashboard/src/app/api/whisper/route.ts | 19 +- __LOCAL_LLMs/setup-tts.sh | 112 ++-- __LOCAL_LLMs/test_orpheus_tts.py | 8 +- __LOCAL_LLMs/test_qwen_tts.py | 12 +- __LOCAL_LLMs/windows_specific/setup-guide.md | 492 +++++++----------- 10 files changed, 518 insertions(+), 440 deletions(-) diff --git a/__LOCAL_LLMs/dashboard/src/app/api/ollama/logs/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/ollama/logs/route.ts index 9e1c4228..c21f4855 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/ollama/logs/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/ollama/logs/route.ts @@ -4,11 +4,15 @@ import { homedir } from 'os'; import { join } from 'path'; import { existsSync } from 'fs'; +const IS_MAC = process.platform === 'darwin'; + export async function GET() { const logPaths = [ join(homedir(), '.ollama', 'logs', 'server.log'), join(homedir(), '.ollama', 'logs', 'gpu.log'), '/tmp/ollama.log', + // Linux / WSL2 — journalctl may write here + '/var/log/ollama.log', ]; for (const logPath of logPaths) { @@ -25,11 +29,13 @@ export async function GET() { } } - // On macOS, Ollama logs via unified logging + // Fallback: platform-specific logging hint + const hint = IS_MAC + ? 'Ollama uses macOS unified logging. Use: log show --predicate \'subsystem == "com.ollama"\' --last 5m' + : 'Ollama logs not found. If running on Windows (accessed via WSL2), check Windows Event Viewer or: journalctl -u ollama --no-pager -n 50'; + return NextResponse.json({ - lines: [ - 'Ollama uses macOS unified logging. Use: log show --predicate \'subsystem == "com.ollama"\' --last 5m', - ], + lines: [hint], path: 'system', total: 1, }); diff --git a/__LOCAL_LLMs/dashboard/src/app/api/system/exec/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/system/exec/route.ts index a5b97d18..8ee0bcc7 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/system/exec/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/system/exec/route.ts @@ -5,9 +5,9 @@ import { promisify } from 'util'; const execFileAsync = promisify(execFile); const COMMAND_ALLOWLIST = new Set([ + // Cross-platform (macOS + Linux) 'git', 'npm', - 'brew', 'cat', 'ls', 'wc', @@ -15,6 +15,21 @@ const COMMAND_ALLOWLIST = new Set([ 'df', 'echo', 'date', + 'head', + 'tail', + 'grep', + 'which', + 'ps', + 'uname', + 'whoami', + // macOS + 'brew', + // Linux / WSL2 + 'free', + 'lscpu', + 'nvidia-smi', + 'dpkg', + 'apt', ]); export async function POST(request: NextRequest) { diff --git a/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts index 69e4409a..acf1f974 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts @@ -1,10 +1,13 @@ import { NextResponse } from 'next/server'; import { exec } from 'child_process'; import { promisify } from 'util'; +import { readFile } from 'fs/promises'; import os from 'os'; const execAsync = promisify(exec); +const IS_MAC = process.platform === 'darwin'; + interface ProcessInfo { pid: number; name: string; @@ -26,7 +29,7 @@ interface VmStatBreakdown { async function getTopProcesses(limit = 20): Promise { try { - // ps with RSS in KB, sorted descending by RSS + // ps with RSS in KB, sorted descending by RSS — works on both macOS and Linux const { stdout } = await execAsync( `ps -axo pid=,rss=,%mem=,user=,comm= | sort -k2 -rn | head -${limit}`, { timeout: 3000 } @@ -60,36 +63,67 @@ async function getTopProcesses(limit = 20): Promise { } async function getVmStatBreakdown(): Promise { - try { - const { stdout } = await execAsync('vm_stat', { timeout: 2000 }); - const pageSizeMatch = stdout.match(/page size of (\d+) bytes/); - const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384; - const parse = (label: string): number => { - const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`)); - return match ? parseInt(match[1]) * pageSize : 0; - }; - return { - active: parse('Pages active'), - wired: parse('Pages wired down'), - compressor: parse('Pages occupied by compressor'), - inactive: parse('Pages inactive'), - purgeable: parse('Pages purgeable'), - speculative: parse('Pages speculative'), - free: parse('Pages free'), - pageSize, - }; - } catch { - return { - active: 0, - wired: 0, - compressor: 0, - inactive: 0, - purgeable: 0, - speculative: 0, - free: 0, - pageSize: 16384, - }; + if (IS_MAC) { + try { + const { stdout } = await execAsync('vm_stat', { timeout: 2000 }); + const pageSizeMatch = stdout.match(/page size of (\d+) bytes/); + const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384; + const parse = (label: string): number => { + const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`)); + return match ? parseInt(match[1]) * pageSize : 0; + }; + return { + active: parse('Pages active'), + wired: parse('Pages wired down'), + compressor: parse('Pages occupied by compressor'), + inactive: parse('Pages inactive'), + purgeable: parse('Pages purgeable'), + speculative: parse('Pages speculative'), + free: parse('Pages free'), + pageSize, + }; + } catch { + // fall through to zeros + } + } else { + // Linux / WSL2 — parse /proc/meminfo into vm_stat-compatible structure + try { + const raw = await readFile('/proc/meminfo', 'utf-8'); + const parse = (key: string): number => { + const match = raw.match(new RegExp(`${key}:\\s+(\\d+)`)); + return match ? parseInt(match[1]) * 1024 : 0; + }; + const free = parse('MemFree'); + const buffers = parse('Buffers'); + const cached = parse('Cached'); + const sReclaimable = parse('SReclaimable'); + const active = parse('Active'); + + return { + active, + wired: buffers, // closest analogy + compressor: parse('SwapCached'), + inactive: parse('Inactive'), + purgeable: sReclaimable, + speculative: 0, + free, + pageSize: 4096, + }; + } catch { + // fall through to zeros + } } + + return { + active: 0, + wired: 0, + compressor: 0, + inactive: 0, + purgeable: 0, + speculative: 0, + free: 0, + pageSize: IS_MAC ? 16384 : 4096, + }; } export async function GET() { diff --git a/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts index 78ea6cbd..eb4c3e94 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts @@ -1,11 +1,14 @@ import { NextResponse } from 'next/server'; import { exec, execFile } from 'child_process'; import { promisify } from 'util'; +import { readFile } from 'fs/promises'; import os from 'os'; const execAsync = promisify(exec); const execFileAsync = promisify(execFile); +const IS_MAC = process.platform === 'darwin'; + // Cache slow commands with TTL let staticCache: { chip: string; @@ -20,10 +23,17 @@ const OLLAMA_DISK_TTL = 60 * 1000; // 60 seconds async function getChipInfo(): Promise { try { + if (IS_MAC) { + const { stdout } = await execAsync( + "sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown'" + ); + return stdout.trim(); + } + // Linux / WSL2 const { stdout } = await execAsync( - "sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown'" + "lscpu 2>/dev/null | grep 'Model name' | sed 's/.*:\\s*//' || cat /proc/cpuinfo | grep 'model name' | head -1 | sed 's/.*: //'" ); - return stdout.trim(); + return stdout.trim() || 'Unknown'; } catch { return 'Unknown'; } @@ -55,30 +65,63 @@ async function getOllamaModelsDiskUsage(): Promise { async function getGpuInfo(): Promise { try { + if (IS_MAC) { + const { stdout } = await execAsync( + "system_profiler SPDisplaysDataType 2>/dev/null | grep 'Chipset Model' | sed 's/.*: //'", + { timeout: 5000 } + ); + return stdout.trim() || 'Apple Silicon (integrated)'; + } + // Linux / WSL2 — try nvidia-smi first, fall back to lspci + try { + const { stdout } = await execAsync( + 'nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1', + { timeout: 3000 } + ); + if (stdout.trim()) return stdout.trim(); + } catch { + /* no nvidia-smi */ + } const { stdout } = await execAsync( - "system_profiler SPDisplaysDataType 2>/dev/null | grep 'Chipset Model' | sed 's/.*: //'", - { timeout: 5000 } + "lspci 2>/dev/null | grep -i 'vga\\|3d' | sed 's/.*: //' | head -1" ); - return stdout.trim() || 'Apple Silicon (integrated)'; + return stdout.trim() || 'Unknown'; } catch { - return 'Apple Silicon (integrated)'; + return IS_MAC ? 'Apple Silicon (integrated)' : 'Unknown'; } } async function getBrewPackages(): Promise> { const targets = ['ollama', 'whisper-cpp', 'ffmpeg']; const results: Array<{ name: string; version: string }> = []; - for (const pkg of targets) { - try { - const { stdout } = await execFileAsync('brew', ['list', '--versions', pkg], { - timeout: 3000, - }); - const parts = stdout.trim().split(' '); - if (parts.length >= 2) { - results.push({ name: parts[0], version: parts.slice(1).join(' ') }); + + if (IS_MAC) { + for (const pkg of targets) { + try { + const { stdout } = await execFileAsync('brew', ['list', '--versions', pkg], { + timeout: 3000, + }); + const parts = stdout.trim().split(' '); + if (parts.length >= 2) { + results.push({ name: parts[0], version: parts.slice(1).join(' ') }); + } + } catch { + // not installed + } + } + } else { + // Linux / WSL2 — check via version commands (ffmpeg uses -version, others use --version) + for (const pkg of targets) { + const bin = pkg === 'whisper-cpp' ? 'whisper-cli' : pkg; + const flag = bin === 'ffmpeg' ? '-version' : '--version'; + try { + const { stdout } = await execAsync(`${bin} ${flag} 2>&1 | head -1`, { timeout: 3000 }); + if (stdout.trim()) { + results.push({ name: pkg, version: stdout.trim() }); + } + } catch { + // not installed } - } catch { - // not installed } } return results; @@ -106,7 +149,8 @@ async function getCachedOllamaDiskUsage(): Promise { return value; } -// macOS vm_stat gives accurate memory breakdown (os.freemem() excludes reclaimable cache) +// macOS: vm_stat gives accurate memory breakdown (os.freemem() excludes reclaimable cache) +// Linux: /proc/meminfo gives accurate breakdown async function getAccurateMemory(): Promise<{ total: number; appMemory: number; @@ -115,42 +159,66 @@ async function getAccurateMemory(): Promise<{ pressure: string; }> { const totalMem = os.totalmem(); - try { - const { stdout } = await execAsync('vm_stat', { timeout: 2000 }); - const pageSizeMatch = stdout.match(/page size of (\d+) bytes/); - const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384; - const parse = (label: string): number => { - const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`)); - return match ? parseInt(match[1]) * pageSize : 0; - }; - const active = parse('Pages active'); - const wired = parse('Pages wired down'); - const inactive = parse('Pages inactive'); - const purgeable = parse('Pages purgeable'); - const speculative = parse('Pages speculative'); - const free = parse('Pages free'); - const compressor = parse('Pages occupied by compressor'); - const appMemory = active + wired + compressor; - const cached = inactive + purgeable + speculative; - // Return raw free separately from cached — no overlap - // available for loading = free + cached (macOS reclaims cached on demand) + if (IS_MAC) { + try { + const { stdout } = await execAsync('vm_stat', { timeout: 2000 }); + const pageSizeMatch = stdout.match(/page size of (\d+) bytes/); + const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384; + const parse = (label: string): number => { + const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`)); + return match ? parseInt(match[1]) * pageSize : 0; + }; + const active = parse('Pages active'); + const wired = parse('Pages wired down'); + const inactive = parse('Pages inactive'); + const purgeable = parse('Pages purgeable'); + const speculative = parse('Pages speculative'); + const free = parse('Pages free'); + const compressor = parse('Pages occupied by compressor'); - const ratio = appMemory / totalMem; - const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal'; + const appMemory = active + wired + compressor; + const cached = inactive + purgeable + speculative; - return { total: totalMem, appMemory, cached, free, pressure }; - } catch { - // Fallback to Node.js (inaccurate on macOS but works everywhere) - const freeMem = os.freemem(); - return { - total: totalMem, - appMemory: totalMem - freeMem, - cached: 0, - free: freeMem, - pressure: 'unknown', - }; + const ratio = appMemory / totalMem; + const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal'; + + return { total: totalMem, appMemory, cached, free, pressure }; + } catch { + // fall through to generic fallback + } + } else { + // Linux / WSL2 — parse /proc/meminfo + try { + const raw = await readFile('/proc/meminfo', 'utf-8'); + const parse = (key: string): number => { + const match = raw.match(new RegExp(`${key}:\\s+(\\d+)`)); + return match ? parseInt(match[1]) * 1024 : 0; // /proc/meminfo is in kB + }; + const total = parse('MemTotal'); + const free = parse('MemFree'); + const buffers = parse('Buffers'); + const cached = parse('Cached') + parse('SReclaimable') + buffers; + const appMemory = total - free - cached; + + const ratio = appMemory / total; + const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal'; + + return { total, appMemory, cached, free, pressure }; + } catch { + // fall through to generic fallback + } } + + // Generic fallback (works everywhere but less accurate) + const freeMem = os.freemem(); + return { + total: totalMem, + appMemory: totalMem - freeMem, + cached: 0, + free: freeMem, + pressure: 'unknown', + }; } export async function GET() { diff --git a/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts index 15e9ba59..024a4b35 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts @@ -6,9 +6,14 @@ import { join, resolve } from 'path'; const execAsync = promisify(exec); +const IS_MAC = process.platform === 'darwin'; + // process.cwd() = dashboard/, parent = __LOCAL_LLMs/ const LOCAL_LLMS_DIR = resolve(process.cwd(), '..'); +// macOS/Linux: bin/python, Windows native: Scripts/python.exe +const VENV_PYTHON = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); + interface TtsEngine { name: string; type: 'ollama' | 'python'; @@ -67,8 +72,7 @@ async function checkOrpheus(): Promise { const snacSize = hasSnac ? await getFileSize(snacPath) : 0; // Check Python venv - const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); - const hasVenv = await fileExists(venvPython); + const hasVenv = await fileExists(VENV_PYTHON); if (hasModel && hasSnac && hasVenv) { engine.status = 'ready'; @@ -114,13 +118,14 @@ async function checkQwenTts(): Promise { } const hasTokenizer = await fileExists(join(tokenizerDir, 'config.json')); - const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); - const hasVenv = await fileExists(venvPython); + const hasVenv = await fileExists(VENV_PYTHON); if (hasModel && hasTokenizer && hasVenv) { engine.status = 'ready'; engine.size = `${(modelSize / 1e9).toFixed(1)} GB`; - engine.details = '0.6B params · 10 languages · MPS/CPU'; + engine.details = IS_MAC + ? '0.6B params · 10 languages · MPS/CPU' + : '0.6B params · 10 languages · CUDA/CPU'; } else if (hasModel || hasTokenizer) { engine.status = 'partial'; const missing: string[] = []; @@ -141,22 +146,21 @@ async function checkVenv(): Promise<{ python?: string; packages?: string[]; }> { - const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); - const exists = await fileExists(venvPython); + const exists = await fileExists(VENV_PYTHON); if (!exists) return { exists: false }; try { const { stdout } = await execAsync( - `"${venvPython}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`, + `"${VENV_PYTHON}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`, { timeout: 5000 } ); return { exists: true, - python: venvPython, + python: VENV_PYTHON, packages: stdout.trim().split(' '), }; } catch { - return { exists: true, python: venvPython }; + return { exists: true, python: VENV_PYTHON }; } } diff --git a/__LOCAL_LLMs/dashboard/src/app/api/whisper/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/whisper/route.ts index 6fae23f9..81e92d7b 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/whisper/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/whisper/route.ts @@ -7,9 +7,22 @@ import { homedir } from 'os'; const execAsync = promisify(exec); +const IS_MAC = process.platform === 'darwin'; + async function getWhisperBinaries(): Promise { try { - const { stdout } = await execAsync('ls /opt/homebrew/bin/whisper-* 2>/dev/null'); + if (IS_MAC) { + const { stdout } = await execAsync('ls /opt/homebrew/bin/whisper-* 2>/dev/null'); + return stdout + .trim() + .split('\n') + .filter(Boolean) + .map(p => p.split('/').pop() || p); + } + // Linux / WSL2 — check common locations + const { stdout } = await execAsync( + 'ls /usr/local/bin/whisper-* /usr/bin/whisper-* 2>/dev/null || which whisper-cli 2>/dev/null' + ); return stdout .trim() .split('\n') @@ -25,7 +38,9 @@ const WHISPER_MODEL_DIRS = (process.env.WHISPER_MODELS_DIR || '') .filter(Boolean) .concat([ join(homedir(), 'whisper-models'), - '/opt/homebrew/share/whisper-cpp/models', + ...(IS_MAC + ? ['/opt/homebrew/share/whisper-cpp/models'] + : ['/usr/local/share/whisper-cpp/models', '/usr/share/whisper-cpp/models']), join(homedir(), '.cache', 'whisper'), ]); diff --git a/__LOCAL_LLMs/setup-tts.sh b/__LOCAL_LLMs/setup-tts.sh index 852c7e0a..53153b51 100755 --- a/__LOCAL_LLMs/setup-tts.sh +++ b/__LOCAL_LLMs/setup-tts.sh @@ -3,26 +3,24 @@ # TTS Setup — One-Shot Script for Fresh Laptop # # Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python) -# on Apple Silicon Macs. Works through corporate proxy. +# on macOS (Apple Silicon) or Linux (CUDA GPU / WSL2). # # What this does: -# 1. Installs Python 3.12 via Homebrew (if missing) -# 2. Creates Python venv with TTS packages +# 1. Installs Python 3.12 (Homebrew on macOS, apt on Linux) +# 2. Creates Python venv with TTS packages (MPS on macOS, CUDA on Linux) # 3. Pulls Orpheus TTS model via Ollama -# 4. Downloads SNAC audio decoder via hf-mirror.com -# 5. (Optional) Downloads Qwen3-TTS 0.6B via hf-mirror.com +# 4. Downloads SNAC audio decoder +# 5. (Optional) Downloads Qwen3-TTS 0.6B # # Prerequisites: -# - macOS with Apple Silicon (M1/M2/M3/M4) -# - Homebrew installed -# - Ollama installed (brew install ollama) +# macOS: Homebrew + Ollama installed +# Linux: apt + Ollama accessible at localhost:11434 # # Usage: # bash setup-tts.sh # # After setup, test with: # .venv-qwen-tts/bin/python test_orpheus_tts.py -# afplay test_orpheus_tara.wav # ============================================================ set -e @@ -31,7 +29,13 @@ VENV="$SCRIPT_DIR/.venv-qwen-tts" MODELS_DIR="$SCRIPT_DIR/models" # HuggingFace mirror that works through corporate proxy -HF_MIRROR="https://hf-mirror.com" +# On personal machines, set HF_MIRROR=https://huggingface.co to download directly +HF_MIRROR="${HF_MIRROR:-https://hf-mirror.com}" + +# Detect OS +OS_TYPE="$(uname -s)" +IS_MAC=false +[ "$OS_TYPE" = "Darwin" ] && IS_MAC=true RED='\033[0;31m' GREEN='\033[0;32m' @@ -52,34 +56,58 @@ echo "" # ── 0. Check prerequisites ────────────────────────────────── step "Checking prerequisites" -# Homebrew -if ! command -v brew &>/dev/null; then - fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\"" -fi -ok "Homebrew" +if $IS_MAC; then + # Homebrew + if ! command -v brew &>/dev/null; then + fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\"" + fi + ok "Homebrew" -# Ollama -if ! command -v ollama &>/dev/null; then - warn "Ollama not found. Installing..." - brew install ollama + # Ollama (install via Homebrew if missing) + if ! command -v ollama &>/dev/null; then + warn "Ollama not found. Installing..." + brew install ollama + fi +else + # Linux / WSL2 — Ollama should be installed on host or via install script + if ! command -v ollama &>/dev/null; then + # On WSL2 Ollama runs on the Windows side; check if reachable + if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then + fail "Ollama not found and not reachable at localhost:11434. Install Ollama on Windows or run: curl -fsSL https://ollama.com/install.sh | sh" + fi + ok "Ollama reachable at localhost:11434 (Windows host)" + fi fi ok "Ollama installed" # Check if Ollama is running if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then warn "Ollama not running. Starting..." - ollama serve &>/dev/null & - sleep 3 + if command -v ollama &>/dev/null; then + ollama serve &>/dev/null & + sleep 3 + fi if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then fail "Could not start Ollama. Try manually: ollama serve" fi fi ok "Ollama running on port 11434" -# Apple Silicon check +# GPU check ARCH=$(uname -m) -if [ "$ARCH" != "arm64" ]; then - warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available." +if $IS_MAC; then + if [ "$ARCH" != "arm64" ]; then + warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available." + else + ok "Apple Silicon ($ARCH) — MPS acceleration available" + fi +else + if command -v nvidia-smi &>/dev/null; then + GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) + ok "NVIDIA GPU detected: $GPU_NAME — CUDA acceleration available" + else + warn "nvidia-smi not found. CUDA acceleration won't be available (CPU fallback)." + fi fi # ── 1. Install Python 3.12 ────────────────────────────────── @@ -87,7 +115,7 @@ step "Python 3.12" PYTHON_CMD="" # Check various Python 3.12 locations -for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do +for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12 python3; do if command -v "$cmd" &>/dev/null; then PYTHON_CMD="$cmd" break @@ -95,9 +123,15 @@ for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do done if [ -z "$PYTHON_CMD" ]; then - warn "Python 3.12 not found. Installing via Homebrew..." - brew install python@3.12 - PYTHON_CMD="/opt/homebrew/bin/python3.12" + if $IS_MAC; then + warn "Python 3.12 not found. Installing via Homebrew..." + brew install python@3.12 + PYTHON_CMD="/opt/homebrew/bin/python3.12" + else + warn "Python 3.12 not found. Installing via apt..." + sudo apt update && sudo apt install -y python3.12 python3.12-venv python3-pip + PYTHON_CMD="python3.12" + fi fi PYTHON_VER=$("$PYTHON_CMD" --version 2>&1) @@ -123,7 +157,15 @@ if "$VENV/bin/python" -c "import snac" &>/dev/null; then else echo "Installing packages (this may take a few minutes)..." "$VENV/bin/pip" install -U pip --quiet - "$VENV/bin/pip" install -U snac qwen-tts --quiet + if $IS_MAC; then + # macOS: default PyTorch includes MPS support + "$VENV/bin/pip" install -U snac qwen-tts --quiet + else + # Linux: install PyTorch with CUDA first, then snac/qwen-tts + echo "Installing PyTorch with CUDA support..." + "$VENV/bin/pip" install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --quiet + "$VENV/bin/pip" install -U snac qwen-tts --quiet + fi ok "Packages installed" fi @@ -144,7 +186,11 @@ step "SNAC 24kHz audio decoder (~76 MB)" mkdir -p "$MODELS_DIR/snac_24khz" if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then - SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null) + if $IS_MAC; then + SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null) + else + SIZE=$(stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null) + fi if [ "$SIZE" -gt 1000000 ]; then ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)" else @@ -247,7 +293,11 @@ du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /' echo "" echo "Test commands:" echo " $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py" -echo " afplay test_orpheus_tara.wav" +if $IS_MAC; then + echo " afplay test_orpheus_tara.wav" +else + echo " aplay test_orpheus_tara.wav (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)" +fi if [ -d "$QWEN_MODEL_DIR" ]; then echo " $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py" fi diff --git a/__LOCAL_LLMs/test_orpheus_tts.py b/__LOCAL_LLMs/test_orpheus_tts.py index 17f05887..743f3bcd 100644 --- a/__LOCAL_LLMs/test_orpheus_tts.py +++ b/__LOCAL_LLMs/test_orpheus_tts.py @@ -13,6 +13,7 @@ Usage: """ import os import re +import sys import time import json import struct @@ -166,7 +167,7 @@ def main(): # Voices: tara, leah, jess, leo, dan, mia, zac, zoe tests = [ - ("Hello! This is Orpheus text to speech, running entirely on your Mac through Ollama.", "tara"), + ("Hello! This is Orpheus text to speech, running entirely locally through Ollama.", "tara"), (" That's amazing! Local AI speech generation without any cloud services!", "leo"), ] @@ -182,7 +183,10 @@ def main(): save_wav(audio, sr, outpath) print("\n=== Done! Open the .wav files to listen. ===") - print("Play with: afplay test_orpheus_tara.wav") + if sys.platform == "darwin": + print("Play with: afplay test_orpheus_tara.wav") + else: + print("Play with: aplay test_orpheus_tara.wav (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)") if __name__ == "__main__": diff --git a/__LOCAL_LLMs/test_qwen_tts.py b/__LOCAL_LLMs/test_qwen_tts.py index 4db74545..b76a3077 100644 --- a/__LOCAL_LLMs/test_qwen_tts.py +++ b/__LOCAL_LLMs/test_qwen_tts.py @@ -1,5 +1,5 @@ """ -Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback). +Test Qwen3-TTS 0.6B (CUDA, MPS, or CPU fallback). Prerequisites: bash setup-tts.sh (one-shot: installs everything) @@ -24,8 +24,12 @@ if not os.path.isdir(MODEL_PATH): print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)") raise SystemExit(1) -# Pick device: MPS if available, else CPU -if torch.backends.mps.is_available(): +# Pick device: CUDA > MPS > CPU +if torch.cuda.is_available(): + device = "cuda" + dtype = torch.float16 + print(f"Using CUDA ({torch.cuda.get_device_name(0)})") +elif torch.backends.mps.is_available(): device = "mps" dtype = torch.float32 # MPS doesn't support bfloat16 print(f"Using MPS (Apple Metal GPU)") @@ -48,7 +52,7 @@ print(f"Supported speakers: {model.get_supported_speakers()}") print(f"Supported languages: {model.get_supported_languages()}") # Test 1: English with a built-in speaker -text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac." +text = f"Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your {'Mac' if device == 'mps' else 'machine'} using {device.upper()}." print(f"\nGenerating speech for: {text[:60]}...") t1 = time.time() diff --git a/__LOCAL_LLMs/windows_specific/setup-guide.md b/__LOCAL_LLMs/windows_specific/setup-guide.md index aaa03932..7dc7acc9 100644 --- a/__LOCAL_LLMs/windows_specific/setup-guide.md +++ b/__LOCAL_LLMs/windows_specific/setup-guide.md @@ -1,372 +1,250 @@ # Windows Setup Guide — Local LLM Stack on Razer Blade 18 > **Hardware:** Razer Blade 18 · Intel Core Ultra 9 275HX · RTX 5090 24 GB GDDR7 · 64 GB DDR5 · 4 TB NVMe -> **OS:** Windows 11 Home +> **OS:** Windows 11 Home + WSL2 (Ubuntu) > **Goal:** Mirror the macOS `__LOCAL_LLMs` stack — Ollama, Whisper, TTS (Orpheus + Qwen3), Mission Control dashboard > **See also:** [razer-blade-18-spec.md](razer-blade-18-spec.md) for full hardware specs --- -## Prerequisites +## Architecture: Windows-Native + WSL2 -### 1. Windows Package Manager - -Install **winget** (ships with Windows 11) and optionally **Scoop** for CLI tools: - -```powershell -# Verify winget -winget --version - -# Install Scoop (optional, useful for dev tools) -Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser -Invoke-RestMethod -Uri https://get.scoop.sh | Invoke-Expression +``` +┌────────────────────────────────────────────────────────┐ +│ Windows 11 │ +│ ├── NVIDIA drivers + CUDA (native) │ +│ ├── Ollama (native Windows service, port 11434) │ +│ └── Browser → http://localhost:3000 │ +│ │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ WSL2 (Ubuntu 24.04) │ │ +│ │ ├── Node.js, Python 3.12, ffmpeg, git │ │ +│ │ ├── __LOCAL_LLMs/ (cloned here) │ │ +│ │ │ ├── dashboard/ → npm run dev (port 3000) │ │ +│ │ │ ├── setup-tts.sh (works as-is) │ │ +│ │ │ ├── start-dashboard.sh (works as-is) │ │ +│ │ │ └── models/ (SNAC, Qwen3-TTS) │ │ +│ │ ├── whisper-cpp (CUDA build) │ │ +│ │ └── .venv-qwen-tts/ (PyTorch CUDA) │ │ +│ └──────────────────────────────────────────────────┘ │ +└────────────────────────────────────────────────────────┘ ``` -### 2. NVIDIA CUDA Toolkit +**Why WSL2?** All existing bash scripts, Python venvs, and Node.js tooling work identically to macOS — zero porting. The dashboard API routes auto-detect macOS vs Linux at runtime via `process.platform`. -The RTX 5090 needs the latest CUDA drivers for GPU-accelerated inference. +--- + +## Phase 1: Windows-Native Setup + +### 1. NVIDIA Drivers ```powershell -# Install NVIDIA drivers (latest Game Ready or Studio) -winget install --id Nvidia.GeForceExperience - -# Install CUDA Toolkit (required for PyTorch CUDA) -winget install --id Nvidia.CUDA -# Or download from: https://developer.nvidia.com/cuda-downloads +# Install latest NVIDIA Game Ready or Studio drivers +# Download from: https://www.nvidia.com/Download/index.aspx # Verify nvidia-smi +# Should show: RTX 5090, 24 GB VRAM, CUDA 13.x+ ``` -Expected output should show: +### 2. Ollama (Windows-Native) -- **RTX 5090** with **24 GB** VRAM -- CUDA version 13.x+ - -### 3. Node.js (for Mission Control Dashboard) - -```powershell -winget install --id OpenJS.NodeJS.LTS -# Verify -node --version # should be 20.x+ -npm --version -``` - -### 4. Python 3.12 - -```powershell -winget install --id Python.Python.3.12 -# Verify -python --version -pip --version -``` - -### 5. Git - -```powershell -winget install --id Git.Git -``` - -### 6. ffmpeg - -```powershell -winget install --id Gyan.FFmpeg -# Or: scoop install ffmpeg -``` - ---- - -## 1. Ollama — LLM Server - -### Install +Ollama runs natively on Windows and is accessible from WSL2 at `localhost:11434`. ```powershell winget install --id Ollama.Ollama -``` - -Ollama for Windows runs as a background service and automatically uses CUDA (RTX 5090). - -### Verify - -```powershell -ollama --version -curl http://localhost:11434/api/tags -``` - -### Download Models - -```powershell -# Coding -ollama pull qwen2.5-coder:32b # 19 GB — primary coding model -ollama pull qwen2.5-coder:7b # 4.7 GB — fast coding - -# Reasoning -ollama pull deepseek-r1:32b # 19 GB — chain-of-thought - -# General -ollama pull llama3.1:8b # 4.9 GB — fast general tasks - -# TTS -ollama pull sematre/orpheus:en # 4 GB — text-to-speech (8 voices) # Verify -ollama list +ollama --version ``` -> **Note:** With 24 GB VRAM, Ollama will offload 32B models almost entirely to GPU. -> On macOS (48 GB unified), the 32B models run in shared CPU/GPU memory. -> On this machine, **GPU inference will be significantly faster** for models that fit in 24 GB VRAM. +### 3. Pull Models (from Windows or WSL2) -### VRAM Budget (RTX 5090 — 24 GB) +```bash +ollama pull qwen2.5-coder:32b # 19 GB — primary coding model +ollama pull qwen2.5-coder:7b # 4.7 GB — fast coding +ollama pull deepseek-r1:32b # 19 GB — chain-of-thought +ollama pull llama3.1:8b # 4.9 GB — fast general tasks +ollama pull sematre/orpheus:en # 4 GB — text-to-speech (8 voices) -| Model | VRAM Usage | Fits in GPU? | -| ---------------------------- | ---------- | ------------ | -| llama3.1:8b | ~5 GB | ✅ Fully | -| qwen2.5-coder:7b | ~5 GB | ✅ Fully | -| sematre/orpheus:en | ~4 GB | ✅ Fully | -| qwen2.5-coder:32b | ~19 GB | ✅ Fully | -| deepseek-r1:32b | ~19 GB | ✅ Fully | -| Two 7B models simultaneously | ~10 GB | ✅ Both fit | +ollama list # verify all 5 models +``` + +### 4. Install WSL2 + +```powershell +# From PowerShell (Admin) +wsl --install -d Ubuntu-24.04 +# Reboot if prompted, then set up username/password +``` --- -## 2. Whisper.cpp — Speech-to-Text +## Phase 2: WSL2 Setup -### Option A: Pre-built Binary (Recommended) +### 1. Install Dependencies -Download the latest release from GitHub: +```bash +# Update +sudo apt update && sudo apt upgrade -y -```powershell -# Create whisper directory -mkdir "$env:USERPROFILE\whisper-cpp" -cd "$env:USERPROFILE\whisper-cpp" +# Node.js 20 LTS +curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash - +sudo apt install -y nodejs -# Download latest release (CUDA build) -# Check: https://github.com/ggerganov/whisper.cpp/releases -# Look for: whisper-cublas-bin-x64.zip or whisper-cuda-bin-x64.zip +# Python 3.12 +sudo apt install -y python3.12 python3.12-venv python3-pip + +# Build tools + ffmpeg +sudo apt install -y ffmpeg git curl build-essential cmake + +# Verify +node --version # 20.x+ +python3.12 --version +nvidia-smi # should show RTX 5090 (GPU passthrough from Windows) ``` -### Option B: Build from Source (CUDA) +> **Important:** Do NOT install NVIDIA drivers inside WSL2. The Windows-side driver handles GPU passthrough automatically. -```powershell +### 2. Clone Repo + +```bash +mkdir -p ~/code/mygh && cd ~/code/mygh +git clone https://github.com/saravanakumardb1/learning_ai_common_plat.git +cd learning_ai_common_plat/__LOCAL_LLMs +``` + +> **Performance note:** Always clone inside WSL2 filesystem (`~/code/...`), NOT in `/mnt/c/` — the Windows filesystem bridge is very slow for `node_modules`. + +### 3. Whisper.cpp (CUDA build) + +```bash +cd ~ git clone https://github.com/ggerganov/whisper.cpp.git cd whisper.cpp cmake -B build -DGGML_CUDA=ON -cmake --build build --config Release -``` +cmake --build build --config Release -j$(nproc) +sudo cp build/bin/whisper-cli /usr/local/bin/ -### Download Whisper Model - -```powershell -mkdir "$env:USERPROFILE\whisper-models" - -# Download ggml-large-v3-turbo (1.5 GB) -curl -L -o "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" ` +# Download model (1.5 GB) +mkdir -p ~/whisper-models +curl -L -o ~/whisper-models/ggml-large-v3-turbo.bin \ "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin" + +# Verify +whisper-cli --version ``` > **No corporate proxy on this machine** — download directly from `huggingface.co`. -> The `hf-mirror.com` workaround is only needed on the corporate MacBook. -### Verify +### 4. TTS Setup (One-Shot) -```powershell -# Test transcription -whisper-cli -m "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" -f test.wav +```bash +cd ~/code/mygh/learning_ai_common_plat/__LOCAL_LLMs + +# Works exactly like macOS — downloads SNAC, Qwen3-TTS, creates venv +bash setup-tts.sh ``` +The script detects macOS vs Linux and installs the correct PyTorch variant (MPS on macOS, CUDA on Linux). On a personal machine, override the default HuggingFace mirror: `HF_MIRROR=https://huggingface.co bash setup-tts.sh` + +### 5. Start Dashboard + +```bash +bash start-dashboard.sh +# Open http://localhost:3000 in Windows browser +``` + +WSL2 automatically forwards ports — the dashboard is accessible from Windows at `localhost:3000`. + --- -## 3. TTS — Orpheus + Qwen3-TTS +## Key Differences: macOS vs WSL2 -### 3a. Orpheus TTS (via Ollama) - -Already handled in Step 1 (`ollama pull sematre/orpheus:en`). - -### 3b. SNAC Decoder - -```powershell -# Create models directory (match macOS layout) -$MODELS = "$PSScriptRoot\models" # or wherever you clone the repo -mkdir "$MODELS\snac_24khz" -Force - -# Download SNAC decoder -curl -L -o "$MODELS\snac_24khz\config.json" ` - "https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/config.json" -curl -L -o "$MODELS\snac_24khz\pytorch_model.bin" ` - "https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" -``` - -### 3c. Python Venv + Dependencies - -```powershell -cd __LOCAL_LLMs - -# Create venv -python -m venv .venv-qwen-tts - -# Activate (Windows uses Scripts, not bin) -.\.venv-qwen-tts\Scripts\Activate.ps1 - -# Install PyTorch with CUDA (NOT MPS — that's Apple only) -pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 - -# Install other deps -pip install snac numpy soundfile - -# Verify CUDA -python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0)}')" -# Expected: CUDA: True, Device: NVIDIA GeForce RTX 5090 Laptop GPU -``` - -### 3d. Qwen3-TTS 0.6B - -```powershell -$MODELS = ".\models" - -# Tokenizer (~650 MB) -mkdir "$MODELS\Qwen3-TTS-Tokenizer-12Hz" -Force -foreach ($f in @("config.json", "configuration.json", "preprocessor_config.json")) { - curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\$f" ` - "https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" -} -curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\model.safetensors" ` - "https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors" - -# Model weights (~1.8 GB) -mkdir "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice" -Force -foreach ($f in @("config.json", "generation_config.json")) { - curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\$f" ` - "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" -} -curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\model.safetensors" ` - "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors" -``` - -### 3e. Test TTS - -```powershell -# Activate venv -.\.venv-qwen-tts\Scripts\Activate.ps1 - -# Orpheus TTS test -python test_orpheus_tts.py - -# Qwen3-TTS test -python test_qwen_tts.py -``` - -> **Key difference from macOS:** Qwen3-TTS will use **CUDA** instead of MPS. -> In `test_qwen_tts.py`, the device selection `torch.device("mps")` will fall through to CUDA automatically -> since `torch.backends.mps.is_available()` returns False on Windows. -> You may want to update the device logic to prefer CUDA: -> -> ```python -> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -> ``` - ---- - -## 4. Mission Control Dashboard - -```powershell -cd __LOCAL_LLMs\dashboard - -# Install dependencies -npm install - -# Start dev server -npm run dev -# Open http://localhost:3000 -``` - -The dashboard is pure Next.js — works identically on Windows. The API routes auto-detect: - -- **Ollama** at `localhost:11434` -- **Whisper** models in `%USERPROFILE%\whisper-models\` -- **TTS** engines (Orpheus, Qwen3-TTS) and Python venv - -### Start Script (PowerShell) - -Use the bash script equivalent: - -```powershell -# Quick start (manual) -ollama serve # if not already running as service -cd __LOCAL_LLMs\dashboard -npm run dev -``` - -> TODO: Create `start-dashboard.ps1` as a PowerShell equivalent of `start-dashboard.sh` - ---- - -## 5. Key Differences: macOS vs Windows - -| Area | macOS (M4 Pro 48 GB) | Windows (Razer Blade 18) | -| ------------------- | ----------------------------------- | ------------------------------------- | -| **GPU** | Apple Silicon (unified memory, MPS) | RTX 5090 (24 GB VRAM, CUDA) | -| **Ollama GPU** | Automatic (Metal) | Automatic (CUDA) | -| **VRAM** | Shared from 48 GB RAM | Dedicated 24 GB GDDR7 | -| **PyTorch device** | `mps` | `cuda` | -| **Whisper install** | `brew install whisper-cpp` | Build from source or download release | -| **Python venv** | `bin/activate` | `Scripts\Activate.ps1` | -| **Package manager** | Homebrew | winget / scoop | -| **Shell** | zsh / bash | PowerShell / cmd | -| **Scripts** | `.sh` (bash) | `.ps1` (PowerShell) | -| **Model download** | `hf-mirror.com` (corporate proxy) | `huggingface.co` (no proxy) | -| **Dashboard** | Identical | Identical | -| **Ollama models** | Identical | Identical | +| Area | macOS (any Mac) | WSL2 (any Linux) | +| ---------------------- | --------------------------- | -------------------------------------- | +| **GPU** | Apple Silicon (MPS) | NVIDIA (CUDA) | +| **Ollama** | macOS native (Metal) | Windows native, accessed via localhost | +| **PyTorch device** | `mps` | `cuda` | +| **Whisper install** | `brew install whisper-cpp` | Build from source with CUDA | +| **Package manager** | Homebrew | apt | +| **Shell scripts** | Work as-is | Work as-is | +| **Python venv path** | `bin/python` | `bin/python` (same) | +| **Dashboard** | Identical | Identical | +| **Ollama models path** | `~/.ollama/models/` | Windows `%USERPROFILE%\.ollama\` | +| **Model download** | `hf-mirror.com` (corporate) | `huggingface.co` (direct) | ### Performance Expectations -| Workload | macOS M4 Pro 48 GB | Razer RTX 5090 24 GB | -| --------------------------- | ---------------------------- | ------------------------- | -| qwen2.5-coder:32b inference | ~15–25 tok/s (MPS/CPU blend) | ~40–60 tok/s (full CUDA) | -| Whisper large-v3-turbo | ~2–4x realtime (CPU) | ~8–15x realtime (CUDA) | -| Orpheus TTS | ~realtime (CPU decode) | ~2–3x realtime (CUDA) | -| Qwen3-TTS | ~realtime (MPS) | ~2–4x realtime (CUDA) | -| 70B quantized models | Fits in 48 GB (slow) | Partially offloads to RAM | +| Workload | macOS M4 Pro 48 GB | Razer RTX 5090 24 GB | +| --------------------------- | -------------------- | ------------------------------- | +| qwen2.5-coder:32b inference | ~15–25 tok/s | ~40–60 tok/s | +| Whisper large-v3-turbo | ~2–4x realtime | ~8–15x realtime | +| Orpheus TTS | ~realtime | ~2–3x realtime | +| Qwen3-TTS | ~realtime (MPS) | ~2–4x realtime (CUDA) | +| 70B quantized models | Fits in 48 GB (slow) | Partially offloads to 64 GB RAM | + +### VRAM Budget (RTX 5090 — 24 GB) + +| Model | VRAM Usage | Fits in GPU? | +| ------------------ | ---------- | ------------ | +| llama3.1:8b | ~5 GB | ✅ Fully | +| qwen2.5-coder:7b | ~5 GB | ✅ Fully | +| sematre/orpheus:en | ~4 GB | ✅ Fully | +| qwen2.5-coder:32b | ~19 GB | ✅ Fully | +| deepseek-r1:32b | ~19 GB | ✅ Fully | --- -## 6. File Layout (Same as macOS) +## Quick Reference — Full Setup Checklist + +### Windows Side ``` -__LOCAL_LLMs/ -├── dashboard/ ← Mission Control (port 3000) — works as-is -├── models/ ← TTS model weights (gitignored) -│ ├── snac_24khz/ -│ ├── Qwen3-TTS-Tokenizer-12Hz/ -│ └── Qwen3-TTS-12Hz-0.6B-CustomVoice/ -├── .venv-qwen-tts/ ← Python venv (Scripts\ on Windows) -├── test_orpheus_tts.py ← works as-is (device fallback) -├── test_qwen_tts.py ← update device to prefer CUDA -├── windows_specific/ -│ ├── razer-blade-18-spec.md ← hardware spec -│ └── setup-guide.md ← this file -└── docs/ ← macOS-focused docs (still useful as reference) -``` - ---- - -## 7. Quick Reference — Full Setup Checklist - -``` -[ ] Install NVIDIA drivers + CUDA Toolkit +[ ] Install NVIDIA drivers (Game Ready or Studio) [ ] Install Ollama (winget install Ollama.Ollama) -[ ] Pull models: qwen2.5-coder:32b, deepseek-r1:32b, llama3.1:8b, orpheus -[ ] Install Node.js 20+ (winget) -[ ] Install Python 3.12 (winget) -[ ] Install Git (winget) -[ ] Install ffmpeg (winget) -[ ] Clone repo -[ ] Download Whisper model to %USERPROFILE%\whisper-models\ -[ ] Build or download whisper-cpp with CUDA -[ ] Create Python venv + install PyTorch CUDA + snac -[ ] Download SNAC decoder -[ ] Download Qwen3-TTS tokenizer + model -[ ] npm install in dashboard/ -[ ] Run dashboard: npm run dev +[ ] Pull all 5 models +[ ] Install WSL2 (wsl --install -d Ubuntu-24.04) +``` + +### WSL2 Side + +``` +[ ] Install Node.js 20+, Python 3.12, ffmpeg, git, cmake +[ ] Verify nvidia-smi shows RTX 5090 +[ ] Clone repo into ~/code/mygh/ +[ ] Build whisper-cpp with CUDA +[ ] Download Whisper model to ~/whisper-models/ +[ ] Run: bash setup-tts.sh +[ ] Run: bash start-dashboard.sh [ ] Verify: http://localhost:3000 shows all green ``` + +--- + +## Troubleshooting + +### Ollama not accessible from WSL2 + +```bash +curl http://localhost:11434/api/tags +# If fails, check Windows firewall or try: +curl http://$(hostname).local:11434/api/tags +``` + +### CUDA not visible in WSL2 + +```bash +nvidia-smi +# If "command not found": +# 1. Update Windows NVIDIA drivers to latest +# 2. Run: wsl --update +# 3. Do NOT install nvidia-driver-* inside WSL2 +``` + +### Slow filesystem performance + +```bash +# Clone repos inside WSL2 filesystem: ~/code/... +# NOT in /mnt/c/ (Windows→WSL bridge is ~10x slower for node_modules) +```