fix(local-llms): cross-platform audit — 8 bugs/gaps fixed

- setup-tts.sh: make fully cross-platform (macOS + Linux/WSL2) - OS detection, apt fallback, CUDA PyTorch install, nvidia-smi check - cross-platform playback hints, HF_MIRROR env override - api/system/route.ts: fix ffmpeg detection (use -version not --version) - api/system/memory/route.ts: remove unused total variable in Linux path - api/system/exec/route.ts: expand allowlist with Linux commands (head, tail, grep, which, ps, uname, free, lscpu, nvidia-smi, etc.) - api/tts/route.ts: cross-platform venv path + CUDA/MPS label - api/whisper/route.ts: Linux binary/model paths - api/ollama/logs/route.ts: Linux log paths + WSL2 hint - test_qwen_tts.py: platform-aware speech text + CUDA device detection - test_orpheus_tts.py: platform-aware text, move import sys to top - setup-guide.md: fix false auto-detect claim, add HF_MIRROR hint
2026-02-21 15:27:49 -08:00 · 2026-02-21 15:27:49 -08:00 · b1d2e4ec81
commit b1d2e4ec81
parent f85b455eb5
10 changed files with 518 additions and 440 deletions
--- a/__LOCAL_LLMs/dashboard/src/app/api/ollama/logs/route.ts
+++ b/__LOCAL_LLMs/dashboard/src/app/api/ollama/logs/route.ts
@ -4,11 +4,15 @@ import { homedir } from 'os';
 import { join } from 'path';
 import { existsSync } from 'fs';
 const IS_MAC = process.platform === 'darwin';
 export async function GET() {
  const logPaths = [
    join(homedir(), '.ollama', 'logs', 'server.log'),
    join(homedir(), '.ollama', 'logs', 'gpu.log'),
    '/tmp/ollama.log',
    // Linux / WSL2 — journalctl may write here
    '/var/log/ollama.log',
  ];
  for (const logPath of logPaths) {
@ -25,11 +29,13 @@ export async function GET() {
    }
  }
-  // On macOS, Ollama logs via unified logging
+  // Fallback: platform-specific logging hint
  const hint = IS_MAC
    ? 'Ollama uses macOS unified logging. Use: log show --predicate \'subsystem == "com.ollama"\' --last 5m'
    : 'Ollama logs not found. If running on Windows (accessed via WSL2), check Windows Event Viewer or: journalctl -u ollama --no-pager -n 50';
  return NextResponse.json({
-    lines: [
+    lines: [hint],
      'Ollama uses macOS unified logging. Use: log show --predicate \'subsystem == "com.ollama"\' --last 5m',
    ],
    path: 'system',
    total: 1,
  });
--- a/__LOCAL_LLMs/dashboard/src/app/api/system/exec/route.ts
+++ b/__LOCAL_LLMs/dashboard/src/app/api/system/exec/route.ts
@ -5,9 +5,9 @@ import { promisify } from 'util';
 const execFileAsync = promisify(execFile);
 const COMMAND_ALLOWLIST = new Set([
  // Cross-platform (macOS + Linux)
  'git',
  'npm',
  'brew',
  'cat',
  'ls',
  'wc',
@ -15,6 +15,21 @@ const COMMAND_ALLOWLIST = new Set([
  'df',
  'echo',
  'date',
  'head',
  'tail',
  'grep',
  'which',
  'ps',
  'uname',
  'whoami',
  // macOS
  'brew',
  // Linux / WSL2
  'free',
  'lscpu',
  'nvidia-smi',
  'dpkg',
  'apt',
 ]);
 export async function POST(request: NextRequest) {
--- a/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts
+++ b/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts
@ -1,10 +1,13 @@
 import { NextResponse } from 'next/server';
 import { exec } from 'child_process';
 import { promisify } from 'util';
 import { readFile } from 'fs/promises';
 import os from 'os';
 const execAsync = promisify(exec);
 const IS_MAC = process.platform === 'darwin';
 interface ProcessInfo {
  pid: number;
  name: string;
@ -26,7 +29,7 @@ interface VmStatBreakdown {
 async function getTopProcesses(limit = 20): Promise<ProcessInfo[]> {
  try {
-    // ps with RSS in KB, sorted descending by RSS
+    // ps with RSS in KB, sorted descending by RSS — works on both macOS and Linux
    const { stdout } = await execAsync(
      `ps -axo pid=,rss=,%mem=,user=,comm= | sort -k2 -rn | head -${limit}`,
      { timeout: 3000 }
@ -60,36 +63,67 @@ async function getTopProcesses(limit = 20): Promise<ProcessInfo[]> {
 }
 async function getVmStatBreakdown(): Promise<VmStatBreakdown> {
-  try {
+  if (IS_MAC) {
-    const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
+    try {
-    const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
+      const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
-    const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
+      const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
-    const parse = (label: string): number => {
+      const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
-      const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
+      const parse = (label: string): number => {
-      return match ? parseInt(match[1]) * pageSize : 0;
+        const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
-    };
+        return match ? parseInt(match[1]) * pageSize : 0;
-    return {
+      };
-      active: parse('Pages active'),
+      return {
-      wired: parse('Pages wired down'),
+        active: parse('Pages active'),
-      compressor: parse('Pages occupied by compressor'),
+        wired: parse('Pages wired down'),
-      inactive: parse('Pages inactive'),
+        compressor: parse('Pages occupied by compressor'),
-      purgeable: parse('Pages purgeable'),
+        inactive: parse('Pages inactive'),
-      speculative: parse('Pages speculative'),
+        purgeable: parse('Pages purgeable'),
-      free: parse('Pages free'),
+        speculative: parse('Pages speculative'),
-      pageSize,
+        free: parse('Pages free'),
-    };
+        pageSize,
-  } catch {
+      };
-    return {
+    } catch {
-      active: 0,
+      // fall through to zeros
-      wired: 0,
+    }
-      compressor: 0,
+  } else {
-      inactive: 0,
+    // Linux / WSL2 — parse /proc/meminfo into vm_stat-compatible structure
-      purgeable: 0,
+    try {
-      speculative: 0,
+      const raw = await readFile('/proc/meminfo', 'utf-8');
-      free: 0,
+      const parse = (key: string): number => {
-      pageSize: 16384,
+        const match = raw.match(new RegExp(`${key}:\\s+(\\d+)`));
-    };
+        return match ? parseInt(match[1]) * 1024 : 0;
      };
      const free = parse('MemFree');
      const buffers = parse('Buffers');
      const cached = parse('Cached');
      const sReclaimable = parse('SReclaimable');
      const active = parse('Active');
      return {
        active,
        wired: buffers, // closest analogy
        compressor: parse('SwapCached'),
        inactive: parse('Inactive'),
        purgeable: sReclaimable,
        speculative: 0,
        free,
        pageSize: 4096,
      };
    } catch {
      // fall through to zeros
    }
  }
  return {
    active: 0,
    wired: 0,
    compressor: 0,
    inactive: 0,
    purgeable: 0,
    speculative: 0,
    free: 0,
    pageSize: IS_MAC ? 16384 : 4096,
  };
 }
 export async function GET() {
--- a/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts
+++ b/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts
@ -1,11 +1,14 @@
 import { NextResponse } from 'next/server';
 import { exec, execFile } from 'child_process';
 import { promisify } from 'util';
 import { readFile } from 'fs/promises';
 import os from 'os';
 const execAsync = promisify(exec);
 const execFileAsync = promisify(execFile);
 const IS_MAC = process.platform === 'darwin';
 // Cache slow commands with TTL
 let staticCache: {
  chip: string;
@ -20,10 +23,17 @@ const OLLAMA_DISK_TTL = 60 * 1000; // 60 seconds
 async function getChipInfo(): Promise<string> {
  try {
    if (IS_MAC) {
      const { stdout } = await execAsync(
        "sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown'"
      );
      return stdout.trim();
    }
    // Linux / WSL2
    const { stdout } = await execAsync(
-      "sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown'"
+      "lscpu 2>/dev/null | grep 'Model name' | sed 's/.*:\\s*//' || cat /proc/cpuinfo | grep 'model name' | head -1 | sed 's/.*: //'"
    );
-    return stdout.trim();
+    return stdout.trim() || 'Unknown';
  } catch {
    return 'Unknown';
  }
@ -55,30 +65,63 @@ async function getOllamaModelsDiskUsage(): Promise<number> {
 async function getGpuInfo(): Promise<string> {
  try {
    if (IS_MAC) {
      const { stdout } = await execAsync(
        "system_profiler SPDisplaysDataType 2>/dev/null | grep 'Chipset Model' | sed 's/.*: //'",
        { timeout: 5000 }
      );
      return stdout.trim() || 'Apple Silicon (integrated)';
    }
    // Linux / WSL2 — try nvidia-smi first, fall back to lspci
    try {
      const { stdout } = await execAsync(
        'nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1',
        { timeout: 3000 }
      );
      if (stdout.trim()) return stdout.trim();
    } catch {
      /* no nvidia-smi */
    }
    const { stdout } = await execAsync(
-      "system_profiler SPDisplaysDataType 2>/dev/null | grep 'Chipset Model' | sed 's/.*: //'",
+      "lspci 2>/dev/null | grep -i 'vga\\|3d' | sed 's/.*: //' | head -1"
      { timeout: 5000 }
    );
-    return stdout.trim() || 'Apple Silicon (integrated)';
+    return stdout.trim() || 'Unknown';
  } catch {
-    return 'Apple Silicon (integrated)';
+    return IS_MAC ? 'Apple Silicon (integrated)' : 'Unknown';
  }
 }
 async function getBrewPackages(): Promise<Array<{ name: string; version: string }>> {
  const targets = ['ollama', 'whisper-cpp', 'ffmpeg'];
  const results: Array<{ name: string; version: string }> = [];
-  for (const pkg of targets) {
+
-    try {
+  if (IS_MAC) {
-      const { stdout } = await execFileAsync('brew', ['list', '--versions', pkg], {
+    for (const pkg of targets) {
-        timeout: 3000,
+      try {
-      });
+        const { stdout } = await execFileAsync('brew', ['list', '--versions', pkg], {
-      const parts = stdout.trim().split(' ');
+          timeout: 3000,
-      if (parts.length >= 2) {
+        });
-        results.push({ name: parts[0], version: parts.slice(1).join(' ') });
+        const parts = stdout.trim().split(' ');
        if (parts.length >= 2) {
          results.push({ name: parts[0], version: parts.slice(1).join(' ') });
        }
      } catch {
        // not installed
      }
    }
  } else {
    // Linux / WSL2 — check via version commands (ffmpeg uses -version, others use --version)
    for (const pkg of targets) {
      const bin = pkg === 'whisper-cpp' ? 'whisper-cli' : pkg;
      const flag = bin === 'ffmpeg' ? '-version' : '--version';
      try {
        const { stdout } = await execAsync(`${bin} ${flag} 2>&1 | head -1`, { timeout: 3000 });
        if (stdout.trim()) {
          results.push({ name: pkg, version: stdout.trim() });
        }
      } catch {
        // not installed
      }
    } catch {
      // not installed
    }
  }
  return results;
@ -106,7 +149,8 @@ async function getCachedOllamaDiskUsage(): Promise<number> {
  return value;
 }
-// macOS vm_stat gives accurate memory breakdown (os.freemem() excludes reclaimable cache)
+// macOS: vm_stat gives accurate memory breakdown (os.freemem() excludes reclaimable cache)
 // Linux: /proc/meminfo gives accurate breakdown
 async function getAccurateMemory(): Promise<{
  total: number;
  appMemory: number;
@ -115,42 +159,66 @@ async function getAccurateMemory(): Promise<{
  pressure: string;
 }> {
  const totalMem = os.totalmem();
  try {
    const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
    const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
    const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
    const parse = (label: string): number => {
      const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
      return match ? parseInt(match[1]) * pageSize : 0;
    };
    const active = parse('Pages active');
    const wired = parse('Pages wired down');
    const inactive = parse('Pages inactive');
    const purgeable = parse('Pages purgeable');
    const speculative = parse('Pages speculative');
    const free = parse('Pages free');
    const compressor = parse('Pages occupied by compressor');
-    const appMemory = active + wired + compressor;
+  if (IS_MAC) {
-    const cached = inactive + purgeable + speculative;
+    try {
-    // Return raw free separately from cached — no overlap
+      const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
-    // available for loading = free + cached (macOS reclaims cached on demand)
+      const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
      const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
      const parse = (label: string): number => {
        const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
        return match ? parseInt(match[1]) * pageSize : 0;
      };
      const active = parse('Pages active');
      const wired = parse('Pages wired down');
      const inactive = parse('Pages inactive');
      const purgeable = parse('Pages purgeable');
      const speculative = parse('Pages speculative');
      const free = parse('Pages free');
      const compressor = parse('Pages occupied by compressor');
-    const ratio = appMemory / totalMem;
+      const appMemory = active + wired + compressor;
-    const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
+      const cached = inactive + purgeable + speculative;
-    return { total: totalMem, appMemory, cached, free, pressure };
+      const ratio = appMemory / totalMem;
-  } catch {
+      const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
-    // Fallback to Node.js (inaccurate on macOS but works everywhere)
+
-    const freeMem = os.freemem();
+      return { total: totalMem, appMemory, cached, free, pressure };
-    return {
+    } catch {
-      total: totalMem,
+      // fall through to generic fallback
-      appMemory: totalMem - freeMem,
+    }
-      cached: 0,
+  } else {
-      free: freeMem,
+    // Linux / WSL2 — parse /proc/meminfo
-      pressure: 'unknown',
+    try {
-    };
+      const raw = await readFile('/proc/meminfo', 'utf-8');
      const parse = (key: string): number => {
        const match = raw.match(new RegExp(`${key}:\\s+(\\d+)`));
        return match ? parseInt(match[1]) * 1024 : 0; // /proc/meminfo is in kB
      };
      const total = parse('MemTotal');
      const free = parse('MemFree');
      const buffers = parse('Buffers');
      const cached = parse('Cached') + parse('SReclaimable') + buffers;
      const appMemory = total - free - cached;
      const ratio = appMemory / total;
      const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
      return { total, appMemory, cached, free, pressure };
    } catch {
      // fall through to generic fallback
    }
  }
  // Generic fallback (works everywhere but less accurate)
  const freeMem = os.freemem();
  return {
    total: totalMem,
    appMemory: totalMem - freeMem,
    cached: 0,
    free: freeMem,
    pressure: 'unknown',
  };
 }
 export async function GET() {
--- a/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts
+++ b/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts
@ -6,9 +6,14 @@ import { join, resolve } from 'path';
 const execAsync = promisify(exec);
 const IS_MAC = process.platform === 'darwin';
 // process.cwd() = dashboard/, parent = __LOCAL_LLMs/
 const LOCAL_LLMS_DIR = resolve(process.cwd(), '..');
 // macOS/Linux: bin/python, Windows native: Scripts/python.exe
 const VENV_PYTHON = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
 interface TtsEngine {
  name: string;
  type: 'ollama' | 'python';
@ -67,8 +72,7 @@ async function checkOrpheus(): Promise<TtsEngine> {
  const snacSize = hasSnac ? await getFileSize(snacPath) : 0;
  // Check Python venv
-  const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
+  const hasVenv = await fileExists(VENV_PYTHON);
  const hasVenv = await fileExists(venvPython);
  if (hasModel && hasSnac && hasVenv) {
    engine.status = 'ready';
@ -114,13 +118,14 @@ async function checkQwenTts(): Promise<TtsEngine> {
  }
  const hasTokenizer = await fileExists(join(tokenizerDir, 'config.json'));
-  const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
+  const hasVenv = await fileExists(VENV_PYTHON);
  const hasVenv = await fileExists(venvPython);
  if (hasModel && hasTokenizer && hasVenv) {
    engine.status = 'ready';
    engine.size = `${(modelSize / 1e9).toFixed(1)} GB`;
-    engine.details = '0.6B params · 10 languages · MPS/CPU';
+    engine.details = IS_MAC
      ? '0.6B params · 10 languages · MPS/CPU'
      : '0.6B params · 10 languages · CUDA/CPU';
  } else if (hasModel || hasTokenizer) {
    engine.status = 'partial';
    const missing: string[] = [];
@ -141,22 +146,21 @@ async function checkVenv(): Promise<{
  python?: string;
  packages?: string[];
 }> {
-  const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
+  const exists = await fileExists(VENV_PYTHON);
  const exists = await fileExists(venvPython);
  if (!exists) return { exists: false };
  try {
    const { stdout } = await execAsync(
-      `"${venvPython}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`,
+      `"${VENV_PYTHON}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`,
      { timeout: 5000 }
    );
    return {
      exists: true,
-      python: venvPython,
+      python: VENV_PYTHON,
      packages: stdout.trim().split(' '),
    };
  } catch {
-    return { exists: true, python: venvPython };
+    return { exists: true, python: VENV_PYTHON };
  }
 }
--- a/__LOCAL_LLMs/dashboard/src/app/api/whisper/route.ts
+++ b/__LOCAL_LLMs/dashboard/src/app/api/whisper/route.ts
@ -7,9 +7,22 @@ import { homedir } from 'os';
 const execAsync = promisify(exec);
 const IS_MAC = process.platform === 'darwin';
 async function getWhisperBinaries(): Promise<string[]> {
  try {
-    const { stdout } = await execAsync('ls /opt/homebrew/bin/whisper-* 2>/dev/null');
+    if (IS_MAC) {
      const { stdout } = await execAsync('ls /opt/homebrew/bin/whisper-* 2>/dev/null');
      return stdout
        .trim()
        .split('\n')
        .filter(Boolean)
        .map(p => p.split('/').pop() || p);
    }
    // Linux / WSL2 — check common locations
    const { stdout } = await execAsync(
      'ls /usr/local/bin/whisper-* /usr/bin/whisper-* 2>/dev/null || which whisper-cli 2>/dev/null'
    );
    return stdout
      .trim()
      .split('\n')
@ -25,7 +38,9 @@ const WHISPER_MODEL_DIRS = (process.env.WHISPER_MODELS_DIR || '')
  .filter(Boolean)
  .concat([
    join(homedir(), 'whisper-models'),
-    '/opt/homebrew/share/whisper-cpp/models',
+    ...(IS_MAC
      ? ['/opt/homebrew/share/whisper-cpp/models']
      : ['/usr/local/share/whisper-cpp/models', '/usr/share/whisper-cpp/models']),
    join(homedir(), '.cache', 'whisper'),
  ]);
--- a/__LOCAL_LLMs/setup-tts.sh
+++ b/__LOCAL_LLMs/setup-tts.sh
@ -3,26 +3,24 @@
 # TTS Setup — One-Shot Script for Fresh Laptop
 #
 # Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python)
-# on Apple Silicon Macs. Works through corporate proxy.
+# on macOS (Apple Silicon) or Linux (CUDA GPU / WSL2).
 #
 # What this does:
-#   1. Installs Python 3.12 via Homebrew (if missing)
+#   1. Installs Python 3.12 (Homebrew on macOS, apt on Linux)
-#   2. Creates Python venv with TTS packages
+#   2. Creates Python venv with TTS packages (MPS on macOS, CUDA on Linux)
 #   3. Pulls Orpheus TTS model via Ollama
-#   4. Downloads SNAC audio decoder via hf-mirror.com
+#   4. Downloads SNAC audio decoder
-#   5. (Optional) Downloads Qwen3-TTS 0.6B via hf-mirror.com
+#   5. (Optional) Downloads Qwen3-TTS 0.6B
 #
 # Prerequisites:
-#   - macOS with Apple Silicon (M1/M2/M3/M4)
+#   macOS: Homebrew + Ollama installed
-#   - Homebrew installed
+#   Linux: apt + Ollama accessible at localhost:11434
 #   - Ollama installed (brew install ollama)
 #
 # Usage:
 #   bash setup-tts.sh
 #
 # After setup, test with:
 #   .venv-qwen-tts/bin/python test_orpheus_tts.py
 #   afplay test_orpheus_tara.wav
 # ============================================================
 set -e
@ -31,7 +29,13 @@ VENV="$SCRIPT_DIR/.venv-qwen-tts"
 MODELS_DIR="$SCRIPT_DIR/models"
 # HuggingFace mirror that works through corporate proxy
-HF_MIRROR="https://hf-mirror.com"
+# On personal machines, set HF_MIRROR=https://huggingface.co to download directly
 HF_MIRROR="${HF_MIRROR:-https://hf-mirror.com}"
 # Detect OS
 OS_TYPE="$(uname -s)"
 IS_MAC=false
 [ "$OS_TYPE" = "Darwin" ] && IS_MAC=true
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@ -52,34 +56,58 @@ echo ""
 # ── 0. Check prerequisites ──────────────────────────────────
 step "Checking prerequisites"
-# Homebrew
+if $IS_MAC; then
-if ! command -v brew &>/dev/null; then
+    # Homebrew
-    fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
+    if ! command -v brew &>/dev/null; then
-fi
+        fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
-ok "Homebrew"
+    fi
    ok "Homebrew"
-# Ollama
+    # Ollama (install via Homebrew if missing)
-if ! command -v ollama &>/dev/null; then
+    if ! command -v ollama &>/dev/null; then
-    warn "Ollama not found. Installing..."
+        warn "Ollama not found. Installing..."
-    brew install ollama
+        brew install ollama
    fi
 else
    # Linux / WSL2 — Ollama should be installed on host or via install script
    if ! command -v ollama &>/dev/null; then
        # On WSL2 Ollama runs on the Windows side; check if reachable
        if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
            fail "Ollama not found and not reachable at localhost:11434. Install Ollama on Windows or run: curl -fsSL https://ollama.com/install.sh | sh"
        fi
        ok "Ollama reachable at localhost:11434 (Windows host)"
    fi
 fi
 ok "Ollama installed"
 # Check if Ollama is running
 if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
    warn "Ollama not running. Starting..."
-    ollama serve &>/dev/null &
+    if command -v ollama &>/dev/null; then
-    sleep 3
+        ollama serve &>/dev/null &
        sleep 3
    fi
    if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
        fail "Could not start Ollama. Try manually: ollama serve"
    fi
 fi
 ok "Ollama running on port 11434"
-# Apple Silicon check
+# GPU check
 ARCH=$(uname -m)
-if [ "$ARCH" != "arm64" ]; then
+if $IS_MAC; then
-    warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available."
+    if [ "$ARCH" != "arm64" ]; then
        warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available."
    else
        ok "Apple Silicon ($ARCH) — MPS acceleration available"
    fi
 else
    if command -v nvidia-smi &>/dev/null; then
        GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
        ok "NVIDIA GPU detected: $GPU_NAME — CUDA acceleration available"
    else
        warn "nvidia-smi not found. CUDA acceleration won't be available (CPU fallback)."
    fi
 fi
 # ── 1. Install Python 3.12 ──────────────────────────────────
@ -87,7 +115,7 @@ step "Python 3.12"
 PYTHON_CMD=""
 # Check various Python 3.12 locations
-for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do
+for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12 python3; do
    if command -v "$cmd" &>/dev/null; then
        PYTHON_CMD="$cmd"
        break
@ -95,9 +123,15 @@ for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do
 done
 if [ -z "$PYTHON_CMD" ]; then
-    warn "Python 3.12 not found. Installing via Homebrew..."
+    if $IS_MAC; then
-    brew install python@3.12
+        warn "Python 3.12 not found. Installing via Homebrew..."
-    PYTHON_CMD="/opt/homebrew/bin/python3.12"
+        brew install python@3.12
        PYTHON_CMD="/opt/homebrew/bin/python3.12"
    else
        warn "Python 3.12 not found. Installing via apt..."
        sudo apt update && sudo apt install -y python3.12 python3.12-venv python3-pip
        PYTHON_CMD="python3.12"
    fi
 fi
 PYTHON_VER=$("$PYTHON_CMD" --version 2>&1)
@ -123,7 +157,15 @@ if "$VENV/bin/python" -c "import snac" &>/dev/null; then
 else
    echo "Installing packages (this may take a few minutes)..."
    "$VENV/bin/pip" install -U pip --quiet
-    "$VENV/bin/pip" install -U snac qwen-tts --quiet
+    if $IS_MAC; then
        # macOS: default PyTorch includes MPS support
        "$VENV/bin/pip" install -U snac qwen-tts --quiet
    else
        # Linux: install PyTorch with CUDA first, then snac/qwen-tts
        echo "Installing PyTorch with CUDA support..."
        "$VENV/bin/pip" install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --quiet
        "$VENV/bin/pip" install -U snac qwen-tts --quiet
    fi
    ok "Packages installed"
 fi
@ -144,7 +186,11 @@ step "SNAC 24kHz audio decoder (~76 MB)"
 mkdir -p "$MODELS_DIR/snac_24khz"
 if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
-    SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
+    if $IS_MAC; then
        SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
    else
        SIZE=$(stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
    fi
    if [ "$SIZE" -gt 1000000 ]; then
        ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
    else
@ -247,7 +293,11 @@ du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/  /'
 echo ""
 echo "Test commands:"
 echo "  $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py"
-echo "  afplay test_orpheus_tara.wav"
+if $IS_MAC; then
    echo "  afplay test_orpheus_tara.wav"
 else
    echo "  aplay test_orpheus_tara.wav  (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)"
 fi
 if [ -d "$QWEN_MODEL_DIR" ]; then
    echo "  $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py"
 fi
--- a/__LOCAL_LLMs/test_orpheus_tts.py
+++ b/__LOCAL_LLMs/test_orpheus_tts.py
@ -13,6 +13,7 @@ Usage:
 """
 import os
 import re
 import sys
 import time
 import json
 import struct
@ -166,7 +167,7 @@ def main():
    # Voices: tara, leah, jess, leo, dan, mia, zac, zoe
    tests = [
-        ("Hello! This is Orpheus text to speech, running entirely on your Mac through Ollama.", "tara"),
+        ("Hello! This is Orpheus text to speech, running entirely locally through Ollama.", "tara"),
        ("<laugh> That's amazing! Local AI speech generation without any cloud services!", "leo"),
    ]
@ -182,7 +183,10 @@ def main():
            save_wav(audio, sr, outpath)
    print("\n=== Done! Open the .wav files to listen. ===")
-    print("Play with:  afplay test_orpheus_tara.wav")
+    if sys.platform == "darwin":
        print("Play with:  afplay test_orpheus_tara.wav")
    else:
        print("Play with:  aplay test_orpheus_tara.wav  (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)")
 if __name__ == "__main__":
--- a/__LOCAL_LLMs/test_qwen_tts.py
+++ b/__LOCAL_LLMs/test_qwen_tts.py
@ -1,5 +1,5 @@
 """
-Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback).
+Test Qwen3-TTS 0.6B (CUDA, MPS, or CPU fallback).
 Prerequisites:
  bash setup-tts.sh              (one-shot: installs everything)
@ -24,8 +24,12 @@ if not os.path.isdir(MODEL_PATH):
    print("Run: bash setup-tts.sh   (or: bash download-tts-models.sh qwen)")
    raise SystemExit(1)
-# Pick device: MPS if available, else CPU
+# Pick device: CUDA > MPS > CPU
-if torch.backends.mps.is_available():
+if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16
    print(f"Using CUDA ({torch.cuda.get_device_name(0)})")
 elif torch.backends.mps.is_available():
    device = "mps"
    dtype = torch.float32  # MPS doesn't support bfloat16
    print(f"Using MPS (Apple Metal GPU)")
@ -48,7 +52,7 @@ print(f"Supported speakers: {model.get_supported_speakers()}")
 print(f"Supported languages: {model.get_supported_languages()}")
 # Test 1: English with a built-in speaker
-text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac."
+text = f"Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your {'Mac' if device == 'mps' else 'machine'} using {device.upper()}."
 print(f"\nGenerating speech for: {text[:60]}...")
 t1 = time.time()
--- a/__LOCAL_LLMs/windows_specific/setup-guide.md
+++ b/__LOCAL_LLMs/windows_specific/setup-guide.md
@ -1,372 +1,250 @@
 # Windows Setup Guide — Local LLM Stack on Razer Blade 18
 > **Hardware:** Razer Blade 18 · Intel Core Ultra 9 275HX · RTX 5090 24 GB GDDR7 · 64 GB DDR5 · 4 TB NVMe
-> **OS:** Windows 11 Home
+> **OS:** Windows 11 Home + WSL2 (Ubuntu)
 > **Goal:** Mirror the macOS `__LOCAL_LLMs` stack — Ollama, Whisper, TTS (Orpheus + Qwen3), Mission Control dashboard
 > **See also:** [razer-blade-18-spec.md](razer-blade-18-spec.md) for full hardware specs
 ---
-## Prerequisites
+## Architecture: Windows-Native + WSL2
-### 1. Windows Package Manager
+```
-
+┌────────────────────────────────────────────────────────┐
-Install **winget** (ships with Windows 11) and optionally **Scoop** for CLI tools:
+│  Windows 11                                            │
-
+│  ├── NVIDIA drivers + CUDA (native)                    │
-```powershell
+│  ├── Ollama (native Windows service, port 11434)       │
-# Verify winget
+│  └── Browser → http://localhost:3000                   │
-winget --version
+│                                                        │
-
+│  ┌──────────────────────────────────────────────────┐  │
-# Install Scoop (optional, useful for dev tools)
+│  │  WSL2 (Ubuntu 24.04)                             │  │
-Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+│  │  ├── Node.js, Python 3.12, ffmpeg, git           │  │
-Invoke-RestMethod -Uri https://get.scoop.sh | Invoke-Expression
+│  │  ├── __LOCAL_LLMs/ (cloned here)                 │  │
 │  │  │   ├── dashboard/ → npm run dev (port 3000)    │  │
 │  │  │   ├── setup-tts.sh    (works as-is)           │  │
 │  │  │   ├── start-dashboard.sh (works as-is)        │  │
 │  │  │   └── models/ (SNAC, Qwen3-TTS)              │  │
 │  │  ├── whisper-cpp (CUDA build)                    │  │
 │  │  └── .venv-qwen-tts/ (PyTorch CUDA)             │  │
 │  └──────────────────────────────────────────────────┘  │
 └────────────────────────────────────────────────────────┘
 ```
-### 2. NVIDIA CUDA Toolkit
+**Why WSL2?** All existing bash scripts, Python venvs, and Node.js tooling work identically to macOS — zero porting. The dashboard API routes auto-detect macOS vs Linux at runtime via `process.platform`.
-The RTX 5090 needs the latest CUDA drivers for GPU-accelerated inference.
+---
 ## Phase 1: Windows-Native Setup
 ### 1. NVIDIA Drivers
 ```powershell
-# Install NVIDIA drivers (latest Game Ready or Studio)
+# Install latest NVIDIA Game Ready or Studio drivers
-winget install --id Nvidia.GeForceExperience
+# Download from: https://www.nvidia.com/Download/index.aspx
 # Install CUDA Toolkit (required for PyTorch CUDA)
 winget install --id Nvidia.CUDA
 # Or download from: https://developer.nvidia.com/cuda-downloads
 # Verify
 nvidia-smi
 # Should show: RTX 5090, 24 GB VRAM, CUDA 13.x+
 ```
-Expected output should show:
+### 2. Ollama (Windows-Native)
- **RTX 5090** with **24 GB** VRAM
+Ollama runs natively on Windows and is accessible from WSL2 at `localhost:11434`.
 - CUDA version 13.x+
 ### 3. Node.js (for Mission Control Dashboard)
 ```powershell
 winget install --id OpenJS.NodeJS.LTS
 # Verify
 node --version   # should be 20.x+
 npm --version
 ```
 ### 4. Python 3.12
 ```powershell
 winget install --id Python.Python.3.12
 # Verify
 python --version
 pip --version
 ```
 ### 5. Git
 ```powershell
 winget install --id Git.Git
 ```
 ### 6. ffmpeg
 ```powershell
 winget install --id Gyan.FFmpeg
 # Or: scoop install ffmpeg
 ```
 ---
 ## 1. Ollama — LLM Server
 ### Install
 ```powershell
 winget install --id Ollama.Ollama
 ```
 Ollama for Windows runs as a background service and automatically uses CUDA (RTX 5090).
 ### Verify
 ```powershell
 ollama --version
 curl http://localhost:11434/api/tags
 ```
 ### Download Models
 ```powershell
 # Coding
 ollama pull qwen2.5-coder:32b     # 19 GB — primary coding model
 ollama pull qwen2.5-coder:7b      # 4.7 GB — fast coding
 # Reasoning
 ollama pull deepseek-r1:32b       # 19 GB — chain-of-thought
 # General
 ollama pull llama3.1:8b            # 4.9 GB — fast general tasks
 # TTS
 ollama pull sematre/orpheus:en    # 4 GB — text-to-speech (8 voices)
 # Verify
-ollama list
+ollama --version
 ```
-> **Note:** With 24 GB VRAM, Ollama will offload 32B models almost entirely to GPU.
+### 3. Pull Models (from Windows or WSL2)
 > On macOS (48 GB unified), the 32B models run in shared CPU/GPU memory.
 > On this machine, **GPU inference will be significantly faster** for models that fit in 24 GB VRAM.
-### VRAM Budget (RTX 5090 — 24 GB)
+```bash
 ollama pull qwen2.5-coder:32b     # 19 GB — primary coding model
 ollama pull qwen2.5-coder:7b      # 4.7 GB — fast coding
 ollama pull deepseek-r1:32b       # 19 GB — chain-of-thought
 ollama pull llama3.1:8b            # 4.9 GB — fast general tasks
 ollama pull sematre/orpheus:en    # 4 GB — text-to-speech (8 voices)
-| Model                        | VRAM Usage | Fits in GPU? |
+ollama list    # verify all 5 models
-| ---------------------------- | ---------- | ------------ |
+```
-| llama3.1:8b                  | ~5 GB      | ✅ Fully     |
+
-| qwen2.5-coder:7b             | ~5 GB      | ✅ Fully     |
+### 4. Install WSL2
-| sematre/orpheus:en           | ~4 GB      | ✅ Fully     |
+
-| qwen2.5-coder:32b            | ~19 GB     | ✅ Fully     |
+```powershell
-| deepseek-r1:32b              | ~19 GB     | ✅ Fully     |
+# From PowerShell (Admin)
-| Two 7B models simultaneously | ~10 GB     | ✅ Both fit  |
+wsl --install -d Ubuntu-24.04
 # Reboot if prompted, then set up username/password
 ```
 ---
-## 2. Whisper.cpp — Speech-to-Text
+## Phase 2: WSL2 Setup
-### Option A: Pre-built Binary (Recommended)
+### 1. Install Dependencies
-Download the latest release from GitHub:
+```bash
 # Update
 sudo apt update && sudo apt upgrade -y
-```powershell
+# Node.js 20 LTS
-# Create whisper directory
+curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -
-mkdir "$env:USERPROFILE\whisper-cpp"
+sudo apt install -y nodejs
 cd "$env:USERPROFILE\whisper-cpp"
-# Download latest release (CUDA build)
+# Python 3.12
-# Check: https://github.com/ggerganov/whisper.cpp/releases
+sudo apt install -y python3.12 python3.12-venv python3-pip
-# Look for: whisper-cublas-bin-x64.zip or whisper-cuda-bin-x64.zip
+
 # Build tools + ffmpeg
 sudo apt install -y ffmpeg git curl build-essential cmake
 # Verify
 node --version        # 20.x+
 python3.12 --version
 nvidia-smi            # should show RTX 5090 (GPU passthrough from Windows)
 ```
-### Option B: Build from Source (CUDA)
+> **Important:** Do NOT install NVIDIA drivers inside WSL2. The Windows-side driver handles GPU passthrough automatically.
-```powershell
+### 2. Clone Repo
 ```bash
 mkdir -p ~/code/mygh && cd ~/code/mygh
 git clone https://github.com/saravanakumardb1/learning_ai_common_plat.git
 cd learning_ai_common_plat/__LOCAL_LLMs
 ```
 > **Performance note:** Always clone inside WSL2 filesystem (`~/code/...`), NOT in `/mnt/c/` — the Windows filesystem bridge is very slow for `node_modules`.
 ### 3. Whisper.cpp (CUDA build)
 ```bash
 cd ~
 git clone https://github.com/ggerganov/whisper.cpp.git
 cd whisper.cpp
 cmake -B build -DGGML_CUDA=ON
-cmake --build build --config Release
+cmake --build build --config Release -j$(nproc)
-```
+sudo cp build/bin/whisper-cli /usr/local/bin/
-### Download Whisper Model
+# Download model (1.5 GB)
-
+mkdir -p ~/whisper-models
-```powershell
+curl -L -o ~/whisper-models/ggml-large-v3-turbo.bin \
 mkdir "$env:USERPROFILE\whisper-models"
 # Download ggml-large-v3-turbo (1.5 GB)
 curl -L -o "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" `
  "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin"
 # Verify
 whisper-cli --version
 ```
 > **No corporate proxy on this machine** — download directly from `huggingface.co`.
 > The `hf-mirror.com` workaround is only needed on the corporate MacBook.
-### Verify
+### 4. TTS Setup (One-Shot)
-```powershell
+```bash
-# Test transcription
+cd ~/code/mygh/learning_ai_common_plat/__LOCAL_LLMs
-whisper-cli -m "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" -f test.wav
+
 # Works exactly like macOS — downloads SNAC, Qwen3-TTS, creates venv
 bash setup-tts.sh
 ```
 The script detects macOS vs Linux and installs the correct PyTorch variant (MPS on macOS, CUDA on Linux). On a personal machine, override the default HuggingFace mirror: `HF_MIRROR=https://huggingface.co bash setup-tts.sh`
 ### 5. Start Dashboard
 ```bash
 bash start-dashboard.sh
 # Open http://localhost:3000 in Windows browser
 ```
 WSL2 automatically forwards ports — the dashboard is accessible from Windows at `localhost:3000`.
 ---
-## 3. TTS — Orpheus + Qwen3-TTS
+## Key Differences: macOS vs WSL2
-### 3a. Orpheus TTS (via Ollama)
+| Area                   | macOS (any Mac)             | WSL2 (any Linux)                       |
-
+| ---------------------- | --------------------------- | -------------------------------------- |
-Already handled in Step 1 (`ollama pull sematre/orpheus:en`).
+| **GPU**                | Apple Silicon (MPS)         | NVIDIA (CUDA)                          |
-
+| **Ollama**             | macOS native (Metal)        | Windows native, accessed via localhost |
-### 3b. SNAC Decoder
+| **PyTorch device**     | `mps`                       | `cuda`                                 |
-
+| **Whisper install**    | `brew install whisper-cpp`  | Build from source with CUDA            |
-```powershell
+| **Package manager**    | Homebrew                    | apt                                    |
-# Create models directory (match macOS layout)
+| **Shell scripts**      | Work as-is                  | Work as-is                             |
-$MODELS = "$PSScriptRoot\models"   # or wherever you clone the repo
+| **Python venv path**   | `bin/python`                | `bin/python` (same)                    |
-mkdir "$MODELS\snac_24khz" -Force
+| **Dashboard**          | Identical                   | Identical                              |
-
+| **Ollama models path** | `~/.ollama/models/`         | Windows `%USERPROFILE%\.ollama\`       |
-# Download SNAC decoder
+| **Model download**     | `hf-mirror.com` (corporate) | `huggingface.co` (direct)              |
 curl -L -o "$MODELS\snac_24khz\config.json" `
  "https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/config.json"
 curl -L -o "$MODELS\snac_24khz\pytorch_model.bin" `
  "https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
 ```
 ### 3c. Python Venv + Dependencies
 ```powershell
 cd __LOCAL_LLMs
 # Create venv
 python -m venv .venv-qwen-tts
 # Activate (Windows uses Scripts, not bin)
 .\.venv-qwen-tts\Scripts\Activate.ps1
 # Install PyTorch with CUDA (NOT MPS — that's Apple only)
 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
 # Install other deps
 pip install snac numpy soundfile
 # Verify CUDA
 python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0)}')"
 # Expected: CUDA: True, Device: NVIDIA GeForce RTX 5090 Laptop GPU
 ```
 ### 3d. Qwen3-TTS 0.6B
 ```powershell
 $MODELS = ".\models"
 # Tokenizer (~650 MB)
 mkdir "$MODELS\Qwen3-TTS-Tokenizer-12Hz" -Force
 foreach ($f in @("config.json", "configuration.json", "preprocessor_config.json")) {
    curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\$f" `
      "https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f"
 }
 curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\model.safetensors" `
  "https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors"
 # Model weights (~1.8 GB)
 mkdir "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice" -Force
 foreach ($f in @("config.json", "generation_config.json")) {
    curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\$f" `
      "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f"
 }
 curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\model.safetensors" `
  "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors"
 ```
 ### 3e. Test TTS
 ```powershell
 # Activate venv
 .\.venv-qwen-tts\Scripts\Activate.ps1
 # Orpheus TTS test
 python test_orpheus_tts.py
 # Qwen3-TTS test
 python test_qwen_tts.py
 ```
 > **Key difference from macOS:** Qwen3-TTS will use **CUDA** instead of MPS.
 > In `test_qwen_tts.py`, the device selection `torch.device("mps")` will fall through to CUDA automatically
 > since `torch.backends.mps.is_available()` returns False on Windows.
 > You may want to update the device logic to prefer CUDA:
 >
 > ```python
 > device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 > ```
 ---
 ## 4. Mission Control Dashboard
 ```powershell
 cd __LOCAL_LLMs\dashboard
 # Install dependencies
 npm install
 # Start dev server
 npm run dev
 # Open http://localhost:3000
 ```
 The dashboard is pure Next.js — works identically on Windows. The API routes auto-detect:
 - **Ollama** at `localhost:11434`
 - **Whisper** models in `%USERPROFILE%\whisper-models\`
 - **TTS** engines (Orpheus, Qwen3-TTS) and Python venv
 ### Start Script (PowerShell)
 Use the bash script equivalent:
 ```powershell
 # Quick start (manual)
 ollama serve    # if not already running as service
 cd __LOCAL_LLMs\dashboard
 npm run dev
 ```
 > TODO: Create `start-dashboard.ps1` as a PowerShell equivalent of `start-dashboard.sh`
 ---
 ## 5. Key Differences: macOS vs Windows
 | Area                | macOS (M4 Pro 48 GB)                | Windows (Razer Blade 18)              |
 | ------------------- | ----------------------------------- | ------------------------------------- |
 | **GPU**             | Apple Silicon (unified memory, MPS) | RTX 5090 (24 GB VRAM, CUDA)           |
 | **Ollama GPU**      | Automatic (Metal)                   | Automatic (CUDA)                      |
 | **VRAM**            | Shared from 48 GB RAM               | Dedicated 24 GB GDDR7                 |
 | **PyTorch device**  | `mps`                               | `cuda`                                |
 | **Whisper install** | `brew install whisper-cpp`          | Build from source or download release |
 | **Python venv**     | `bin/activate`                      | `Scripts\Activate.ps1`                |
 | **Package manager** | Homebrew                            | winget / scoop                        |
 | **Shell**           | zsh / bash                          | PowerShell / cmd                      |
 | **Scripts**         | `.sh` (bash)                        | `.ps1` (PowerShell)                   |
 | **Model download**  | `hf-mirror.com` (corporate proxy)   | `huggingface.co` (no proxy)           |
 | **Dashboard**       | Identical                           | Identical                             |
 | **Ollama models**   | Identical                           | Identical                             |
 ### Performance Expectations
-| Workload                    | macOS M4 Pro 48 GB           | Razer RTX 5090 24 GB      |
+| Workload                    | macOS M4 Pro 48 GB   | Razer RTX 5090 24 GB            |
-| --------------------------- | ---------------------------- | ------------------------- |
+| --------------------------- | -------------------- | ------------------------------- |
-| qwen2.5-coder:32b inference | ~15–25 tok/s (MPS/CPU blend) | ~40–60 tok/s (full CUDA)  |
+| qwen2.5-coder:32b inference | ~15–25 tok/s         | ~40–60 tok/s                    |
-| Whisper large-v3-turbo      | ~2–4x realtime (CPU)         | ~8–15x realtime (CUDA)    |
+| Whisper large-v3-turbo      | ~2–4x realtime       | ~8–15x realtime                 |
-| Orpheus TTS                 | ~realtime (CPU decode)       | ~2–3x realtime (CUDA)     |
+| Orpheus TTS                 | ~realtime            | ~2–3x realtime                  |
-| Qwen3-TTS                   | ~realtime (MPS)              | ~2–4x realtime (CUDA)     |
+| Qwen3-TTS                   | ~realtime (MPS)      | ~2–4x realtime (CUDA)           |
-| 70B quantized models        | Fits in 48 GB (slow)         | Partially offloads to RAM |
+| 70B quantized models        | Fits in 48 GB (slow) | Partially offloads to 64 GB RAM |
 ### VRAM Budget (RTX 5090 — 24 GB)
 | Model              | VRAM Usage | Fits in GPU? |
 | ------------------ | ---------- | ------------ |
 | llama3.1:8b        | ~5 GB      | ✅ Fully     |
 | qwen2.5-coder:7b   | ~5 GB      | ✅ Fully     |
 | sematre/orpheus:en | ~4 GB      | ✅ Fully     |
 | qwen2.5-coder:32b  | ~19 GB     | ✅ Fully     |
 | deepseek-r1:32b    | ~19 GB     | ✅ Fully     |
 ---
-## 6. File Layout (Same as macOS)
+## Quick Reference — Full Setup Checklist
 ### Windows Side
 ```
-__LOCAL_LLMs/
+[ ] Install NVIDIA drivers (Game Ready or Studio)
 ├── dashboard/                       ← Mission Control (port 3000) — works as-is
 ├── models/                          ← TTS model weights (gitignored)
 │   ├── snac_24khz/
 │   ├── Qwen3-TTS-Tokenizer-12Hz/
 │   └── Qwen3-TTS-12Hz-0.6B-CustomVoice/
 ├── .venv-qwen-tts/                  ← Python venv (Scripts\ on Windows)
 ├── test_orpheus_tts.py              ← works as-is (device fallback)
 ├── test_qwen_tts.py                 ← update device to prefer CUDA
 ├── windows_specific/
 │   ├── razer-blade-18-spec.md       ← hardware spec
 │   └── setup-guide.md              ← this file
 └── docs/                            ← macOS-focused docs (still useful as reference)
 ```
 ---
 ## 7. Quick Reference — Full Setup Checklist
 ```
 [ ] Install NVIDIA drivers + CUDA Toolkit
 [ ] Install Ollama (winget install Ollama.Ollama)
-[ ] Pull models: qwen2.5-coder:32b, deepseek-r1:32b, llama3.1:8b, orpheus
+[ ] Pull all 5 models
-[ ] Install Node.js 20+ (winget)
+[ ] Install WSL2 (wsl --install -d Ubuntu-24.04)
-[ ] Install Python 3.12 (winget)
+```
-[ ] Install Git (winget)
+
-[ ] Install ffmpeg (winget)
+### WSL2 Side
-[ ] Clone repo
+
-[ ] Download Whisper model to %USERPROFILE%\whisper-models\
+```
-[ ] Build or download whisper-cpp with CUDA
+[ ] Install Node.js 20+, Python 3.12, ffmpeg, git, cmake
-[ ] Create Python venv + install PyTorch CUDA + snac
+[ ] Verify nvidia-smi shows RTX 5090
-[ ] Download SNAC decoder
+[ ] Clone repo into ~/code/mygh/
-[ ] Download Qwen3-TTS tokenizer + model
+[ ] Build whisper-cpp with CUDA
-[ ] npm install in dashboard/
+[ ] Download Whisper model to ~/whisper-models/
-[ ] Run dashboard: npm run dev
+[ ] Run: bash setup-tts.sh
 [ ] Run: bash start-dashboard.sh
 [ ] Verify: http://localhost:3000 shows all green
 ```
 ---
 ## Troubleshooting
 ### Ollama not accessible from WSL2
 ```bash
 curl http://localhost:11434/api/tags
 # If fails, check Windows firewall or try:
 curl http://$(hostname).local:11434/api/tags
 ```
 ### CUDA not visible in WSL2
 ```bash
 nvidia-smi
 # If "command not found":
 # 1. Update Windows NVIDIA drivers to latest
 # 2. Run: wsl --update
 # 3. Do NOT install nvidia-driver-* inside WSL2
 ```
 ### Slow filesystem performance
 ```bash
 # Clone repos inside WSL2 filesystem: ~/code/...
 # NOT in /mnt/c/ (Windows→WSL bridge is ~10x slower for node_modules)
 ```