fix(local-llms): cross-platform audit — 8 bugs/gaps fixed
- setup-tts.sh: make fully cross-platform (macOS + Linux/WSL2) - OS detection, apt fallback, CUDA PyTorch install, nvidia-smi check - cross-platform playback hints, HF_MIRROR env override - api/system/route.ts: fix ffmpeg detection (use -version not --version) - api/system/memory/route.ts: remove unused total variable in Linux path - api/system/exec/route.ts: expand allowlist with Linux commands (head, tail, grep, which, ps, uname, free, lscpu, nvidia-smi, etc.) - api/tts/route.ts: cross-platform venv path + CUDA/MPS label - api/whisper/route.ts: Linux binary/model paths - api/ollama/logs/route.ts: Linux log paths + WSL2 hint - test_qwen_tts.py: platform-aware speech text + CUDA device detection - test_orpheus_tts.py: platform-aware text, move import sys to top - setup-guide.md: fix false auto-detect claim, add HF_MIRROR hint
This commit is contained in:
parent
f85b455eb5
commit
b1d2e4ec81
@ -4,11 +4,15 @@ import { homedir } from 'os';
|
||||
import { join } from 'path';
|
||||
import { existsSync } from 'fs';
|
||||
|
||||
const IS_MAC = process.platform === 'darwin';
|
||||
|
||||
export async function GET() {
|
||||
const logPaths = [
|
||||
join(homedir(), '.ollama', 'logs', 'server.log'),
|
||||
join(homedir(), '.ollama', 'logs', 'gpu.log'),
|
||||
'/tmp/ollama.log',
|
||||
// Linux / WSL2 — journalctl may write here
|
||||
'/var/log/ollama.log',
|
||||
];
|
||||
|
||||
for (const logPath of logPaths) {
|
||||
@ -25,11 +29,13 @@ export async function GET() {
|
||||
}
|
||||
}
|
||||
|
||||
// On macOS, Ollama logs via unified logging
|
||||
// Fallback: platform-specific logging hint
|
||||
const hint = IS_MAC
|
||||
? 'Ollama uses macOS unified logging. Use: log show --predicate \'subsystem == "com.ollama"\' --last 5m'
|
||||
: 'Ollama logs not found. If running on Windows (accessed via WSL2), check Windows Event Viewer or: journalctl -u ollama --no-pager -n 50';
|
||||
|
||||
return NextResponse.json({
|
||||
lines: [
|
||||
'Ollama uses macOS unified logging. Use: log show --predicate \'subsystem == "com.ollama"\' --last 5m',
|
||||
],
|
||||
lines: [hint],
|
||||
path: 'system',
|
||||
total: 1,
|
||||
});
|
||||
|
||||
@ -5,9 +5,9 @@ import { promisify } from 'util';
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const COMMAND_ALLOWLIST = new Set([
|
||||
// Cross-platform (macOS + Linux)
|
||||
'git',
|
||||
'npm',
|
||||
'brew',
|
||||
'cat',
|
||||
'ls',
|
||||
'wc',
|
||||
@ -15,6 +15,21 @@ const COMMAND_ALLOWLIST = new Set([
|
||||
'df',
|
||||
'echo',
|
||||
'date',
|
||||
'head',
|
||||
'tail',
|
||||
'grep',
|
||||
'which',
|
||||
'ps',
|
||||
'uname',
|
||||
'whoami',
|
||||
// macOS
|
||||
'brew',
|
||||
// Linux / WSL2
|
||||
'free',
|
||||
'lscpu',
|
||||
'nvidia-smi',
|
||||
'dpkg',
|
||||
'apt',
|
||||
]);
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
import { NextResponse } from 'next/server';
|
||||
import { exec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { readFile } from 'fs/promises';
|
||||
import os from 'os';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
const IS_MAC = process.platform === 'darwin';
|
||||
|
||||
interface ProcessInfo {
|
||||
pid: number;
|
||||
name: string;
|
||||
@ -26,7 +29,7 @@ interface VmStatBreakdown {
|
||||
|
||||
async function getTopProcesses(limit = 20): Promise<ProcessInfo[]> {
|
||||
try {
|
||||
// ps with RSS in KB, sorted descending by RSS
|
||||
// ps with RSS in KB, sorted descending by RSS — works on both macOS and Linux
|
||||
const { stdout } = await execAsync(
|
||||
`ps -axo pid=,rss=,%mem=,user=,comm= | sort -k2 -rn | head -${limit}`,
|
||||
{ timeout: 3000 }
|
||||
@ -60,36 +63,67 @@ async function getTopProcesses(limit = 20): Promise<ProcessInfo[]> {
|
||||
}
|
||||
|
||||
async function getVmStatBreakdown(): Promise<VmStatBreakdown> {
|
||||
try {
|
||||
const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
|
||||
const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
|
||||
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
|
||||
const parse = (label: string): number => {
|
||||
const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
|
||||
return match ? parseInt(match[1]) * pageSize : 0;
|
||||
};
|
||||
return {
|
||||
active: parse('Pages active'),
|
||||
wired: parse('Pages wired down'),
|
||||
compressor: parse('Pages occupied by compressor'),
|
||||
inactive: parse('Pages inactive'),
|
||||
purgeable: parse('Pages purgeable'),
|
||||
speculative: parse('Pages speculative'),
|
||||
free: parse('Pages free'),
|
||||
pageSize,
|
||||
};
|
||||
} catch {
|
||||
return {
|
||||
active: 0,
|
||||
wired: 0,
|
||||
compressor: 0,
|
||||
inactive: 0,
|
||||
purgeable: 0,
|
||||
speculative: 0,
|
||||
free: 0,
|
||||
pageSize: 16384,
|
||||
};
|
||||
if (IS_MAC) {
|
||||
try {
|
||||
const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
|
||||
const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
|
||||
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
|
||||
const parse = (label: string): number => {
|
||||
const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
|
||||
return match ? parseInt(match[1]) * pageSize : 0;
|
||||
};
|
||||
return {
|
||||
active: parse('Pages active'),
|
||||
wired: parse('Pages wired down'),
|
||||
compressor: parse('Pages occupied by compressor'),
|
||||
inactive: parse('Pages inactive'),
|
||||
purgeable: parse('Pages purgeable'),
|
||||
speculative: parse('Pages speculative'),
|
||||
free: parse('Pages free'),
|
||||
pageSize,
|
||||
};
|
||||
} catch {
|
||||
// fall through to zeros
|
||||
}
|
||||
} else {
|
||||
// Linux / WSL2 — parse /proc/meminfo into vm_stat-compatible structure
|
||||
try {
|
||||
const raw = await readFile('/proc/meminfo', 'utf-8');
|
||||
const parse = (key: string): number => {
|
||||
const match = raw.match(new RegExp(`${key}:\\s+(\\d+)`));
|
||||
return match ? parseInt(match[1]) * 1024 : 0;
|
||||
};
|
||||
const free = parse('MemFree');
|
||||
const buffers = parse('Buffers');
|
||||
const cached = parse('Cached');
|
||||
const sReclaimable = parse('SReclaimable');
|
||||
const active = parse('Active');
|
||||
|
||||
return {
|
||||
active,
|
||||
wired: buffers, // closest analogy
|
||||
compressor: parse('SwapCached'),
|
||||
inactive: parse('Inactive'),
|
||||
purgeable: sReclaimable,
|
||||
speculative: 0,
|
||||
free,
|
||||
pageSize: 4096,
|
||||
};
|
||||
} catch {
|
||||
// fall through to zeros
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
active: 0,
|
||||
wired: 0,
|
||||
compressor: 0,
|
||||
inactive: 0,
|
||||
purgeable: 0,
|
||||
speculative: 0,
|
||||
free: 0,
|
||||
pageSize: IS_MAC ? 16384 : 4096,
|
||||
};
|
||||
}
|
||||
|
||||
export async function GET() {
|
||||
|
||||
@ -1,11 +1,14 @@
|
||||
import { NextResponse } from 'next/server';
|
||||
import { exec, execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { readFile } from 'fs/promises';
|
||||
import os from 'os';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const IS_MAC = process.platform === 'darwin';
|
||||
|
||||
// Cache slow commands with TTL
|
||||
let staticCache: {
|
||||
chip: string;
|
||||
@ -20,10 +23,17 @@ const OLLAMA_DISK_TTL = 60 * 1000; // 60 seconds
|
||||
|
||||
async function getChipInfo(): Promise<string> {
|
||||
try {
|
||||
if (IS_MAC) {
|
||||
const { stdout } = await execAsync(
|
||||
"sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown'"
|
||||
);
|
||||
return stdout.trim();
|
||||
}
|
||||
// Linux / WSL2
|
||||
const { stdout } = await execAsync(
|
||||
"sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown'"
|
||||
"lscpu 2>/dev/null | grep 'Model name' | sed 's/.*:\\s*//' || cat /proc/cpuinfo | grep 'model name' | head -1 | sed 's/.*: //'"
|
||||
);
|
||||
return stdout.trim();
|
||||
return stdout.trim() || 'Unknown';
|
||||
} catch {
|
||||
return 'Unknown';
|
||||
}
|
||||
@ -55,30 +65,63 @@ async function getOllamaModelsDiskUsage(): Promise<number> {
|
||||
|
||||
async function getGpuInfo(): Promise<string> {
|
||||
try {
|
||||
if (IS_MAC) {
|
||||
const { stdout } = await execAsync(
|
||||
"system_profiler SPDisplaysDataType 2>/dev/null | grep 'Chipset Model' | sed 's/.*: //'",
|
||||
{ timeout: 5000 }
|
||||
);
|
||||
return stdout.trim() || 'Apple Silicon (integrated)';
|
||||
}
|
||||
// Linux / WSL2 — try nvidia-smi first, fall back to lspci
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
'nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1',
|
||||
{ timeout: 3000 }
|
||||
);
|
||||
if (stdout.trim()) return stdout.trim();
|
||||
} catch {
|
||||
/* no nvidia-smi */
|
||||
}
|
||||
const { stdout } = await execAsync(
|
||||
"system_profiler SPDisplaysDataType 2>/dev/null | grep 'Chipset Model' | sed 's/.*: //'",
|
||||
{ timeout: 5000 }
|
||||
"lspci 2>/dev/null | grep -i 'vga\\|3d' | sed 's/.*: //' | head -1"
|
||||
);
|
||||
return stdout.trim() || 'Apple Silicon (integrated)';
|
||||
return stdout.trim() || 'Unknown';
|
||||
} catch {
|
||||
return 'Apple Silicon (integrated)';
|
||||
return IS_MAC ? 'Apple Silicon (integrated)' : 'Unknown';
|
||||
}
|
||||
}
|
||||
|
||||
async function getBrewPackages(): Promise<Array<{ name: string; version: string }>> {
|
||||
const targets = ['ollama', 'whisper-cpp', 'ffmpeg'];
|
||||
const results: Array<{ name: string; version: string }> = [];
|
||||
for (const pkg of targets) {
|
||||
try {
|
||||
const { stdout } = await execFileAsync('brew', ['list', '--versions', pkg], {
|
||||
timeout: 3000,
|
||||
});
|
||||
const parts = stdout.trim().split(' ');
|
||||
if (parts.length >= 2) {
|
||||
results.push({ name: parts[0], version: parts.slice(1).join(' ') });
|
||||
|
||||
if (IS_MAC) {
|
||||
for (const pkg of targets) {
|
||||
try {
|
||||
const { stdout } = await execFileAsync('brew', ['list', '--versions', pkg], {
|
||||
timeout: 3000,
|
||||
});
|
||||
const parts = stdout.trim().split(' ');
|
||||
if (parts.length >= 2) {
|
||||
results.push({ name: parts[0], version: parts.slice(1).join(' ') });
|
||||
}
|
||||
} catch {
|
||||
// not installed
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Linux / WSL2 — check via version commands (ffmpeg uses -version, others use --version)
|
||||
for (const pkg of targets) {
|
||||
const bin = pkg === 'whisper-cpp' ? 'whisper-cli' : pkg;
|
||||
const flag = bin === 'ffmpeg' ? '-version' : '--version';
|
||||
try {
|
||||
const { stdout } = await execAsync(`${bin} ${flag} 2>&1 | head -1`, { timeout: 3000 });
|
||||
if (stdout.trim()) {
|
||||
results.push({ name: pkg, version: stdout.trim() });
|
||||
}
|
||||
} catch {
|
||||
// not installed
|
||||
}
|
||||
} catch {
|
||||
// not installed
|
||||
}
|
||||
}
|
||||
return results;
|
||||
@ -106,7 +149,8 @@ async function getCachedOllamaDiskUsage(): Promise<number> {
|
||||
return value;
|
||||
}
|
||||
|
||||
// macOS vm_stat gives accurate memory breakdown (os.freemem() excludes reclaimable cache)
|
||||
// macOS: vm_stat gives accurate memory breakdown (os.freemem() excludes reclaimable cache)
|
||||
// Linux: /proc/meminfo gives accurate breakdown
|
||||
async function getAccurateMemory(): Promise<{
|
||||
total: number;
|
||||
appMemory: number;
|
||||
@ -115,42 +159,66 @@ async function getAccurateMemory(): Promise<{
|
||||
pressure: string;
|
||||
}> {
|
||||
const totalMem = os.totalmem();
|
||||
try {
|
||||
const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
|
||||
const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
|
||||
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
|
||||
const parse = (label: string): number => {
|
||||
const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
|
||||
return match ? parseInt(match[1]) * pageSize : 0;
|
||||
};
|
||||
const active = parse('Pages active');
|
||||
const wired = parse('Pages wired down');
|
||||
const inactive = parse('Pages inactive');
|
||||
const purgeable = parse('Pages purgeable');
|
||||
const speculative = parse('Pages speculative');
|
||||
const free = parse('Pages free');
|
||||
const compressor = parse('Pages occupied by compressor');
|
||||
|
||||
const appMemory = active + wired + compressor;
|
||||
const cached = inactive + purgeable + speculative;
|
||||
// Return raw free separately from cached — no overlap
|
||||
// available for loading = free + cached (macOS reclaims cached on demand)
|
||||
if (IS_MAC) {
|
||||
try {
|
||||
const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
|
||||
const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
|
||||
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
|
||||
const parse = (label: string): number => {
|
||||
const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
|
||||
return match ? parseInt(match[1]) * pageSize : 0;
|
||||
};
|
||||
const active = parse('Pages active');
|
||||
const wired = parse('Pages wired down');
|
||||
const inactive = parse('Pages inactive');
|
||||
const purgeable = parse('Pages purgeable');
|
||||
const speculative = parse('Pages speculative');
|
||||
const free = parse('Pages free');
|
||||
const compressor = parse('Pages occupied by compressor');
|
||||
|
||||
const ratio = appMemory / totalMem;
|
||||
const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
|
||||
const appMemory = active + wired + compressor;
|
||||
const cached = inactive + purgeable + speculative;
|
||||
|
||||
return { total: totalMem, appMemory, cached, free, pressure };
|
||||
} catch {
|
||||
// Fallback to Node.js (inaccurate on macOS but works everywhere)
|
||||
const freeMem = os.freemem();
|
||||
return {
|
||||
total: totalMem,
|
||||
appMemory: totalMem - freeMem,
|
||||
cached: 0,
|
||||
free: freeMem,
|
||||
pressure: 'unknown',
|
||||
};
|
||||
const ratio = appMemory / totalMem;
|
||||
const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
|
||||
|
||||
return { total: totalMem, appMemory, cached, free, pressure };
|
||||
} catch {
|
||||
// fall through to generic fallback
|
||||
}
|
||||
} else {
|
||||
// Linux / WSL2 — parse /proc/meminfo
|
||||
try {
|
||||
const raw = await readFile('/proc/meminfo', 'utf-8');
|
||||
const parse = (key: string): number => {
|
||||
const match = raw.match(new RegExp(`${key}:\\s+(\\d+)`));
|
||||
return match ? parseInt(match[1]) * 1024 : 0; // /proc/meminfo is in kB
|
||||
};
|
||||
const total = parse('MemTotal');
|
||||
const free = parse('MemFree');
|
||||
const buffers = parse('Buffers');
|
||||
const cached = parse('Cached') + parse('SReclaimable') + buffers;
|
||||
const appMemory = total - free - cached;
|
||||
|
||||
const ratio = appMemory / total;
|
||||
const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
|
||||
|
||||
return { total, appMemory, cached, free, pressure };
|
||||
} catch {
|
||||
// fall through to generic fallback
|
||||
}
|
||||
}
|
||||
|
||||
// Generic fallback (works everywhere but less accurate)
|
||||
const freeMem = os.freemem();
|
||||
return {
|
||||
total: totalMem,
|
||||
appMemory: totalMem - freeMem,
|
||||
cached: 0,
|
||||
free: freeMem,
|
||||
pressure: 'unknown',
|
||||
};
|
||||
}
|
||||
|
||||
export async function GET() {
|
||||
|
||||
@ -6,9 +6,14 @@ import { join, resolve } from 'path';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
const IS_MAC = process.platform === 'darwin';
|
||||
|
||||
// process.cwd() = dashboard/, parent = __LOCAL_LLMs/
|
||||
const LOCAL_LLMS_DIR = resolve(process.cwd(), '..');
|
||||
|
||||
// macOS/Linux: bin/python, Windows native: Scripts/python.exe
|
||||
const VENV_PYTHON = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
|
||||
|
||||
interface TtsEngine {
|
||||
name: string;
|
||||
type: 'ollama' | 'python';
|
||||
@ -67,8 +72,7 @@ async function checkOrpheus(): Promise<TtsEngine> {
|
||||
const snacSize = hasSnac ? await getFileSize(snacPath) : 0;
|
||||
|
||||
// Check Python venv
|
||||
const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
|
||||
const hasVenv = await fileExists(venvPython);
|
||||
const hasVenv = await fileExists(VENV_PYTHON);
|
||||
|
||||
if (hasModel && hasSnac && hasVenv) {
|
||||
engine.status = 'ready';
|
||||
@ -114,13 +118,14 @@ async function checkQwenTts(): Promise<TtsEngine> {
|
||||
}
|
||||
|
||||
const hasTokenizer = await fileExists(join(tokenizerDir, 'config.json'));
|
||||
const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
|
||||
const hasVenv = await fileExists(venvPython);
|
||||
const hasVenv = await fileExists(VENV_PYTHON);
|
||||
|
||||
if (hasModel && hasTokenizer && hasVenv) {
|
||||
engine.status = 'ready';
|
||||
engine.size = `${(modelSize / 1e9).toFixed(1)} GB`;
|
||||
engine.details = '0.6B params · 10 languages · MPS/CPU';
|
||||
engine.details = IS_MAC
|
||||
? '0.6B params · 10 languages · MPS/CPU'
|
||||
: '0.6B params · 10 languages · CUDA/CPU';
|
||||
} else if (hasModel || hasTokenizer) {
|
||||
engine.status = 'partial';
|
||||
const missing: string[] = [];
|
||||
@ -141,22 +146,21 @@ async function checkVenv(): Promise<{
|
||||
python?: string;
|
||||
packages?: string[];
|
||||
}> {
|
||||
const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
|
||||
const exists = await fileExists(venvPython);
|
||||
const exists = await fileExists(VENV_PYTHON);
|
||||
if (!exists) return { exists: false };
|
||||
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`"${venvPython}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`,
|
||||
`"${VENV_PYTHON}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`,
|
||||
{ timeout: 5000 }
|
||||
);
|
||||
return {
|
||||
exists: true,
|
||||
python: venvPython,
|
||||
python: VENV_PYTHON,
|
||||
packages: stdout.trim().split(' '),
|
||||
};
|
||||
} catch {
|
||||
return { exists: true, python: venvPython };
|
||||
return { exists: true, python: VENV_PYTHON };
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -7,9 +7,22 @@ import { homedir } from 'os';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
const IS_MAC = process.platform === 'darwin';
|
||||
|
||||
async function getWhisperBinaries(): Promise<string[]> {
|
||||
try {
|
||||
const { stdout } = await execAsync('ls /opt/homebrew/bin/whisper-* 2>/dev/null');
|
||||
if (IS_MAC) {
|
||||
const { stdout } = await execAsync('ls /opt/homebrew/bin/whisper-* 2>/dev/null');
|
||||
return stdout
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter(Boolean)
|
||||
.map(p => p.split('/').pop() || p);
|
||||
}
|
||||
// Linux / WSL2 — check common locations
|
||||
const { stdout } = await execAsync(
|
||||
'ls /usr/local/bin/whisper-* /usr/bin/whisper-* 2>/dev/null || which whisper-cli 2>/dev/null'
|
||||
);
|
||||
return stdout
|
||||
.trim()
|
||||
.split('\n')
|
||||
@ -25,7 +38,9 @@ const WHISPER_MODEL_DIRS = (process.env.WHISPER_MODELS_DIR || '')
|
||||
.filter(Boolean)
|
||||
.concat([
|
||||
join(homedir(), 'whisper-models'),
|
||||
'/opt/homebrew/share/whisper-cpp/models',
|
||||
...(IS_MAC
|
||||
? ['/opt/homebrew/share/whisper-cpp/models']
|
||||
: ['/usr/local/share/whisper-cpp/models', '/usr/share/whisper-cpp/models']),
|
||||
join(homedir(), '.cache', 'whisper'),
|
||||
]);
|
||||
|
||||
|
||||
@ -3,26 +3,24 @@
|
||||
# TTS Setup — One-Shot Script for Fresh Laptop
|
||||
#
|
||||
# Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python)
|
||||
# on Apple Silicon Macs. Works through corporate proxy.
|
||||
# on macOS (Apple Silicon) or Linux (CUDA GPU / WSL2).
|
||||
#
|
||||
# What this does:
|
||||
# 1. Installs Python 3.12 via Homebrew (if missing)
|
||||
# 2. Creates Python venv with TTS packages
|
||||
# 1. Installs Python 3.12 (Homebrew on macOS, apt on Linux)
|
||||
# 2. Creates Python venv with TTS packages (MPS on macOS, CUDA on Linux)
|
||||
# 3. Pulls Orpheus TTS model via Ollama
|
||||
# 4. Downloads SNAC audio decoder via hf-mirror.com
|
||||
# 5. (Optional) Downloads Qwen3-TTS 0.6B via hf-mirror.com
|
||||
# 4. Downloads SNAC audio decoder
|
||||
# 5. (Optional) Downloads Qwen3-TTS 0.6B
|
||||
#
|
||||
# Prerequisites:
|
||||
# - macOS with Apple Silicon (M1/M2/M3/M4)
|
||||
# - Homebrew installed
|
||||
# - Ollama installed (brew install ollama)
|
||||
# macOS: Homebrew + Ollama installed
|
||||
# Linux: apt + Ollama accessible at localhost:11434
|
||||
#
|
||||
# Usage:
|
||||
# bash setup-tts.sh
|
||||
#
|
||||
# After setup, test with:
|
||||
# .venv-qwen-tts/bin/python test_orpheus_tts.py
|
||||
# afplay test_orpheus_tara.wav
|
||||
# ============================================================
|
||||
set -e
|
||||
|
||||
@ -31,7 +29,13 @@ VENV="$SCRIPT_DIR/.venv-qwen-tts"
|
||||
MODELS_DIR="$SCRIPT_DIR/models"
|
||||
|
||||
# HuggingFace mirror that works through corporate proxy
|
||||
HF_MIRROR="https://hf-mirror.com"
|
||||
# On personal machines, set HF_MIRROR=https://huggingface.co to download directly
|
||||
HF_MIRROR="${HF_MIRROR:-https://hf-mirror.com}"
|
||||
|
||||
# Detect OS
|
||||
OS_TYPE="$(uname -s)"
|
||||
IS_MAC=false
|
||||
[ "$OS_TYPE" = "Darwin" ] && IS_MAC=true
|
||||
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
@ -52,34 +56,58 @@ echo ""
|
||||
# ── 0. Check prerequisites ──────────────────────────────────
|
||||
step "Checking prerequisites"
|
||||
|
||||
# Homebrew
|
||||
if ! command -v brew &>/dev/null; then
|
||||
fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
|
||||
fi
|
||||
ok "Homebrew"
|
||||
if $IS_MAC; then
|
||||
# Homebrew
|
||||
if ! command -v brew &>/dev/null; then
|
||||
fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
|
||||
fi
|
||||
ok "Homebrew"
|
||||
|
||||
# Ollama
|
||||
if ! command -v ollama &>/dev/null; then
|
||||
warn "Ollama not found. Installing..."
|
||||
brew install ollama
|
||||
# Ollama (install via Homebrew if missing)
|
||||
if ! command -v ollama &>/dev/null; then
|
||||
warn "Ollama not found. Installing..."
|
||||
brew install ollama
|
||||
fi
|
||||
else
|
||||
# Linux / WSL2 — Ollama should be installed on host or via install script
|
||||
if ! command -v ollama &>/dev/null; then
|
||||
# On WSL2 Ollama runs on the Windows side; check if reachable
|
||||
if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
|
||||
fail "Ollama not found and not reachable at localhost:11434. Install Ollama on Windows or run: curl -fsSL https://ollama.com/install.sh | sh"
|
||||
fi
|
||||
ok "Ollama reachable at localhost:11434 (Windows host)"
|
||||
fi
|
||||
fi
|
||||
ok "Ollama installed"
|
||||
|
||||
# Check if Ollama is running
|
||||
if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
|
||||
warn "Ollama not running. Starting..."
|
||||
ollama serve &>/dev/null &
|
||||
sleep 3
|
||||
if command -v ollama &>/dev/null; then
|
||||
ollama serve &>/dev/null &
|
||||
sleep 3
|
||||
fi
|
||||
if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
|
||||
fail "Could not start Ollama. Try manually: ollama serve"
|
||||
fi
|
||||
fi
|
||||
ok "Ollama running on port 11434"
|
||||
|
||||
# Apple Silicon check
|
||||
# GPU check
|
||||
ARCH=$(uname -m)
|
||||
if [ "$ARCH" != "arm64" ]; then
|
||||
warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available."
|
||||
if $IS_MAC; then
|
||||
if [ "$ARCH" != "arm64" ]; then
|
||||
warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available."
|
||||
else
|
||||
ok "Apple Silicon ($ARCH) — MPS acceleration available"
|
||||
fi
|
||||
else
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
|
||||
ok "NVIDIA GPU detected: $GPU_NAME — CUDA acceleration available"
|
||||
else
|
||||
warn "nvidia-smi not found. CUDA acceleration won't be available (CPU fallback)."
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 1. Install Python 3.12 ──────────────────────────────────
|
||||
@ -87,7 +115,7 @@ step "Python 3.12"
|
||||
|
||||
PYTHON_CMD=""
|
||||
# Check various Python 3.12 locations
|
||||
for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do
|
||||
for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12 python3; do
|
||||
if command -v "$cmd" &>/dev/null; then
|
||||
PYTHON_CMD="$cmd"
|
||||
break
|
||||
@ -95,9 +123,15 @@ for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do
|
||||
done
|
||||
|
||||
if [ -z "$PYTHON_CMD" ]; then
|
||||
warn "Python 3.12 not found. Installing via Homebrew..."
|
||||
brew install python@3.12
|
||||
PYTHON_CMD="/opt/homebrew/bin/python3.12"
|
||||
if $IS_MAC; then
|
||||
warn "Python 3.12 not found. Installing via Homebrew..."
|
||||
brew install python@3.12
|
||||
PYTHON_CMD="/opt/homebrew/bin/python3.12"
|
||||
else
|
||||
warn "Python 3.12 not found. Installing via apt..."
|
||||
sudo apt update && sudo apt install -y python3.12 python3.12-venv python3-pip
|
||||
PYTHON_CMD="python3.12"
|
||||
fi
|
||||
fi
|
||||
|
||||
PYTHON_VER=$("$PYTHON_CMD" --version 2>&1)
|
||||
@ -123,7 +157,15 @@ if "$VENV/bin/python" -c "import snac" &>/dev/null; then
|
||||
else
|
||||
echo "Installing packages (this may take a few minutes)..."
|
||||
"$VENV/bin/pip" install -U pip --quiet
|
||||
"$VENV/bin/pip" install -U snac qwen-tts --quiet
|
||||
if $IS_MAC; then
|
||||
# macOS: default PyTorch includes MPS support
|
||||
"$VENV/bin/pip" install -U snac qwen-tts --quiet
|
||||
else
|
||||
# Linux: install PyTorch with CUDA first, then snac/qwen-tts
|
||||
echo "Installing PyTorch with CUDA support..."
|
||||
"$VENV/bin/pip" install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --quiet
|
||||
"$VENV/bin/pip" install -U snac qwen-tts --quiet
|
||||
fi
|
||||
ok "Packages installed"
|
||||
fi
|
||||
|
||||
@ -144,7 +186,11 @@ step "SNAC 24kHz audio decoder (~76 MB)"
|
||||
mkdir -p "$MODELS_DIR/snac_24khz"
|
||||
|
||||
if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
|
||||
SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
|
||||
if $IS_MAC; then
|
||||
SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
|
||||
else
|
||||
SIZE=$(stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
|
||||
fi
|
||||
if [ "$SIZE" -gt 1000000 ]; then
|
||||
ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
|
||||
else
|
||||
@ -247,7 +293,11 @@ du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /'
|
||||
echo ""
|
||||
echo "Test commands:"
|
||||
echo " $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py"
|
||||
echo " afplay test_orpheus_tara.wav"
|
||||
if $IS_MAC; then
|
||||
echo " afplay test_orpheus_tara.wav"
|
||||
else
|
||||
echo " aplay test_orpheus_tara.wav (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)"
|
||||
fi
|
||||
if [ -d "$QWEN_MODEL_DIR" ]; then
|
||||
echo " $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py"
|
||||
fi
|
||||
|
||||
@ -13,6 +13,7 @@ Usage:
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import struct
|
||||
@ -166,7 +167,7 @@ def main():
|
||||
|
||||
# Voices: tara, leah, jess, leo, dan, mia, zac, zoe
|
||||
tests = [
|
||||
("Hello! This is Orpheus text to speech, running entirely on your Mac through Ollama.", "tara"),
|
||||
("Hello! This is Orpheus text to speech, running entirely locally through Ollama.", "tara"),
|
||||
("<laugh> That's amazing! Local AI speech generation without any cloud services!", "leo"),
|
||||
]
|
||||
|
||||
@ -182,7 +183,10 @@ def main():
|
||||
save_wav(audio, sr, outpath)
|
||||
|
||||
print("\n=== Done! Open the .wav files to listen. ===")
|
||||
print("Play with: afplay test_orpheus_tara.wav")
|
||||
if sys.platform == "darwin":
|
||||
print("Play with: afplay test_orpheus_tara.wav")
|
||||
else:
|
||||
print("Play with: aplay test_orpheus_tara.wav (or: ffplay -nodisp -autoexit test_orpheus_tara.wav)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback).
|
||||
Test Qwen3-TTS 0.6B (CUDA, MPS, or CPU fallback).
|
||||
|
||||
Prerequisites:
|
||||
bash setup-tts.sh (one-shot: installs everything)
|
||||
@ -24,8 +24,12 @@ if not os.path.isdir(MODEL_PATH):
|
||||
print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)")
|
||||
raise SystemExit(1)
|
||||
|
||||
# Pick device: MPS if available, else CPU
|
||||
if torch.backends.mps.is_available():
|
||||
# Pick device: CUDA > MPS > CPU
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
dtype = torch.float16
|
||||
print(f"Using CUDA ({torch.cuda.get_device_name(0)})")
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
dtype = torch.float32 # MPS doesn't support bfloat16
|
||||
print(f"Using MPS (Apple Metal GPU)")
|
||||
@ -48,7 +52,7 @@ print(f"Supported speakers: {model.get_supported_speakers()}")
|
||||
print(f"Supported languages: {model.get_supported_languages()}")
|
||||
|
||||
# Test 1: English with a built-in speaker
|
||||
text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac."
|
||||
text = f"Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your {'Mac' if device == 'mps' else 'machine'} using {device.upper()}."
|
||||
print(f"\nGenerating speech for: {text[:60]}...")
|
||||
|
||||
t1 = time.time()
|
||||
|
||||
@ -1,372 +1,250 @@
|
||||
# Windows Setup Guide — Local LLM Stack on Razer Blade 18
|
||||
|
||||
> **Hardware:** Razer Blade 18 · Intel Core Ultra 9 275HX · RTX 5090 24 GB GDDR7 · 64 GB DDR5 · 4 TB NVMe
|
||||
> **OS:** Windows 11 Home
|
||||
> **OS:** Windows 11 Home + WSL2 (Ubuntu)
|
||||
> **Goal:** Mirror the macOS `__LOCAL_LLMs` stack — Ollama, Whisper, TTS (Orpheus + Qwen3), Mission Control dashboard
|
||||
> **See also:** [razer-blade-18-spec.md](razer-blade-18-spec.md) for full hardware specs
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
## Architecture: Windows-Native + WSL2
|
||||
|
||||
### 1. Windows Package Manager
|
||||
|
||||
Install **winget** (ships with Windows 11) and optionally **Scoop** for CLI tools:
|
||||
|
||||
```powershell
|
||||
# Verify winget
|
||||
winget --version
|
||||
|
||||
# Install Scoop (optional, useful for dev tools)
|
||||
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
Invoke-RestMethod -Uri https://get.scoop.sh | Invoke-Expression
|
||||
```
|
||||
┌────────────────────────────────────────────────────────┐
|
||||
│ Windows 11 │
|
||||
│ ├── NVIDIA drivers + CUDA (native) │
|
||||
│ ├── Ollama (native Windows service, port 11434) │
|
||||
│ └── Browser → http://localhost:3000 │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────┐ │
|
||||
│ │ WSL2 (Ubuntu 24.04) │ │
|
||||
│ │ ├── Node.js, Python 3.12, ffmpeg, git │ │
|
||||
│ │ ├── __LOCAL_LLMs/ (cloned here) │ │
|
||||
│ │ │ ├── dashboard/ → npm run dev (port 3000) │ │
|
||||
│ │ │ ├── setup-tts.sh (works as-is) │ │
|
||||
│ │ │ ├── start-dashboard.sh (works as-is) │ │
|
||||
│ │ │ └── models/ (SNAC, Qwen3-TTS) │ │
|
||||
│ │ ├── whisper-cpp (CUDA build) │ │
|
||||
│ │ └── .venv-qwen-tts/ (PyTorch CUDA) │ │
|
||||
│ └──────────────────────────────────────────────────┘ │
|
||||
└────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 2. NVIDIA CUDA Toolkit
|
||||
**Why WSL2?** All existing bash scripts, Python venvs, and Node.js tooling work identically to macOS — zero porting. The dashboard API routes auto-detect macOS vs Linux at runtime via `process.platform`.
|
||||
|
||||
The RTX 5090 needs the latest CUDA drivers for GPU-accelerated inference.
|
||||
---
|
||||
|
||||
## Phase 1: Windows-Native Setup
|
||||
|
||||
### 1. NVIDIA Drivers
|
||||
|
||||
```powershell
|
||||
# Install NVIDIA drivers (latest Game Ready or Studio)
|
||||
winget install --id Nvidia.GeForceExperience
|
||||
|
||||
# Install CUDA Toolkit (required for PyTorch CUDA)
|
||||
winget install --id Nvidia.CUDA
|
||||
# Or download from: https://developer.nvidia.com/cuda-downloads
|
||||
# Install latest NVIDIA Game Ready or Studio drivers
|
||||
# Download from: https://www.nvidia.com/Download/index.aspx
|
||||
|
||||
# Verify
|
||||
nvidia-smi
|
||||
# Should show: RTX 5090, 24 GB VRAM, CUDA 13.x+
|
||||
```
|
||||
|
||||
Expected output should show:
|
||||
### 2. Ollama (Windows-Native)
|
||||
|
||||
- **RTX 5090** with **24 GB** VRAM
|
||||
- CUDA version 13.x+
|
||||
|
||||
### 3. Node.js (for Mission Control Dashboard)
|
||||
|
||||
```powershell
|
||||
winget install --id OpenJS.NodeJS.LTS
|
||||
# Verify
|
||||
node --version # should be 20.x+
|
||||
npm --version
|
||||
```
|
||||
|
||||
### 4. Python 3.12
|
||||
|
||||
```powershell
|
||||
winget install --id Python.Python.3.12
|
||||
# Verify
|
||||
python --version
|
||||
pip --version
|
||||
```
|
||||
|
||||
### 5. Git
|
||||
|
||||
```powershell
|
||||
winget install --id Git.Git
|
||||
```
|
||||
|
||||
### 6. ffmpeg
|
||||
|
||||
```powershell
|
||||
winget install --id Gyan.FFmpeg
|
||||
# Or: scoop install ffmpeg
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 1. Ollama — LLM Server
|
||||
|
||||
### Install
|
||||
Ollama runs natively on Windows and is accessible from WSL2 at `localhost:11434`.
|
||||
|
||||
```powershell
|
||||
winget install --id Ollama.Ollama
|
||||
```
|
||||
|
||||
Ollama for Windows runs as a background service and automatically uses CUDA (RTX 5090).
|
||||
|
||||
### Verify
|
||||
|
||||
```powershell
|
||||
ollama --version
|
||||
curl http://localhost:11434/api/tags
|
||||
```
|
||||
|
||||
### Download Models
|
||||
|
||||
```powershell
|
||||
# Coding
|
||||
ollama pull qwen2.5-coder:32b # 19 GB — primary coding model
|
||||
ollama pull qwen2.5-coder:7b # 4.7 GB — fast coding
|
||||
|
||||
# Reasoning
|
||||
ollama pull deepseek-r1:32b # 19 GB — chain-of-thought
|
||||
|
||||
# General
|
||||
ollama pull llama3.1:8b # 4.9 GB — fast general tasks
|
||||
|
||||
# TTS
|
||||
ollama pull sematre/orpheus:en # 4 GB — text-to-speech (8 voices)
|
||||
|
||||
# Verify
|
||||
ollama list
|
||||
ollama --version
|
||||
```
|
||||
|
||||
> **Note:** With 24 GB VRAM, Ollama will offload 32B models almost entirely to GPU.
|
||||
> On macOS (48 GB unified), the 32B models run in shared CPU/GPU memory.
|
||||
> On this machine, **GPU inference will be significantly faster** for models that fit in 24 GB VRAM.
|
||||
### 3. Pull Models (from Windows or WSL2)
|
||||
|
||||
### VRAM Budget (RTX 5090 — 24 GB)
|
||||
```bash
|
||||
ollama pull qwen2.5-coder:32b # 19 GB — primary coding model
|
||||
ollama pull qwen2.5-coder:7b # 4.7 GB — fast coding
|
||||
ollama pull deepseek-r1:32b # 19 GB — chain-of-thought
|
||||
ollama pull llama3.1:8b # 4.9 GB — fast general tasks
|
||||
ollama pull sematre/orpheus:en # 4 GB — text-to-speech (8 voices)
|
||||
|
||||
| Model | VRAM Usage | Fits in GPU? |
|
||||
| ---------------------------- | ---------- | ------------ |
|
||||
| llama3.1:8b | ~5 GB | ✅ Fully |
|
||||
| qwen2.5-coder:7b | ~5 GB | ✅ Fully |
|
||||
| sematre/orpheus:en | ~4 GB | ✅ Fully |
|
||||
| qwen2.5-coder:32b | ~19 GB | ✅ Fully |
|
||||
| deepseek-r1:32b | ~19 GB | ✅ Fully |
|
||||
| Two 7B models simultaneously | ~10 GB | ✅ Both fit |
|
||||
ollama list # verify all 5 models
|
||||
```
|
||||
|
||||
### 4. Install WSL2
|
||||
|
||||
```powershell
|
||||
# From PowerShell (Admin)
|
||||
wsl --install -d Ubuntu-24.04
|
||||
# Reboot if prompted, then set up username/password
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Whisper.cpp — Speech-to-Text
|
||||
## Phase 2: WSL2 Setup
|
||||
|
||||
### Option A: Pre-built Binary (Recommended)
|
||||
### 1. Install Dependencies
|
||||
|
||||
Download the latest release from GitHub:
|
||||
```bash
|
||||
# Update
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
|
||||
```powershell
|
||||
# Create whisper directory
|
||||
mkdir "$env:USERPROFILE\whisper-cpp"
|
||||
cd "$env:USERPROFILE\whisper-cpp"
|
||||
# Node.js 20 LTS
|
||||
curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -
|
||||
sudo apt install -y nodejs
|
||||
|
||||
# Download latest release (CUDA build)
|
||||
# Check: https://github.com/ggerganov/whisper.cpp/releases
|
||||
# Look for: whisper-cublas-bin-x64.zip or whisper-cuda-bin-x64.zip
|
||||
# Python 3.12
|
||||
sudo apt install -y python3.12 python3.12-venv python3-pip
|
||||
|
||||
# Build tools + ffmpeg
|
||||
sudo apt install -y ffmpeg git curl build-essential cmake
|
||||
|
||||
# Verify
|
||||
node --version # 20.x+
|
||||
python3.12 --version
|
||||
nvidia-smi # should show RTX 5090 (GPU passthrough from Windows)
|
||||
```
|
||||
|
||||
### Option B: Build from Source (CUDA)
|
||||
> **Important:** Do NOT install NVIDIA drivers inside WSL2. The Windows-side driver handles GPU passthrough automatically.
|
||||
|
||||
```powershell
|
||||
### 2. Clone Repo
|
||||
|
||||
```bash
|
||||
mkdir -p ~/code/mygh && cd ~/code/mygh
|
||||
git clone https://github.com/saravanakumardb1/learning_ai_common_plat.git
|
||||
cd learning_ai_common_plat/__LOCAL_LLMs
|
||||
```
|
||||
|
||||
> **Performance note:** Always clone inside WSL2 filesystem (`~/code/...`), NOT in `/mnt/c/` — the Windows filesystem bridge is very slow for `node_modules`.
|
||||
|
||||
### 3. Whisper.cpp (CUDA build)
|
||||
|
||||
```bash
|
||||
cd ~
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp
|
||||
cmake -B build -DGGML_CUDA=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
sudo cp build/bin/whisper-cli /usr/local/bin/
|
||||
|
||||
### Download Whisper Model
|
||||
|
||||
```powershell
|
||||
mkdir "$env:USERPROFILE\whisper-models"
|
||||
|
||||
# Download ggml-large-v3-turbo (1.5 GB)
|
||||
curl -L -o "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" `
|
||||
# Download model (1.5 GB)
|
||||
mkdir -p ~/whisper-models
|
||||
curl -L -o ~/whisper-models/ggml-large-v3-turbo.bin \
|
||||
"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin"
|
||||
|
||||
# Verify
|
||||
whisper-cli --version
|
||||
```
|
||||
|
||||
> **No corporate proxy on this machine** — download directly from `huggingface.co`.
|
||||
> The `hf-mirror.com` workaround is only needed on the corporate MacBook.
|
||||
|
||||
### Verify
|
||||
### 4. TTS Setup (One-Shot)
|
||||
|
||||
```powershell
|
||||
# Test transcription
|
||||
whisper-cli -m "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" -f test.wav
|
||||
```bash
|
||||
cd ~/code/mygh/learning_ai_common_plat/__LOCAL_LLMs
|
||||
|
||||
# Works exactly like macOS — downloads SNAC, Qwen3-TTS, creates venv
|
||||
bash setup-tts.sh
|
||||
```
|
||||
|
||||
The script detects macOS vs Linux and installs the correct PyTorch variant (MPS on macOS, CUDA on Linux). On a personal machine, override the default HuggingFace mirror: `HF_MIRROR=https://huggingface.co bash setup-tts.sh`
|
||||
|
||||
### 5. Start Dashboard
|
||||
|
||||
```bash
|
||||
bash start-dashboard.sh
|
||||
# Open http://localhost:3000 in Windows browser
|
||||
```
|
||||
|
||||
WSL2 automatically forwards ports — the dashboard is accessible from Windows at `localhost:3000`.
|
||||
|
||||
---
|
||||
|
||||
## 3. TTS — Orpheus + Qwen3-TTS
|
||||
## Key Differences: macOS vs WSL2
|
||||
|
||||
### 3a. Orpheus TTS (via Ollama)
|
||||
|
||||
Already handled in Step 1 (`ollama pull sematre/orpheus:en`).
|
||||
|
||||
### 3b. SNAC Decoder
|
||||
|
||||
```powershell
|
||||
# Create models directory (match macOS layout)
|
||||
$MODELS = "$PSScriptRoot\models" # or wherever you clone the repo
|
||||
mkdir "$MODELS\snac_24khz" -Force
|
||||
|
||||
# Download SNAC decoder
|
||||
curl -L -o "$MODELS\snac_24khz\config.json" `
|
||||
"https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/config.json"
|
||||
curl -L -o "$MODELS\snac_24khz\pytorch_model.bin" `
|
||||
"https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
|
||||
```
|
||||
|
||||
### 3c. Python Venv + Dependencies
|
||||
|
||||
```powershell
|
||||
cd __LOCAL_LLMs
|
||||
|
||||
# Create venv
|
||||
python -m venv .venv-qwen-tts
|
||||
|
||||
# Activate (Windows uses Scripts, not bin)
|
||||
.\.venv-qwen-tts\Scripts\Activate.ps1
|
||||
|
||||
# Install PyTorch with CUDA (NOT MPS — that's Apple only)
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
|
||||
# Install other deps
|
||||
pip install snac numpy soundfile
|
||||
|
||||
# Verify CUDA
|
||||
python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0)}')"
|
||||
# Expected: CUDA: True, Device: NVIDIA GeForce RTX 5090 Laptop GPU
|
||||
```
|
||||
|
||||
### 3d. Qwen3-TTS 0.6B
|
||||
|
||||
```powershell
|
||||
$MODELS = ".\models"
|
||||
|
||||
# Tokenizer (~650 MB)
|
||||
mkdir "$MODELS\Qwen3-TTS-Tokenizer-12Hz" -Force
|
||||
foreach ($f in @("config.json", "configuration.json", "preprocessor_config.json")) {
|
||||
curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\$f" `
|
||||
"https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f"
|
||||
}
|
||||
curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\model.safetensors" `
|
||||
"https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors"
|
||||
|
||||
# Model weights (~1.8 GB)
|
||||
mkdir "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice" -Force
|
||||
foreach ($f in @("config.json", "generation_config.json")) {
|
||||
curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\$f" `
|
||||
"https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f"
|
||||
}
|
||||
curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\model.safetensors" `
|
||||
"https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors"
|
||||
```
|
||||
|
||||
### 3e. Test TTS
|
||||
|
||||
```powershell
|
||||
# Activate venv
|
||||
.\.venv-qwen-tts\Scripts\Activate.ps1
|
||||
|
||||
# Orpheus TTS test
|
||||
python test_orpheus_tts.py
|
||||
|
||||
# Qwen3-TTS test
|
||||
python test_qwen_tts.py
|
||||
```
|
||||
|
||||
> **Key difference from macOS:** Qwen3-TTS will use **CUDA** instead of MPS.
|
||||
> In `test_qwen_tts.py`, the device selection `torch.device("mps")` will fall through to CUDA automatically
|
||||
> since `torch.backends.mps.is_available()` returns False on Windows.
|
||||
> You may want to update the device logic to prefer CUDA:
|
||||
>
|
||||
> ```python
|
||||
> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
> ```
|
||||
|
||||
---
|
||||
|
||||
## 4. Mission Control Dashboard
|
||||
|
||||
```powershell
|
||||
cd __LOCAL_LLMs\dashboard
|
||||
|
||||
# Install dependencies
|
||||
npm install
|
||||
|
||||
# Start dev server
|
||||
npm run dev
|
||||
# Open http://localhost:3000
|
||||
```
|
||||
|
||||
The dashboard is pure Next.js — works identically on Windows. The API routes auto-detect:
|
||||
|
||||
- **Ollama** at `localhost:11434`
|
||||
- **Whisper** models in `%USERPROFILE%\whisper-models\`
|
||||
- **TTS** engines (Orpheus, Qwen3-TTS) and Python venv
|
||||
|
||||
### Start Script (PowerShell)
|
||||
|
||||
Use the bash script equivalent:
|
||||
|
||||
```powershell
|
||||
# Quick start (manual)
|
||||
ollama serve # if not already running as service
|
||||
cd __LOCAL_LLMs\dashboard
|
||||
npm run dev
|
||||
```
|
||||
|
||||
> TODO: Create `start-dashboard.ps1` as a PowerShell equivalent of `start-dashboard.sh`
|
||||
|
||||
---
|
||||
|
||||
## 5. Key Differences: macOS vs Windows
|
||||
|
||||
| Area | macOS (M4 Pro 48 GB) | Windows (Razer Blade 18) |
|
||||
| ------------------- | ----------------------------------- | ------------------------------------- |
|
||||
| **GPU** | Apple Silicon (unified memory, MPS) | RTX 5090 (24 GB VRAM, CUDA) |
|
||||
| **Ollama GPU** | Automatic (Metal) | Automatic (CUDA) |
|
||||
| **VRAM** | Shared from 48 GB RAM | Dedicated 24 GB GDDR7 |
|
||||
| **PyTorch device** | `mps` | `cuda` |
|
||||
| **Whisper install** | `brew install whisper-cpp` | Build from source or download release |
|
||||
| **Python venv** | `bin/activate` | `Scripts\Activate.ps1` |
|
||||
| **Package manager** | Homebrew | winget / scoop |
|
||||
| **Shell** | zsh / bash | PowerShell / cmd |
|
||||
| **Scripts** | `.sh` (bash) | `.ps1` (PowerShell) |
|
||||
| **Model download** | `hf-mirror.com` (corporate proxy) | `huggingface.co` (no proxy) |
|
||||
| **Dashboard** | Identical | Identical |
|
||||
| **Ollama models** | Identical | Identical |
|
||||
| Area | macOS (any Mac) | WSL2 (any Linux) |
|
||||
| ---------------------- | --------------------------- | -------------------------------------- |
|
||||
| **GPU** | Apple Silicon (MPS) | NVIDIA (CUDA) |
|
||||
| **Ollama** | macOS native (Metal) | Windows native, accessed via localhost |
|
||||
| **PyTorch device** | `mps` | `cuda` |
|
||||
| **Whisper install** | `brew install whisper-cpp` | Build from source with CUDA |
|
||||
| **Package manager** | Homebrew | apt |
|
||||
| **Shell scripts** | Work as-is | Work as-is |
|
||||
| **Python venv path** | `bin/python` | `bin/python` (same) |
|
||||
| **Dashboard** | Identical | Identical |
|
||||
| **Ollama models path** | `~/.ollama/models/` | Windows `%USERPROFILE%\.ollama\` |
|
||||
| **Model download** | `hf-mirror.com` (corporate) | `huggingface.co` (direct) |
|
||||
|
||||
### Performance Expectations
|
||||
|
||||
| Workload | macOS M4 Pro 48 GB | Razer RTX 5090 24 GB |
|
||||
| --------------------------- | ---------------------------- | ------------------------- |
|
||||
| qwen2.5-coder:32b inference | ~15–25 tok/s (MPS/CPU blend) | ~40–60 tok/s (full CUDA) |
|
||||
| Whisper large-v3-turbo | ~2–4x realtime (CPU) | ~8–15x realtime (CUDA) |
|
||||
| Orpheus TTS | ~realtime (CPU decode) | ~2–3x realtime (CUDA) |
|
||||
| Qwen3-TTS | ~realtime (MPS) | ~2–4x realtime (CUDA) |
|
||||
| 70B quantized models | Fits in 48 GB (slow) | Partially offloads to RAM |
|
||||
| Workload | macOS M4 Pro 48 GB | Razer RTX 5090 24 GB |
|
||||
| --------------------------- | -------------------- | ------------------------------- |
|
||||
| qwen2.5-coder:32b inference | ~15–25 tok/s | ~40–60 tok/s |
|
||||
| Whisper large-v3-turbo | ~2–4x realtime | ~8–15x realtime |
|
||||
| Orpheus TTS | ~realtime | ~2–3x realtime |
|
||||
| Qwen3-TTS | ~realtime (MPS) | ~2–4x realtime (CUDA) |
|
||||
| 70B quantized models | Fits in 48 GB (slow) | Partially offloads to 64 GB RAM |
|
||||
|
||||
### VRAM Budget (RTX 5090 — 24 GB)
|
||||
|
||||
| Model | VRAM Usage | Fits in GPU? |
|
||||
| ------------------ | ---------- | ------------ |
|
||||
| llama3.1:8b | ~5 GB | ✅ Fully |
|
||||
| qwen2.5-coder:7b | ~5 GB | ✅ Fully |
|
||||
| sematre/orpheus:en | ~4 GB | ✅ Fully |
|
||||
| qwen2.5-coder:32b | ~19 GB | ✅ Fully |
|
||||
| deepseek-r1:32b | ~19 GB | ✅ Fully |
|
||||
|
||||
---
|
||||
|
||||
## 6. File Layout (Same as macOS)
|
||||
## Quick Reference — Full Setup Checklist
|
||||
|
||||
### Windows Side
|
||||
|
||||
```
|
||||
__LOCAL_LLMs/
|
||||
├── dashboard/ ← Mission Control (port 3000) — works as-is
|
||||
├── models/ ← TTS model weights (gitignored)
|
||||
│ ├── snac_24khz/
|
||||
│ ├── Qwen3-TTS-Tokenizer-12Hz/
|
||||
│ └── Qwen3-TTS-12Hz-0.6B-CustomVoice/
|
||||
├── .venv-qwen-tts/ ← Python venv (Scripts\ on Windows)
|
||||
├── test_orpheus_tts.py ← works as-is (device fallback)
|
||||
├── test_qwen_tts.py ← update device to prefer CUDA
|
||||
├── windows_specific/
|
||||
│ ├── razer-blade-18-spec.md ← hardware spec
|
||||
│ └── setup-guide.md ← this file
|
||||
└── docs/ ← macOS-focused docs (still useful as reference)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Quick Reference — Full Setup Checklist
|
||||
|
||||
```
|
||||
[ ] Install NVIDIA drivers + CUDA Toolkit
|
||||
[ ] Install NVIDIA drivers (Game Ready or Studio)
|
||||
[ ] Install Ollama (winget install Ollama.Ollama)
|
||||
[ ] Pull models: qwen2.5-coder:32b, deepseek-r1:32b, llama3.1:8b, orpheus
|
||||
[ ] Install Node.js 20+ (winget)
|
||||
[ ] Install Python 3.12 (winget)
|
||||
[ ] Install Git (winget)
|
||||
[ ] Install ffmpeg (winget)
|
||||
[ ] Clone repo
|
||||
[ ] Download Whisper model to %USERPROFILE%\whisper-models\
|
||||
[ ] Build or download whisper-cpp with CUDA
|
||||
[ ] Create Python venv + install PyTorch CUDA + snac
|
||||
[ ] Download SNAC decoder
|
||||
[ ] Download Qwen3-TTS tokenizer + model
|
||||
[ ] npm install in dashboard/
|
||||
[ ] Run dashboard: npm run dev
|
||||
[ ] Pull all 5 models
|
||||
[ ] Install WSL2 (wsl --install -d Ubuntu-24.04)
|
||||
```
|
||||
|
||||
### WSL2 Side
|
||||
|
||||
```
|
||||
[ ] Install Node.js 20+, Python 3.12, ffmpeg, git, cmake
|
||||
[ ] Verify nvidia-smi shows RTX 5090
|
||||
[ ] Clone repo into ~/code/mygh/
|
||||
[ ] Build whisper-cpp with CUDA
|
||||
[ ] Download Whisper model to ~/whisper-models/
|
||||
[ ] Run: bash setup-tts.sh
|
||||
[ ] Run: bash start-dashboard.sh
|
||||
[ ] Verify: http://localhost:3000 shows all green
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Ollama not accessible from WSL2
|
||||
|
||||
```bash
|
||||
curl http://localhost:11434/api/tags
|
||||
# If fails, check Windows firewall or try:
|
||||
curl http://$(hostname).local:11434/api/tags
|
||||
```
|
||||
|
||||
### CUDA not visible in WSL2
|
||||
|
||||
```bash
|
||||
nvidia-smi
|
||||
# If "command not found":
|
||||
# 1. Update Windows NVIDIA drivers to latest
|
||||
# 2. Run: wsl --update
|
||||
# 3. Do NOT install nvidia-driver-* inside WSL2
|
||||
```
|
||||
|
||||
### Slow filesystem performance
|
||||
|
||||
```bash
|
||||
# Clone repos inside WSL2 filesystem: ~/code/...
|
||||
# NOT in /mnt/c/ (Windows→WSL bridge is ~10x slower for node_modules)
|
||||
```
|
||||
|
||||
Loading…
Reference in New Issue
Block a user