diff --git a/.gitignore b/.gitignore index f6b6f99e..d7696fdf 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,8 @@ coverage/ *.key kv.txt kv_azure.txt + +# Local LLM models & venvs +__LOCAL_LLMs/models/ +__LOCAL_LLMs/.venv-*/ +__LOCAL_LLMs/*.wav diff --git a/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/components/MemoryDrilldown.tsx b/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/components/MemoryDrilldown.tsx new file mode 100644 index 00000000..7682bfc2 --- /dev/null +++ b/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/components/MemoryDrilldown.tsx @@ -0,0 +1,267 @@ +'use client'; + +import { useState, useEffect } from 'react'; +import { RefreshCw, Cpu, HardDrive, Archive, Layers, Zap } from 'lucide-react'; +import { formatBytes } from '../../../lib/format'; +import { ProgressBar } from '../../../components/ProgressBar'; + +interface VmCategory { + active: number; + wired: number; + compressor: number; + inactive: number; + purgeable: number; + speculative: number; + free: number; +} + +interface GroupedProcess { + name: string; + rss: number; + pctMem: number; + count: number; + pids: number[]; +} + +interface MemoryDrilldownData { + totalRam: number; + categories: VmCategory; + processes: GroupedProcess[]; +} + +const CATEGORY_META: Record< + keyof VmCategory, + { label: string; color: string; description: string } +> = { + active: { + label: 'Active', + color: 'var(--accent-primary)', + description: 'Pages recently used by apps', + }, + wired: { + label: 'Wired', + color: 'var(--danger)', + description: 'Kernel & drivers — cannot be paged out', + }, + compressor: { + label: 'Compressed', + color: 'var(--warning)', + description: 'Pages compressed to save RAM (still counts as used)', + }, + inactive: { + label: 'Inactive', + color: 'var(--accent-secondary)', + description: 'Recently freed — reclaimable on demand', + }, + purgeable: { + label: 'Purgeable', + color: 'var(--purple)', + description: 'Cache that macOS can discard immediately', + }, + speculative: { + label: 'Speculative', + color: 'var(--text-tertiary)', + description: 'Pre-fetched pages — reclaimable', + }, + free: { + label: 'Free', + color: 'var(--success)', + description: 'Unused pages — immediately available', + }, +}; + +export function MemoryDrilldown() { + const [data, setData] = useState(null); + const [loading, setLoading] = useState(true); + + const fetchData = async () => { + setLoading(true); + try { + const res = await fetch('/api/system/memory'); + if (res.ok) setData(await res.json()); + } catch { + // ignore + } + setLoading(false); + }; + + useEffect(() => { + fetchData(); + }, []); + + if (loading && !data) { + return ( +
+ +
+ ); + } + if (!data) return null; + + const total = data.totalRam; + const cats = data.categories; + const appMemory = cats.active + cats.wired + cats.compressor; + + return ( +
+ {/* Category breakdown header */} +
+ + Memory Categories (vm_stat) + + +
+ + {/* Stacked bar */} +
+ {(Object.keys(CATEGORY_META) as (keyof VmCategory)[]).map(key => { + const bytes = cats[key]; + const pct = (bytes / total) * 100; + if (pct < 0.3) return null; + const meta = CATEGORY_META[key]; + return ( +
+ {pct > 6 ? meta.label : ''} +
+ ); + })} +
+ + {/* Legend grid */} +
+ {(Object.keys(CATEGORY_META) as (keyof VmCategory)[]).map(key => { + const bytes = cats[key]; + const pct = (bytes / total) * 100; + const meta = CATEGORY_META[key]; + const isApp = key === 'active' || key === 'wired' || key === 'compressor'; + return ( +
+
+ + + {meta.label} + +
+ + {formatBytes(bytes)} + ({pct.toFixed(1)}%) + +
+ ); + })} +
+ + {/* Summary line */} +
+ + App memory (active + wired + compressed) + + + {formatBytes(appMemory)} + +
+ + {/* Top processes */} +
+ + Top Processes by Memory + +
+
+ {data.processes.slice(0, 15).map((proc, i) => { + const pct = (proc.rss / total) * 100; + const isOllama = proc.name.toLowerCase().includes('ollama'); + const isNode = + proc.name.toLowerCase().includes('node') || proc.name.toLowerCase().includes('next'); + return ( +
+
+
+ {isOllama ? ( + + ) : isNode ? ( + + ) : ( + + )} + + {proc.name} + {proc.count > 1 && ( + ×{proc.count} + )} + +
+ + {formatBytes(proc.rss)} + ({pct.toFixed(1)}%) + +
+
+
+
+
+ ); + })} +
+
+ ); +} diff --git a/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/page.tsx b/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/page.tsx index b9ae8913..a8eeae33 100644 --- a/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/page.tsx +++ b/__LOCAL_LLMs/dashboard/src/app/(mission-control)/mission-control/page.tsx @@ -36,6 +36,7 @@ import { Star, MessageSquare, Settings, + Volume2, } from 'lucide-react'; import type { OllamaData, @@ -57,6 +58,7 @@ import { ProgressBar } from '../../components/ProgressBar'; import { Sparkline } from '../../components/Sparkline'; import { RamBudgetBar } from './components/RamBudgetBar'; import { MarkdownResponse } from './components/MarkdownResponse'; +import { MemoryDrilldown } from './components/MemoryDrilldown'; export default function Dashboard() { const [ollama, setOllama] = useState(null); @@ -129,6 +131,19 @@ export default function Dashboard() { >([]); const [showInferenceLog, setShowInferenceLog] = useState(false); const [inferenceSearch, setInferenceSearch] = useState(''); + const [showMemoryDrilldown, setShowMemoryDrilldown] = useState(false); + const [ttsData, setTtsData] = useState<{ + engines: Array<{ + name: string; + type: 'ollama' | 'python'; + status: 'ready' | 'partial' | 'missing'; + model: string; + size?: string; + voices?: string[]; + details: string; + }>; + venv: { exists: boolean; packages?: string[] }; + } | null>(null); const responseRef = useRef(null); const abortRef = useRef(null); const compareAbortRef = useRef(null); @@ -158,6 +173,13 @@ export default function Dashboard() { setMemoryHistory(prev => [...prev.slice(-29), sRes.value.memory.appMemory]); } } + // TTS engine status + try { + const tRes = await fetch('/api/tts'); + if (tRes.ok) setTtsData(await tRes.json()); + } catch { + /* ignore */ + } // F15: Check extraction service health via server-side proxy (avoids browser CORS/console errors) try { const eRes = await fetch('/api/extraction/health'); @@ -1143,21 +1165,33 @@ export default function Dashboard() {

-
+
setShowMemoryDrilldown(prev => !prev)} + style={{ + outline: showMemoryDrilldown ? '2px solid var(--warning)' : 'none', + outlineOffset: '-1px', + }} + title="Click to see memory drilldown" + >
MEMORY + + {showMemoryDrilldown ? '▲ hide' : '▼ drilldown'} +
{formatBytes(system?.memory.appMemory || 0)} - / {formatBytes(system?.memory.total || 0)} + used / {formatBytes(system?.memory.total || 0)} -

- {formatBytes(system?.memory.cached || 0)} cached (reclaimable) +

+ {formatBytes((system?.memory.free || 0) + (system?.memory.cached || 0) * 0.9)}{' '} + available for models

+ {/* Memory Drilldown Panel */} + {showMemoryDrilldown && ( +
+

+ + Memory Drilldown +

+ +
+ )} + {/* Main Grid */}
{/* Ollama Models — 2 cols */} @@ -1351,7 +1396,7 @@ export default function Dashboard() { totalRam={system.memory.total} appMemory={system.memory.appMemory} runningModels={ollama.running} - freeRam={system.memory.free} + freeRam={system.memory.free + system.memory.cached} /> )} {ollama.models @@ -1456,20 +1501,36 @@ export default function Dashboard() { )}
+ {/* Metrics row */}
- {formatBytes(model.size)} + + + {formatBytes(model.size)} + {model.details?.parameter_size && ( - {model.details.parameter_size} + + + {model.details.parameter_size} + )} {model.details?.quantization_level && ( - {model.details.quantization_level} + + {model.details.quantization_level} + )} - - ~{formatBytes(estRam)} RAM - {(() => { const ctx = modelMetadata[model.name]?.contextLength; return ctx ? ( @@ -1486,7 +1547,86 @@ export default function Dashboard() { ~{modelBenchmarks[model.name].tokPerSec.toFixed(1)} tok/s )} + {(() => { + const ps = parseFloat(model.details?.parameter_size || '0'); + const tier = + ps <= 3 + ? { label: 'Tiny · Instant', color: 'var(--success)' } + : ps <= 8 + ? { label: 'Small · Fast', color: 'var(--accent-secondary)' } + : ps <= 14 + ? { label: 'Medium', color: 'var(--accent-primary)' } + : ps <= 34 + ? { label: 'Large · Slow', color: 'var(--warning)' } + : { label: 'XL · Very Slow', color: 'var(--danger)' }; + return ( + + {tier.label} + + ); + })()}
+ {/* Memory fit — only for non-running models */} + {!running && + system && + (() => { + const avail = system.memory.free + system.memory.cached * 0.9; + const gap = avail - estRam; + const fitColor = + fitStatus === 'fits' + ? 'var(--success)' + : fitStatus === 'tight' + ? 'var(--warning)' + : 'var(--danger)'; + return ( +
+
+ + Needs ~{formatBytes(estRam)} · {formatBytes(avail)}{' '} + available + + + {fitStatus === 'fits' + ? `✓ ${formatBytes(gap)} to spare` + : fitStatus === 'tight' + ? `⚠ Tight — ${formatBytes(gap)} to spare` + : `✗ ${formatBytes(Math.abs(gap))} short`} + +
+
+
+
+
+ ); + })()} {running && (() => { const rm = ollama?.running.find(r => r.name === model.name); @@ -1547,26 +1687,6 @@ export default function Dashboard() { ) : (
- {fitStatus && !running && ( - - )}
- - {formatBytes(system?.memory.free || 0)} avail + + {formatBytes((system?.memory.free || 0) + (system?.memory.cached || 0) * 0.9)}{' '} + avail
- App: {formatBytes(system?.memory.appMemory || 0)} - Cache: {formatBytes(system?.memory.cached || 0)} + Used: {formatBytes(system?.memory.appMemory || 0)} + Total: {formatBytes(system?.memory.total || 0)}
@@ -2024,6 +2145,116 @@ export default function Dashboard() { )}
+ {/* Speech — TTS Engines */} +
+

+ + Speech (TTS) +

+ {ttsData ? ( +
+ {ttsData.engines.map(engine => ( +
+
+
+ + {engine.name} + + {engine.type === 'ollama' ? 'Ollama' : 'Python'} + +
+ {engine.size && ( + + {engine.size} + + )} +
+

+ {engine.model} +

+

+ {engine.details} +

+ {engine.voices && engine.status === 'ready' && ( +
+ {engine.voices.map(v => ( + + {v} + + ))} +
+ )} +
+ ))} + {/* Venv status */} +
+ Python venv + + {ttsData.venv.exists ? ( + <>✓ {ttsData.venv.packages?.join(' · ') || 'installed'} + ) : ( + 'Not found — run setup-tts.sh' + )} + +
+
+ ) : ( +
+

+ Loading TTS status... +

+
+ )} +
+ {/* Extraction Service (F15) */}

diff --git a/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts new file mode 100644 index 00000000..69e4409a --- /dev/null +++ b/__LOCAL_LLMs/dashboard/src/app/api/system/memory/route.ts @@ -0,0 +1,136 @@ +import { NextResponse } from 'next/server'; +import { exec } from 'child_process'; +import { promisify } from 'util'; +import os from 'os'; + +const execAsync = promisify(exec); + +interface ProcessInfo { + pid: number; + name: string; + rss: number; // bytes + pctMem: number; + user: string; +} + +interface VmStatBreakdown { + active: number; + wired: number; + compressor: number; + inactive: number; + purgeable: number; + speculative: number; + free: number; + pageSize: number; +} + +async function getTopProcesses(limit = 20): Promise { + try { + // ps with RSS in KB, sorted descending by RSS + const { stdout } = await execAsync( + `ps -axo pid=,rss=,%mem=,user=,comm= | sort -k2 -rn | head -${limit}`, + { timeout: 3000 } + ); + return stdout + .trim() + .split('\n') + .filter(Boolean) + .map(line => { + const parts = line.trim().split(/\s+/); + const pid = parseInt(parts[0]); + const rssKb = parseInt(parts[1]); + const pctMem = parseFloat(parts[2]); + const user = parts[3]; + // comm can have spaces/slashes — take everything after user + const rawName = parts.slice(4).join(' '); + // Extract just the process name from the full path + const name = rawName.split('/').pop() || rawName; + return { + pid, + name, + rss: rssKb * 1024, + pctMem, + user, + }; + }) + .filter(p => p.rss > 0); + } catch { + return []; + } +} + +async function getVmStatBreakdown(): Promise { + try { + const { stdout } = await execAsync('vm_stat', { timeout: 2000 }); + const pageSizeMatch = stdout.match(/page size of (\d+) bytes/); + const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384; + const parse = (label: string): number => { + const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`)); + return match ? parseInt(match[1]) * pageSize : 0; + }; + return { + active: parse('Pages active'), + wired: parse('Pages wired down'), + compressor: parse('Pages occupied by compressor'), + inactive: parse('Pages inactive'), + purgeable: parse('Pages purgeable'), + speculative: parse('Pages speculative'), + free: parse('Pages free'), + pageSize, + }; + } catch { + return { + active: 0, + wired: 0, + compressor: 0, + inactive: 0, + purgeable: 0, + speculative: 0, + free: 0, + pageSize: 16384, + }; + } +} + +export async function GET() { + const [processes, vmstat] = await Promise.all([getTopProcesses(25), getVmStatBreakdown()]); + + // Group by process name and sum RSS (e.g. multiple Chrome helpers) + const grouped: Record = + {}; + for (const p of processes) { + const key = p.name; + if (!grouped[key]) { + grouped[key] = { rss: 0, pctMem: 0, count: 0, pids: [] }; + } + grouped[key].rss += p.rss; + grouped[key].pctMem += p.pctMem; + grouped[key].count += 1; + grouped[key].pids.push(p.pid); + } + + const groupedProcesses = Object.entries(grouped) + .map(([name, info]) => ({ + name, + rss: info.rss, + pctMem: Math.round(info.pctMem * 10) / 10, + count: info.count, + pids: info.pids, + })) + .sort((a, b) => b.rss - a.rss); + + return NextResponse.json({ + totalRam: os.totalmem(), + vmstat, + categories: { + active: vmstat.active, + wired: vmstat.wired, + compressor: vmstat.compressor, + inactive: vmstat.inactive, + purgeable: vmstat.purgeable, + speculative: vmstat.speculative, + free: vmstat.free, + }, + processes: groupedProcesses, + }); +} diff --git a/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts index b58f1170..78ea6cbd 100644 --- a/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts +++ b/__LOCAL_LLMs/dashboard/src/app/api/system/route.ts @@ -133,12 +133,13 @@ async function getAccurateMemory(): Promise<{ const appMemory = active + wired + compressor; const cached = inactive + purgeable + speculative; - const trueFree = free + cached; // macOS reclaims cached on demand + // Return raw free separately from cached — no overlap + // available for loading = free + cached (macOS reclaims cached on demand) const ratio = appMemory / totalMem; const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal'; - return { total: totalMem, appMemory, cached, free: trueFree, pressure }; + return { total: totalMem, appMemory, cached, free, pressure }; } catch { // Fallback to Node.js (inaccurate on macOS but works everywhere) const freeMem = os.freemem(); diff --git a/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts b/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts new file mode 100644 index 00000000..15e9ba59 --- /dev/null +++ b/__LOCAL_LLMs/dashboard/src/app/api/tts/route.ts @@ -0,0 +1,175 @@ +import { NextResponse } from 'next/server'; +import { exec } from 'child_process'; +import { promisify } from 'util'; +import { access, stat, readdir } from 'fs/promises'; +import { join, resolve } from 'path'; + +const execAsync = promisify(exec); + +// process.cwd() = dashboard/, parent = __LOCAL_LLMs/ +const LOCAL_LLMS_DIR = resolve(process.cwd(), '..'); + +interface TtsEngine { + name: string; + type: 'ollama' | 'python'; + status: 'ready' | 'partial' | 'missing'; + model: string; + size?: string; + voices?: string[]; + details: string; +} + +async function fileExists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +async function getFileSize(path: string): Promise { + try { + const s = await stat(path); + return s.size; + } catch { + return 0; + } +} + +async function checkOrpheus(): Promise { + const engine: TtsEngine = { + name: 'Orpheus TTS', + type: 'ollama', + status: 'missing', + model: 'sematre/orpheus:en', + voices: ['tara', 'leah', 'jess', 'leo', 'dan', 'mia', 'zac', 'zoe'], + details: '', + }; + + // Check if Orpheus model is in Ollama + let hasModel = false; + try { + const res = await fetch('http://localhost:11434/api/tags', { + signal: AbortSignal.timeout(2000), + }); + if (res.ok) { + const data = await res.json(); + hasModel = data.models?.some((m: { name: string }) => m.name.includes('orpheus')) ?? false; + } + } catch { + // Ollama not running + } + + // Check SNAC decoder + const snacPath = join(LOCAL_LLMS_DIR, 'models', 'snac_24khz', 'pytorch_model.bin'); + const hasSnac = await fileExists(snacPath); + const snacSize = hasSnac ? await getFileSize(snacPath) : 0; + + // Check Python venv + const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); + const hasVenv = await fileExists(venvPython); + + if (hasModel && hasSnac && hasVenv) { + engine.status = 'ready'; + engine.size = `${(snacSize / 1e6).toFixed(0)} MB decoder`; + engine.details = 'Ollama model + SNAC decoder + Python venv'; + } else if (hasModel) { + engine.status = 'partial'; + const missing: string[] = []; + if (!hasSnac) missing.push('SNAC decoder'); + if (!hasVenv) missing.push('Python venv'); + engine.details = `Missing: ${missing.join(', ')}`; + } else { + engine.status = 'missing'; + engine.details = 'Run: bash setup-tts.sh'; + } + + return engine; +} + +async function checkQwenTts(): Promise { + const engine: TtsEngine = { + name: 'Qwen3-TTS', + type: 'python', + status: 'missing', + model: 'Qwen3-TTS-12Hz-0.6B-CustomVoice', + details: '', + }; + + const modelDir = join(LOCAL_LLMS_DIR, 'models', 'Qwen3-TTS-12Hz-0.6B-CustomVoice'); + const tokenizerDir = join(LOCAL_LLMS_DIR, 'models', 'Qwen3-TTS-Tokenizer-12Hz'); + + let hasModel = false; + let modelSize = 0; + try { + const files = await readdir(modelDir); + const safetensors = files.find(f => f.endsWith('.safetensors')); + if (safetensors) { + hasModel = true; + modelSize = await getFileSize(join(modelDir, safetensors)); + } + } catch { + // dir doesn't exist + } + + const hasTokenizer = await fileExists(join(tokenizerDir, 'config.json')); + const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); + const hasVenv = await fileExists(venvPython); + + if (hasModel && hasTokenizer && hasVenv) { + engine.status = 'ready'; + engine.size = `${(modelSize / 1e9).toFixed(1)} GB`; + engine.details = '0.6B params · 10 languages · MPS/CPU'; + } else if (hasModel || hasTokenizer) { + engine.status = 'partial'; + const missing: string[] = []; + if (!hasModel) missing.push('model weights'); + if (!hasTokenizer) missing.push('tokenizer'); + if (!hasVenv) missing.push('Python venv'); + engine.details = `Missing: ${missing.join(', ')}`; + } else { + engine.status = 'missing'; + engine.details = 'Run: bash setup-tts.sh'; + } + + return engine; +} + +async function checkVenv(): Promise<{ + exists: boolean; + python?: string; + packages?: string[]; +}> { + const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python'); + const exists = await fileExists(venvPython); + if (!exists) return { exists: false }; + + try { + const { stdout } = await execAsync( + `"${venvPython}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`, + { timeout: 5000 } + ); + return { + exists: true, + python: venvPython, + packages: stdout.trim().split(' '), + }; + } catch { + return { exists: true, python: venvPython }; + } +} + +export async function GET() { + const [orpheus, qwenTts, venv] = await Promise.all([checkOrpheus(), checkQwenTts(), checkVenv()]); + + return NextResponse.json({ + engines: [orpheus, qwenTts], + venv, + setupScript: 'bash setup-tts.sh', + testCommands: { + orpheus: '.venv-qwen-tts/bin/python test_orpheus_tts.py', + qwenTts: '.venv-qwen-tts/bin/python test_qwen_tts.py', + }, + }); +} diff --git a/__LOCAL_LLMs/dashboard/src/app/lib/format.ts b/__LOCAL_LLMs/dashboard/src/app/lib/format.ts index 89bdc8dc..10f9ef91 100644 --- a/__LOCAL_LLMs/dashboard/src/app/lib/format.ts +++ b/__LOCAL_LLMs/dashboard/src/app/lib/format.ts @@ -19,13 +19,15 @@ export function estimateRam(diskSize: number, quant?: string): number { } // N2: Check if model fits in available memory +// free = raw free pages, cached = inactive+purgeable+speculative (no overlap) +// macOS reclaims ~90% of cached on demand for large allocations (model mmaps) export type FitStatus = 'fits' | 'tight' | 'no'; export function checkMemoryFit( estimatedRam: number, freeMemory: number, cachedMemory: number ): FitStatus { - const available = freeMemory + cachedMemory * 0.5; + const available = freeMemory + cachedMemory * 0.9; const ratio = estimatedRam / available; if (ratio < 0.7) return 'fits'; if (ratio <= 1.0) return 'tight'; diff --git a/__LOCAL_LLMs/docs/00-developer-guide.md b/__LOCAL_LLMs/docs/00-developer-guide.md index c861fe7e..b2243983 100644 --- a/__LOCAL_LLMs/docs/00-developer-guide.md +++ b/__LOCAL_LLMs/docs/00-developer-guide.md @@ -10,10 +10,13 @@ This machine runs a local LLM server via [Ollama](https://ollama.com), exposing **Models installed:** -| Model | Size | Best For | -| ------------------- | ------- | ----------------------------------------- | -| `qwen2.5-coder:32b` | 18.5 GB | Code (TS, Python, Swift), structured JSON | -| `llama3.1:8b` | 4.7 GB | Fast evals, general tasks | +| Model | Size | Best For | +| -------------------- | ------ | -------------------------------------------- | +| `qwen2.5-coder:32b` | 19 GB | Code (TS, Python, Swift), structured JSON | +| `qwen2.5-coder:7b` | 4.7 GB | Fast code tasks, fits alongside other models | +| `deepseek-r1:32b` | 19 GB | Complex reasoning, chain-of-thought | +| `llama3.1:8b` | 4.9 GB | Fast evals, general tasks | +| `sematre/orpheus:en` | 4 GB | Text-to-speech (8 voices, emotion tags) | --- diff --git a/__LOCAL_LLMs/docs/05-mission-control-dashboard.md b/__LOCAL_LLMs/docs/05-mission-control-dashboard.md index f8f70ae4..95e85fc6 100644 --- a/__LOCAL_LLMs/docs/05-mission-control-dashboard.md +++ b/__LOCAL_LLMs/docs/05-mission-control-dashboard.md @@ -1,17 +1,103 @@ # 05 — Mission Control Dashboard -> **Documentation has moved.** All dashboard docs now live in the dashboard directory. - -- **PRD:** [`__LOCAL_LLMs/dashboard/docs/DASHBOARD_PRD.md`](../dashboard/docs/DASHBOARD_PRD.md) -- **Review (39 items):** [`__LOCAL_LLMs/dashboard/docs/DASHBOARD_REVIEW.md`](../dashboard/docs/DASHBOARD_REVIEW.md) -- **Roadmap (N1–N15):** [`__LOCAL_LLMs/dashboard/docs/DASHBOARD_ROADMAP.md`](../dashboard/docs/DASHBOARD_ROADMAP.md) +> Next.js 16 dashboard for managing local LLM models, system resources, and inference. +> Last updated: 2026-02-21 ## Quick Start ```bash cd __LOCAL_LLMs/dashboard npm install # first time only -npm run dev -- -p 3100 +npm run dev # runs on port 3000 ``` -Open: **http://localhost:3100** +Open: **http://localhost:3000** + +--- + +## Recent Changes (Feb 2026) + +### Memory Calculation Fix + +**Root cause:** The system API (`/api/system`) computed `trueFree = free + cached` and returned it as `free`. This made `free` and `cached` overlap. The UI then did `available = free + cached * 0.5`, which **double-counted** cached memory and inflated available RAM by ~8 GB. + +**Fix (4 files):** + +- `src/app/api/system/route.ts` — Return raw `Pages free` separately from `cached` (no overlap) +- `src/app/lib/format.ts` — Updated `checkMemoryFit()` to use `cached × 0.9` (macOS reclaims ~90% on demand) +- `src/app/(mission-control)/mission-control/page.tsx` — All UI memory references fixed +- `src/app/(mission-control)/mission-control/components/RamBudgetBar.tsx` — Receives corrected `free + cached` + +**Memory formula:** `available for models = rawFree + cached × 0.9` + +### Memory Drilldown + +Click the **MEMORY** card in the status bar to toggle a drilldown panel showing: + +1. **Stacked bar** — vm_stat categories (Active, Wired, Compressed, Inactive, Purgeable, Free) +2. **Legend grid** — exact bytes + percentage for each category +3. **App memory summary** — Active + Wired + Compressed = total used +4. **Top 15 processes by RSS** — grouped by name, Ollama highlighted in green + +**New files:** + +- `src/app/api/system/memory/route.ts` — Process memory API (`ps` + `vm_stat`) +- `src/app/(mission-control)/mission-control/components/MemoryDrilldown.tsx` — Drilldown UI + +### Simplified Memory UI + +All memory displays now use consistent, plain language: + +| Element | Before (confusing) | After (clear) | +| -------------------- | ---------------------------------- | ------------------------------------------- | +| **MEMORY card** | "10.5 GB / 48 GB" (ambiguous) | **"35.6 GB used / 48 GB"** | +| **Subtitle** | "App: 35.6 GB · Cache: 11.6 GB" | **"10.5 GB available for models"** (green) | +| **Model fit** | "76 MB free + 10.5 GB reclaimable" | **"Needs ~22 GB · 10.5 GB available"** | +| **Fit badge** | "✗ Won't fit" | **"✗ 11.6 GB short"** (with exact gap) | +| **System panel RAM** | "76 MB avail" | **"10.5 GB avail"** (green, matches header) | + +--- + +## Detailed Documentation + +- **PRD:** [`dashboard/docs/DASHBOARD_PRD.md`](../dashboard/docs/DASHBOARD_PRD.md) +- **Review (39 items):** [`dashboard/docs/DASHBOARD_REVIEW.md`](../dashboard/docs/DASHBOARD_REVIEW.md) +- **Roadmap (N1–N15):** [`dashboard/docs/DASHBOARD_ROADMAP.md`](../dashboard/docs/DASHBOARD_ROADMAP.md) +- **Rich Features Roadmap (A–G):** [`dashboard/docs/RICH_FEATURES_ROADMAP.md`](../dashboard/docs/RICH_FEATURES_ROADMAP.md) + +--- + +## API Routes + +| Route | Method | Description | +| -------------------- | -------- | ---------------------------------------------------- | +| `/api/ollama` | GET/POST | Ollama proxy (list, load, unload, generate) | +| `/api/whisper` | GET | Whisper binary/model discovery | +| `/api/system` | GET | System info (chip, RAM, disk, brew, pressure) | +| `/api/system/memory` | GET | Memory drilldown (vm_stat breakdown + top processes) | +| `/api/system/exec` | POST | Safe shell command execution | + +--- + +## Key Components + +``` +dashboard/src/app/ +├── (mission-control)/mission-control/ +│ ├── page.tsx # Main Mission Control page +│ └── components/ +│ ├── RamBudgetBar.tsx # Stacked RAM budget visualization +│ ├── MemoryDrilldown.tsx # Process-level memory breakdown +│ └── MarkdownResponse.tsx # Markdown renderer for LLM output +├── (workspace)/components/ # Chat workspace (conversations, messages) +├── api/ +│ ├── ollama/route.ts +│ ├── whisper/route.ts +│ ├── system/route.ts +│ └── system/memory/route.ts +└── lib/ + ├── format.ts # formatBytes, estimateRam, checkMemoryFit + ├── db.ts # IndexedDB CRUD (conversations, projects, tasks) + ├── cron.ts # Cron expression parser + └── scheduled-tasks.ts # Built-in task templates +``` diff --git a/__LOCAL_LLMs/docs/08-troubleshooting.md b/__LOCAL_LLMs/docs/08-troubleshooting.md index b70bdade..596f7dd8 100644 --- a/__LOCAL_LLMs/docs/08-troubleshooting.md +++ b/__LOCAL_LLMs/docs/08-troubleshooting.md @@ -19,19 +19,41 @@ This machine is behind an AT&T Forcepoint proxy that performs SSL deep packet in ### What Works Through Proxy -| Tool | Status | Notes | -| -------------------------- | ---------- | ------------------------------------- | -| `ollama pull` | ✅ Works | Ollama handles proxy natively | -| `brew install` | ✅ Works | Homebrew handles proxy | -| `npm install` | ✅ Works | With `NODE_TLS_REJECT_UNAUTHORIZED=0` | -| `curl` to Hugging Face | ❌ Blocked | Returns 19 KB HTML redirect page | -| `curl -k` to Hugging Face | ❌ Blocked | Still intercepted even with `-k` | -| `python requests` to HF | ❌ Blocked | SSL_CERTIFICATE_VERIFY_FAILED | -| `huggingface_hub` download | ❌ Blocked | Falls back to cached (broken) files | +| Tool | Status | Notes | +| -------------------------- | ---------- | ------------------------------------------- | +| `ollama pull` | ✅ Works | Ollama handles proxy natively | +| `brew install` | ✅ Works | Homebrew handles proxy | +| `npm install` | ✅ Works | With `NODE_TLS_REJECT_UNAUTHORIZED=0` | +| `git clone` (GitHub) | ✅ Works | With `GIT_SSL_NO_VERIFY=1` | +| `pip install` (PyPI) | ✅ Works | Via corporate Artifactory mirror | +| **`hf-mirror.com`** | ✅ Works | Chinese HuggingFace mirror, **not blocked** | +| `curl` to Hugging Face | ❌ Blocked | Returns 19 KB HTML redirect page | +| `curl -k` to Hugging Face | ❌ Blocked | Still intercepted even with `-k` | +| `python requests` to HF | ❌ Blocked | SSL_CERTIFICATE_VERIFY_FAILED | +| `huggingface_hub` download | ❌ Blocked | Falls back to cached (broken) files | -### Workaround: Download Off-Network +### Workaround 1: Use hf-mirror.com (recommended) -For Hugging Face model downloads (e.g., Whisper GGML files): +`hf-mirror.com` is a Chinese mirror of HuggingFace that **is NOT blocked** by Forcepoint. Replace `huggingface.co` with `hf-mirror.com` in any download URL: + +```bash +# Instead of: https://huggingface.co/org/model/resolve/main/file.bin +# Use: https://hf-mirror.com/org/model/resolve/main/file.bin + +# Example: download SNAC decoder (TTS) +curl -k -L -o models/snac_24khz/pytorch_model.bin \ + "https://hf-mirror.com/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" + +# Example: download Whisper model +curl -k -L -o ~/whisper-models/ggml-large-v3-turbo.bin \ + "https://hf-mirror.com/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin" +``` + +The TTS scripts (`setup-tts.sh`, `download-tts-models.sh`) use this mirror automatically. + +### Workaround 2: Download Off-Network + +If the mirror is also blocked, use a non-corporate network: 1. **Disconnect** from corporate VPN/Wi-Fi 2. **Connect** to personal hotspot or home Wi-Fi diff --git a/__LOCAL_LLMs/docs/10-text-to-speech.md b/__LOCAL_LLMs/docs/10-text-to-speech.md new file mode 100644 index 00000000..6df2669f --- /dev/null +++ b/__LOCAL_LLMs/docs/10-text-to-speech.md @@ -0,0 +1,230 @@ +# 10 — Text-to-Speech (TTS) — Local Setup + +> Local TTS on Apple Silicon: Orpheus TTS via Ollama + Qwen3-TTS 0.6B direct. +> Works through corporate proxy via `hf-mirror.com`. +> Last updated: 2026-02-21 + +--- + +## Overview + +Two TTS engines for local speech generation — both run fully offline after initial setup. + +| Engine | Model | Size | How It Runs | Quality | Speed | +| --------------- | --------------------------------- | ------ | ----------------------- | ------------------------------------------ | ------------------------ | +| **Orpheus TTS** | `sematre/orpheus:en` | 4 GB | Via Ollama (Metal GPU) | Great — expressive, 8 voices, emotion tags | ~11s for short sentences | +| **Qwen3-TTS** | `Qwen3-TTS-12Hz-0.6B-CustomVoice` | 1.2 GB | Direct Python (MPS/CPU) | Excellent — 10 languages, voice design | ~10-20s on MPS | + +### Architecture + +``` +Text → Ollama (Orpheus 3B) → Audio Tokens → SNAC Decoder → WAV file +Text → Qwen3-TTS 0.6B (PyTorch MPS) → WAV file +``` + +--- + +## Quick Start (Fresh Laptop) + +The **one-shot setup script** handles everything — works on any Apple Silicon Mac, including through corporate proxy: + +```bash +cd __LOCAL_LLMs +bash setup-tts.sh +``` + +This installs: Python 3.12, venv, pip packages, Orpheus model (Ollama), SNAC decoder (hf-mirror.com), and optionally Qwen3-TTS 0.6B. + +After setup: + +```bash +.venv-qwen-tts/bin/python test_orpheus_tts.py +afplay test_orpheus_tara.wav +``` + +--- + +## Prerequisites + +| Component | How to Install | Notes | +| ------------------------- | ---------------------------------- | ------------------------------ | +| **macOS + Apple Silicon** | — | M1/M2/M3/M4 (MPS acceleration) | +| **Homebrew** | `/bin/bash -c "$(curl -fsSL ...)"` | Package manager | +| **Ollama** | `brew install ollama` | Local LLM server | +| **Python 3.12** | `brew install python@3.12` | TTS packages need 3.12 | + +All of the above are installed automatically by `setup-tts.sh`. + +--- + +## Manual Setup (step by step) + +If you prefer to run each step yourself instead of `setup-tts.sh`: + +### 1. Python Environment + +```bash +cd __LOCAL_LLMs + +# Install Python 3.12 +brew install python@3.12 + +# Create isolated venv +/opt/homebrew/bin/python3.12 -m venv .venv-qwen-tts + +# Install packages +.venv-qwen-tts/bin/pip install -U snac qwen-tts +``` + +### 2. Orpheus TTS Model (via Ollama) + +```bash +ollama serve & # start Ollama if not running +ollama pull sematre/orpheus:en # 4 GB, via Ollama registry (works through proxy) +``` + +### 3. SNAC Audio Decoder + +Downloads via `hf-mirror.com` — **works through corporate proxy**: + +```bash +bash download-tts-models.sh snac # just SNAC (~76 MB) +``` + +Or manually: + +```bash +mkdir -p models/snac_24khz +curl -k -sL -o models/snac_24khz/config.json \ + "https://hf-mirror.com/hubertsiuzdak/snac_24khz/raw/main/config.json" +curl -k -L --progress-bar -o models/snac_24khz/pytorch_model.bin \ + "https://hf-mirror.com/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" +``` + +### 4. Qwen3-TTS 0.6B (optional) + +```bash +bash download-tts-models.sh qwen # tokenizer + model (~1.7 GB) +``` + +After download everything runs **fully offline**. + +--- + +## Usage + +### Orpheus TTS (via Ollama) + +```bash +# Make sure Ollama is running +ollama serve & + +# Run test +.venv-qwen-tts/bin/python test_orpheus_tts.py + +# Play output +afplay test_orpheus_tara.wav +``` + +**Voices:** `tara`, `leah`, `jess`, `leo`, `dan`, `mia`, `zac`, `zoe` + +**Emotion tags:** ``, ``, ``, ``, ``, ``, ``, `` + +```python +# Example prompt format +voice = "tara" +text = " That's hilarious! Tell me more." +prompt = f"<|begin_of_text|>{voice}: {text}<|eot_id|>" +``` + +### Qwen3-TTS (direct Python) + +```bash +.venv-qwen-tts/bin/python test_qwen_tts.py +afplay test_output_english.wav +``` + +**Features:** + +- 10 languages (Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian) +- Built-in speaker voices (Chelsie, Vivian, Ryan, etc.) +- Natural language emotion control: `instruct="Speak with excitement"` +- Voice cloning from a short audio sample (with Base model variant) + +--- + +## File Inventory + +``` +__LOCAL_LLMs/ +├── setup-tts.sh # ← START HERE — one-shot setup for fresh laptop +├── download-tts-models.sh # Download model weights (uses hf-mirror.com) +├── test_orpheus_tts.py # Orpheus TTS test (Ollama + SNAC) +├── test_qwen_tts.py # Qwen3-TTS test (direct Python) +├── .venv-qwen-tts/ # Python 3.12 venv (gitignored, created by setup) +├── models/ # Downloaded model weights (gitignored) +│ ├── snac_24khz/ # SNAC audio decoder (~76 MB) +│ ├── Qwen3-TTS-Tokenizer-12Hz/ # Qwen3-TTS tokenizer (optional) +│ └── Qwen3-TTS-12Hz-0.6B-CustomVoice/ # Qwen3-TTS model (~1.2 GB, optional) +└── *.wav # Generated audio output (gitignored) +``` + +--- + +## OSS TTS Landscape (as of Feb 2026) + +### Speech-to-Text (STT) + +| Model | By | Notes | +| ------------------------- | ------------------ | --------------------------------------------------- | +| **Whisper / whisper-cpp** | OpenAI / ggerganov | Gold standard, already installed, Metal-accelerated | +| **Faster Whisper** | SYSTRAN | 4× faster via CTranslate2 | +| **Distil-Whisper** | Hugging Face | 6× faster, 49% fewer params | + +### Text-to-Speech (TTS) + +| Model | By | Size | Notes | +| ---------------- | ------------ | --------- | ------------------------------------------------------- | +| **Qwen3-TTS** ⭐ | Alibaba | 0.6B–1.7B | Best quality, 10 languages, voice cloning, Jan 2026 | +| **Orpheus TTS** | Canopy AI | 3B | Expressive, 8 voices, emotion tags, available on Ollama | +| **Kokoro** | HF Community | 82M | Very fast, near-commercial quality, Apache 2.0 | +| **Piper** | Rhasspy | ONNX | Lightweight, runs on Raspberry Pi | +| **F5-TTS** | SWivid | — | Zero-shot voice cloning, flow matching | +| **StyleTTS 2** | Columbia U | — | Human-level quality, style diffusion | +| **OuteTTS** | Community | — | Pure LLM-based TTS, runs via llama.cpp | +| **Bark** | Suno | — | Speech + music + sound effects | + +--- + +## Corporate Proxy Notes + +| Source | Status | Workaround | +| ------------------------------------------ | ---------- | --------------------------------------------------- | +| **Ollama registry** (`registry.ollama.ai`) | ✅ Works | Ollama pull uses its own CDN | +| **PyPI** (via `artifact.it.att.com`) | ✅ Works | Corporate Artifactory mirror | +| **GitHub releases** | ✅ Works | Direct download | +| **HuggingFace** (`huggingface.co`) | ❌ Blocked | Use `hf-mirror.com` as mirror (works through proxy) | +| **hf-mirror.com** (HF mirror) | ✅ Works | Chinese HF mirror, not blocked by Forcepoint | + +Forcepoint CSO intercepts HTTPS and serves a block page for HuggingFace. No SSL workaround works for `huggingface.co`. However, **`hf-mirror.com`** (a Chinese mirror of HuggingFace) is **not blocked** and can be used to download model weights: + +```bash +# Download SNAC config + weights via mirror +curl -k -L -o models/snac_24khz/config.json "https://hf-mirror.com/hubertsiuzdak/snac_24khz/raw/main/config.json" +curl -k -L -o models/snac_24khz/pytorch_model.bin "https://hf-mirror.com/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" +``` + +All other sources (Ollama, pip, GitHub) also work fine through the proxy. + +--- + +## Troubleshooting + +| Problem | Fix | +| --------------------------------------------- | ----------------------------------------------------------------------------- | +| `OSError: couldn't connect to huggingface.co` | Use `hf-mirror.com` or run `bash setup-tts.sh` | +| `SNAC decoder not found` | Run `bash setup-tts.sh` or `bash download-tts-models.sh snac` | +| `Model not found at models/Qwen3-TTS-*` | Run `bash setup-tts.sh` or `bash download-tts-models.sh qwen` | +| Orpheus generates no audio tokens | Ensure `ollama serve` is running and `ollama list` shows `sematre/orpheus:en` | +| MPS out of memory for Qwen3-TTS | Close other apps (Windsurf uses ~18 GB). Or use `device="cpu"` in test script | +| Slow generation on CPU | Expected for 0.6B model. MPS should be ~2-3× faster | diff --git a/__LOCAL_LLMs/docs/DASHBOARD_REVIEW.md b/__LOCAL_LLMs/docs/DASHBOARD_REVIEW.md deleted file mode 100644 index 6948dae1..00000000 --- a/__LOCAL_LLMs/docs/DASHBOARD_REVIEW.md +++ /dev/null @@ -1,310 +0,0 @@ -# Mission Control Dashboard — Bug & Improvement Review - -> Systematic code review of `__LOCAL_LLMs/dashboard/` (6 source files, 1,395 lines) -> Last updated: Feb 19, 2026 - ---- - -## File Inventory - -| File | Lines | Purpose | -| ------------------------------------ | ----- | -------------------------------------------------------------------- | -| `src/app/page.tsx` | 1,079 | Main dashboard UI (single component) | -| `src/app/globals.css` | 91 | Design tokens, animations, base styles | -| `src/app/layout.tsx` | 20 | Root layout (metadata, dark mode) | -| `src/app/api/ollama/route.ts` | 117 | Ollama REST proxy (list, load, unload, pull, delete, show, generate) | -| `src/app/api/ollama/stream/route.ts` | 38 | Ollama streaming generate proxy (NDJSON) | -| `src/app/api/whisper/route.ts` | 66 | Whisper binary + GGML model discovery | -| `src/app/api/system/route.ts` | 162 | System info (chip, memory via vm_stat, disk, brew) | - -**Stack:** Next.js 16, React 19, TailwindCSS v4, Lucide icons, TypeScript - ---- - -## 1. Bugs - -- [x] **B1. Hardcoded machine specs in header** — `page.tsx:317` - Subtitle reads `Apple M4 Pro · 48 GB · {system?.platform}` — should use `system?.chip` and `formatBytes(system?.memory.total)` dynamically so it works on any machine. - -- [x] **B2. Pull model blocks UI — no progress feedback** — `api/ollama/route.ts:84-92` - `handlePull` calls Ollama with `stream: false`. Large models (20+ GB) block for 30+ minutes. The Next.js API route will likely timeout. Must use `stream: true` and pipe progress events to the client. _(Combined with F1.)_ - -- [x] **B3. Dead code: non-streaming `generate` action** — `api/ollama/route.ts:69-82` - The `action === 'generate'` handler is unused — UI only uses `/api/ollama/stream`. Remove or keep as fallback with a comment. - -- [x] **B4. Escape key closes modal during active streaming** — `page.tsx:188-197` - Global `keydown` handler calls `setPromptModel(null)` unconditionally. Backdrop click correctly checks `!promptLoading`. Escape should also respect `promptLoading` to prevent discarding an in-flight response. - -- [x] **B5. Auto-refresh (15s) fires during streaming/pull** — `page.tsx:182-185` - `setInterval(fetchAll, 15000)` runs unconditionally. During streaming this causes background churn and potential UI flicker. Should pause while `promptLoading` or `pullLoading` is true. - -- [x] **B6. Toast ID collision on HMR remount** — `page.tsx:156-159` - `toastId.current` resets to 0 on component remount during dev. Use `Date.now()` or `crypto.randomUUID()` for robust uniqueness. - -- [x] **B7. vm_stat page size hardcoded** — `api/system/route.ts:103` - Hardcoded `16384`. Should parse from vm_stat's first line: `"(page size of NNNNN bytes)"` for portability. - -- [x] **B8. Whisper models dir not configurable** — `api/whisper/route.ts:24` - Hardcoded to `~/whisper-models`. Should scan multiple known paths (`/opt/homebrew/share/whisper-cpp/models/`, `~/whisper-models`, `~/.cache/whisper/`) or accept `WHISPER_MODELS_DIR` env var. - -- [x] **B9. No AbortController for streaming fetch** — `page.tsx:250-289` - Closing the prompt modal doesn't cancel the underlying fetch. The `reader.read()` loop continues in the background wasting CPU/bandwidth until the model finishes generating. - -- [x] **B10. Brew shows "Loading..." when array is empty** — `page.tsx:936-940` - When `system.brewPackages` is `[]` (all uninstalled), displays "Loading..." instead of "No packages found". Needs to distinguish "still fetching" vs "fetched but empty". - -- [x] **B11. Prompt text not cleared on close without send** — `page.tsx:951-957` - Backdrop click clears `promptText`, but Escape handler (B4 fix) should also clear it. Otherwise stale text persists when re-opening. - ---- - -## 2. Code Quality - -- [x] **CQ1. Monolithic 1,079-line single component** — `page.tsx` - All interfaces, utilities, sub-components, and 900+ lines of JSX in one file. Extract to: - - `components/` — StatusDot, ProgressBar, ToastContainer, PromptModal, OllamaModelsPanel, SystemPanel, WhisperPanel, BrewPanel - - `lib/types.ts` — interfaces (OllamaModel, SystemData, etc.) - - `lib/format.ts` — formatBytes, formatUptime - - `lib/hooks.ts` — useAutoRefresh, useToasts, useOllamaActions - -- [x] **CQ2. Pervasive inline styles instead of CSS/Tailwind classes** — `page.tsx` (100+ occurrences) - Every `style={{ color: 'var(--text-tertiary)' }}` should be a utility class. Options: custom Tailwind theme mapping, or CSS utility classes in `globals.css` (e.g., `.text-muted`). - -- [x] **CQ3. OLLAMA_URL duplicated** — `api/ollama/route.ts:3` + `api/ollama/stream/route.ts:3` - Same `process.env.OLLAMA_URL || 'http://localhost:11434'` in two files. Extract to `lib/ollama-config.ts`. - -- [x] **CQ4. No React Error Boundary** — `page.tsx` - Unexpected API response shape crashes the entire dashboard. Add an `error.tsx` (Next.js App Router convention) for graceful recovery. - -- [x] **CQ5. No loading skeleton / shimmer UI** - Initial load shows "..." placeholders. Skeleton cards would be more polished. - -- [x] **CQ6. No TypeScript strict null checks in API responses** - API route handlers catch errors but return loosely typed JSON. Add Zod validation on the Ollama/system responses to prevent runtime surprises. - ---- - -## 3. Features - -- [x] **F1. Streaming pull with progress bar** _(fixes B2)_ - Use Ollama `stream: true` for `/api/pull`. Create `/api/ollama/pull/route.ts` that pipes NDJSON progress. UI shows progress bar with `completed/total` bytes, speed, and ETA. - -- [x] **F2. Model search/filter** - Search input above models list. Filter by name, family, quantization. Useful when 10+ models are installed. - -- [x] **F3. Prompt history (localStorage)** - Store last 20 prompts with model name + timestamp. Dropdown in prompt modal to re-run previous prompts. - -- [x] **F4. Chat mode (multi-turn conversation)** - Use Ollama `/api/chat` instead of `/api/generate`. Chat bubble layout with message history. System prompt input field. - -- [x] **F5. Model comparison (side-by-side)** - Send same prompt to 2 models simultaneously. Display responses side-by-side with latency/quality comparison. - -- [x] **F6. Token/s metrics after generation** - Parse `eval_count` and `eval_duration` from the final NDJSON chunk. Display tokens/second, total tokens, and latency in the response footer. - -- [x] **F7. System resource sparklines (time-series)** - Ring buffer of memory/CPU snapshots (localStorage). Render mini sparkline charts in the System panel. Spot trends over time. - -- [x] **F8. Ollama server logs viewer** - Read `~/.ollama/logs/` and display in a collapsible terminal-style panel. Filter by level. Auto-scroll. - -- [x] **F9. Modelfile / template viewer** - The `show` action already fetches Modelfile, template, and system prompt. Display in a collapsible code block in expanded model details. - -- [x] **F10. Dark/light theme toggle** - Add `:root.light` CSS variable overrides. Theme toggle with localStorage persistence. Current architecture supports this natively. - -- [x] **F11. Keyboard shortcuts panel (`?` key)** - Show all shortcuts in a modal: ⌘+Enter (send), Esc (close), R (refresh), / (search models), ? (help). - -- [x] **F12. Whisper transcription test** - Upload/record a short audio clip, transcribe locally via whisper-cli, display result with latency. Tests the full local STT pipeline. - -- [x] **F13. Responsive mobile layout** - Better breakpoints for the 4-column stats row and 3-column main grid. Collapsible sidebar on mobile. - -- [x] **F14. Model tags/labels (localStorage)** - User-defined tags (coding, fast, vision) with colored badges. Persisted in localStorage. - -- [x] **F15. Extraction service integration panel** - Show extraction-service (port 4005) health status. Run test extractions against loaded Ollama models. Bridges dashboard to LysnrAI pipeline. - -- [x] **F16. Auto-load preferred model** - Mark a model as "auto-load" (stored in localStorage). When Ollama is online but no models loaded, auto-load the preferred model. - ---- - -## 4. Performance & Reliability - -- [x] **P1. No request deduplication on Refresh** — `page.tsx:164-176` - Rapid clicks on Refresh fire duplicate `fetchAll()` calls. Add a `fetchingRef` guard or disable the button during fetch (partially done for `actionLoading` but not for `fetchAll`). - -- [x] **P2. Static cache never expires** — `api/system/route.ts:81-90` - `staticCache` (chip, GPU, brew) lives forever in the server process. Brew package upgrades won't reflect. Add 5-minute TTL. - -- [x] **P3. `du -sk ~/.ollama/models` on every refresh** — `api/system/route.ts:41` - Traverses entire models directory every 15 seconds. Cache with 60-second TTL. - -- [x] **P4. No fetch timeout on Ollama calls** — `api/ollama/route.ts:5-12` - `fetchOllama` has no `AbortSignal` or timeout. If Ollama hangs, the dashboard hangs. Add 5-second timeout. - -- [x] **P5. `system_profiler` slow on first load** — `api/system/route.ts:52-53` - Takes ~2-3 seconds. Cached after first call, but first dashboard load waits. Consider eager background fetch on server start or return placeholder. - ---- - -## 5. Security & Hardening - -- [x] **S1. No input validation on model names** — `api/ollama/route.ts:50-51` - `model` from request body passed directly to Ollama. Add regex validation: `^[a-zA-Z0-9._:/-]{1,256}$`. - -- [x] **S2. Shell command interpolation pattern** — `api/system/route.ts:67` - `execAsync(\`brew list --versions ${pkg}\`)`— safe today (hardcoded targets) but fragile. Use`execFile('brew', ['list', '--versions', pkg])` for safety. - -- [x] **S3. No CORS or auth** _(acceptable for local-only, documented)_ - Any local process can call API routes. Fine for dev tool; document the assumption. - ---- - -## 6. Implementation Tracker - -### Sprint 1 — Critical Bug Fixes _(est. 1–2 hrs)_ - -| # | ID | Task | Effort | Commit | -| --- | --------- | ----------------------------------------- | ------ | --------- | -| 1 | - [x] B4 | Guard Escape key during streaming | 5 min | `2da67c2` | -| 2 | - [x] B5 | Pause auto-refresh during prompt/pull | 10 min | `2da67c2` | -| 3 | - [x] B9 | Add AbortController to streaming fetch | 15 min | `2da67c2` | -| 4 | - [x] B1 | Dynamic chip/RAM in header | 5 min | `2da67c2` | -| 5 | - [x] B11 | Clear prompt text on Escape close | 5 min | `2da67c2` | -| 6 | - [x] P4 | Add timeout to Ollama fetch calls | 10 min | `2da67c2` | -| 7 | - [x] B3 | Remove dead generate action (or document) | 5 min | `2da67c2` | -| 8 | - [x] B6 | Use Date.now() for toast IDs | 2 min | `2da67c2` | -| 9 | - [x] B10 | Fix brew "Loading..." vs "empty" state | 5 min | `2da67c2` | - -### Sprint 2 — Pull Progress + Metrics _(est. 2–3 hrs)_ - -| # | ID | Task | Effort | Commit | -| --- | ----------- | ----------------------------------- | ------ | --------- | -| 10 | - [x] B2+F1 | Streaming pull with progress bar | 60 min | `2d9475b` | -| 11 | - [x] F6 | Display tokens/s after generation | 30 min | `2d9475b` | -| 12 | - [x] B7 | Parse vm_stat page size dynamically | 10 min | `2d9475b` | -| 13 | - [x] B8 | Multi-path whisper model discovery | 15 min | `2d9475b` | - -### Sprint 3 — Component Refactor _(est. 2–3 hrs)_ - -| # | ID | Task | Effort | Commit | -| --- | --------- | --------------------------------------- | ------ | --------- | -| 14 | - [x] CQ1 | Extract components into separate files | 90 min | `75a3cd0` | -| 15 | - [x] CQ4 | Add error.tsx Error Boundary | 15 min | `75a3cd0` | -| 16 | - [x] CQ3 | Shared ollama-config.ts | 10 min | `75a3cd0` | -| 17 | - [x] CQ2 | Consolidate inline styles → CSS classes | 45 min | `ed93a6f` | -| 18 | - [x] S1 | Add model name input validation | 10 min | `75a3cd0` | -| 19 | - [x] S2 | Replace exec → execFile for brew | 10 min | `75a3cd0` | - -### Sprint 4 — UX Enhancements _(est. 3–4 hrs)_ - -| # | ID | Task | Effort | Commit | -| --- | --------- | ------------------------------------ | ------ | --------- | -| 20 | - [x] F3 | Prompt history (localStorage) | 45 min | `9c2f5f3` | -| 21 | - [x] F9 | Modelfile viewer in expanded details | 30 min | `9c2f5f3` | -| 22 | - [x] F4 | Chat mode (multi-turn via /api/chat) | 90 min | `ed93a6f` | -| 23 | - [x] F2 | Model search/filter | 30 min | `9c2f5f3` | -| 24 | - [x] F11 | Keyboard shortcuts panel | 20 min | `9c2f5f3` | - -### Sprint 5 — Integration & Polish _(est. 2–3 hrs)_ - -| # | ID | Task | Effort | Commit | -| --- | ----------- | -------------------------- | ------ | --------- | -| 25 | - [x] F15 | Extraction service panel | 60 min | `8bdd5ee` | -| 26 | - [x] F12 | Whisper transcription test | 45 min | `8bdd5ee` | -| 27 | - [x] F7 | System resource sparklines | 45 min | `8bdd5ee` | -| 28 | - [x] CQ5 | Loading skeleton UI | 20 min | `8bdd5ee` | -| 29 | - [x] P1-P3 | Request dedup + cache TTLs | 30 min | `b1fda3a` | -| 30 | - [x] F16 | Auto-load preferred model | 20 min | `ed93a6f` | - -### Deferred (nice-to-have) - -| ID | Task | Notes | -| --------- | ------------------------------- | --------- | -| - [x] F5 | Model comparison (side-by-side) | `8bdd5ee` | -| - [x] F10 | Dark/light theme toggle | `ed93a6f` | -| - [x] F13 | Responsive mobile layout | `8bdd5ee` | -| - [x] F14 | Model tags/labels | `ed93a6f` | -| - [x] CQ6 | Zod validation on API responses | `ed93a6f` | -| - [x] F8 | Ollama server logs viewer | `8bdd5ee` | -| - [x] S3 | CORS / auth (documented) | `8bdd5ee` | - ---- - -## 7. Commit Log - -_Commits will be added here as work progresses._ - -| # | Date | Commit | Sprint | Items Completed | -| --- | ------ | --------- | -------- | ------------------------------------ | -| 1 | Feb 19 | `2da67c2` | Sprint 1 | B1, B3, B4, B5, B6, B9, B10, B11, P4 | -| 2 | Feb 19 | `2d9475b` | Sprint 2 | B2, B7, B8, F1, F6 | -| 3 | Feb 19 | `75a3cd0` | Sprint 3 | CQ1, CQ3, CQ4, S1, S2 | -| 4 | Feb 19 | `9c2f5f3` | Sprint 4 | F2, F3, F9, F11 | -| 5 | Feb 19 | `b1fda3a` | Sprint 5 | P1, P2, P3 | -| 6 | Feb 19 | `ed93a6f` | Sprint 6 | CQ2, CQ6, P5, F4, F10, F14, F16 | -| 7 | Feb 19 | `8bdd5ee` | Sprint 7 | F5, F7, F8, F12, F13, F15, CQ5, S3 | - ---- - -> **39 items total:** 11 bugs, 6 code quality, 16 features, 5 performance, 3 security -> **All 39 items completed** across 7 sprints (9 code commits + doc updates) -> **Actual total effort:** ~8 hours across 7 sprints - ---- - -## 8. Next Wave — Model Intelligence & Pre-Load Metrics - -> Proposed improvements focused on helping users make informed decisions **before** loading a model. - -### Tier A — Pre-Load Decision Metrics _(est. 45 min)_ - -| ID | Feature | Description | -| --- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| N1 | **Estimated RAM per model** | Approximate from disk size: Q4_K_M ≈ 1.2×disk in RAM. Show on every model card (e.g., `~22 GB RAM`), not just running models. | -| N2 | **"Will it fit?" indicator** | Compare estimated RAM vs `system.memory.free + cached`. Color-code: 🟢 Fits, 🟡 Tight (80–100%), 🔴 Won't fit. Show on Load button or as badge. | -| N3 | **Aggregate loaded model RAM** | Sum VRAM of all running models. Display at top of models panel: "3 models loaded · 28.5 GB VRAM". | - -### Tier B — Rich Model Metadata _(est. 60 min)_ - -| ID | Feature | Description | -| --- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------- | ---------------- | ------------------------------------------ | -| N4 | **RAM budget bar** | Horizontal stacked bar: `[OS+Apps | Model A (loaded) | Model B (loaded) | Free]`. Instant visual of memory headroom. | -| N5 | **Context window size** | Fetch `context_length` from Ollama `/api/show` → `model_info`. Display on card (e.g., `128k ctx`). Critical for knowing max prompt length. | - -### Tier C — Model Intelligence Badges _(est. 45 min)_ - -| ID | Feature | Description | -| --- | --------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | -| N6 | **`` warning badge** | If model is DeepSeek R1 family, show ⚠️ badge: "Emits `` traces — strip before JSON.parse". Prevents silent JSON failures. | -| N7 | **Vision model indicator** | If model is multimodal (llava, qwen2.5vl), show 👁 badge. These need image input — text-only prompts are suboptimal. | -| N8 | **Architecture badge** | Show model arch (llama, qwen2, phi3, deepseek2) as subtle pill on the card. Currently buried in expanded details. | -| N9 | **Sort/order models** | Dropdown to sort by: name, size, parameters, running status, last modified. Currently uses Ollama's default order. | -| N10 | **Ollama version display** | Call `/api/version`. Show in Ollama status card. Useful for debugging model compatibility. | - -### Tier D — Runtime Metrics & UX _(est. 30 min)_ - -| ID | Feature | Description | -| --- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| N11 | **Last known tok/s per model** | Persist `StreamMetrics.tokensPerSec` in localStorage keyed by model. Show on card (e.g., `~45 tok/s`). Compare speeds without re-benchmarking. | -| N12 | **Auto-unload countdown** | Replace static `Expires: 3:45 PM` with live countdown: `Unloads in 4m 32s`. More actionable. | -| N13 | **Session stats per model** | Track prompts sent + tokens generated per model in session. Show in expanded details. | -| N14 | **Delete confirmation + reclaim** | Show "Delete qwen2.5-coder:32b? Reclaim 18.5 GB disk." before deleting. Currently no confirmation. | -| N15 | **Simultaneous load suggestions** | Based on available RAM, suggest which models can be co-loaded. E.g., "Can co-load llama3.1:8b + qwen2.5-coder:32b (28 GB, 20 GB free)". | - -### Implementation Plan - -| Sprint | Items | Focus | Effort | -| ------ | ----------------------- | ------------------------ | ------- | -| 8 | N1, N2, N3 | Pre-load RAM estimates | ~45 min | -| 9 | N4, N5 | RAM bar + context window | ~60 min | -| 10 | N6, N7, N8, N9, N10 | Badges + sort + version | ~45 min | -| 11 | N11, N12, N13, N14, N15 | Runtime metrics + UX | ~30 min | diff --git a/__LOCAL_LLMs/docs/README.md b/__LOCAL_LLMs/docs/README.md index 95d76514..9c1cb8e1 100644 --- a/__LOCAL_LLMs/docs/README.md +++ b/__LOCAL_LLMs/docs/README.md @@ -2,7 +2,7 @@ > Complete guide for the local AI inference stack on the ByteLyst development machine. > Hardware: **Apple M4 Pro · 48 GB LPDDR5 · macOS Tahoe** -> Last updated: 2026-02-19 +> Last updated: 2026-02-21 --- @@ -16,8 +16,11 @@ ollama serve # or: brew services start ollama ollama run qwen2.5-coder:32b # best coding model for this hardware # 3. Launch Mission Control dashboard -cd __LOCAL_LLMs/dashboard && npm run dev -- -p 3100 -# Open http://localhost:3100 +cd __LOCAL_LLMs/dashboard && npm run dev +# Open http://localhost:3000 + +# 4. (Optional) Set up TTS +cd __LOCAL_LLMs && bash setup-tts.sh ``` --- @@ -35,6 +38,7 @@ cd __LOCAL_LLMs/dashboard && npm run dev -- -p 3100 | 07 | [Model Recommendations](07-model-recommendations.md) | Tiered model guide by use case, size, and quality for M4 Pro 48GB | | 08 | [Troubleshooting & Corporate Proxy](08-troubleshooting.md) | Common issues, Forcepoint proxy workarounds, MLX warnings | | 09 | [Environment Variables](09-environment-variables.md) | All config vars for Ollama, Whisper, dashboard, evals | +| 10 | [Text-to-Speech](10-text-to-speech.md) | Orpheus TTS via Ollama, Qwen3-TTS 0.6B, setup, corporate proxy | --- @@ -53,28 +57,42 @@ __LOCAL_LLMs/ │ ├── 06-extraction-service-evals.md │ ├── 07-model-recommendations.md │ ├── 08-troubleshooting.md -│ └── 09-environment-variables.md -├── dashboard/ ← Next.js Mission Control app (port 3100) -│ ├── src/app/page.tsx ← main dashboard UI +│ ├── 09-environment-variables.md +│ └── 10-text-to-speech.md +├── dashboard/ ← Next.js Mission Control app (port 3000) +│ ├── src/app/(mission-control)/ ← Mission Control page + memory drilldown │ ├── src/app/api/ollama/route.ts ← Ollama API proxy (list, load, unload, generate) │ ├── src/app/api/whisper/route.ts ← Whisper binary/model discovery -│ └── src/app/api/system/route.ts ← System info (chip, RAM, disk, brew) +│ ├── src/app/api/system/route.ts ← System info (chip, RAM, disk, brew) +│ └── src/app/api/system/memory/route.ts ← Memory drilldown (vm_stat + top processes) +├── setup-tts.sh ← One-shot TTS setup for fresh laptop +├── download-tts-models.sh ← Download model weights (uses hf-mirror.com) +├── test_orpheus_tts.py ← Orpheus TTS test (Ollama + SNAC decoder) +├── test_qwen_tts.py ← Qwen3-TTS 0.6B test (direct Python, MPS/CPU) +├── .venv-qwen-tts/ ← Python 3.12 venv for TTS (gitignored) +├── models/ ← Downloaded TTS model weights (gitignored) └── LOCAL_LLMs_setup_mac_m4_48gb.md ← original doc (preserved, see docs/ for latest) ``` --- -## Current Installation Status (2026-02-19) +## Current Installation Status (2026-02-21) -| Component | Version | Status | Disk Usage | -| ----------------------------------- | ---------- | ----------------------------- | ---------- | -| Ollama | 0.16.2 | ✅ Installed via brew | — | -| qwen2.5-coder:32b | — | ✅ Downloaded | 19 GB | -| llama3.1:8b | — | ✅ Downloaded | 4.9 GB | -| whisper-cpp | 1.8.3 | ✅ Installed via brew | 9.6 MB | -| whisper model (ggml-large-v3-turbo) | — | ❌ Blocked by corporate proxy | — | -| ffmpeg | 8.0.1 | ✅ Installed via brew | 53.3 MB | -| Mission Control Dashboard | Next.js 16 | ✅ Built, runs on :3100 | — | +| Component | Version | Status | Disk Usage | +| ----------------------------------- | ---------- | ------------------------------------------ | ---------- | +| Ollama | 0.16.2 | ✅ Installed via brew | — | +| qwen2.5-coder:32b | — | ✅ Downloaded | 19 GB | +| qwen2.5-coder:7b | — | ✅ Downloaded | 4.7 GB | +| deepseek-r1:32b | — | ✅ Downloaded | 19 GB | +| llama3.1:8b | — | ✅ Downloaded | 4.9 GB | +| sematre/orpheus:en (TTS) | — | ✅ Downloaded via Ollama | 4 GB | +| whisper-cpp | 1.8.3 | ✅ Installed via brew | 9.6 MB | +| whisper model (ggml-large-v3-turbo) | — | ✅ Downloaded via hf-mirror.com | 1.5 GB | +| ffmpeg | 8.0.1 | ✅ Installed via brew | 53.3 MB | +| Python 3.12 (TTS venv) | 3.12.12 | ✅ Installed via brew + venv created | ~2 GB | +| SNAC decoder (TTS) | — | ✅ Downloaded via hf-mirror.com | 76 MB | +| Qwen3-TTS 0.6B | — | ✅ Downloaded via hf-mirror.com | 1.7 GB | +| Mission Control Dashboard | Next.js 16 | ✅ Built, runs on :3000 (memory drilldown) | — | --- diff --git a/__LOCAL_LLMs/download-tts-models.sh b/__LOCAL_LLMs/download-tts-models.sh new file mode 100755 index 00000000..d150a50f --- /dev/null +++ b/__LOCAL_LLMs/download-tts-models.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# ============================================================ +# Download TTS Model Weights +# +# Downloads SNAC decoder + Qwen3-TTS from HuggingFace. +# Uses hf-mirror.com which works through corporate proxy. +# Falls back to huggingface.co if mirror is unreachable. +# +# No Python venv required — uses curl only. +# +# Usage: +# bash download-tts-models.sh # download all +# bash download-tts-models.sh snac # SNAC decoder only +# bash download-tts-models.sh qwen # Qwen3-TTS only +# ============================================================ +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +MODELS_DIR="$SCRIPT_DIR/models" + +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' +ok() { echo -e "${GREEN}✓${NC} $1"; } +fail() { echo -e "${RED}✗${NC} $1"; exit 1; } + +echo "=== TTS Model Downloader ===" +echo "" + +# ── Pick HuggingFace source ───────────────────────────────── +# Try hf-mirror.com first (works through corporate proxy) +# Fall back to huggingface.co (requires non-corporate network) +HF_BASE="" +echo "Testing hf-mirror.com..." +if curl -k -s --max-time 5 "https://hf-mirror.com/hubertsiuzdak/snac_24khz/raw/main/config.json" | python3 -c "import sys,json; json.load(sys.stdin)" &>/dev/null; then + HF_BASE="https://hf-mirror.com" + ok "Using hf-mirror.com (works through corporate proxy)" +else + echo "Mirror unavailable. Testing huggingface.co..." + if curl -s --max-time 5 "https://huggingface.co/api/models/hubertsiuzdak/snac_24khz" -o /dev/null 2>/dev/null; then + HF_BASE="https://huggingface.co" + ok "Using huggingface.co directly" + else + fail "Cannot reach hf-mirror.com or huggingface.co. If on corporate network, try from home WiFi." + fi +fi +echo "" + +mkdir -p "$MODELS_DIR" + +# ── Helper: download with validation ──────────────────────── +download_file() { + local URL="$1" + local DEST="$2" + local DESC="$3" + + echo " Downloading $DESC..." + curl -k -L --progress-bar -o "$DEST" "$URL" + + # Verify not an HTML block page + FILE_HEAD=$(head -c 50 "$DEST" 2>/dev/null) + if echo "$FILE_HEAD" | grep -qi "/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 1000000 ]; then + ok "Already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)" + echo "" + return + fi + fi + + download_file "$HF_BASE/hubertsiuzdak/snac_24khz/raw/main/config.json" \ + "$MODELS_DIR/snac_24khz/config.json" "config.json" + + download_file "$HF_BASE/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" \ + "$MODELS_DIR/snac_24khz/pytorch_model.bin" "pytorch_model.bin (~76 MB)" + + ok "SNAC decoder downloaded" + echo "" +} + +# ── 2. Qwen3-TTS Tokenizer ────────────────────────────────── +download_qwen_tokenizer() { + echo "=== [Qwen3-TTS] Tokenizer (~650 MB) ===" + local DIR="$MODELS_DIR/Qwen3-TTS-Tokenizer-12Hz" + mkdir -p "$DIR" + + if [ -f "$DIR/model.safetensors" ]; then + SIZE=$(stat -f%z "$DIR/model.safetensors" 2>/dev/null || stat -c%s "$DIR/model.safetensors" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 100000000 ]; then + ok "Already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)" + echo "" + return + fi + fi + + for f in config.json configuration.json preprocessor_config.json; do + download_file "$HF_BASE/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" \ + "$DIR/$f" "$f" + done + + download_file "$HF_BASE/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors" \ + "$DIR/model.safetensors" "model.safetensors (~650 MB)" + + ok "Qwen3-TTS Tokenizer downloaded" + echo "" +} + +# ── 3. Qwen3-TTS 0.6B model ───────────────────────────────── +download_qwen_model() { + echo "=== [Qwen3-TTS] 0.6B CustomVoice (~1.2 GB) ===" + local DIR="$MODELS_DIR/Qwen3-TTS-12Hz-0.6B-CustomVoice" + mkdir -p "$DIR" + + if [ -f "$DIR/model.safetensors" ]; then + SIZE=$(stat -f%z "$DIR/model.safetensors" 2>/dev/null || stat -c%s "$DIR/model.safetensors" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 100000000 ]; then + ok "Already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)" + echo "" + return + fi + fi + + for f in config.json generation_config.json; do + download_file "$HF_BASE/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" \ + "$DIR/$f" "$f" + done + + download_file "$HF_BASE/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors" \ + "$DIR/model.safetensors" "model.safetensors (~1.2 GB)" + + ok "Qwen3-TTS 0.6B downloaded" + echo "" +} + +# ── Run downloads ──────────────────────────────────────────── +case "${1:-all}" in + snac) + download_snac + ;; + qwen) + download_qwen_tokenizer + download_qwen_model + ;; + all) + download_snac + download_qwen_tokenizer + download_qwen_model + ;; + *) + echo "Usage: bash download-tts-models.sh [snac|qwen|all]" + exit 1 + ;; +esac + +# ── Summary ────────────────────────────────────────────────── +echo "=== Downloads complete ===" +echo "" +echo "Disk usage:" +du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /' +echo "" +echo "Test commands:" +echo " .venv-qwen-tts/bin/python test_orpheus_tts.py # Orpheus via Ollama" +echo " .venv-qwen-tts/bin/python test_qwen_tts.py # Qwen3-TTS direct" diff --git a/__LOCAL_LLMs/setup-tts.sh b/__LOCAL_LLMs/setup-tts.sh new file mode 100755 index 00000000..852c7e0a --- /dev/null +++ b/__LOCAL_LLMs/setup-tts.sh @@ -0,0 +1,256 @@ +#!/bin/bash +# ============================================================ +# TTS Setup — One-Shot Script for Fresh Laptop +# +# Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python) +# on Apple Silicon Macs. Works through corporate proxy. +# +# What this does: +# 1. Installs Python 3.12 via Homebrew (if missing) +# 2. Creates Python venv with TTS packages +# 3. Pulls Orpheus TTS model via Ollama +# 4. Downloads SNAC audio decoder via hf-mirror.com +# 5. (Optional) Downloads Qwen3-TTS 0.6B via hf-mirror.com +# +# Prerequisites: +# - macOS with Apple Silicon (M1/M2/M3/M4) +# - Homebrew installed +# - Ollama installed (brew install ollama) +# +# Usage: +# bash setup-tts.sh +# +# After setup, test with: +# .venv-qwen-tts/bin/python test_orpheus_tts.py +# afplay test_orpheus_tara.wav +# ============================================================ +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +VENV="$SCRIPT_DIR/.venv-qwen-tts" +MODELS_DIR="$SCRIPT_DIR/models" + +# HuggingFace mirror that works through corporate proxy +HF_MIRROR="https://hf-mirror.com" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +ok() { echo -e "${GREEN}✓${NC} $1"; } +warn() { echo -e "${YELLOW}⚠${NC} $1"; } +fail() { echo -e "${RED}✗${NC} $1"; exit 1; } +step() { echo -e "\n${GREEN}=== $1 ===${NC}"; } + +echo "╔══════════════════════════════════════════════╗" +echo "║ TTS Setup — Local Speech Generation ║" +echo "║ Orpheus TTS (Ollama) + Qwen3-TTS (Python) ║" +echo "╚══════════════════════════════════════════════╝" +echo "" + +# ── 0. Check prerequisites ────────────────────────────────── +step "Checking prerequisites" + +# Homebrew +if ! command -v brew &>/dev/null; then + fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\"" +fi +ok "Homebrew" + +# Ollama +if ! command -v ollama &>/dev/null; then + warn "Ollama not found. Installing..." + brew install ollama +fi +ok "Ollama installed" + +# Check if Ollama is running +if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then + warn "Ollama not running. Starting..." + ollama serve &>/dev/null & + sleep 3 + if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then + fail "Could not start Ollama. Try manually: ollama serve" + fi +fi +ok "Ollama running on port 11434" + +# Apple Silicon check +ARCH=$(uname -m) +if [ "$ARCH" != "arm64" ]; then + warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available." +fi + +# ── 1. Install Python 3.12 ────────────────────────────────── +step "Python 3.12" + +PYTHON_CMD="" +# Check various Python 3.12 locations +for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do + if command -v "$cmd" &>/dev/null; then + PYTHON_CMD="$cmd" + break + fi +done + +if [ -z "$PYTHON_CMD" ]; then + warn "Python 3.12 not found. Installing via Homebrew..." + brew install python@3.12 + PYTHON_CMD="/opt/homebrew/bin/python3.12" +fi + +PYTHON_VER=$("$PYTHON_CMD" --version 2>&1) +ok "$PYTHON_VER at $PYTHON_CMD" + +# ── 2. Create venv ────────────────────────────────────────── +step "Python virtual environment" + +if [ -f "$VENV/bin/python" ]; then + ok "Venv exists at $VENV" +else + echo "Creating venv..." + "$PYTHON_CMD" -m venv "$VENV" + ok "Venv created at $VENV" +fi + +# ── 3. Install Python packages ────────────────────────────── +step "Python packages" + +# Check if snac is installed (quick proxy for all packages) +if "$VENV/bin/python" -c "import snac" &>/dev/null; then + ok "Packages already installed (snac, torch, etc.)" +else + echo "Installing packages (this may take a few minutes)..." + "$VENV/bin/pip" install -U pip --quiet + "$VENV/bin/pip" install -U snac qwen-tts --quiet + ok "Packages installed" +fi + +# ── 4. Pull Orpheus TTS model ─────────────────────────────── +step "Orpheus TTS model (Ollama)" + +if ollama list 2>/dev/null | grep -q "orpheus"; then + ok "Orpheus TTS already downloaded" +else + echo "Pulling sematre/orpheus:en (4 GB)..." + NO_PROXY="ollama.com,registry.ollama.ai" ollama pull sematre/orpheus:en + ok "Orpheus TTS downloaded" +fi + +# ── 5. Download SNAC decoder ──────────────────────────────── +step "SNAC 24kHz audio decoder (~76 MB)" + +mkdir -p "$MODELS_DIR/snac_24khz" + +if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then + SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null) + if [ "$SIZE" -gt 1000000 ]; then + ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)" + else + warn "SNAC file looks corrupted (${SIZE} bytes). Re-downloading..." + rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" + fi +fi + +if [ ! -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then + echo "Downloading config.json..." + curl -k -sL -o "$MODELS_DIR/snac_24khz/config.json" \ + "$HF_MIRROR/hubertsiuzdak/snac_24khz/raw/main/config.json" + + # Verify config is JSON (not an HTML block page) + if ! python3 -c "import json; json.load(open('$MODELS_DIR/snac_24khz/config.json'))" &>/dev/null; then + fail "Downloaded config.json is not valid JSON. The mirror may be blocked. Try from home network." + fi + ok "config.json downloaded" + + echo "Downloading pytorch_model.bin (~76 MB)..." + curl -k -L --progress-bar -o "$MODELS_DIR/snac_24khz/pytorch_model.bin" \ + "$HF_MIRROR/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" + + # Verify it's a real model file (zip/pytorch format), not HTML + FILE_TYPE=$(file -b "$MODELS_DIR/snac_24khz/pytorch_model.bin" | head -c 20) + if echo "$FILE_TYPE" | grep -qi "html"; then + rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" + fail "Downloaded model is HTML (proxy block page). Try from home network." + fi + ok "SNAC decoder downloaded" +fi + +# Verify SNAC loads in Python +echo "Verifying SNAC decoder loads..." +if "$VENV/bin/python" -c " +import snac, torch +model = snac.SNAC.from_pretrained('$MODELS_DIR/snac_24khz') +print(f'SNAC: {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters') +" 2>/dev/null; then + ok "SNAC decoder verified" +else + fail "SNAC decoder failed to load. Delete models/snac_24khz/ and re-run." +fi + +# ── 6. (Optional) Download Qwen3-TTS ──────────────────────── +step "Qwen3-TTS 0.6B (optional, ~1.7 GB total)" + +QWEN_TOKENIZER_DIR="$MODELS_DIR/Qwen3-TTS-Tokenizer-12Hz" +QWEN_MODEL_DIR="$MODELS_DIR/Qwen3-TTS-12Hz-0.6B-CustomVoice" + +if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then + ok "Qwen3-TTS already downloaded" +else + echo "Qwen3-TTS 0.6B requires ~1.7 GB download (tokenizer + model)." + echo "This is optional — Orpheus TTS (above) works without it." + read -p "Download Qwen3-TTS? [y/N] " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + # Tokenizer (~650 MB) + echo "Downloading Qwen3-TTS Tokenizer (~650 MB)..." + mkdir -p "$QWEN_TOKENIZER_DIR" + for f in config.json configuration.json preprocessor_config.json; do + curl -k -sL -o "$QWEN_TOKENIZER_DIR/$f" \ + "$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" 2>/dev/null || true + done + curl -k -L --progress-bar -o "$QWEN_TOKENIZER_DIR/model.safetensors" \ + "$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors" + ok "Tokenizer downloaded" + + # Model + echo "Downloading Qwen3-TTS 0.6B (~1.2 GB)..." + mkdir -p "$QWEN_MODEL_DIR" + for f in config.json generation_config.json; do + curl -k -sL -o "$QWEN_MODEL_DIR/$f" \ + "$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" 2>/dev/null || true + done + curl -k -L --progress-bar -o "$QWEN_MODEL_DIR/model.safetensors" \ + "$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors" + ok "Qwen3-TTS 0.6B downloaded" + else + warn "Skipped. You can re-run this script later to download." + fi +fi + +# ── Summary ────────────────────────────────────────────────── +step "Setup Complete" + +echo "" +echo "Installed components:" +echo " Orpheus TTS (Ollama): $(ollama list 2>/dev/null | grep orpheus | awk '{print $NF}' || echo 'ready')" +echo " SNAC decoder: $MODELS_DIR/snac_24khz/" +if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then + echo " Qwen3-TTS 0.6B: $QWEN_MODEL_DIR/" +else + echo " Qwen3-TTS 0.6B: (not installed — re-run setup to add)" +fi +echo "" +echo "Disk usage:" +du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /' +echo "" +echo "Test commands:" +echo " $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py" +echo " afplay test_orpheus_tara.wav" +if [ -d "$QWEN_MODEL_DIR" ]; then + echo " $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py" +fi +echo "" +echo "Voices: tara, leah, jess, leo, dan, mia, zac, zoe" +echo "Emotion: , , , , , , " diff --git a/__LOCAL_LLMs/start-dashboard.sh b/__LOCAL_LLMs/start-dashboard.sh new file mode 100755 index 00000000..f4404126 --- /dev/null +++ b/__LOCAL_LLMs/start-dashboard.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# ============================================================ +# Start Mission Control Dashboard + Ollama +# +# Usage: +# bash start-dashboard.sh # start dashboard + ensure Ollama running +# bash start-dashboard.sh stop # stop dashboard +# bash start-dashboard.sh status # check status +# ============================================================ + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +DASHBOARD_DIR="$SCRIPT_DIR/dashboard" +PORT=3000 +OLLAMA_URL="http://localhost:11434" + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' +ok() { echo -e "${GREEN}✓${NC} $1"; } +warn() { echo -e "${YELLOW}⚠${NC} $1"; } +fail() { echo -e "${RED}✗${NC} $1"; } + +case "${1:-start}" in + stop) + echo "Stopping dashboard..." + PID=$(lsof -ti :$PORT 2>/dev/null) + if [ -n "$PID" ]; then + kill "$PID" 2>/dev/null + ok "Dashboard stopped (PID $PID)" + else + warn "Dashboard not running on port $PORT" + fi + exit 0 + ;; + + status) + echo "=== Status ===" + # Ollama + if curl -s --max-time 2 "$OLLAMA_URL/api/tags" &>/dev/null; then + MODELS=$(curl -s "$OLLAMA_URL/api/tags" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('models',[])))" 2>/dev/null || echo "?") + ok "Ollama running ($MODELS models)" + else + fail "Ollama not running" + fi + # Dashboard + if curl -s --max-time 2 "http://localhost:$PORT" &>/dev/null; then + ok "Dashboard running at http://localhost:$PORT" + else + fail "Dashboard not running" + fi + exit 0 + ;; + + start) + echo "=== Starting Mission Control ===" + echo "" + + # 1. Ensure Ollama is running + if curl -s --max-time 2 "$OLLAMA_URL/api/tags" &>/dev/null; then + ok "Ollama already running" + else + echo "Starting Ollama..." + ollama serve &>/dev/null & + sleep 2 + if curl -s --max-time 2 "$OLLAMA_URL/api/tags" &>/dev/null; then + ok "Ollama started" + else + fail "Could not start Ollama. Try: ollama serve" + fi + fi + + # 2. Check if dashboard already running + if curl -s --max-time 2 "http://localhost:$PORT" &>/dev/null; then + ok "Dashboard already running at http://localhost:$PORT" + exit 0 + fi + + # 3. Install deps if needed + if [ ! -d "$DASHBOARD_DIR/node_modules" ]; then + echo "Installing dependencies..." + (cd "$DASHBOARD_DIR" && npm install --silent) + ok "Dependencies installed" + fi + + # 4. Start dashboard + echo "Starting dashboard on port $PORT..." + (cd "$DASHBOARD_DIR" && npm run dev &>/dev/null &) + + # Wait for it to be ready + for i in $(seq 1 15); do + if curl -s --max-time 1 "http://localhost:$PORT" &>/dev/null; then + ok "Dashboard ready at http://localhost:$PORT" + echo "" + echo "Open: http://localhost:$PORT" + echo "Stop: bash start-dashboard.sh stop" + exit 0 + fi + sleep 1 + done + + fail "Dashboard did not start within 15s. Check: cd dashboard && npm run dev" + exit 1 + ;; + + *) + echo "Usage: bash start-dashboard.sh [start|stop|status]" + exit 1 + ;; +esac diff --git a/__LOCAL_LLMs/test_orpheus_tts.py b/__LOCAL_LLMs/test_orpheus_tts.py new file mode 100644 index 00000000..17f05887 --- /dev/null +++ b/__LOCAL_LLMs/test_orpheus_tts.py @@ -0,0 +1,189 @@ +""" +Test Orpheus TTS via Ollama + SNAC decoder. + +Prerequisites: + 1. bash setup-tts.sh (one-shot: installs everything) + -- OR manually -- + 1. ollama pull sematre/orpheus:en + 2. bash download-tts-models.sh snac (downloads SNAC via hf-mirror.com) + 3. ollama serve (must be running) + +Usage: + .venv-qwen-tts/bin/python test_orpheus_tts.py +""" +import os +import re +import time +import json +import struct +import wave +import urllib.request + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +SNAC_MODEL_DIR = os.path.join(SCRIPT_DIR, "models", "snac_24khz") +OLLAMA_URL = "http://localhost:11434" +MODEL = "sematre/orpheus:en" + +AUDIO_TOKEN_RE = re.compile(r"") + + +def check_ollama(): + """Verify Ollama is running and model is available.""" + try: + req = urllib.request.Request(f"{OLLAMA_URL}/api/tags") + with urllib.request.urlopen(req, timeout=3) as resp: + data = json.loads(resp.read()) + names = [m["name"] for m in data.get("models", [])] + if not any(MODEL in n for n in names): + print(f"ERROR: Model '{MODEL}' not found. Run: ollama pull {MODEL}") + return False + return True + except Exception as e: + print(f"ERROR: Cannot connect to Ollama at {OLLAMA_URL}: {e}") + print("Run: ollama serve") + return False + + +def check_snac(): + """Verify SNAC model is downloaded.""" + if not os.path.isdir(SNAC_MODEL_DIR): + print(f"ERROR: SNAC decoder not found at {SNAC_MODEL_DIR}") + print("Run: bash setup-tts.sh (or: bash download-tts-models.sh snac)") + return False + return True + + +def load_snac(): + """Load SNAC audio codec.""" + import torch + import snac + + print(f"Loading SNAC decoder from {SNAC_MODEL_DIR}...") + model = snac.SNAC.from_pretrained(SNAC_MODEL_DIR) + model.eval() + return model + + +def generate_tokens(text: str, voice: str = "tara") -> str: + """Call Ollama to generate audio tokens from text.""" + prompt = f"<|begin_of_text|>{voice}: {text}<|eot_id|>" + + payload = json.dumps({ + "model": MODEL, + "prompt": prompt, + "stream": False, + "options": { + "temperature": 0.6, + "top_p": 0.9, + "repeat_penalty": 1.1, + "num_predict": 10240, + "stop": ["<|end_of_text|>"], + }, + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/api/generate", + data=payload, + headers={"Content-Type": "application/json"}, + ) + + print("Generating audio tokens via Ollama...") + t0 = time.time() + with urllib.request.urlopen(req, timeout=120) as resp: + result = json.loads(resp.read()) + + elapsed = time.time() - t0 + response_text = result.get("response", "") + token_count = len(AUDIO_TOKEN_RE.findall(response_text)) + print(f"Generated {token_count} audio tokens in {elapsed:.1f}s") + return response_text + + +def decode_tokens(response_text: str, snac_model) -> tuple: + """Convert audio tokens to WAV audio.""" + import torch + + tokens = AUDIO_TOKEN_RE.findall(response_text) + if not tokens: + print("ERROR: No audio tokens found in response") + return None, 0 + + audio_ids = [ + int(tok) - 10 - ((idx % 7) * 4096) + for idx, tok in enumerate(tokens) + ] + + # Trim to multiple of 7 + audio_ids = audio_ids[: len(audio_ids) // 7 * 7] + if len(audio_ids) == 0: + print("ERROR: Not enough audio tokens to decode") + return None, 0 + + audio_tensor = torch.tensor(audio_ids, dtype=torch.int32).reshape(-1, 7) + codes_0 = audio_tensor[:, 0].unsqueeze(0) + codes_1 = torch.stack((audio_tensor[:, 1], audio_tensor[:, 4])).t().flatten().unsqueeze(0) + codes_2 = ( + torch.stack((audio_tensor[:, 2], audio_tensor[:, 3], audio_tensor[:, 5], audio_tensor[:, 6])) + .t() + .flatten() + .unsqueeze(0) + ) + + print("Decoding audio...") + with torch.inference_mode(): + audio_hat = snac_model.decode([codes_0, codes_1, codes_2]) + + audio_np = audio_hat[0].squeeze().numpy() + return audio_np, 24000 + + +def save_wav(audio_np, sample_rate: int, path: str): + """Save numpy audio array as 16-bit WAV.""" + import numpy as np + + # Normalize to int16 + audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16) + + with wave.open(path, "w") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio_int16.tobytes()) + + duration = len(audio_int16) / sample_rate + print(f"Saved {path} ({duration:.1f}s, {sample_rate} Hz)") + + +def main(): + print("=== Orpheus TTS Test (Ollama + SNAC) ===\n") + + if not check_ollama(): + return + if not check_snac(): + return + + snac_model = load_snac() + + # Voices: tara, leah, jess, leo, dan, mia, zac, zoe + tests = [ + ("Hello! This is Orpheus text to speech, running entirely on your Mac through Ollama.", "tara"), + (" That's amazing! Local AI speech generation without any cloud services!", "leo"), + ] + + for i, (text, voice) in enumerate(tests): + print(f"\n--- Test {i+1}: voice={voice} ---") + print(f"Text: {text[:80]}...") + + response = generate_tokens(text, voice) + audio, sr = decode_tokens(response, snac_model) + + if audio is not None: + outpath = os.path.join(SCRIPT_DIR, f"test_orpheus_{voice}.wav") + save_wav(audio, sr, outpath) + + print("\n=== Done! Open the .wav files to listen. ===") + print("Play with: afplay test_orpheus_tara.wav") + + +if __name__ == "__main__": + main() diff --git a/__LOCAL_LLMs/test_qwen_tts.py b/__LOCAL_LLMs/test_qwen_tts.py new file mode 100644 index 00000000..4db74545 --- /dev/null +++ b/__LOCAL_LLMs/test_qwen_tts.py @@ -0,0 +1,84 @@ +""" +Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback). + +Prerequisites: + bash setup-tts.sh (one-shot: installs everything) + -- OR manually -- + bash download-tts-models.sh (downloads models via hf-mirror.com) + +Usage: + .venv-qwen-tts/bin/python test_qwen_tts.py +""" +import os +import time +import torch +import soundfile as sf +from qwen_tts import Qwen3TTSModel + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice") + +# Check model exists locally +if not os.path.isdir(MODEL_PATH): + print(f"ERROR: Model not found at {MODEL_PATH}") + print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)") + raise SystemExit(1) + +# Pick device: MPS if available, else CPU +if torch.backends.mps.is_available(): + device = "mps" + dtype = torch.float32 # MPS doesn't support bfloat16 + print(f"Using MPS (Apple Metal GPU)") +else: + device = "cpu" + dtype = torch.float32 + print(f"Using CPU") + +print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...") +t0 = time.time() + +model = Qwen3TTSModel.from_pretrained( + MODEL_PATH, + device_map=device, + dtype=dtype, +) + +print(f"Model loaded in {time.time() - t0:.1f}s") +print(f"Supported speakers: {model.get_supported_speakers()}") +print(f"Supported languages: {model.get_supported_languages()}") + +# Test 1: English with a built-in speaker +text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac." +print(f"\nGenerating speech for: {text[:60]}...") + +t1 = time.time() +wavs, sr = model.generate_custom_voice( + text=text, + language="English", + speaker="Chelsie", +) +elapsed = time.time() - t1 +print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s") + +output_path = "test_output_english.wav" +sf.write(output_path, wavs[0], sr) +print(f"Saved to {output_path}") + +# Test 2: English with emotion instruction +text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!" +print(f"\nGenerating with emotion: {text2[:60]}...") + +t2 = time.time() +wavs2, sr2 = model.generate_custom_voice( + text=text2, + language="English", + speaker="Chelsie", + instruct="Speak with excitement and enthusiasm", +) +elapsed2 = time.time() - t2 +print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s") + +sf.write("test_output_excited.wav", wavs2[0], sr2) +print("Saved to test_output_excited.wav") + +print("\nDone! Open the .wav files to listen.") diff --git a/__LOCAL_LLMs/windows_specific/razer-blade-18-spec.md b/__LOCAL_LLMs/windows_specific/razer-blade-18-spec.md new file mode 100644 index 00000000..3919e6df --- /dev/null +++ b/__LOCAL_LLMs/windows_specific/razer-blade-18-spec.md @@ -0,0 +1,387 @@ +Here is a complete engineering-grade specification document for the exact configuration you shared: + +⸻ + +Razer Blade 18 (Model: RZ09-05299ER9-R3U1) — Detailed Specification Document + +Manufacturer: Razer Inc. +Product Line: Blade Series +Model Number: RZ09-05299ER9-R3U1 +Form Factor: High-performance desktop-class gaming & workstation laptop +Release Generation: RTX 50-series era (2026) + +⸻ + +1. System Overview + +The Razer Blade 18 is positioned as a flagship desktop-replacement laptop, integrating Intel Core Ultra HX processors, NVIDIA RTX 50-series GPUs, ultra-high refresh displays, and workstation-level memory/storage configurations.  + +Primary Target Use Cases +• AAA gaming at maximum settings (4K, ray tracing) +• AI / ML model development (local inference, CUDA workloads) +• Software development & compilation +• 3D rendering, Unreal Engine, Blender +• Video editing (8K workflows) +• Desktop replacement workstation + +⸻ + +2. CPU (Processor) + +Processor: Intel® Core™ Ultra 9 275HX  + +Architecture + +Attribute Specification +CPU family Intel Core Ultra HX Series +Architecture Intel Meteor Lake / Arrow Lake HX class +Core design Hybrid architecture +Core types Performance cores + Efficient cores +Target TDP ~55W base (HX class), scalable to ~157W turbo +Fabrication Intel 3 / advanced node +Integrated AI accelerator Intel NPU (Neural Processing Unit) + +Estimated core configuration (typical for Ultra 9 HX class) + +Core type Count +Performance cores 8 +Efficient cores 16 +Total cores 24 +Threads 24 + +AI acceleration + +Integrated: +• Intel NPU +• AVX-512 support +• VNNI instructions +• Hardware AI acceleration support + +Use cases: +• Local AI inference +• Background Copilot AI tasks +• AI-assisted workflows + +⸻ + +3. GPU (Graphics) + +Discrete GPU: NVIDIA GeForce RTX 5090 Laptop GPU  +VRAM: 24 GB GDDR7 VRAM  + +⸻ + +GPU Architecture + +Attribute Specification +Architecture NVIDIA Blackwell (RTX 50-series) +Memory type GDDR7 +VRAM size 24 GB +CUDA cores Estimated ~18,000–20,000 +Ray tracing cores 4th or 5th Gen RT cores +Tensor cores 5th or 6th Gen +PCIe interface PCIe Gen 5 +DirectX support DirectX 12 Ultimate +Vulkan support Yes +OpenCL support Yes +CUDA support Yes + +⸻ + +GPU Compute Capability + +Feature Support +CUDA compute Yes +Tensor acceleration Yes +DLSS DLSS 4 +Ray tracing Hardware accelerated +AI inference Excellent +Stable diffusion Excellent +Local LLM inference Excellent + +⸻ + +AI / ML Capability Estimate + +Model Expected Performance +Llama 3 8B Real-time +Llama 3 70B quantized Usable +Stable Diffusion XL Very fast +Whisper large Very fast +TensorRT inference Excellent + +⸻ + +4. RAM (Memory) + +Installed memory: 64 GB RAM  +Memory speed: 5600 MHz  + +⸻ + +Memory Details + +Attribute Specification +Capacity 64 GB +Type DDR5 +Speed 5600 MHz +Channels Dual channel +ECC No +Upgradeability Yes (depends on configuration) + +⸻ + +Memory bandwidth estimate + +~90–120 GB/sec + +⸻ + +5. Storage + +Installed storage: 4 TB SSD (2 TB + 2 TB)  + +⸻ + +Storage configuration + +Attribute Specification +Total capacity 4 TB +Drive type NVMe SSD +Interface PCIe Gen 4 or Gen 5 +Configuration Dual SSD +RAID support Possible +Upgradeable Yes + +⸻ + +Storage performance estimate + +Metric Expected +Sequential read 7,000–14,000 MB/sec +Sequential write 6,000–12,000 MB/sec +Random IOPS >1 million + +⸻ + +6. Display + +Display size: 18 inches  +Display modes: Dual mode UHD+ 240 Hz / FHD+ 440 Hz  + +⸻ + +Display detailed specifications + +Attribute Specification +Size 18 inches +Mode 1 resolution UHD+ (3840×2400) +Mode 2 resolution FHD+ (1920×1200) +Refresh rate (UHD+) 240 Hz +Refresh rate (FHD+) 440 Hz +Aspect ratio 16:10 +Panel type IPS or Mini-LED +Adaptive sync Yes +Response time <3 ms (estimated) +HDR support Likely HDR 600–1000 +Color gamut 100% DCI-P3 + +⸻ + +Dual-mode display explanation + +Switchable between: + +Mode Use case +UHD+ 240 Hz Visual quality, editing +FHD+ 440 Hz Competitive gaming + +⸻ + +7. Operating System + +OS: Windows 11 Home  + +Supports: +• DirectX 12 Ultimate +• WSL2 +• CUDA +• AI frameworks + +⸻ + +8. Cooling System + +Advanced vapor chamber cooling system. + +Expected features: +• Vapor chamber cooling +• Dual fan cooling +• Liquid metal thermal interface +• Advanced heat pipe network + +Supports sustained: +• CPU ~120W+ +• GPU ~175W+ + +⸻ + +9. Connectivity & Ports (Expected for Blade 18) + +Typical Blade 18 includes: + +USB +• 3× USB-A 3.2 Gen 2 +• 2× USB-C (Thunderbolt 4 / USB4) + +Video +• HDMI 2.1 +• Thunderbolt video output + +Network +• 2.5 Gb Ethernet + +Audio +• 3.5 mm combo jack + +Storage expansion +• Dual NVMe slots + +⸻ + +10. Wireless Connectivity + +Expected: + +Technology Support +Wi-Fi Wi-Fi 7 +Bluetooth Bluetooth 5.4 + +⸻ + +11. Power System + +Estimated: + +Attribute Specification +Power adapter 330W–400W +Battery ~90–99 Wh +Charging Fast charging supported + +⸻ + +12. Keyboard + • Per-key RGB lighting + • Razer Chroma support + • Anti-ghosting + • Full keyboard with numpad + +⸻ + +13. Build and Physical + +Expected typical Blade 18 chassis: + +Attribute Specification +Material CNC aluminum +Color Black +Thickness ~21–24 mm +Weight ~3.1 kg + +⸻ + +14. AI / ML Workstation Capability Assessment + +This laptop is excellent for: + +Supported workloads + +Workload Capability +CUDA development Excellent +Stable diffusion Excellent +LLM inference Excellent +PyTorch training Very good +TensorFlow Excellent +Unreal Engine Excellent +Blender Excellent + +⸻ + +15. Gaming Performance Tier + +Expected performance: + +Resolution Performance +1080p Extreme +1440p Extreme +4K Excellent + +Ray tracing: +• Excellent + +DLSS 4: +• Supported + +⸻ + +16. Storage Expansion Capability + +Supports: +• Dual NVMe SSD +• RAID configuration +• Upgradable storage + +⸻ + +17. Price + +Configured model price: + +$5,199.99 USD  + +⸻ + +18. Performance Tier Classification + +Category Rating +Gaming Extreme +AI / ML Extreme +Software development Extreme +Video editing Extreme +Desktop replacement Extreme + +⸻ + +19. Ideal Use Cases for You Specifically (Saravana) + +Based on your AI engineering work: + +Perfect for: +• Running multiple local LLMs +• CUDA inference pipelines +• Running LangGraph agents locally +• Running Stable Diffusion +• Running Whisper locally +• Running multiple Docker containers +• Unreal Engine / 3D + +⸻ + +20. Summary + +This configuration is: + +One of the most powerful laptops available in the world currently + +Equivalent to: +• High-end workstation +• High-end desktop GPU +• Enterprise-grade development machine + +⸻ + +If you want, I can also create: + +• Full performance comparison vs desktop RTX 4090 +• Or benchmark estimates for LLM / Stable Diffusion / coding workloads +• Or recommended optimal configuration for your home lab and multi-model setup diff --git a/__LOCAL_LLMs/windows_specific/setup-guide.md b/__LOCAL_LLMs/windows_specific/setup-guide.md new file mode 100644 index 00000000..aaa03932 --- /dev/null +++ b/__LOCAL_LLMs/windows_specific/setup-guide.md @@ -0,0 +1,372 @@ +# Windows Setup Guide — Local LLM Stack on Razer Blade 18 + +> **Hardware:** Razer Blade 18 · Intel Core Ultra 9 275HX · RTX 5090 24 GB GDDR7 · 64 GB DDR5 · 4 TB NVMe +> **OS:** Windows 11 Home +> **Goal:** Mirror the macOS `__LOCAL_LLMs` stack — Ollama, Whisper, TTS (Orpheus + Qwen3), Mission Control dashboard +> **See also:** [razer-blade-18-spec.md](razer-blade-18-spec.md) for full hardware specs + +--- + +## Prerequisites + +### 1. Windows Package Manager + +Install **winget** (ships with Windows 11) and optionally **Scoop** for CLI tools: + +```powershell +# Verify winget +winget --version + +# Install Scoop (optional, useful for dev tools) +Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser +Invoke-RestMethod -Uri https://get.scoop.sh | Invoke-Expression +``` + +### 2. NVIDIA CUDA Toolkit + +The RTX 5090 needs the latest CUDA drivers for GPU-accelerated inference. + +```powershell +# Install NVIDIA drivers (latest Game Ready or Studio) +winget install --id Nvidia.GeForceExperience + +# Install CUDA Toolkit (required for PyTorch CUDA) +winget install --id Nvidia.CUDA +# Or download from: https://developer.nvidia.com/cuda-downloads + +# Verify +nvidia-smi +``` + +Expected output should show: + +- **RTX 5090** with **24 GB** VRAM +- CUDA version 13.x+ + +### 3. Node.js (for Mission Control Dashboard) + +```powershell +winget install --id OpenJS.NodeJS.LTS +# Verify +node --version # should be 20.x+ +npm --version +``` + +### 4. Python 3.12 + +```powershell +winget install --id Python.Python.3.12 +# Verify +python --version +pip --version +``` + +### 5. Git + +```powershell +winget install --id Git.Git +``` + +### 6. ffmpeg + +```powershell +winget install --id Gyan.FFmpeg +# Or: scoop install ffmpeg +``` + +--- + +## 1. Ollama — LLM Server + +### Install + +```powershell +winget install --id Ollama.Ollama +``` + +Ollama for Windows runs as a background service and automatically uses CUDA (RTX 5090). + +### Verify + +```powershell +ollama --version +curl http://localhost:11434/api/tags +``` + +### Download Models + +```powershell +# Coding +ollama pull qwen2.5-coder:32b # 19 GB — primary coding model +ollama pull qwen2.5-coder:7b # 4.7 GB — fast coding + +# Reasoning +ollama pull deepseek-r1:32b # 19 GB — chain-of-thought + +# General +ollama pull llama3.1:8b # 4.9 GB — fast general tasks + +# TTS +ollama pull sematre/orpheus:en # 4 GB — text-to-speech (8 voices) + +# Verify +ollama list +``` + +> **Note:** With 24 GB VRAM, Ollama will offload 32B models almost entirely to GPU. +> On macOS (48 GB unified), the 32B models run in shared CPU/GPU memory. +> On this machine, **GPU inference will be significantly faster** for models that fit in 24 GB VRAM. + +### VRAM Budget (RTX 5090 — 24 GB) + +| Model | VRAM Usage | Fits in GPU? | +| ---------------------------- | ---------- | ------------ | +| llama3.1:8b | ~5 GB | ✅ Fully | +| qwen2.5-coder:7b | ~5 GB | ✅ Fully | +| sematre/orpheus:en | ~4 GB | ✅ Fully | +| qwen2.5-coder:32b | ~19 GB | ✅ Fully | +| deepseek-r1:32b | ~19 GB | ✅ Fully | +| Two 7B models simultaneously | ~10 GB | ✅ Both fit | + +--- + +## 2. Whisper.cpp — Speech-to-Text + +### Option A: Pre-built Binary (Recommended) + +Download the latest release from GitHub: + +```powershell +# Create whisper directory +mkdir "$env:USERPROFILE\whisper-cpp" +cd "$env:USERPROFILE\whisper-cpp" + +# Download latest release (CUDA build) +# Check: https://github.com/ggerganov/whisper.cpp/releases +# Look for: whisper-cublas-bin-x64.zip or whisper-cuda-bin-x64.zip +``` + +### Option B: Build from Source (CUDA) + +```powershell +git clone https://github.com/ggerganov/whisper.cpp.git +cd whisper.cpp +cmake -B build -DGGML_CUDA=ON +cmake --build build --config Release +``` + +### Download Whisper Model + +```powershell +mkdir "$env:USERPROFILE\whisper-models" + +# Download ggml-large-v3-turbo (1.5 GB) +curl -L -o "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" ` + "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin" +``` + +> **No corporate proxy on this machine** — download directly from `huggingface.co`. +> The `hf-mirror.com` workaround is only needed on the corporate MacBook. + +### Verify + +```powershell +# Test transcription +whisper-cli -m "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" -f test.wav +``` + +--- + +## 3. TTS — Orpheus + Qwen3-TTS + +### 3a. Orpheus TTS (via Ollama) + +Already handled in Step 1 (`ollama pull sematre/orpheus:en`). + +### 3b. SNAC Decoder + +```powershell +# Create models directory (match macOS layout) +$MODELS = "$PSScriptRoot\models" # or wherever you clone the repo +mkdir "$MODELS\snac_24khz" -Force + +# Download SNAC decoder +curl -L -o "$MODELS\snac_24khz\config.json" ` + "https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/config.json" +curl -L -o "$MODELS\snac_24khz\pytorch_model.bin" ` + "https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" +``` + +### 3c. Python Venv + Dependencies + +```powershell +cd __LOCAL_LLMs + +# Create venv +python -m venv .venv-qwen-tts + +# Activate (Windows uses Scripts, not bin) +.\.venv-qwen-tts\Scripts\Activate.ps1 + +# Install PyTorch with CUDA (NOT MPS — that's Apple only) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 + +# Install other deps +pip install snac numpy soundfile + +# Verify CUDA +python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0)}')" +# Expected: CUDA: True, Device: NVIDIA GeForce RTX 5090 Laptop GPU +``` + +### 3d. Qwen3-TTS 0.6B + +```powershell +$MODELS = ".\models" + +# Tokenizer (~650 MB) +mkdir "$MODELS\Qwen3-TTS-Tokenizer-12Hz" -Force +foreach ($f in @("config.json", "configuration.json", "preprocessor_config.json")) { + curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\$f" ` + "https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" +} +curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\model.safetensors" ` + "https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors" + +# Model weights (~1.8 GB) +mkdir "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice" -Force +foreach ($f in @("config.json", "generation_config.json")) { + curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\$f" ` + "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" +} +curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\model.safetensors" ` + "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors" +``` + +### 3e. Test TTS + +```powershell +# Activate venv +.\.venv-qwen-tts\Scripts\Activate.ps1 + +# Orpheus TTS test +python test_orpheus_tts.py + +# Qwen3-TTS test +python test_qwen_tts.py +``` + +> **Key difference from macOS:** Qwen3-TTS will use **CUDA** instead of MPS. +> In `test_qwen_tts.py`, the device selection `torch.device("mps")` will fall through to CUDA automatically +> since `torch.backends.mps.is_available()` returns False on Windows. +> You may want to update the device logic to prefer CUDA: +> +> ```python +> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +> ``` + +--- + +## 4. Mission Control Dashboard + +```powershell +cd __LOCAL_LLMs\dashboard + +# Install dependencies +npm install + +# Start dev server +npm run dev +# Open http://localhost:3000 +``` + +The dashboard is pure Next.js — works identically on Windows. The API routes auto-detect: + +- **Ollama** at `localhost:11434` +- **Whisper** models in `%USERPROFILE%\whisper-models\` +- **TTS** engines (Orpheus, Qwen3-TTS) and Python venv + +### Start Script (PowerShell) + +Use the bash script equivalent: + +```powershell +# Quick start (manual) +ollama serve # if not already running as service +cd __LOCAL_LLMs\dashboard +npm run dev +``` + +> TODO: Create `start-dashboard.ps1` as a PowerShell equivalent of `start-dashboard.sh` + +--- + +## 5. Key Differences: macOS vs Windows + +| Area | macOS (M4 Pro 48 GB) | Windows (Razer Blade 18) | +| ------------------- | ----------------------------------- | ------------------------------------- | +| **GPU** | Apple Silicon (unified memory, MPS) | RTX 5090 (24 GB VRAM, CUDA) | +| **Ollama GPU** | Automatic (Metal) | Automatic (CUDA) | +| **VRAM** | Shared from 48 GB RAM | Dedicated 24 GB GDDR7 | +| **PyTorch device** | `mps` | `cuda` | +| **Whisper install** | `brew install whisper-cpp` | Build from source or download release | +| **Python venv** | `bin/activate` | `Scripts\Activate.ps1` | +| **Package manager** | Homebrew | winget / scoop | +| **Shell** | zsh / bash | PowerShell / cmd | +| **Scripts** | `.sh` (bash) | `.ps1` (PowerShell) | +| **Model download** | `hf-mirror.com` (corporate proxy) | `huggingface.co` (no proxy) | +| **Dashboard** | Identical | Identical | +| **Ollama models** | Identical | Identical | + +### Performance Expectations + +| Workload | macOS M4 Pro 48 GB | Razer RTX 5090 24 GB | +| --------------------------- | ---------------------------- | ------------------------- | +| qwen2.5-coder:32b inference | ~15–25 tok/s (MPS/CPU blend) | ~40–60 tok/s (full CUDA) | +| Whisper large-v3-turbo | ~2–4x realtime (CPU) | ~8–15x realtime (CUDA) | +| Orpheus TTS | ~realtime (CPU decode) | ~2–3x realtime (CUDA) | +| Qwen3-TTS | ~realtime (MPS) | ~2–4x realtime (CUDA) | +| 70B quantized models | Fits in 48 GB (slow) | Partially offloads to RAM | + +--- + +## 6. File Layout (Same as macOS) + +``` +__LOCAL_LLMs/ +├── dashboard/ ← Mission Control (port 3000) — works as-is +├── models/ ← TTS model weights (gitignored) +│ ├── snac_24khz/ +│ ├── Qwen3-TTS-Tokenizer-12Hz/ +│ └── Qwen3-TTS-12Hz-0.6B-CustomVoice/ +├── .venv-qwen-tts/ ← Python venv (Scripts\ on Windows) +├── test_orpheus_tts.py ← works as-is (device fallback) +├── test_qwen_tts.py ← update device to prefer CUDA +├── windows_specific/ +│ ├── razer-blade-18-spec.md ← hardware spec +│ └── setup-guide.md ← this file +└── docs/ ← macOS-focused docs (still useful as reference) +``` + +--- + +## 7. Quick Reference — Full Setup Checklist + +``` +[ ] Install NVIDIA drivers + CUDA Toolkit +[ ] Install Ollama (winget install Ollama.Ollama) +[ ] Pull models: qwen2.5-coder:32b, deepseek-r1:32b, llama3.1:8b, orpheus +[ ] Install Node.js 20+ (winget) +[ ] Install Python 3.12 (winget) +[ ] Install Git (winget) +[ ] Install ffmpeg (winget) +[ ] Clone repo +[ ] Download Whisper model to %USERPROFILE%\whisper-models\ +[ ] Build or download whisper-cpp with CUDA +[ ] Create Python venv + install PyTorch CUDA + snac +[ ] Download SNAC decoder +[ ] Download Qwen3-TTS tokenizer + model +[ ] npm install in dashboard/ +[ ] Run dashboard: npm run dev +[ ] Verify: http://localhost:3000 shows all green +```