ci: update CI/CD configuration

This commit is contained in:
saravanakumardb1 2026-02-21 14:13:07 -08:00
parent 14c7883d2a
commit f85b455eb5
20 changed files with 2827 additions and 389 deletions

5
.gitignore vendored
View File

@ -14,3 +14,8 @@ coverage/
*.key
kv.txt
kv_azure.txt
# Local LLM models & venvs
__LOCAL_LLMs/models/
__LOCAL_LLMs/.venv-*/
__LOCAL_LLMs/*.wav

View File

@ -0,0 +1,267 @@
'use client';
import { useState, useEffect } from 'react';
import { RefreshCw, Cpu, HardDrive, Archive, Layers, Zap } from 'lucide-react';
import { formatBytes } from '../../../lib/format';
import { ProgressBar } from '../../../components/ProgressBar';
interface VmCategory {
active: number;
wired: number;
compressor: number;
inactive: number;
purgeable: number;
speculative: number;
free: number;
}
interface GroupedProcess {
name: string;
rss: number;
pctMem: number;
count: number;
pids: number[];
}
interface MemoryDrilldownData {
totalRam: number;
categories: VmCategory;
processes: GroupedProcess[];
}
const CATEGORY_META: Record<
keyof VmCategory,
{ label: string; color: string; description: string }
> = {
active: {
label: 'Active',
color: 'var(--accent-primary)',
description: 'Pages recently used by apps',
},
wired: {
label: 'Wired',
color: 'var(--danger)',
description: 'Kernel & drivers — cannot be paged out',
},
compressor: {
label: 'Compressed',
color: 'var(--warning)',
description: 'Pages compressed to save RAM (still counts as used)',
},
inactive: {
label: 'Inactive',
color: 'var(--accent-secondary)',
description: 'Recently freed — reclaimable on demand',
},
purgeable: {
label: 'Purgeable',
color: 'var(--purple)',
description: 'Cache that macOS can discard immediately',
},
speculative: {
label: 'Speculative',
color: 'var(--text-tertiary)',
description: 'Pre-fetched pages — reclaimable',
},
free: {
label: 'Free',
color: 'var(--success)',
description: 'Unused pages — immediately available',
},
};
export function MemoryDrilldown() {
const [data, setData] = useState<MemoryDrilldownData | null>(null);
const [loading, setLoading] = useState(true);
const fetchData = async () => {
setLoading(true);
try {
const res = await fetch('/api/system/memory');
if (res.ok) setData(await res.json());
} catch {
// ignore
}
setLoading(false);
};
useEffect(() => {
fetchData();
}, []);
if (loading && !data) {
return (
<div className="flex items-center justify-center py-6">
<RefreshCw className="w-4 h-4 animate-spin" style={{ color: 'var(--text-tertiary)' }} />
</div>
);
}
if (!data) return null;
const total = data.totalRam;
const cats = data.categories;
const appMemory = cats.active + cats.wired + cats.compressor;
return (
<div className="space-y-4">
{/* Category breakdown header */}
<div className="flex items-center justify-between">
<span className="text-xs font-semibold" style={{ color: 'var(--text-secondary)' }}>
Memory Categories (vm_stat)
</span>
<button
onClick={fetchData}
disabled={loading}
className="p-1 rounded transition-colors"
style={{ color: 'var(--text-tertiary)' }}
title="Refresh"
>
<RefreshCw className={`w-3.5 h-3.5 ${loading ? 'animate-spin' : ''}`} />
</button>
</div>
{/* Stacked bar */}
<div
className="flex w-full h-6 rounded-md overflow-hidden"
style={{ background: 'var(--surface-muted)' }}
>
{(Object.keys(CATEGORY_META) as (keyof VmCategory)[]).map(key => {
const bytes = cats[key];
const pct = (bytes / total) * 100;
if (pct < 0.3) return null;
const meta = CATEGORY_META[key];
return (
<div
key={key}
className="h-full flex items-center justify-center text-[9px] font-medium overflow-hidden shrink-0"
style={{
width: `${pct}%`,
background: meta.color,
color: 'var(--bg-canvas)',
opacity: 0.85,
}}
title={`${meta.label}: ${formatBytes(bytes)} (${pct.toFixed(1)}%)`}
>
{pct > 6 ? meta.label : ''}
</div>
);
})}
</div>
{/* Legend grid */}
<div className="grid grid-cols-2 gap-x-4 gap-y-1.5">
{(Object.keys(CATEGORY_META) as (keyof VmCategory)[]).map(key => {
const bytes = cats[key];
const pct = (bytes / total) * 100;
const meta = CATEGORY_META[key];
const isApp = key === 'active' || key === 'wired' || key === 'compressor';
return (
<div key={key} className="flex items-center justify-between" title={meta.description}>
<div className="flex items-center gap-1.5">
<span
className="w-2.5 h-2.5 rounded-sm inline-block shrink-0"
style={{ background: meta.color, opacity: 0.85 }}
/>
<span
className="text-[11px]"
style={{ color: isApp ? 'var(--text-secondary)' : 'var(--text-tertiary)' }}
>
{meta.label}
</span>
</div>
<span className="text-[11px] font-mono" style={{ color: 'var(--text-tertiary)' }}>
{formatBytes(bytes)}
<span className="ml-1 text-[9px]">({pct.toFixed(1)}%)</span>
</span>
</div>
);
})}
</div>
{/* Summary line */}
<div
className="flex items-center justify-between px-2 py-1.5 rounded-md text-[11px]"
style={{ background: 'var(--surface-muted)' }}
>
<span style={{ color: 'var(--text-secondary)' }}>
<strong>App memory</strong> (active + wired + compressed)
</span>
<span className="font-mono font-semibold" style={{ color: 'var(--text-primary)' }}>
{formatBytes(appMemory)}
</span>
</div>
{/* Top processes */}
<div>
<span className="text-xs font-semibold" style={{ color: 'var(--text-secondary)' }}>
Top Processes by Memory
</span>
</div>
<div className="space-y-1.5">
{data.processes.slice(0, 15).map((proc, i) => {
const pct = (proc.rss / total) * 100;
const isOllama = proc.name.toLowerCase().includes('ollama');
const isNode =
proc.name.toLowerCase().includes('node') || proc.name.toLowerCase().includes('next');
return (
<div key={`${proc.name}-${i}`}>
<div className="flex items-center justify-between mb-0.5">
<div className="flex items-center gap-1.5 min-w-0">
{isOllama ? (
<Zap className="w-3 h-3 shrink-0" style={{ color: 'var(--success)' }} />
) : isNode ? (
<Layers
className="w-3 h-3 shrink-0"
style={{ color: 'var(--accent-secondary)' }}
/>
) : (
<Cpu className="w-3 h-3 shrink-0" style={{ color: 'var(--text-tertiary)' }} />
)}
<span
className="text-[11px] font-mono truncate"
style={{
color: isOllama
? 'var(--success)'
: isNode
? 'var(--accent-secondary)'
: 'var(--text-secondary)',
}}
>
{proc.name}
{proc.count > 1 && (
<span style={{ color: 'var(--text-tertiary)' }}> ×{proc.count}</span>
)}
</span>
</div>
<span
className="text-[11px] font-mono shrink-0 ml-2"
style={{ color: 'var(--text-tertiary)' }}
>
{formatBytes(proc.rss)}
<span className="ml-1 text-[9px]">({pct.toFixed(1)}%)</span>
</span>
</div>
<div
className="h-1.5 rounded-full overflow-hidden"
style={{ background: 'var(--surface-muted)' }}
>
<div
className="h-full rounded-full"
style={{
width: `${Math.max(0.5, pct)}%`,
background: isOllama
? 'var(--success)'
: isNode
? 'var(--accent-secondary)'
: 'var(--accent-primary)',
opacity: 0.7,
}}
/>
</div>
</div>
);
})}
</div>
</div>
);
}

View File

@ -36,6 +36,7 @@ import {
Star,
MessageSquare,
Settings,
Volume2,
} from 'lucide-react';
import type {
OllamaData,
@ -57,6 +58,7 @@ import { ProgressBar } from '../../components/ProgressBar';
import { Sparkline } from '../../components/Sparkline';
import { RamBudgetBar } from './components/RamBudgetBar';
import { MarkdownResponse } from './components/MarkdownResponse';
import { MemoryDrilldown } from './components/MemoryDrilldown';
export default function Dashboard() {
const [ollama, setOllama] = useState<OllamaData | null>(null);
@ -129,6 +131,19 @@ export default function Dashboard() {
>([]);
const [showInferenceLog, setShowInferenceLog] = useState(false);
const [inferenceSearch, setInferenceSearch] = useState('');
const [showMemoryDrilldown, setShowMemoryDrilldown] = useState(false);
const [ttsData, setTtsData] = useState<{
engines: Array<{
name: string;
type: 'ollama' | 'python';
status: 'ready' | 'partial' | 'missing';
model: string;
size?: string;
voices?: string[];
details: string;
}>;
venv: { exists: boolean; packages?: string[] };
} | null>(null);
const responseRef = useRef<HTMLDivElement>(null);
const abortRef = useRef<AbortController | null>(null);
const compareAbortRef = useRef<AbortController | null>(null);
@ -158,6 +173,13 @@ export default function Dashboard() {
setMemoryHistory(prev => [...prev.slice(-29), sRes.value.memory.appMemory]);
}
}
// TTS engine status
try {
const tRes = await fetch('/api/tts');
if (tRes.ok) setTtsData(await tRes.json());
} catch {
/* ignore */
}
// F15: Check extraction service health via server-side proxy (avoids browser CORS/console errors)
try {
const eRes = await fetch('/api/extraction/health');
@ -1143,21 +1165,33 @@ export default function Dashboard() {
</p>
</div>
<div className="card p-4">
<div
className="card p-4 cursor-pointer transition-all"
onClick={() => setShowMemoryDrilldown(prev => !prev)}
style={{
outline: showMemoryDrilldown ? '2px solid var(--warning)' : 'none',
outlineOffset: '-1px',
}}
title="Click to see memory drilldown"
>
<div className="flex items-center gap-2 mb-2">
<MemoryStick className="w-4 h-4" style={{ color: 'var(--warning)' }} />
<span className="text-xs font-medium" style={{ color: 'var(--text-tertiary)' }}>
MEMORY
</span>
<span className="text-[9px] ml-auto" style={{ color: 'var(--text-tertiary)' }}>
{showMemoryDrilldown ? '▲ hide' : '▼ drilldown'}
</span>
</div>
<span className="text-lg font-bold">
{formatBytes(system?.memory.appMemory || 0)}
</span>
<span className="text-sm ml-1" style={{ color: 'var(--text-tertiary)' }}>
/ {formatBytes(system?.memory.total || 0)}
used / {formatBytes(system?.memory.total || 0)}
</span>
<p className="text-[10px] mt-0.5" style={{ color: 'var(--text-tertiary)' }}>
{formatBytes(system?.memory.cached || 0)} cached (reclaimable)
<p className="text-[10px] mt-0.5 font-medium" style={{ color: 'var(--success)' }}>
{formatBytes((system?.memory.free || 0) + (system?.memory.cached || 0) * 0.9)}{' '}
available for models
</p>
<div className="mt-2">
<ProgressBar
@ -1189,6 +1223,17 @@ export default function Dashboard() {
)}
</div>
{/* Memory Drilldown Panel */}
{showMemoryDrilldown && (
<div className="card p-6">
<h2 className="text-lg font-semibold flex items-center gap-2 mb-4">
<MemoryStick className="w-5 h-5" style={{ color: 'var(--warning)' }} />
Memory Drilldown
</h2>
<MemoryDrilldown />
</div>
)}
{/* Main Grid */}
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
{/* Ollama Models — 2 cols */}
@ -1351,7 +1396,7 @@ export default function Dashboard() {
totalRam={system.memory.total}
appMemory={system.memory.appMemory}
runningModels={ollama.running}
freeRam={system.memory.free}
freeRam={system.memory.free + system.memory.cached}
/>
)}
{ollama.models
@ -1456,20 +1501,36 @@ export default function Dashboard() {
</span>
)}
</div>
{/* Metrics row */}
<div
className="flex items-center gap-3 text-xs mt-0.5 flex-wrap"
className="flex items-center gap-2 text-xs mt-1 flex-wrap"
style={{ color: 'var(--text-tertiary)' }}
>
<span>{formatBytes(model.size)}</span>
<span className="inline-flex items-center gap-1" title="Disk size">
<HardDrive className="w-3 h-3" />
{formatBytes(model.size)}
</span>
{model.details?.parameter_size && (
<span>{model.details.parameter_size}</span>
<span
className="inline-flex items-center gap-1"
title="Parameter count"
>
<Cpu className="w-3 h-3" />
{model.details.parameter_size}
</span>
)}
{model.details?.quantization_level && (
<span>{model.details.quantization_level}</span>
<span
className="px-1.5 py-0.5 rounded font-mono text-[10px]"
style={{
background: 'var(--surface-card)',
color: 'var(--text-tertiary)',
}}
title="Quantization level — lower bits = smaller & faster but less accurate"
>
{model.details.quantization_level}
</span>
)}
<span title="Estimated RAM when loaded (Apple Silicon unified memory)">
~{formatBytes(estRam)} RAM
</span>
{(() => {
const ctx = modelMetadata[model.name]?.contextLength;
return ctx ? (
@ -1486,7 +1547,86 @@ export default function Dashboard() {
~{modelBenchmarks[model.name].tokPerSec.toFixed(1)} tok/s
</span>
)}
{(() => {
const ps = parseFloat(model.details?.parameter_size || '0');
const tier =
ps <= 3
? { label: 'Tiny · Instant', color: 'var(--success)' }
: ps <= 8
? { label: 'Small · Fast', color: 'var(--accent-secondary)' }
: ps <= 14
? { label: 'Medium', color: 'var(--accent-primary)' }
: ps <= 34
? { label: 'Large · Slow', color: 'var(--warning)' }
: { label: 'XL · Very Slow', color: 'var(--danger)' };
return (
<span
className="text-[10px] px-1.5 py-0.5 rounded font-medium"
style={{
background: `color-mix(in srgb, ${tier.color} 12%, transparent)`,
color: tier.color,
}}
title="Speed tier based on parameter count"
>
{tier.label}
</span>
);
})()}
</div>
{/* Memory fit — only for non-running models */}
{!running &&
system &&
(() => {
const avail = system.memory.free + system.memory.cached * 0.9;
const gap = avail - estRam;
const fitColor =
fitStatus === 'fits'
? 'var(--success)'
: fitStatus === 'tight'
? 'var(--warning)'
: 'var(--danger)';
return (
<div
className="mt-2 p-2 rounded-md"
style={{ background: 'var(--surface-card)' }}
>
<div className="flex items-center justify-between mb-1">
<span
className="text-[11px]"
style={{ color: 'var(--text-tertiary)' }}
>
Needs ~{formatBytes(estRam)} · {formatBytes(avail)}{' '}
available
</span>
<span
className="text-[10px] px-1.5 py-0.5 rounded-full font-medium"
style={{
background: `color-mix(in srgb, ${fitColor} 15%, transparent)`,
color: fitColor,
}}
>
{fitStatus === 'fits'
? `${formatBytes(gap)} to spare`
: fitStatus === 'tight'
? `⚠ Tight — ${formatBytes(gap)} to spare`
: `${formatBytes(Math.abs(gap))} short`}
</span>
</div>
<div
className="h-1.5 rounded-full overflow-hidden"
style={{ background: 'var(--surface-muted)' }}
>
<div
className="h-full rounded-full transition-all"
style={{
width: `${Math.min(100, Math.round((estRam / avail) * 100))}%`,
background: fitColor,
}}
/>
</div>
</div>
);
})()}
{running &&
(() => {
const rm = ollama?.running.find(r => r.name === model.name);
@ -1547,26 +1687,6 @@ export default function Dashboard() {
</>
) : (
<div className="flex items-center gap-2">
{fitStatus && !running && (
<span
className="w-2 h-2 rounded-full shrink-0"
title={
fitStatus === 'fits'
? 'Fits comfortably in available memory'
: fitStatus === 'tight'
? 'Tight — may cause swap pressure'
: "Won't fit — will swap heavily"
}
style={{
background:
fitStatus === 'fits'
? 'var(--success)'
: fitStatus === 'tight'
? 'var(--warning)'
: 'var(--danger)',
}}
/>
)}
<button
onClick={() => handleModelAction('load', model.name)}
disabled={actionLoading === `load-${model.name}`}
@ -1757,7 +1877,7 @@ export default function Dashboard() {
(() => {
const usedVram = ollama.running.reduce((sum, r) => sum + r.size_vram, 0);
const freeForModels =
system.memory.free + system.memory.cached * 0.5 - usedVram * 0.1;
system.memory.free + system.memory.cached * 0.9 - usedVram * 0.1;
const suggestions = ollama.models
.filter(m => !isRunning(m.name))
.map(m => ({
@ -1831,8 +1951,9 @@ export default function Dashboard() {
RAM
</span>
</div>
<span className="text-xs font-mono" style={{ color: 'var(--text-tertiary)' }}>
{formatBytes(system?.memory.free || 0)} avail
<span className="text-xs font-mono" style={{ color: 'var(--success)' }}>
{formatBytes((system?.memory.free || 0) + (system?.memory.cached || 0) * 0.9)}{' '}
avail
</span>
</div>
<ProgressBar
@ -1850,8 +1971,8 @@ export default function Dashboard() {
className="flex justify-between mt-1 text-[10px]"
style={{ color: 'var(--text-tertiary)' }}
>
<span>App: {formatBytes(system?.memory.appMemory || 0)}</span>
<span>Cache: {formatBytes(system?.memory.cached || 0)}</span>
<span>Used: {formatBytes(system?.memory.appMemory || 0)}</span>
<span>Total: {formatBytes(system?.memory.total || 0)}</span>
</div>
</div>
<div>
@ -2024,6 +2145,116 @@ export default function Dashboard() {
)}
</div>
{/* Speech — TTS Engines */}
<div className="card p-6">
<h2 className="text-lg font-semibold flex items-center gap-2 mb-4">
<Volume2 className="w-5 h-5" style={{ color: 'var(--accent-primary)' }} />
Speech (TTS)
</h2>
{ttsData ? (
<div className="space-y-3">
{ttsData.engines.map(engine => (
<div
key={engine.name}
className="p-3 rounded-lg"
style={{ background: 'var(--surface-muted)' }}
>
<div className="flex items-center justify-between mb-1">
<div className="flex items-center gap-2">
<StatusDot
status={
engine.status === 'ready'
? 'online'
: engine.status === 'partial'
? 'warning'
: 'offline'
}
/>
<span className="text-sm font-semibold">{engine.name}</span>
<span
className="text-[10px] px-1.5 py-0.5 rounded font-mono"
style={{
background:
engine.type === 'ollama' ? 'var(--accent-primary)' : 'var(--purple)',
color: '#fff',
opacity: 0.85,
}}
>
{engine.type === 'ollama' ? 'Ollama' : 'Python'}
</span>
</div>
{engine.size && (
<span
className="text-[11px] font-mono"
style={{ color: 'var(--text-tertiary)' }}
>
{engine.size}
</span>
)}
</div>
<p className="text-xs ml-5" style={{ color: 'var(--text-tertiary)' }}>
{engine.model}
</p>
<p
className="text-xs ml-5 mt-0.5"
style={{
color:
engine.status === 'ready'
? 'var(--success)'
: engine.status === 'partial'
? 'var(--warning)'
: 'var(--text-tertiary)',
}}
>
{engine.details}
</p>
{engine.voices && engine.status === 'ready' && (
<div className="flex flex-wrap gap-1 mt-2 ml-5">
{engine.voices.map(v => (
<span
key={v}
className="text-[10px] px-1.5 py-0.5 rounded font-mono"
style={{
background: 'var(--bg-elevated)',
color: 'var(--text-secondary)',
}}
>
{v}
</span>
))}
</div>
)}
</div>
))}
{/* Venv status */}
<div
className="flex items-center justify-between text-xs pt-2"
style={{ borderTop: '1px solid var(--border-subtle)' }}
>
<span style={{ color: 'var(--text-tertiary)' }}>Python venv</span>
<span
style={{ color: ttsData.venv.exists ? 'var(--success)' : 'var(--warning)' }}
>
{ttsData.venv.exists ? (
<> {ttsData.venv.packages?.join(' · ') || 'installed'}</>
) : (
'Not found — run setup-tts.sh'
)}
</span>
</div>
</div>
) : (
<div
className="p-3 rounded-lg text-center"
style={{ background: 'var(--surface-muted)' }}
>
<p className="text-xs" style={{ color: 'var(--text-tertiary)' }}>
Loading TTS status...
</p>
</div>
)}
</div>
{/* Extraction Service (F15) */}
<div className="card p-6">
<h2 className="text-lg font-semibold flex items-center gap-2 mb-4">

View File

@ -0,0 +1,136 @@
import { NextResponse } from 'next/server';
import { exec } from 'child_process';
import { promisify } from 'util';
import os from 'os';
const execAsync = promisify(exec);
interface ProcessInfo {
pid: number;
name: string;
rss: number; // bytes
pctMem: number;
user: string;
}
interface VmStatBreakdown {
active: number;
wired: number;
compressor: number;
inactive: number;
purgeable: number;
speculative: number;
free: number;
pageSize: number;
}
async function getTopProcesses(limit = 20): Promise<ProcessInfo[]> {
try {
// ps with RSS in KB, sorted descending by RSS
const { stdout } = await execAsync(
`ps -axo pid=,rss=,%mem=,user=,comm= | sort -k2 -rn | head -${limit}`,
{ timeout: 3000 }
);
return stdout
.trim()
.split('\n')
.filter(Boolean)
.map(line => {
const parts = line.trim().split(/\s+/);
const pid = parseInt(parts[0]);
const rssKb = parseInt(parts[1]);
const pctMem = parseFloat(parts[2]);
const user = parts[3];
// comm can have spaces/slashes — take everything after user
const rawName = parts.slice(4).join(' ');
// Extract just the process name from the full path
const name = rawName.split('/').pop() || rawName;
return {
pid,
name,
rss: rssKb * 1024,
pctMem,
user,
};
})
.filter(p => p.rss > 0);
} catch {
return [];
}
}
async function getVmStatBreakdown(): Promise<VmStatBreakdown> {
try {
const { stdout } = await execAsync('vm_stat', { timeout: 2000 });
const pageSizeMatch = stdout.match(/page size of (\d+) bytes/);
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1]) : 16384;
const parse = (label: string): number => {
const match = stdout.match(new RegExp(`${label}:\\s+(\\d+)`));
return match ? parseInt(match[1]) * pageSize : 0;
};
return {
active: parse('Pages active'),
wired: parse('Pages wired down'),
compressor: parse('Pages occupied by compressor'),
inactive: parse('Pages inactive'),
purgeable: parse('Pages purgeable'),
speculative: parse('Pages speculative'),
free: parse('Pages free'),
pageSize,
};
} catch {
return {
active: 0,
wired: 0,
compressor: 0,
inactive: 0,
purgeable: 0,
speculative: 0,
free: 0,
pageSize: 16384,
};
}
}
export async function GET() {
const [processes, vmstat] = await Promise.all([getTopProcesses(25), getVmStatBreakdown()]);
// Group by process name and sum RSS (e.g. multiple Chrome helpers)
const grouped: Record<string, { rss: number; pctMem: number; count: number; pids: number[] }> =
{};
for (const p of processes) {
const key = p.name;
if (!grouped[key]) {
grouped[key] = { rss: 0, pctMem: 0, count: 0, pids: [] };
}
grouped[key].rss += p.rss;
grouped[key].pctMem += p.pctMem;
grouped[key].count += 1;
grouped[key].pids.push(p.pid);
}
const groupedProcesses = Object.entries(grouped)
.map(([name, info]) => ({
name,
rss: info.rss,
pctMem: Math.round(info.pctMem * 10) / 10,
count: info.count,
pids: info.pids,
}))
.sort((a, b) => b.rss - a.rss);
return NextResponse.json({
totalRam: os.totalmem(),
vmstat,
categories: {
active: vmstat.active,
wired: vmstat.wired,
compressor: vmstat.compressor,
inactive: vmstat.inactive,
purgeable: vmstat.purgeable,
speculative: vmstat.speculative,
free: vmstat.free,
},
processes: groupedProcesses,
});
}

View File

@ -133,12 +133,13 @@ async function getAccurateMemory(): Promise<{
const appMemory = active + wired + compressor;
const cached = inactive + purgeable + speculative;
const trueFree = free + cached; // macOS reclaims cached on demand
// Return raw free separately from cached — no overlap
// available for loading = free + cached (macOS reclaims cached on demand)
const ratio = appMemory / totalMem;
const pressure = ratio > 0.85 ? 'critical' : ratio > 0.7 ? 'warning' : 'normal';
return { total: totalMem, appMemory, cached, free: trueFree, pressure };
return { total: totalMem, appMemory, cached, free, pressure };
} catch {
// Fallback to Node.js (inaccurate on macOS but works everywhere)
const freeMem = os.freemem();

View File

@ -0,0 +1,175 @@
import { NextResponse } from 'next/server';
import { exec } from 'child_process';
import { promisify } from 'util';
import { access, stat, readdir } from 'fs/promises';
import { join, resolve } from 'path';
const execAsync = promisify(exec);
// process.cwd() = dashboard/, parent = __LOCAL_LLMs/
const LOCAL_LLMS_DIR = resolve(process.cwd(), '..');
interface TtsEngine {
name: string;
type: 'ollama' | 'python';
status: 'ready' | 'partial' | 'missing';
model: string;
size?: string;
voices?: string[];
details: string;
}
async function fileExists(path: string): Promise<boolean> {
try {
await access(path);
return true;
} catch {
return false;
}
}
async function getFileSize(path: string): Promise<number> {
try {
const s = await stat(path);
return s.size;
} catch {
return 0;
}
}
async function checkOrpheus(): Promise<TtsEngine> {
const engine: TtsEngine = {
name: 'Orpheus TTS',
type: 'ollama',
status: 'missing',
model: 'sematre/orpheus:en',
voices: ['tara', 'leah', 'jess', 'leo', 'dan', 'mia', 'zac', 'zoe'],
details: '',
};
// Check if Orpheus model is in Ollama
let hasModel = false;
try {
const res = await fetch('http://localhost:11434/api/tags', {
signal: AbortSignal.timeout(2000),
});
if (res.ok) {
const data = await res.json();
hasModel = data.models?.some((m: { name: string }) => m.name.includes('orpheus')) ?? false;
}
} catch {
// Ollama not running
}
// Check SNAC decoder
const snacPath = join(LOCAL_LLMS_DIR, 'models', 'snac_24khz', 'pytorch_model.bin');
const hasSnac = await fileExists(snacPath);
const snacSize = hasSnac ? await getFileSize(snacPath) : 0;
// Check Python venv
const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
const hasVenv = await fileExists(venvPython);
if (hasModel && hasSnac && hasVenv) {
engine.status = 'ready';
engine.size = `${(snacSize / 1e6).toFixed(0)} MB decoder`;
engine.details = 'Ollama model + SNAC decoder + Python venv';
} else if (hasModel) {
engine.status = 'partial';
const missing: string[] = [];
if (!hasSnac) missing.push('SNAC decoder');
if (!hasVenv) missing.push('Python venv');
engine.details = `Missing: ${missing.join(', ')}`;
} else {
engine.status = 'missing';
engine.details = 'Run: bash setup-tts.sh';
}
return engine;
}
async function checkQwenTts(): Promise<TtsEngine> {
const engine: TtsEngine = {
name: 'Qwen3-TTS',
type: 'python',
status: 'missing',
model: 'Qwen3-TTS-12Hz-0.6B-CustomVoice',
details: '',
};
const modelDir = join(LOCAL_LLMS_DIR, 'models', 'Qwen3-TTS-12Hz-0.6B-CustomVoice');
const tokenizerDir = join(LOCAL_LLMS_DIR, 'models', 'Qwen3-TTS-Tokenizer-12Hz');
let hasModel = false;
let modelSize = 0;
try {
const files = await readdir(modelDir);
const safetensors = files.find(f => f.endsWith('.safetensors'));
if (safetensors) {
hasModel = true;
modelSize = await getFileSize(join(modelDir, safetensors));
}
} catch {
// dir doesn't exist
}
const hasTokenizer = await fileExists(join(tokenizerDir, 'config.json'));
const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
const hasVenv = await fileExists(venvPython);
if (hasModel && hasTokenizer && hasVenv) {
engine.status = 'ready';
engine.size = `${(modelSize / 1e9).toFixed(1)} GB`;
engine.details = '0.6B params · 10 languages · MPS/CPU';
} else if (hasModel || hasTokenizer) {
engine.status = 'partial';
const missing: string[] = [];
if (!hasModel) missing.push('model weights');
if (!hasTokenizer) missing.push('tokenizer');
if (!hasVenv) missing.push('Python venv');
engine.details = `Missing: ${missing.join(', ')}`;
} else {
engine.status = 'missing';
engine.details = 'Run: bash setup-tts.sh';
}
return engine;
}
async function checkVenv(): Promise<{
exists: boolean;
python?: string;
packages?: string[];
}> {
const venvPython = join(LOCAL_LLMS_DIR, '.venv-qwen-tts', 'bin', 'python');
const exists = await fileExists(venvPython);
if (!exists) return { exists: false };
try {
const { stdout } = await execAsync(
`"${venvPython}" -c "import snac; import torch; print(f'snac={snac.__version__} torch={torch.__version__}')"`,
{ timeout: 5000 }
);
return {
exists: true,
python: venvPython,
packages: stdout.trim().split(' '),
};
} catch {
return { exists: true, python: venvPython };
}
}
export async function GET() {
const [orpheus, qwenTts, venv] = await Promise.all([checkOrpheus(), checkQwenTts(), checkVenv()]);
return NextResponse.json({
engines: [orpheus, qwenTts],
venv,
setupScript: 'bash setup-tts.sh',
testCommands: {
orpheus: '.venv-qwen-tts/bin/python test_orpheus_tts.py',
qwenTts: '.venv-qwen-tts/bin/python test_qwen_tts.py',
},
});
}

View File

@ -19,13 +19,15 @@ export function estimateRam(diskSize: number, quant?: string): number {
}
// N2: Check if model fits in available memory
// free = raw free pages, cached = inactive+purgeable+speculative (no overlap)
// macOS reclaims ~90% of cached on demand for large allocations (model mmaps)
export type FitStatus = 'fits' | 'tight' | 'no';
export function checkMemoryFit(
estimatedRam: number,
freeMemory: number,
cachedMemory: number
): FitStatus {
const available = freeMemory + cachedMemory * 0.5;
const available = freeMemory + cachedMemory * 0.9;
const ratio = estimatedRam / available;
if (ratio < 0.7) return 'fits';
if (ratio <= 1.0) return 'tight';

View File

@ -10,10 +10,13 @@ This machine runs a local LLM server via [Ollama](https://ollama.com), exposing
**Models installed:**
| Model | Size | Best For |
| ------------------- | ------- | ----------------------------------------- |
| `qwen2.5-coder:32b` | 18.5 GB | Code (TS, Python, Swift), structured JSON |
| `llama3.1:8b` | 4.7 GB | Fast evals, general tasks |
| Model | Size | Best For |
| -------------------- | ------ | -------------------------------------------- |
| `qwen2.5-coder:32b` | 19 GB | Code (TS, Python, Swift), structured JSON |
| `qwen2.5-coder:7b` | 4.7 GB | Fast code tasks, fits alongside other models |
| `deepseek-r1:32b` | 19 GB | Complex reasoning, chain-of-thought |
| `llama3.1:8b` | 4.9 GB | Fast evals, general tasks |
| `sematre/orpheus:en` | 4 GB | Text-to-speech (8 voices, emotion tags) |
---

View File

@ -1,17 +1,103 @@
# 05 — Mission Control Dashboard
> **Documentation has moved.** All dashboard docs now live in the dashboard directory.
- **PRD:** [`__LOCAL_LLMs/dashboard/docs/DASHBOARD_PRD.md`](../dashboard/docs/DASHBOARD_PRD.md)
- **Review (39 items):** [`__LOCAL_LLMs/dashboard/docs/DASHBOARD_REVIEW.md`](../dashboard/docs/DASHBOARD_REVIEW.md)
- **Roadmap (N1N15):** [`__LOCAL_LLMs/dashboard/docs/DASHBOARD_ROADMAP.md`](../dashboard/docs/DASHBOARD_ROADMAP.md)
> Next.js 16 dashboard for managing local LLM models, system resources, and inference.
> Last updated: 2026-02-21
## Quick Start
```bash
cd __LOCAL_LLMs/dashboard
npm install # first time only
npm run dev -- -p 3100
npm run dev # runs on port 3000
```
Open: **http://localhost:3100**
Open: **http://localhost:3000**
---
## Recent Changes (Feb 2026)
### Memory Calculation Fix
**Root cause:** The system API (`/api/system`) computed `trueFree = free + cached` and returned it as `free`. This made `free` and `cached` overlap. The UI then did `available = free + cached * 0.5`, which **double-counted** cached memory and inflated available RAM by ~8 GB.
**Fix (4 files):**
- `src/app/api/system/route.ts` — Return raw `Pages free` separately from `cached` (no overlap)
- `src/app/lib/format.ts` — Updated `checkMemoryFit()` to use `cached × 0.9` (macOS reclaims ~90% on demand)
- `src/app/(mission-control)/mission-control/page.tsx` — All UI memory references fixed
- `src/app/(mission-control)/mission-control/components/RamBudgetBar.tsx` — Receives corrected `free + cached`
**Memory formula:** `available for models = rawFree + cached × 0.9`
### Memory Drilldown
Click the **MEMORY** card in the status bar to toggle a drilldown panel showing:
1. **Stacked bar** — vm_stat categories (Active, Wired, Compressed, Inactive, Purgeable, Free)
2. **Legend grid** — exact bytes + percentage for each category
3. **App memory summary** — Active + Wired + Compressed = total used
4. **Top 15 processes by RSS** — grouped by name, Ollama highlighted in green
**New files:**
- `src/app/api/system/memory/route.ts` — Process memory API (`ps` + `vm_stat`)
- `src/app/(mission-control)/mission-control/components/MemoryDrilldown.tsx` — Drilldown UI
### Simplified Memory UI
All memory displays now use consistent, plain language:
| Element | Before (confusing) | After (clear) |
| -------------------- | ---------------------------------- | ------------------------------------------- |
| **MEMORY card** | "10.5 GB / 48 GB" (ambiguous) | **"35.6 GB used / 48 GB"** |
| **Subtitle** | "App: 35.6 GB · Cache: 11.6 GB" | **"10.5 GB available for models"** (green) |
| **Model fit** | "76 MB free + 10.5 GB reclaimable" | **"Needs ~22 GB · 10.5 GB available"** |
| **Fit badge** | "✗ Won't fit" | **"✗ 11.6 GB short"** (with exact gap) |
| **System panel RAM** | "76 MB avail" | **"10.5 GB avail"** (green, matches header) |
---
## Detailed Documentation
- **PRD:** [`dashboard/docs/DASHBOARD_PRD.md`](../dashboard/docs/DASHBOARD_PRD.md)
- **Review (39 items):** [`dashboard/docs/DASHBOARD_REVIEW.md`](../dashboard/docs/DASHBOARD_REVIEW.md)
- **Roadmap (N1N15):** [`dashboard/docs/DASHBOARD_ROADMAP.md`](../dashboard/docs/DASHBOARD_ROADMAP.md)
- **Rich Features Roadmap (AG):** [`dashboard/docs/RICH_FEATURES_ROADMAP.md`](../dashboard/docs/RICH_FEATURES_ROADMAP.md)
---
## API Routes
| Route | Method | Description |
| -------------------- | -------- | ---------------------------------------------------- |
| `/api/ollama` | GET/POST | Ollama proxy (list, load, unload, generate) |
| `/api/whisper` | GET | Whisper binary/model discovery |
| `/api/system` | GET | System info (chip, RAM, disk, brew, pressure) |
| `/api/system/memory` | GET | Memory drilldown (vm_stat breakdown + top processes) |
| `/api/system/exec` | POST | Safe shell command execution |
---
## Key Components
```
dashboard/src/app/
├── (mission-control)/mission-control/
│ ├── page.tsx # Main Mission Control page
│ └── components/
│ ├── RamBudgetBar.tsx # Stacked RAM budget visualization
│ ├── MemoryDrilldown.tsx # Process-level memory breakdown
│ └── MarkdownResponse.tsx # Markdown renderer for LLM output
├── (workspace)/components/ # Chat workspace (conversations, messages)
├── api/
│ ├── ollama/route.ts
│ ├── whisper/route.ts
│ ├── system/route.ts
│ └── system/memory/route.ts
└── lib/
├── format.ts # formatBytes, estimateRam, checkMemoryFit
├── db.ts # IndexedDB CRUD (conversations, projects, tasks)
├── cron.ts # Cron expression parser
└── scheduled-tasks.ts # Built-in task templates
```

View File

@ -19,19 +19,41 @@ This machine is behind an AT&T Forcepoint proxy that performs SSL deep packet in
### What Works Through Proxy
| Tool | Status | Notes |
| -------------------------- | ---------- | ------------------------------------- |
| `ollama pull` | ✅ Works | Ollama handles proxy natively |
| `brew install` | ✅ Works | Homebrew handles proxy |
| `npm install` | ✅ Works | With `NODE_TLS_REJECT_UNAUTHORIZED=0` |
| `curl` to Hugging Face | ❌ Blocked | Returns 19 KB HTML redirect page |
| `curl -k` to Hugging Face | ❌ Blocked | Still intercepted even with `-k` |
| `python requests` to HF | ❌ Blocked | SSL_CERTIFICATE_VERIFY_FAILED |
| `huggingface_hub` download | ❌ Blocked | Falls back to cached (broken) files |
| Tool | Status | Notes |
| -------------------------- | ---------- | ------------------------------------------- |
| `ollama pull` | ✅ Works | Ollama handles proxy natively |
| `brew install` | ✅ Works | Homebrew handles proxy |
| `npm install` | ✅ Works | With `NODE_TLS_REJECT_UNAUTHORIZED=0` |
| `git clone` (GitHub) | ✅ Works | With `GIT_SSL_NO_VERIFY=1` |
| `pip install` (PyPI) | ✅ Works | Via corporate Artifactory mirror |
| **`hf-mirror.com`** | ✅ Works | Chinese HuggingFace mirror, **not blocked** |
| `curl` to Hugging Face | ❌ Blocked | Returns 19 KB HTML redirect page |
| `curl -k` to Hugging Face | ❌ Blocked | Still intercepted even with `-k` |
| `python requests` to HF | ❌ Blocked | SSL_CERTIFICATE_VERIFY_FAILED |
| `huggingface_hub` download | ❌ Blocked | Falls back to cached (broken) files |
### Workaround: Download Off-Network
### Workaround 1: Use hf-mirror.com (recommended)
For Hugging Face model downloads (e.g., Whisper GGML files):
`hf-mirror.com` is a Chinese mirror of HuggingFace that **is NOT blocked** by Forcepoint. Replace `huggingface.co` with `hf-mirror.com` in any download URL:
```bash
# Instead of: https://huggingface.co/org/model/resolve/main/file.bin
# Use: https://hf-mirror.com/org/model/resolve/main/file.bin
# Example: download SNAC decoder (TTS)
curl -k -L -o models/snac_24khz/pytorch_model.bin \
"https://hf-mirror.com/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
# Example: download Whisper model
curl -k -L -o ~/whisper-models/ggml-large-v3-turbo.bin \
"https://hf-mirror.com/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin"
```
The TTS scripts (`setup-tts.sh`, `download-tts-models.sh`) use this mirror automatically.
### Workaround 2: Download Off-Network
If the mirror is also blocked, use a non-corporate network:
1. **Disconnect** from corporate VPN/Wi-Fi
2. **Connect** to personal hotspot or home Wi-Fi

View File

@ -0,0 +1,230 @@
# 10 — Text-to-Speech (TTS) — Local Setup
> Local TTS on Apple Silicon: Orpheus TTS via Ollama + Qwen3-TTS 0.6B direct.
> Works through corporate proxy via `hf-mirror.com`.
> Last updated: 2026-02-21
---
## Overview
Two TTS engines for local speech generation — both run fully offline after initial setup.
| Engine | Model | Size | How It Runs | Quality | Speed |
| --------------- | --------------------------------- | ------ | ----------------------- | ------------------------------------------ | ------------------------ |
| **Orpheus TTS** | `sematre/orpheus:en` | 4 GB | Via Ollama (Metal GPU) | Great — expressive, 8 voices, emotion tags | ~11s for short sentences |
| **Qwen3-TTS** | `Qwen3-TTS-12Hz-0.6B-CustomVoice` | 1.2 GB | Direct Python (MPS/CPU) | Excellent — 10 languages, voice design | ~10-20s on MPS |
### Architecture
```
Text → Ollama (Orpheus 3B) → Audio Tokens → SNAC Decoder → WAV file
Text → Qwen3-TTS 0.6B (PyTorch MPS) → WAV file
```
---
## Quick Start (Fresh Laptop)
The **one-shot setup script** handles everything — works on any Apple Silicon Mac, including through corporate proxy:
```bash
cd __LOCAL_LLMs
bash setup-tts.sh
```
This installs: Python 3.12, venv, pip packages, Orpheus model (Ollama), SNAC decoder (hf-mirror.com), and optionally Qwen3-TTS 0.6B.
After setup:
```bash
.venv-qwen-tts/bin/python test_orpheus_tts.py
afplay test_orpheus_tara.wav
```
---
## Prerequisites
| Component | How to Install | Notes |
| ------------------------- | ---------------------------------- | ------------------------------ |
| **macOS + Apple Silicon** | — | M1/M2/M3/M4 (MPS acceleration) |
| **Homebrew** | `/bin/bash -c "$(curl -fsSL ...)"` | Package manager |
| **Ollama** | `brew install ollama` | Local LLM server |
| **Python 3.12** | `brew install python@3.12` | TTS packages need 3.12 |
All of the above are installed automatically by `setup-tts.sh`.
---
## Manual Setup (step by step)
If you prefer to run each step yourself instead of `setup-tts.sh`:
### 1. Python Environment
```bash
cd __LOCAL_LLMs
# Install Python 3.12
brew install python@3.12
# Create isolated venv
/opt/homebrew/bin/python3.12 -m venv .venv-qwen-tts
# Install packages
.venv-qwen-tts/bin/pip install -U snac qwen-tts
```
### 2. Orpheus TTS Model (via Ollama)
```bash
ollama serve & # start Ollama if not running
ollama pull sematre/orpheus:en # 4 GB, via Ollama registry (works through proxy)
```
### 3. SNAC Audio Decoder
Downloads via `hf-mirror.com`**works through corporate proxy**:
```bash
bash download-tts-models.sh snac # just SNAC (~76 MB)
```
Or manually:
```bash
mkdir -p models/snac_24khz
curl -k -sL -o models/snac_24khz/config.json \
"https://hf-mirror.com/hubertsiuzdak/snac_24khz/raw/main/config.json"
curl -k -L --progress-bar -o models/snac_24khz/pytorch_model.bin \
"https://hf-mirror.com/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
```
### 4. Qwen3-TTS 0.6B (optional)
```bash
bash download-tts-models.sh qwen # tokenizer + model (~1.7 GB)
```
After download everything runs **fully offline**.
---
## Usage
### Orpheus TTS (via Ollama)
```bash
# Make sure Ollama is running
ollama serve &
# Run test
.venv-qwen-tts/bin/python test_orpheus_tts.py
# Play output
afplay test_orpheus_tara.wav
```
**Voices:** `tara`, `leah`, `jess`, `leo`, `dan`, `mia`, `zac`, `zoe`
**Emotion tags:** `<laugh>`, `<chuckle>`, `<sigh>`, `<cough>`, `<sniffle>`, `<groan>`, `<yawn>`, `<gasp>`
```python
# Example prompt format
voice = "tara"
text = "<laugh> That's hilarious! Tell me more."
prompt = f"<custom_token_3><|begin_of_text|>{voice}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
```
### Qwen3-TTS (direct Python)
```bash
.venv-qwen-tts/bin/python test_qwen_tts.py
afplay test_output_english.wav
```
**Features:**
- 10 languages (Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian)
- Built-in speaker voices (Chelsie, Vivian, Ryan, etc.)
- Natural language emotion control: `instruct="Speak with excitement"`
- Voice cloning from a short audio sample (with Base model variant)
---
## File Inventory
```
__LOCAL_LLMs/
├── setup-tts.sh # ← START HERE — one-shot setup for fresh laptop
├── download-tts-models.sh # Download model weights (uses hf-mirror.com)
├── test_orpheus_tts.py # Orpheus TTS test (Ollama + SNAC)
├── test_qwen_tts.py # Qwen3-TTS test (direct Python)
├── .venv-qwen-tts/ # Python 3.12 venv (gitignored, created by setup)
├── models/ # Downloaded model weights (gitignored)
│ ├── snac_24khz/ # SNAC audio decoder (~76 MB)
│ ├── Qwen3-TTS-Tokenizer-12Hz/ # Qwen3-TTS tokenizer (optional)
│ └── Qwen3-TTS-12Hz-0.6B-CustomVoice/ # Qwen3-TTS model (~1.2 GB, optional)
└── *.wav # Generated audio output (gitignored)
```
---
## OSS TTS Landscape (as of Feb 2026)
### Speech-to-Text (STT)
| Model | By | Notes |
| ------------------------- | ------------------ | --------------------------------------------------- |
| **Whisper / whisper-cpp** | OpenAI / ggerganov | Gold standard, already installed, Metal-accelerated |
| **Faster Whisper** | SYSTRAN | 4× faster via CTranslate2 |
| **Distil-Whisper** | Hugging Face | 6× faster, 49% fewer params |
### Text-to-Speech (TTS)
| Model | By | Size | Notes |
| ---------------- | ------------ | --------- | ------------------------------------------------------- |
| **Qwen3-TTS** ⭐ | Alibaba | 0.6B1.7B | Best quality, 10 languages, voice cloning, Jan 2026 |
| **Orpheus TTS** | Canopy AI | 3B | Expressive, 8 voices, emotion tags, available on Ollama |
| **Kokoro** | HF Community | 82M | Very fast, near-commercial quality, Apache 2.0 |
| **Piper** | Rhasspy | ONNX | Lightweight, runs on Raspberry Pi |
| **F5-TTS** | SWivid | — | Zero-shot voice cloning, flow matching |
| **StyleTTS 2** | Columbia U | — | Human-level quality, style diffusion |
| **OuteTTS** | Community | — | Pure LLM-based TTS, runs via llama.cpp |
| **Bark** | Suno | — | Speech + music + sound effects |
---
## Corporate Proxy Notes
| Source | Status | Workaround |
| ------------------------------------------ | ---------- | --------------------------------------------------- |
| **Ollama registry** (`registry.ollama.ai`) | ✅ Works | Ollama pull uses its own CDN |
| **PyPI** (via `artifact.it.att.com`) | ✅ Works | Corporate Artifactory mirror |
| **GitHub releases** | ✅ Works | Direct download |
| **HuggingFace** (`huggingface.co`) | ❌ Blocked | Use `hf-mirror.com` as mirror (works through proxy) |
| **hf-mirror.com** (HF mirror) | ✅ Works | Chinese HF mirror, not blocked by Forcepoint |
Forcepoint CSO intercepts HTTPS and serves a block page for HuggingFace. No SSL workaround works for `huggingface.co`. However, **`hf-mirror.com`** (a Chinese mirror of HuggingFace) is **not blocked** and can be used to download model weights:
```bash
# Download SNAC config + weights via mirror
curl -k -L -o models/snac_24khz/config.json "https://hf-mirror.com/hubertsiuzdak/snac_24khz/raw/main/config.json"
curl -k -L -o models/snac_24khz/pytorch_model.bin "https://hf-mirror.com/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
```
All other sources (Ollama, pip, GitHub) also work fine through the proxy.
---
## Troubleshooting
| Problem | Fix |
| --------------------------------------------- | ----------------------------------------------------------------------------- |
| `OSError: couldn't connect to huggingface.co` | Use `hf-mirror.com` or run `bash setup-tts.sh` |
| `SNAC decoder not found` | Run `bash setup-tts.sh` or `bash download-tts-models.sh snac` |
| `Model not found at models/Qwen3-TTS-*` | Run `bash setup-tts.sh` or `bash download-tts-models.sh qwen` |
| Orpheus generates no audio tokens | Ensure `ollama serve` is running and `ollama list` shows `sematre/orpheus:en` |
| MPS out of memory for Qwen3-TTS | Close other apps (Windsurf uses ~18 GB). Or use `device="cpu"` in test script |
| Slow generation on CPU | Expected for 0.6B model. MPS should be ~2-3× faster |

View File

@ -1,310 +0,0 @@
# Mission Control Dashboard — Bug & Improvement Review
> Systematic code review of `__LOCAL_LLMs/dashboard/` (6 source files, 1,395 lines)
> Last updated: Feb 19, 2026
---
## File Inventory
| File | Lines | Purpose |
| ------------------------------------ | ----- | -------------------------------------------------------------------- |
| `src/app/page.tsx` | 1,079 | Main dashboard UI (single component) |
| `src/app/globals.css` | 91 | Design tokens, animations, base styles |
| `src/app/layout.tsx` | 20 | Root layout (metadata, dark mode) |
| `src/app/api/ollama/route.ts` | 117 | Ollama REST proxy (list, load, unload, pull, delete, show, generate) |
| `src/app/api/ollama/stream/route.ts` | 38 | Ollama streaming generate proxy (NDJSON) |
| `src/app/api/whisper/route.ts` | 66 | Whisper binary + GGML model discovery |
| `src/app/api/system/route.ts` | 162 | System info (chip, memory via vm_stat, disk, brew) |
**Stack:** Next.js 16, React 19, TailwindCSS v4, Lucide icons, TypeScript
---
## 1. Bugs
- [x] **B1. Hardcoded machine specs in header**`page.tsx:317`
Subtitle reads `Apple M4 Pro · 48 GB · {system?.platform}` — should use `system?.chip` and `formatBytes(system?.memory.total)` dynamically so it works on any machine.
- [x] **B2. Pull model blocks UI — no progress feedback**`api/ollama/route.ts:84-92`
`handlePull` calls Ollama with `stream: false`. Large models (20+ GB) block for 30+ minutes. The Next.js API route will likely timeout. Must use `stream: true` and pipe progress events to the client. _(Combined with F1.)_
- [x] **B3. Dead code: non-streaming `generate` action**`api/ollama/route.ts:69-82`
The `action === 'generate'` handler is unused — UI only uses `/api/ollama/stream`. Remove or keep as fallback with a comment.
- [x] **B4. Escape key closes modal during active streaming**`page.tsx:188-197`
Global `keydown` handler calls `setPromptModel(null)` unconditionally. Backdrop click correctly checks `!promptLoading`. Escape should also respect `promptLoading` to prevent discarding an in-flight response.
- [x] **B5. Auto-refresh (15s) fires during streaming/pull**`page.tsx:182-185`
`setInterval(fetchAll, 15000)` runs unconditionally. During streaming this causes background churn and potential UI flicker. Should pause while `promptLoading` or `pullLoading` is true.
- [x] **B6. Toast ID collision on HMR remount**`page.tsx:156-159`
`toastId.current` resets to 0 on component remount during dev. Use `Date.now()` or `crypto.randomUUID()` for robust uniqueness.
- [x] **B7. vm_stat page size hardcoded**`api/system/route.ts:103`
Hardcoded `16384`. Should parse from vm_stat's first line: `"(page size of NNNNN bytes)"` for portability.
- [x] **B8. Whisper models dir not configurable**`api/whisper/route.ts:24`
Hardcoded to `~/whisper-models`. Should scan multiple known paths (`/opt/homebrew/share/whisper-cpp/models/`, `~/whisper-models`, `~/.cache/whisper/`) or accept `WHISPER_MODELS_DIR` env var.
- [x] **B9. No AbortController for streaming fetch**`page.tsx:250-289`
Closing the prompt modal doesn't cancel the underlying fetch. The `reader.read()` loop continues in the background wasting CPU/bandwidth until the model finishes generating.
- [x] **B10. Brew shows "Loading..." when array is empty**`page.tsx:936-940`
When `system.brewPackages` is `[]` (all uninstalled), displays "Loading..." instead of "No packages found". Needs to distinguish "still fetching" vs "fetched but empty".
- [x] **B11. Prompt text not cleared on close without send**`page.tsx:951-957`
Backdrop click clears `promptText`, but Escape handler (B4 fix) should also clear it. Otherwise stale text persists when re-opening.
---
## 2. Code Quality
- [x] **CQ1. Monolithic 1,079-line single component**`page.tsx`
All interfaces, utilities, sub-components, and 900+ lines of JSX in one file. Extract to:
- `components/` — StatusDot, ProgressBar, ToastContainer, PromptModal, OllamaModelsPanel, SystemPanel, WhisperPanel, BrewPanel
- `lib/types.ts` — interfaces (OllamaModel, SystemData, etc.)
- `lib/format.ts` — formatBytes, formatUptime
- `lib/hooks.ts` — useAutoRefresh, useToasts, useOllamaActions
- [x] **CQ2. Pervasive inline styles instead of CSS/Tailwind classes**`page.tsx` (100+ occurrences)
Every `style={{ color: 'var(--text-tertiary)' }}` should be a utility class. Options: custom Tailwind theme mapping, or CSS utility classes in `globals.css` (e.g., `.text-muted`).
- [x] **CQ3. OLLAMA_URL duplicated**`api/ollama/route.ts:3` + `api/ollama/stream/route.ts:3`
Same `process.env.OLLAMA_URL || 'http://localhost:11434'` in two files. Extract to `lib/ollama-config.ts`.
- [x] **CQ4. No React Error Boundary**`page.tsx`
Unexpected API response shape crashes the entire dashboard. Add an `error.tsx` (Next.js App Router convention) for graceful recovery.
- [x] **CQ5. No loading skeleton / shimmer UI**
Initial load shows "..." placeholders. Skeleton cards would be more polished.
- [x] **CQ6. No TypeScript strict null checks in API responses**
API route handlers catch errors but return loosely typed JSON. Add Zod validation on the Ollama/system responses to prevent runtime surprises.
---
## 3. Features
- [x] **F1. Streaming pull with progress bar** _(fixes B2)_
Use Ollama `stream: true` for `/api/pull`. Create `/api/ollama/pull/route.ts` that pipes NDJSON progress. UI shows progress bar with `completed/total` bytes, speed, and ETA.
- [x] **F2. Model search/filter**
Search input above models list. Filter by name, family, quantization. Useful when 10+ models are installed.
- [x] **F3. Prompt history (localStorage)**
Store last 20 prompts with model name + timestamp. Dropdown in prompt modal to re-run previous prompts.
- [x] **F4. Chat mode (multi-turn conversation)**
Use Ollama `/api/chat` instead of `/api/generate`. Chat bubble layout with message history. System prompt input field.
- [x] **F5. Model comparison (side-by-side)**
Send same prompt to 2 models simultaneously. Display responses side-by-side with latency/quality comparison.
- [x] **F6. Token/s metrics after generation**
Parse `eval_count` and `eval_duration` from the final NDJSON chunk. Display tokens/second, total tokens, and latency in the response footer.
- [x] **F7. System resource sparklines (time-series)**
Ring buffer of memory/CPU snapshots (localStorage). Render mini sparkline charts in the System panel. Spot trends over time.
- [x] **F8. Ollama server logs viewer**
Read `~/.ollama/logs/` and display in a collapsible terminal-style panel. Filter by level. Auto-scroll.
- [x] **F9. Modelfile / template viewer**
The `show` action already fetches Modelfile, template, and system prompt. Display in a collapsible code block in expanded model details.
- [x] **F10. Dark/light theme toggle**
Add `:root.light` CSS variable overrides. Theme toggle with localStorage persistence. Current architecture supports this natively.
- [x] **F11. Keyboard shortcuts panel (`?` key)**
Show all shortcuts in a modal: ⌘+Enter (send), Esc (close), R (refresh), / (search models), ? (help).
- [x] **F12. Whisper transcription test**
Upload/record a short audio clip, transcribe locally via whisper-cli, display result with latency. Tests the full local STT pipeline.
- [x] **F13. Responsive mobile layout**
Better breakpoints for the 4-column stats row and 3-column main grid. Collapsible sidebar on mobile.
- [x] **F14. Model tags/labels (localStorage)**
User-defined tags (coding, fast, vision) with colored badges. Persisted in localStorage.
- [x] **F15. Extraction service integration panel**
Show extraction-service (port 4005) health status. Run test extractions against loaded Ollama models. Bridges dashboard to LysnrAI pipeline.
- [x] **F16. Auto-load preferred model**
Mark a model as "auto-load" (stored in localStorage). When Ollama is online but no models loaded, auto-load the preferred model.
---
## 4. Performance & Reliability
- [x] **P1. No request deduplication on Refresh**`page.tsx:164-176`
Rapid clicks on Refresh fire duplicate `fetchAll()` calls. Add a `fetchingRef` guard or disable the button during fetch (partially done for `actionLoading` but not for `fetchAll`).
- [x] **P2. Static cache never expires**`api/system/route.ts:81-90`
`staticCache` (chip, GPU, brew) lives forever in the server process. Brew package upgrades won't reflect. Add 5-minute TTL.
- [x] **P3. `du -sk ~/.ollama/models` on every refresh**`api/system/route.ts:41`
Traverses entire models directory every 15 seconds. Cache with 60-second TTL.
- [x] **P4. No fetch timeout on Ollama calls**`api/ollama/route.ts:5-12`
`fetchOllama` has no `AbortSignal` or timeout. If Ollama hangs, the dashboard hangs. Add 5-second timeout.
- [x] **P5. `system_profiler` slow on first load**`api/system/route.ts:52-53`
Takes ~2-3 seconds. Cached after first call, but first dashboard load waits. Consider eager background fetch on server start or return placeholder.
---
## 5. Security & Hardening
- [x] **S1. No input validation on model names**`api/ollama/route.ts:50-51`
`model` from request body passed directly to Ollama. Add regex validation: `^[a-zA-Z0-9._:/-]{1,256}$`.
- [x] **S2. Shell command interpolation pattern**`api/system/route.ts:67`
`execAsync(\`brew list --versions ${pkg}\`)`— safe today (hardcoded targets) but fragile. Use`execFile('brew', ['list', '--versions', pkg])` for safety.
- [x] **S3. No CORS or auth** _(acceptable for local-only, documented)_
Any local process can call API routes. Fine for dev tool; document the assumption.
---
## 6. Implementation Tracker
### Sprint 1 — Critical Bug Fixes _(est. 12 hrs)_
| # | ID | Task | Effort | Commit |
| --- | --------- | ----------------------------------------- | ------ | --------- |
| 1 | - [x] B4 | Guard Escape key during streaming | 5 min | `2da67c2` |
| 2 | - [x] B5 | Pause auto-refresh during prompt/pull | 10 min | `2da67c2` |
| 3 | - [x] B9 | Add AbortController to streaming fetch | 15 min | `2da67c2` |
| 4 | - [x] B1 | Dynamic chip/RAM in header | 5 min | `2da67c2` |
| 5 | - [x] B11 | Clear prompt text on Escape close | 5 min | `2da67c2` |
| 6 | - [x] P4 | Add timeout to Ollama fetch calls | 10 min | `2da67c2` |
| 7 | - [x] B3 | Remove dead generate action (or document) | 5 min | `2da67c2` |
| 8 | - [x] B6 | Use Date.now() for toast IDs | 2 min | `2da67c2` |
| 9 | - [x] B10 | Fix brew "Loading..." vs "empty" state | 5 min | `2da67c2` |
### Sprint 2 — Pull Progress + Metrics _(est. 23 hrs)_
| # | ID | Task | Effort | Commit |
| --- | ----------- | ----------------------------------- | ------ | --------- |
| 10 | - [x] B2+F1 | Streaming pull with progress bar | 60 min | `2d9475b` |
| 11 | - [x] F6 | Display tokens/s after generation | 30 min | `2d9475b` |
| 12 | - [x] B7 | Parse vm_stat page size dynamically | 10 min | `2d9475b` |
| 13 | - [x] B8 | Multi-path whisper model discovery | 15 min | `2d9475b` |
### Sprint 3 — Component Refactor _(est. 23 hrs)_
| # | ID | Task | Effort | Commit |
| --- | --------- | --------------------------------------- | ------ | --------- |
| 14 | - [x] CQ1 | Extract components into separate files | 90 min | `75a3cd0` |
| 15 | - [x] CQ4 | Add error.tsx Error Boundary | 15 min | `75a3cd0` |
| 16 | - [x] CQ3 | Shared ollama-config.ts | 10 min | `75a3cd0` |
| 17 | - [x] CQ2 | Consolidate inline styles → CSS classes | 45 min | `ed93a6f` |
| 18 | - [x] S1 | Add model name input validation | 10 min | `75a3cd0` |
| 19 | - [x] S2 | Replace exec → execFile for brew | 10 min | `75a3cd0` |
### Sprint 4 — UX Enhancements _(est. 34 hrs)_
| # | ID | Task | Effort | Commit |
| --- | --------- | ------------------------------------ | ------ | --------- |
| 20 | - [x] F3 | Prompt history (localStorage) | 45 min | `9c2f5f3` |
| 21 | - [x] F9 | Modelfile viewer in expanded details | 30 min | `9c2f5f3` |
| 22 | - [x] F4 | Chat mode (multi-turn via /api/chat) | 90 min | `ed93a6f` |
| 23 | - [x] F2 | Model search/filter | 30 min | `9c2f5f3` |
| 24 | - [x] F11 | Keyboard shortcuts panel | 20 min | `9c2f5f3` |
### Sprint 5 — Integration & Polish _(est. 23 hrs)_
| # | ID | Task | Effort | Commit |
| --- | ----------- | -------------------------- | ------ | --------- |
| 25 | - [x] F15 | Extraction service panel | 60 min | `8bdd5ee` |
| 26 | - [x] F12 | Whisper transcription test | 45 min | `8bdd5ee` |
| 27 | - [x] F7 | System resource sparklines | 45 min | `8bdd5ee` |
| 28 | - [x] CQ5 | Loading skeleton UI | 20 min | `8bdd5ee` |
| 29 | - [x] P1-P3 | Request dedup + cache TTLs | 30 min | `b1fda3a` |
| 30 | - [x] F16 | Auto-load preferred model | 20 min | `ed93a6f` |
### Deferred (nice-to-have)
| ID | Task | Notes |
| --------- | ------------------------------- | --------- |
| - [x] F5 | Model comparison (side-by-side) | `8bdd5ee` |
| - [x] F10 | Dark/light theme toggle | `ed93a6f` |
| - [x] F13 | Responsive mobile layout | `8bdd5ee` |
| - [x] F14 | Model tags/labels | `ed93a6f` |
| - [x] CQ6 | Zod validation on API responses | `ed93a6f` |
| - [x] F8 | Ollama server logs viewer | `8bdd5ee` |
| - [x] S3 | CORS / auth (documented) | `8bdd5ee` |
---
## 7. Commit Log
_Commits will be added here as work progresses._
| # | Date | Commit | Sprint | Items Completed |
| --- | ------ | --------- | -------- | ------------------------------------ |
| 1 | Feb 19 | `2da67c2` | Sprint 1 | B1, B3, B4, B5, B6, B9, B10, B11, P4 |
| 2 | Feb 19 | `2d9475b` | Sprint 2 | B2, B7, B8, F1, F6 |
| 3 | Feb 19 | `75a3cd0` | Sprint 3 | CQ1, CQ3, CQ4, S1, S2 |
| 4 | Feb 19 | `9c2f5f3` | Sprint 4 | F2, F3, F9, F11 |
| 5 | Feb 19 | `b1fda3a` | Sprint 5 | P1, P2, P3 |
| 6 | Feb 19 | `ed93a6f` | Sprint 6 | CQ2, CQ6, P5, F4, F10, F14, F16 |
| 7 | Feb 19 | `8bdd5ee` | Sprint 7 | F5, F7, F8, F12, F13, F15, CQ5, S3 |
---
> **39 items total:** 11 bugs, 6 code quality, 16 features, 5 performance, 3 security
> **All 39 items completed** across 7 sprints (9 code commits + doc updates)
> **Actual total effort:** ~8 hours across 7 sprints
---
## 8. Next Wave — Model Intelligence & Pre-Load Metrics
> Proposed improvements focused on helping users make informed decisions **before** loading a model.
### Tier A — Pre-Load Decision Metrics _(est. 45 min)_
| ID | Feature | Description |
| --- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| N1 | **Estimated RAM per model** | Approximate from disk size: Q4_K_M ≈ 1.2×disk in RAM. Show on every model card (e.g., `~22 GB RAM`), not just running models. |
| N2 | **"Will it fit?" indicator** | Compare estimated RAM vs `system.memory.free + cached`. Color-code: 🟢 Fits, 🟡 Tight (80100%), 🔴 Won't fit. Show on Load button or as badge. |
| N3 | **Aggregate loaded model RAM** | Sum VRAM of all running models. Display at top of models panel: "3 models loaded · 28.5 GB VRAM". |
### Tier B — Rich Model Metadata _(est. 60 min)_
| ID | Feature | Description |
| --- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------- | ---------------- | ------------------------------------------ |
| N4 | **RAM budget bar** | Horizontal stacked bar: `[OS+Apps | Model A (loaded) | Model B (loaded) | Free]`. Instant visual of memory headroom. |
| N5 | **Context window size** | Fetch `context_length` from Ollama `/api/show``model_info`. Display on card (e.g., `128k ctx`). Critical for knowing max prompt length. |
### Tier C — Model Intelligence Badges _(est. 45 min)_
| ID | Feature | Description |
| --- | --------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
| N6 | **`<think>` warning badge** | If model is DeepSeek R1 family, show ⚠️ badge: "Emits `<think>` traces — strip before JSON.parse". Prevents silent JSON failures. |
| N7 | **Vision model indicator** | If model is multimodal (llava, qwen2.5vl), show 👁 badge. These need image input — text-only prompts are suboptimal. |
| N8 | **Architecture badge** | Show model arch (llama, qwen2, phi3, deepseek2) as subtle pill on the card. Currently buried in expanded details. |
| N9 | **Sort/order models** | Dropdown to sort by: name, size, parameters, running status, last modified. Currently uses Ollama's default order. |
| N10 | **Ollama version display** | Call `/api/version`. Show in Ollama status card. Useful for debugging model compatibility. |
### Tier D — Runtime Metrics & UX _(est. 30 min)_
| ID | Feature | Description |
| --- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| N11 | **Last known tok/s per model** | Persist `StreamMetrics.tokensPerSec` in localStorage keyed by model. Show on card (e.g., `~45 tok/s`). Compare speeds without re-benchmarking. |
| N12 | **Auto-unload countdown** | Replace static `Expires: 3:45 PM` with live countdown: `Unloads in 4m 32s`. More actionable. |
| N13 | **Session stats per model** | Track prompts sent + tokens generated per model in session. Show in expanded details. |
| N14 | **Delete confirmation + reclaim** | Show "Delete qwen2.5-coder:32b? Reclaim 18.5 GB disk." before deleting. Currently no confirmation. |
| N15 | **Simultaneous load suggestions** | Based on available RAM, suggest which models can be co-loaded. E.g., "Can co-load llama3.1:8b + qwen2.5-coder:32b (28 GB, 20 GB free)". |
### Implementation Plan
| Sprint | Items | Focus | Effort |
| ------ | ----------------------- | ------------------------ | ------- |
| 8 | N1, N2, N3 | Pre-load RAM estimates | ~45 min |
| 9 | N4, N5 | RAM bar + context window | ~60 min |
| 10 | N6, N7, N8, N9, N10 | Badges + sort + version | ~45 min |
| 11 | N11, N12, N13, N14, N15 | Runtime metrics + UX | ~30 min |

View File

@ -2,7 +2,7 @@
> Complete guide for the local AI inference stack on the ByteLyst development machine.
> Hardware: **Apple M4 Pro · 48 GB LPDDR5 · macOS Tahoe**
> Last updated: 2026-02-19
> Last updated: 2026-02-21
---
@ -16,8 +16,11 @@ ollama serve # or: brew services start ollama
ollama run qwen2.5-coder:32b # best coding model for this hardware
# 3. Launch Mission Control dashboard
cd __LOCAL_LLMs/dashboard && npm run dev -- -p 3100
# Open http://localhost:3100
cd __LOCAL_LLMs/dashboard && npm run dev
# Open http://localhost:3000
# 4. (Optional) Set up TTS
cd __LOCAL_LLMs && bash setup-tts.sh
```
---
@ -35,6 +38,7 @@ cd __LOCAL_LLMs/dashboard && npm run dev -- -p 3100
| 07 | [Model Recommendations](07-model-recommendations.md) | Tiered model guide by use case, size, and quality for M4 Pro 48GB |
| 08 | [Troubleshooting & Corporate Proxy](08-troubleshooting.md) | Common issues, Forcepoint proxy workarounds, MLX warnings |
| 09 | [Environment Variables](09-environment-variables.md) | All config vars for Ollama, Whisper, dashboard, evals |
| 10 | [Text-to-Speech](10-text-to-speech.md) | Orpheus TTS via Ollama, Qwen3-TTS 0.6B, setup, corporate proxy |
---
@ -53,28 +57,42 @@ __LOCAL_LLMs/
│ ├── 06-extraction-service-evals.md
│ ├── 07-model-recommendations.md
│ ├── 08-troubleshooting.md
│ └── 09-environment-variables.md
├── dashboard/ ← Next.js Mission Control app (port 3100)
│ ├── src/app/page.tsx ← main dashboard UI
│ ├── 09-environment-variables.md
│ └── 10-text-to-speech.md
├── dashboard/ ← Next.js Mission Control app (port 3000)
│ ├── src/app/(mission-control)/ ← Mission Control page + memory drilldown
│ ├── src/app/api/ollama/route.ts ← Ollama API proxy (list, load, unload, generate)
│ ├── src/app/api/whisper/route.ts ← Whisper binary/model discovery
│ └── src/app/api/system/route.ts ← System info (chip, RAM, disk, brew)
│ ├── src/app/api/system/route.ts ← System info (chip, RAM, disk, brew)
│ └── src/app/api/system/memory/route.ts ← Memory drilldown (vm_stat + top processes)
├── setup-tts.sh ← One-shot TTS setup for fresh laptop
├── download-tts-models.sh ← Download model weights (uses hf-mirror.com)
├── test_orpheus_tts.py ← Orpheus TTS test (Ollama + SNAC decoder)
├── test_qwen_tts.py ← Qwen3-TTS 0.6B test (direct Python, MPS/CPU)
├── .venv-qwen-tts/ ← Python 3.12 venv for TTS (gitignored)
├── models/ ← Downloaded TTS model weights (gitignored)
└── LOCAL_LLMs_setup_mac_m4_48gb.md ← original doc (preserved, see docs/ for latest)
```
---
## Current Installation Status (2026-02-19)
## Current Installation Status (2026-02-21)
| Component | Version | Status | Disk Usage |
| ----------------------------------- | ---------- | ----------------------------- | ---------- |
| Ollama | 0.16.2 | ✅ Installed via brew | — |
| qwen2.5-coder:32b | — | ✅ Downloaded | 19 GB |
| llama3.1:8b | — | ✅ Downloaded | 4.9 GB |
| whisper-cpp | 1.8.3 | ✅ Installed via brew | 9.6 MB |
| whisper model (ggml-large-v3-turbo) | — | ❌ Blocked by corporate proxy | — |
| ffmpeg | 8.0.1 | ✅ Installed via brew | 53.3 MB |
| Mission Control Dashboard | Next.js 16 | ✅ Built, runs on :3100 | — |
| Component | Version | Status | Disk Usage |
| ----------------------------------- | ---------- | ------------------------------------------ | ---------- |
| Ollama | 0.16.2 | ✅ Installed via brew | — |
| qwen2.5-coder:32b | — | ✅ Downloaded | 19 GB |
| qwen2.5-coder:7b | — | ✅ Downloaded | 4.7 GB |
| deepseek-r1:32b | — | ✅ Downloaded | 19 GB |
| llama3.1:8b | — | ✅ Downloaded | 4.9 GB |
| sematre/orpheus:en (TTS) | — | ✅ Downloaded via Ollama | 4 GB |
| whisper-cpp | 1.8.3 | ✅ Installed via brew | 9.6 MB |
| whisper model (ggml-large-v3-turbo) | — | ✅ Downloaded via hf-mirror.com | 1.5 GB |
| ffmpeg | 8.0.1 | ✅ Installed via brew | 53.3 MB |
| Python 3.12 (TTS venv) | 3.12.12 | ✅ Installed via brew + venv created | ~2 GB |
| SNAC decoder (TTS) | — | ✅ Downloaded via hf-mirror.com | 76 MB |
| Qwen3-TTS 0.6B | — | ✅ Downloaded via hf-mirror.com | 1.7 GB |
| Mission Control Dashboard | Next.js 16 | ✅ Built, runs on :3000 (memory drilldown) | — |
---

View File

@ -0,0 +1,174 @@
#!/bin/bash
# ============================================================
# Download TTS Model Weights
#
# Downloads SNAC decoder + Qwen3-TTS from HuggingFace.
# Uses hf-mirror.com which works through corporate proxy.
# Falls back to huggingface.co if mirror is unreachable.
#
# No Python venv required — uses curl only.
#
# Usage:
# bash download-tts-models.sh # download all
# bash download-tts-models.sh snac # SNAC decoder only
# bash download-tts-models.sh qwen # Qwen3-TTS only
# ============================================================
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
MODELS_DIR="$SCRIPT_DIR/models"
GREEN='\033[0;32m'
RED='\033[0;31m'
NC='\033[0m'
ok() { echo -e "${GREEN}${NC} $1"; }
fail() { echo -e "${RED}${NC} $1"; exit 1; }
echo "=== TTS Model Downloader ==="
echo ""
# ── Pick HuggingFace source ─────────────────────────────────
# Try hf-mirror.com first (works through corporate proxy)
# Fall back to huggingface.co (requires non-corporate network)
HF_BASE=""
echo "Testing hf-mirror.com..."
if curl -k -s --max-time 5 "https://hf-mirror.com/hubertsiuzdak/snac_24khz/raw/main/config.json" | python3 -c "import sys,json; json.load(sys.stdin)" &>/dev/null; then
HF_BASE="https://hf-mirror.com"
ok "Using hf-mirror.com (works through corporate proxy)"
else
echo "Mirror unavailable. Testing huggingface.co..."
if curl -s --max-time 5 "https://huggingface.co/api/models/hubertsiuzdak/snac_24khz" -o /dev/null 2>/dev/null; then
HF_BASE="https://huggingface.co"
ok "Using huggingface.co directly"
else
fail "Cannot reach hf-mirror.com or huggingface.co. If on corporate network, try from home WiFi."
fi
fi
echo ""
mkdir -p "$MODELS_DIR"
# ── Helper: download with validation ────────────────────────
download_file() {
local URL="$1"
local DEST="$2"
local DESC="$3"
echo " Downloading $DESC..."
curl -k -L --progress-bar -o "$DEST" "$URL"
# Verify not an HTML block page
FILE_HEAD=$(head -c 50 "$DEST" 2>/dev/null)
if echo "$FILE_HEAD" | grep -qi "<!DOCTYPE\|<html"; then
rm -f "$DEST"
fail "Downloaded $DESC is HTML (proxy block page). Try from non-corporate network."
fi
}
# ── 1. SNAC 24kHz decoder ───────────────────────────────────
download_snac() {
echo "=== [SNAC] 24kHz Audio Decoder (~76 MB) ==="
mkdir -p "$MODELS_DIR/snac_24khz"
if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || echo 0)
if [ "$SIZE" -gt 1000000 ]; then
ok "Already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
echo ""
return
fi
fi
download_file "$HF_BASE/hubertsiuzdak/snac_24khz/raw/main/config.json" \
"$MODELS_DIR/snac_24khz/config.json" "config.json"
download_file "$HF_BASE/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin" \
"$MODELS_DIR/snac_24khz/pytorch_model.bin" "pytorch_model.bin (~76 MB)"
ok "SNAC decoder downloaded"
echo ""
}
# ── 2. Qwen3-TTS Tokenizer ──────────────────────────────────
download_qwen_tokenizer() {
echo "=== [Qwen3-TTS] Tokenizer (~650 MB) ==="
local DIR="$MODELS_DIR/Qwen3-TTS-Tokenizer-12Hz"
mkdir -p "$DIR"
if [ -f "$DIR/model.safetensors" ]; then
SIZE=$(stat -f%z "$DIR/model.safetensors" 2>/dev/null || stat -c%s "$DIR/model.safetensors" 2>/dev/null || echo 0)
if [ "$SIZE" -gt 100000000 ]; then
ok "Already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
echo ""
return
fi
fi
for f in config.json configuration.json preprocessor_config.json; do
download_file "$HF_BASE/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" \
"$DIR/$f" "$f"
done
download_file "$HF_BASE/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors" \
"$DIR/model.safetensors" "model.safetensors (~650 MB)"
ok "Qwen3-TTS Tokenizer downloaded"
echo ""
}
# ── 3. Qwen3-TTS 0.6B model ─────────────────────────────────
download_qwen_model() {
echo "=== [Qwen3-TTS] 0.6B CustomVoice (~1.2 GB) ==="
local DIR="$MODELS_DIR/Qwen3-TTS-12Hz-0.6B-CustomVoice"
mkdir -p "$DIR"
if [ -f "$DIR/model.safetensors" ]; then
SIZE=$(stat -f%z "$DIR/model.safetensors" 2>/dev/null || stat -c%s "$DIR/model.safetensors" 2>/dev/null || echo 0)
if [ "$SIZE" -gt 100000000 ]; then
ok "Already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
echo ""
return
fi
fi
for f in config.json generation_config.json; do
download_file "$HF_BASE/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" \
"$DIR/$f" "$f"
done
download_file "$HF_BASE/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors" \
"$DIR/model.safetensors" "model.safetensors (~1.2 GB)"
ok "Qwen3-TTS 0.6B downloaded"
echo ""
}
# ── Run downloads ────────────────────────────────────────────
case "${1:-all}" in
snac)
download_snac
;;
qwen)
download_qwen_tokenizer
download_qwen_model
;;
all)
download_snac
download_qwen_tokenizer
download_qwen_model
;;
*)
echo "Usage: bash download-tts-models.sh [snac|qwen|all]"
exit 1
;;
esac
# ── Summary ──────────────────────────────────────────────────
echo "=== Downloads complete ==="
echo ""
echo "Disk usage:"
du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /'
echo ""
echo "Test commands:"
echo " .venv-qwen-tts/bin/python test_orpheus_tts.py # Orpheus via Ollama"
echo " .venv-qwen-tts/bin/python test_qwen_tts.py # Qwen3-TTS direct"

256
__LOCAL_LLMs/setup-tts.sh Executable file
View File

@ -0,0 +1,256 @@
#!/bin/bash
# ============================================================
# TTS Setup — One-Shot Script for Fresh Laptop
#
# Sets up Orpheus TTS (via Ollama) and Qwen3-TTS (direct Python)
# on Apple Silicon Macs. Works through corporate proxy.
#
# What this does:
# 1. Installs Python 3.12 via Homebrew (if missing)
# 2. Creates Python venv with TTS packages
# 3. Pulls Orpheus TTS model via Ollama
# 4. Downloads SNAC audio decoder via hf-mirror.com
# 5. (Optional) Downloads Qwen3-TTS 0.6B via hf-mirror.com
#
# Prerequisites:
# - macOS with Apple Silicon (M1/M2/M3/M4)
# - Homebrew installed
# - Ollama installed (brew install ollama)
#
# Usage:
# bash setup-tts.sh
#
# After setup, test with:
# .venv-qwen-tts/bin/python test_orpheus_tts.py
# afplay test_orpheus_tara.wav
# ============================================================
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
VENV="$SCRIPT_DIR/.venv-qwen-tts"
MODELS_DIR="$SCRIPT_DIR/models"
# HuggingFace mirror that works through corporate proxy
HF_MIRROR="https://hf-mirror.com"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
ok() { echo -e "${GREEN}${NC} $1"; }
warn() { echo -e "${YELLOW}${NC} $1"; }
fail() { echo -e "${RED}${NC} $1"; exit 1; }
step() { echo -e "\n${GREEN}=== $1 ===${NC}"; }
echo "╔══════════════════════════════════════════════╗"
echo "║ TTS Setup — Local Speech Generation ║"
echo "║ Orpheus TTS (Ollama) + Qwen3-TTS (Python) ║"
echo "╚══════════════════════════════════════════════╝"
echo ""
# ── 0. Check prerequisites ──────────────────────────────────
step "Checking prerequisites"
# Homebrew
if ! command -v brew &>/dev/null; then
fail "Homebrew not found. Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
fi
ok "Homebrew"
# Ollama
if ! command -v ollama &>/dev/null; then
warn "Ollama not found. Installing..."
brew install ollama
fi
ok "Ollama installed"
# Check if Ollama is running
if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
warn "Ollama not running. Starting..."
ollama serve &>/dev/null &
sleep 3
if ! curl -s --max-time 2 http://localhost:11434/api/tags &>/dev/null; then
fail "Could not start Ollama. Try manually: ollama serve"
fi
fi
ok "Ollama running on port 11434"
# Apple Silicon check
ARCH=$(uname -m)
if [ "$ARCH" != "arm64" ]; then
warn "Not Apple Silicon ($ARCH). MPS acceleration won't be available."
fi
# ── 1. Install Python 3.12 ──────────────────────────────────
step "Python 3.12"
PYTHON_CMD=""
# Check various Python 3.12 locations
for cmd in python3.12 /opt/homebrew/bin/python3.12 /usr/local/bin/python3.12; do
if command -v "$cmd" &>/dev/null; then
PYTHON_CMD="$cmd"
break
fi
done
if [ -z "$PYTHON_CMD" ]; then
warn "Python 3.12 not found. Installing via Homebrew..."
brew install python@3.12
PYTHON_CMD="/opt/homebrew/bin/python3.12"
fi
PYTHON_VER=$("$PYTHON_CMD" --version 2>&1)
ok "$PYTHON_VER at $PYTHON_CMD"
# ── 2. Create venv ──────────────────────────────────────────
step "Python virtual environment"
if [ -f "$VENV/bin/python" ]; then
ok "Venv exists at $VENV"
else
echo "Creating venv..."
"$PYTHON_CMD" -m venv "$VENV"
ok "Venv created at $VENV"
fi
# ── 3. Install Python packages ──────────────────────────────
step "Python packages"
# Check if snac is installed (quick proxy for all packages)
if "$VENV/bin/python" -c "import snac" &>/dev/null; then
ok "Packages already installed (snac, torch, etc.)"
else
echo "Installing packages (this may take a few minutes)..."
"$VENV/bin/pip" install -U pip --quiet
"$VENV/bin/pip" install -U snac qwen-tts --quiet
ok "Packages installed"
fi
# ── 4. Pull Orpheus TTS model ───────────────────────────────
step "Orpheus TTS model (Ollama)"
if ollama list 2>/dev/null | grep -q "orpheus"; then
ok "Orpheus TTS already downloaded"
else
echo "Pulling sematre/orpheus:en (4 GB)..."
NO_PROXY="ollama.com,registry.ollama.ai" ollama pull sematre/orpheus:en
ok "Orpheus TTS downloaded"
fi
# ── 5. Download SNAC decoder ────────────────────────────────
step "SNAC 24kHz audio decoder (~76 MB)"
mkdir -p "$MODELS_DIR/snac_24khz"
if [ -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
SIZE=$(stat -f%z "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null || stat -c%s "$MODELS_DIR/snac_24khz/pytorch_model.bin" 2>/dev/null)
if [ "$SIZE" -gt 1000000 ]; then
ok "SNAC decoder already downloaded ($(echo "scale=1; $SIZE/1048576" | bc) MB)"
else
warn "SNAC file looks corrupted (${SIZE} bytes). Re-downloading..."
rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin"
fi
fi
if [ ! -f "$MODELS_DIR/snac_24khz/pytorch_model.bin" ]; then
echo "Downloading config.json..."
curl -k -sL -o "$MODELS_DIR/snac_24khz/config.json" \
"$HF_MIRROR/hubertsiuzdak/snac_24khz/raw/main/config.json"
# Verify config is JSON (not an HTML block page)
if ! python3 -c "import json; json.load(open('$MODELS_DIR/snac_24khz/config.json'))" &>/dev/null; then
fail "Downloaded config.json is not valid JSON. The mirror may be blocked. Try from home network."
fi
ok "config.json downloaded"
echo "Downloading pytorch_model.bin (~76 MB)..."
curl -k -L --progress-bar -o "$MODELS_DIR/snac_24khz/pytorch_model.bin" \
"$HF_MIRROR/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
# Verify it's a real model file (zip/pytorch format), not HTML
FILE_TYPE=$(file -b "$MODELS_DIR/snac_24khz/pytorch_model.bin" | head -c 20)
if echo "$FILE_TYPE" | grep -qi "html"; then
rm -f "$MODELS_DIR/snac_24khz/pytorch_model.bin"
fail "Downloaded model is HTML (proxy block page). Try from home network."
fi
ok "SNAC decoder downloaded"
fi
# Verify SNAC loads in Python
echo "Verifying SNAC decoder loads..."
if "$VENV/bin/python" -c "
import snac, torch
model = snac.SNAC.from_pretrained('$MODELS_DIR/snac_24khz')
print(f'SNAC: {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters')
" 2>/dev/null; then
ok "SNAC decoder verified"
else
fail "SNAC decoder failed to load. Delete models/snac_24khz/ and re-run."
fi
# ── 6. (Optional) Download Qwen3-TTS ────────────────────────
step "Qwen3-TTS 0.6B (optional, ~1.7 GB total)"
QWEN_TOKENIZER_DIR="$MODELS_DIR/Qwen3-TTS-Tokenizer-12Hz"
QWEN_MODEL_DIR="$MODELS_DIR/Qwen3-TTS-12Hz-0.6B-CustomVoice"
if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then
ok "Qwen3-TTS already downloaded"
else
echo "Qwen3-TTS 0.6B requires ~1.7 GB download (tokenizer + model)."
echo "This is optional — Orpheus TTS (above) works without it."
read -p "Download Qwen3-TTS? [y/N] " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
# Tokenizer (~650 MB)
echo "Downloading Qwen3-TTS Tokenizer (~650 MB)..."
mkdir -p "$QWEN_TOKENIZER_DIR"
for f in config.json configuration.json preprocessor_config.json; do
curl -k -sL -o "$QWEN_TOKENIZER_DIR/$f" \
"$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f" 2>/dev/null || true
done
curl -k -L --progress-bar -o "$QWEN_TOKENIZER_DIR/model.safetensors" \
"$HF_MIRROR/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors"
ok "Tokenizer downloaded"
# Model
echo "Downloading Qwen3-TTS 0.6B (~1.2 GB)..."
mkdir -p "$QWEN_MODEL_DIR"
for f in config.json generation_config.json; do
curl -k -sL -o "$QWEN_MODEL_DIR/$f" \
"$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f" 2>/dev/null || true
done
curl -k -L --progress-bar -o "$QWEN_MODEL_DIR/model.safetensors" \
"$HF_MIRROR/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors"
ok "Qwen3-TTS 0.6B downloaded"
else
warn "Skipped. You can re-run this script later to download."
fi
fi
# ── Summary ──────────────────────────────────────────────────
step "Setup Complete"
echo ""
echo "Installed components:"
echo " Orpheus TTS (Ollama): $(ollama list 2>/dev/null | grep orpheus | awk '{print $NF}' || echo 'ready')"
echo " SNAC decoder: $MODELS_DIR/snac_24khz/"
if [ -d "$QWEN_MODEL_DIR" ] && [ -f "$QWEN_MODEL_DIR/config.json" ]; then
echo " Qwen3-TTS 0.6B: $QWEN_MODEL_DIR/"
else
echo " Qwen3-TTS 0.6B: (not installed — re-run setup to add)"
fi
echo ""
echo "Disk usage:"
du -sh "$MODELS_DIR"/* 2>/dev/null | sed 's/^/ /'
echo ""
echo "Test commands:"
echo " $VENV/bin/python $SCRIPT_DIR/test_orpheus_tts.py"
echo " afplay test_orpheus_tara.wav"
if [ -d "$QWEN_MODEL_DIR" ]; then
echo " $VENV/bin/python $SCRIPT_DIR/test_qwen_tts.py"
fi
echo ""
echo "Voices: tara, leah, jess, leo, dan, mia, zac, zoe"
echo "Emotion: <laugh>, <chuckle>, <sigh>, <cough>, <groan>, <yawn>, <gasp>"

110
__LOCAL_LLMs/start-dashboard.sh Executable file
View File

@ -0,0 +1,110 @@
#!/bin/bash
# ============================================================
# Start Mission Control Dashboard + Ollama
#
# Usage:
# bash start-dashboard.sh # start dashboard + ensure Ollama running
# bash start-dashboard.sh stop # stop dashboard
# bash start-dashboard.sh status # check status
# ============================================================
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DASHBOARD_DIR="$SCRIPT_DIR/dashboard"
PORT=3000
OLLAMA_URL="http://localhost:11434"
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
ok() { echo -e "${GREEN}${NC} $1"; }
warn() { echo -e "${YELLOW}${NC} $1"; }
fail() { echo -e "${RED}${NC} $1"; }
case "${1:-start}" in
stop)
echo "Stopping dashboard..."
PID=$(lsof -ti :$PORT 2>/dev/null)
if [ -n "$PID" ]; then
kill "$PID" 2>/dev/null
ok "Dashboard stopped (PID $PID)"
else
warn "Dashboard not running on port $PORT"
fi
exit 0
;;
status)
echo "=== Status ==="
# Ollama
if curl -s --max-time 2 "$OLLAMA_URL/api/tags" &>/dev/null; then
MODELS=$(curl -s "$OLLAMA_URL/api/tags" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('models',[])))" 2>/dev/null || echo "?")
ok "Ollama running ($MODELS models)"
else
fail "Ollama not running"
fi
# Dashboard
if curl -s --max-time 2 "http://localhost:$PORT" &>/dev/null; then
ok "Dashboard running at http://localhost:$PORT"
else
fail "Dashboard not running"
fi
exit 0
;;
start)
echo "=== Starting Mission Control ==="
echo ""
# 1. Ensure Ollama is running
if curl -s --max-time 2 "$OLLAMA_URL/api/tags" &>/dev/null; then
ok "Ollama already running"
else
echo "Starting Ollama..."
ollama serve &>/dev/null &
sleep 2
if curl -s --max-time 2 "$OLLAMA_URL/api/tags" &>/dev/null; then
ok "Ollama started"
else
fail "Could not start Ollama. Try: ollama serve"
fi
fi
# 2. Check if dashboard already running
if curl -s --max-time 2 "http://localhost:$PORT" &>/dev/null; then
ok "Dashboard already running at http://localhost:$PORT"
exit 0
fi
# 3. Install deps if needed
if [ ! -d "$DASHBOARD_DIR/node_modules" ]; then
echo "Installing dependencies..."
(cd "$DASHBOARD_DIR" && npm install --silent)
ok "Dependencies installed"
fi
# 4. Start dashboard
echo "Starting dashboard on port $PORT..."
(cd "$DASHBOARD_DIR" && npm run dev &>/dev/null &)
# Wait for it to be ready
for i in $(seq 1 15); do
if curl -s --max-time 1 "http://localhost:$PORT" &>/dev/null; then
ok "Dashboard ready at http://localhost:$PORT"
echo ""
echo "Open: http://localhost:$PORT"
echo "Stop: bash start-dashboard.sh stop"
exit 0
fi
sleep 1
done
fail "Dashboard did not start within 15s. Check: cd dashboard && npm run dev"
exit 1
;;
*)
echo "Usage: bash start-dashboard.sh [start|stop|status]"
exit 1
;;
esac

View File

@ -0,0 +1,189 @@
"""
Test Orpheus TTS via Ollama + SNAC decoder.
Prerequisites:
1. bash setup-tts.sh (one-shot: installs everything)
-- OR manually --
1. ollama pull sematre/orpheus:en
2. bash download-tts-models.sh snac (downloads SNAC via hf-mirror.com)
3. ollama serve (must be running)
Usage:
.venv-qwen-tts/bin/python test_orpheus_tts.py
"""
import os
import re
import time
import json
import struct
import wave
import urllib.request
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SNAC_MODEL_DIR = os.path.join(SCRIPT_DIR, "models", "snac_24khz")
OLLAMA_URL = "http://localhost:11434"
MODEL = "sematre/orpheus:en"
AUDIO_TOKEN_RE = re.compile(r"<custom_token_(\d+)>")
def check_ollama():
"""Verify Ollama is running and model is available."""
try:
req = urllib.request.Request(f"{OLLAMA_URL}/api/tags")
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
names = [m["name"] for m in data.get("models", [])]
if not any(MODEL in n for n in names):
print(f"ERROR: Model '{MODEL}' not found. Run: ollama pull {MODEL}")
return False
return True
except Exception as e:
print(f"ERROR: Cannot connect to Ollama at {OLLAMA_URL}: {e}")
print("Run: ollama serve")
return False
def check_snac():
"""Verify SNAC model is downloaded."""
if not os.path.isdir(SNAC_MODEL_DIR):
print(f"ERROR: SNAC decoder not found at {SNAC_MODEL_DIR}")
print("Run: bash setup-tts.sh (or: bash download-tts-models.sh snac)")
return False
return True
def load_snac():
"""Load SNAC audio codec."""
import torch
import snac
print(f"Loading SNAC decoder from {SNAC_MODEL_DIR}...")
model = snac.SNAC.from_pretrained(SNAC_MODEL_DIR)
model.eval()
return model
def generate_tokens(text: str, voice: str = "tara") -> str:
"""Call Ollama to generate audio tokens from text."""
prompt = f"<custom_token_3><|begin_of_text|>{voice}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.6,
"top_p": 0.9,
"repeat_penalty": 1.1,
"num_predict": 10240,
"stop": ["<|end_of_text|>"],
},
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/api/generate",
data=payload,
headers={"Content-Type": "application/json"},
)
print("Generating audio tokens via Ollama...")
t0 = time.time()
with urllib.request.urlopen(req, timeout=120) as resp:
result = json.loads(resp.read())
elapsed = time.time() - t0
response_text = result.get("response", "")
token_count = len(AUDIO_TOKEN_RE.findall(response_text))
print(f"Generated {token_count} audio tokens in {elapsed:.1f}s")
return response_text
def decode_tokens(response_text: str, snac_model) -> tuple:
"""Convert audio tokens to WAV audio."""
import torch
tokens = AUDIO_TOKEN_RE.findall(response_text)
if not tokens:
print("ERROR: No audio tokens found in response")
return None, 0
audio_ids = [
int(tok) - 10 - ((idx % 7) * 4096)
for idx, tok in enumerate(tokens)
]
# Trim to multiple of 7
audio_ids = audio_ids[: len(audio_ids) // 7 * 7]
if len(audio_ids) == 0:
print("ERROR: Not enough audio tokens to decode")
return None, 0
audio_tensor = torch.tensor(audio_ids, dtype=torch.int32).reshape(-1, 7)
codes_0 = audio_tensor[:, 0].unsqueeze(0)
codes_1 = torch.stack((audio_tensor[:, 1], audio_tensor[:, 4])).t().flatten().unsqueeze(0)
codes_2 = (
torch.stack((audio_tensor[:, 2], audio_tensor[:, 3], audio_tensor[:, 5], audio_tensor[:, 6]))
.t()
.flatten()
.unsqueeze(0)
)
print("Decoding audio...")
with torch.inference_mode():
audio_hat = snac_model.decode([codes_0, codes_1, codes_2])
audio_np = audio_hat[0].squeeze().numpy()
return audio_np, 24000
def save_wav(audio_np, sample_rate: int, path: str):
"""Save numpy audio array as 16-bit WAV."""
import numpy as np
# Normalize to int16
audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
with wave.open(path, "w") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(audio_int16.tobytes())
duration = len(audio_int16) / sample_rate
print(f"Saved {path} ({duration:.1f}s, {sample_rate} Hz)")
def main():
print("=== Orpheus TTS Test (Ollama + SNAC) ===\n")
if not check_ollama():
return
if not check_snac():
return
snac_model = load_snac()
# Voices: tara, leah, jess, leo, dan, mia, zac, zoe
tests = [
("Hello! This is Orpheus text to speech, running entirely on your Mac through Ollama.", "tara"),
("<laugh> That's amazing! Local AI speech generation without any cloud services!", "leo"),
]
for i, (text, voice) in enumerate(tests):
print(f"\n--- Test {i+1}: voice={voice} ---")
print(f"Text: {text[:80]}...")
response = generate_tokens(text, voice)
audio, sr = decode_tokens(response, snac_model)
if audio is not None:
outpath = os.path.join(SCRIPT_DIR, f"test_orpheus_{voice}.wav")
save_wav(audio, sr, outpath)
print("\n=== Done! Open the .wav files to listen. ===")
print("Play with: afplay test_orpheus_tara.wav")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,84 @@
"""
Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback).
Prerequisites:
bash setup-tts.sh (one-shot: installs everything)
-- OR manually --
bash download-tts-models.sh (downloads models via hf-mirror.com)
Usage:
.venv-qwen-tts/bin/python test_qwen_tts.py
"""
import os
import time
import torch
import soundfile as sf
from qwen_tts import Qwen3TTSModel
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice")
# Check model exists locally
if not os.path.isdir(MODEL_PATH):
print(f"ERROR: Model not found at {MODEL_PATH}")
print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)")
raise SystemExit(1)
# Pick device: MPS if available, else CPU
if torch.backends.mps.is_available():
device = "mps"
dtype = torch.float32 # MPS doesn't support bfloat16
print(f"Using MPS (Apple Metal GPU)")
else:
device = "cpu"
dtype = torch.float32
print(f"Using CPU")
print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...")
t0 = time.time()
model = Qwen3TTSModel.from_pretrained(
MODEL_PATH,
device_map=device,
dtype=dtype,
)
print(f"Model loaded in {time.time() - t0:.1f}s")
print(f"Supported speakers: {model.get_supported_speakers()}")
print(f"Supported languages: {model.get_supported_languages()}")
# Test 1: English with a built-in speaker
text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac."
print(f"\nGenerating speech for: {text[:60]}...")
t1 = time.time()
wavs, sr = model.generate_custom_voice(
text=text,
language="English",
speaker="Chelsie",
)
elapsed = time.time() - t1
print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s")
output_path = "test_output_english.wav"
sf.write(output_path, wavs[0], sr)
print(f"Saved to {output_path}")
# Test 2: English with emotion instruction
text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!"
print(f"\nGenerating with emotion: {text2[:60]}...")
t2 = time.time()
wavs2, sr2 = model.generate_custom_voice(
text=text2,
language="English",
speaker="Chelsie",
instruct="Speak with excitement and enthusiasm",
)
elapsed2 = time.time() - t2
print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s")
sf.write("test_output_excited.wav", wavs2[0], sr2)
print("Saved to test_output_excited.wav")
print("\nDone! Open the .wav files to listen.")

View File

@ -0,0 +1,387 @@
Here is a complete engineering-grade specification document for the exact configuration you shared:
Razer Blade 18 (Model: RZ09-05299ER9-R3U1) — Detailed Specification Document
Manufacturer: Razer Inc.
Product Line: Blade Series
Model Number: RZ09-05299ER9-R3U1
Form Factor: High-performance desktop-class gaming & workstation laptop
Release Generation: RTX 50-series era (2026)
1. System Overview
The Razer Blade 18 is positioned as a flagship desktop-replacement laptop, integrating Intel Core Ultra HX processors, NVIDIA RTX 50-series GPUs, ultra-high refresh displays, and workstation-level memory/storage configurations.
Primary Target Use Cases
• AAA gaming at maximum settings (4K, ray tracing)
• AI / ML model development (local inference, CUDA workloads)
• Software development & compilation
• 3D rendering, Unreal Engine, Blender
• Video editing (8K workflows)
• Desktop replacement workstation
2. CPU (Processor)
Processor: Intel® Core™ Ultra 9 275HX
Architecture
Attribute Specification
CPU family Intel Core Ultra HX Series
Architecture Intel Meteor Lake / Arrow Lake HX class
Core design Hybrid architecture
Core types Performance cores + Efficient cores
Target TDP ~55W base (HX class), scalable to ~157W turbo
Fabrication Intel 3 / advanced node
Integrated AI accelerator Intel NPU (Neural Processing Unit)
Estimated core configuration (typical for Ultra 9 HX class)
Core type Count
Performance cores 8
Efficient cores 16
Total cores 24
Threads 24
AI acceleration
Integrated:
• Intel NPU
• AVX-512 support
• VNNI instructions
• Hardware AI acceleration support
Use cases:
• Local AI inference
• Background Copilot AI tasks
• AI-assisted workflows
3. GPU (Graphics)
Discrete GPU: NVIDIA GeForce RTX 5090 Laptop GPU
VRAM: 24 GB GDDR7 VRAM
GPU Architecture
Attribute Specification
Architecture NVIDIA Blackwell (RTX 50-series)
Memory type GDDR7
VRAM size 24 GB
CUDA cores Estimated ~18,00020,000
Ray tracing cores 4th or 5th Gen RT cores
Tensor cores 5th or 6th Gen
PCIe interface PCIe Gen 5
DirectX support DirectX 12 Ultimate
Vulkan support Yes
OpenCL support Yes
CUDA support Yes
GPU Compute Capability
Feature Support
CUDA compute Yes
Tensor acceleration Yes
DLSS DLSS 4
Ray tracing Hardware accelerated
AI inference Excellent
Stable diffusion Excellent
Local LLM inference Excellent
AI / ML Capability Estimate
Model Expected Performance
Llama 3 8B Real-time
Llama 3 70B quantized Usable
Stable Diffusion XL Very fast
Whisper large Very fast
TensorRT inference Excellent
4. RAM (Memory)
Installed memory: 64 GB RAM
Memory speed: 5600 MHz
Memory Details
Attribute Specification
Capacity 64 GB
Type DDR5
Speed 5600 MHz
Channels Dual channel
ECC No
Upgradeability Yes (depends on configuration)
Memory bandwidth estimate
~90120 GB/sec
5. Storage
Installed storage: 4 TB SSD (2 TB + 2 TB)
Storage configuration
Attribute Specification
Total capacity 4 TB
Drive type NVMe SSD
Interface PCIe Gen 4 or Gen 5
Configuration Dual SSD
RAID support Possible
Upgradeable Yes
Storage performance estimate
Metric Expected
Sequential read 7,00014,000 MB/sec
Sequential write 6,00012,000 MB/sec
Random IOPS >1 million
6. Display
Display size: 18 inches
Display modes: Dual mode UHD+ 240 Hz / FHD+ 440 Hz
Display detailed specifications
Attribute Specification
Size 18 inches
Mode 1 resolution UHD+ (3840×2400)
Mode 2 resolution FHD+ (1920×1200)
Refresh rate (UHD+) 240 Hz
Refresh rate (FHD+) 440 Hz
Aspect ratio 16:10
Panel type IPS or Mini-LED
Adaptive sync Yes
Response time <3 ms (estimated)
HDR support Likely HDR 6001000
Color gamut 100% DCI-P3
Dual-mode display explanation
Switchable between:
Mode Use case
UHD+ 240 Hz Visual quality, editing
FHD+ 440 Hz Competitive gaming
7. Operating System
OS: Windows 11 Home
Supports:
• DirectX 12 Ultimate
• WSL2
• CUDA
• AI frameworks
8. Cooling System
Advanced vapor chamber cooling system.
Expected features:
• Vapor chamber cooling
• Dual fan cooling
• Liquid metal thermal interface
• Advanced heat pipe network
Supports sustained:
• CPU ~120W+
• GPU ~175W+
9. Connectivity & Ports (Expected for Blade 18)
Typical Blade 18 includes:
USB
• 3× USB-A 3.2 Gen 2
• 2× USB-C (Thunderbolt 4 / USB4)
Video
• HDMI 2.1
• Thunderbolt video output
Network
• 2.5 Gb Ethernet
Audio
• 3.5 mm combo jack
Storage expansion
• Dual NVMe slots
10. Wireless Connectivity
Expected:
Technology Support
Wi-Fi Wi-Fi 7
Bluetooth Bluetooth 5.4
11. Power System
Estimated:
Attribute Specification
Power adapter 330W400W
Battery ~9099 Wh
Charging Fast charging supported
12. Keyboard
• Per-key RGB lighting
• Razer Chroma support
• Anti-ghosting
• Full keyboard with numpad
13. Build and Physical
Expected typical Blade 18 chassis:
Attribute Specification
Material CNC aluminum
Color Black
Thickness ~2124 mm
Weight ~3.1 kg
14. AI / ML Workstation Capability Assessment
This laptop is excellent for:
Supported workloads
Workload Capability
CUDA development Excellent
Stable diffusion Excellent
LLM inference Excellent
PyTorch training Very good
TensorFlow Excellent
Unreal Engine Excellent
Blender Excellent
15. Gaming Performance Tier
Expected performance:
Resolution Performance
1080p Extreme
1440p Extreme
4K Excellent
Ray tracing:
• Excellent
DLSS 4:
• Supported
16. Storage Expansion Capability
Supports:
• Dual NVMe SSD
• RAID configuration
• Upgradable storage
17. Price
Configured model price:
$5,199.99 USD
18. Performance Tier Classification
Category Rating
Gaming Extreme
AI / ML Extreme
Software development Extreme
Video editing Extreme
Desktop replacement Extreme
19. Ideal Use Cases for You Specifically (Saravana)
Based on your AI engineering work:
Perfect for:
• Running multiple local LLMs
• CUDA inference pipelines
• Running LangGraph agents locally
• Running Stable Diffusion
• Running Whisper locally
• Running multiple Docker containers
• Unreal Engine / 3D
20. Summary
This configuration is:
One of the most powerful laptops available in the world currently
Equivalent to:
• High-end workstation
• High-end desktop GPU
• Enterprise-grade development machine
If you want, I can also create:
• Full performance comparison vs desktop RTX 4090
• Or benchmark estimates for LLM / Stable Diffusion / coding workloads
• Or recommended optimal configuration for your home lab and multi-model setup

View File

@ -0,0 +1,372 @@
# Windows Setup Guide — Local LLM Stack on Razer Blade 18
> **Hardware:** Razer Blade 18 · Intel Core Ultra 9 275HX · RTX 5090 24 GB GDDR7 · 64 GB DDR5 · 4 TB NVMe
> **OS:** Windows 11 Home
> **Goal:** Mirror the macOS `__LOCAL_LLMs` stack — Ollama, Whisper, TTS (Orpheus + Qwen3), Mission Control dashboard
> **See also:** [razer-blade-18-spec.md](razer-blade-18-spec.md) for full hardware specs
---
## Prerequisites
### 1. Windows Package Manager
Install **winget** (ships with Windows 11) and optionally **Scoop** for CLI tools:
```powershell
# Verify winget
winget --version
# Install Scoop (optional, useful for dev tools)
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
Invoke-RestMethod -Uri https://get.scoop.sh | Invoke-Expression
```
### 2. NVIDIA CUDA Toolkit
The RTX 5090 needs the latest CUDA drivers for GPU-accelerated inference.
```powershell
# Install NVIDIA drivers (latest Game Ready or Studio)
winget install --id Nvidia.GeForceExperience
# Install CUDA Toolkit (required for PyTorch CUDA)
winget install --id Nvidia.CUDA
# Or download from: https://developer.nvidia.com/cuda-downloads
# Verify
nvidia-smi
```
Expected output should show:
- **RTX 5090** with **24 GB** VRAM
- CUDA version 13.x+
### 3. Node.js (for Mission Control Dashboard)
```powershell
winget install --id OpenJS.NodeJS.LTS
# Verify
node --version # should be 20.x+
npm --version
```
### 4. Python 3.12
```powershell
winget install --id Python.Python.3.12
# Verify
python --version
pip --version
```
### 5. Git
```powershell
winget install --id Git.Git
```
### 6. ffmpeg
```powershell
winget install --id Gyan.FFmpeg
# Or: scoop install ffmpeg
```
---
## 1. Ollama — LLM Server
### Install
```powershell
winget install --id Ollama.Ollama
```
Ollama for Windows runs as a background service and automatically uses CUDA (RTX 5090).
### Verify
```powershell
ollama --version
curl http://localhost:11434/api/tags
```
### Download Models
```powershell
# Coding
ollama pull qwen2.5-coder:32b # 19 GB — primary coding model
ollama pull qwen2.5-coder:7b # 4.7 GB — fast coding
# Reasoning
ollama pull deepseek-r1:32b # 19 GB — chain-of-thought
# General
ollama pull llama3.1:8b # 4.9 GB — fast general tasks
# TTS
ollama pull sematre/orpheus:en # 4 GB — text-to-speech (8 voices)
# Verify
ollama list
```
> **Note:** With 24 GB VRAM, Ollama will offload 32B models almost entirely to GPU.
> On macOS (48 GB unified), the 32B models run in shared CPU/GPU memory.
> On this machine, **GPU inference will be significantly faster** for models that fit in 24 GB VRAM.
### VRAM Budget (RTX 5090 — 24 GB)
| Model | VRAM Usage | Fits in GPU? |
| ---------------------------- | ---------- | ------------ |
| llama3.1:8b | ~5 GB | ✅ Fully |
| qwen2.5-coder:7b | ~5 GB | ✅ Fully |
| sematre/orpheus:en | ~4 GB | ✅ Fully |
| qwen2.5-coder:32b | ~19 GB | ✅ Fully |
| deepseek-r1:32b | ~19 GB | ✅ Fully |
| Two 7B models simultaneously | ~10 GB | ✅ Both fit |
---
## 2. Whisper.cpp — Speech-to-Text
### Option A: Pre-built Binary (Recommended)
Download the latest release from GitHub:
```powershell
# Create whisper directory
mkdir "$env:USERPROFILE\whisper-cpp"
cd "$env:USERPROFILE\whisper-cpp"
# Download latest release (CUDA build)
# Check: https://github.com/ggerganov/whisper.cpp/releases
# Look for: whisper-cublas-bin-x64.zip or whisper-cuda-bin-x64.zip
```
### Option B: Build from Source (CUDA)
```powershell
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
```
### Download Whisper Model
```powershell
mkdir "$env:USERPROFILE\whisper-models"
# Download ggml-large-v3-turbo (1.5 GB)
curl -L -o "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" `
"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin"
```
> **No corporate proxy on this machine** — download directly from `huggingface.co`.
> The `hf-mirror.com` workaround is only needed on the corporate MacBook.
### Verify
```powershell
# Test transcription
whisper-cli -m "$env:USERPROFILE\whisper-models\ggml-large-v3-turbo.bin" -f test.wav
```
---
## 3. TTS — Orpheus + Qwen3-TTS
### 3a. Orpheus TTS (via Ollama)
Already handled in Step 1 (`ollama pull sematre/orpheus:en`).
### 3b. SNAC Decoder
```powershell
# Create models directory (match macOS layout)
$MODELS = "$PSScriptRoot\models" # or wherever you clone the repo
mkdir "$MODELS\snac_24khz" -Force
# Download SNAC decoder
curl -L -o "$MODELS\snac_24khz\config.json" `
"https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/config.json"
curl -L -o "$MODELS\snac_24khz\pytorch_model.bin" `
"https://huggingface.co/hubertsiuzdak/snac_24khz/resolve/main/pytorch_model.bin"
```
### 3c. Python Venv + Dependencies
```powershell
cd __LOCAL_LLMs
# Create venv
python -m venv .venv-qwen-tts
# Activate (Windows uses Scripts, not bin)
.\.venv-qwen-tts\Scripts\Activate.ps1
# Install PyTorch with CUDA (NOT MPS — that's Apple only)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# Install other deps
pip install snac numpy soundfile
# Verify CUDA
python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0)}')"
# Expected: CUDA: True, Device: NVIDIA GeForce RTX 5090 Laptop GPU
```
### 3d. Qwen3-TTS 0.6B
```powershell
$MODELS = ".\models"
# Tokenizer (~650 MB)
mkdir "$MODELS\Qwen3-TTS-Tokenizer-12Hz" -Force
foreach ($f in @("config.json", "configuration.json", "preprocessor_config.json")) {
curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\$f" `
"https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/$f"
}
curl -L -o "$MODELS\Qwen3-TTS-Tokenizer-12Hz\model.safetensors" `
"https://huggingface.co/Qwen/Qwen3-TTS-Tokenizer-12Hz/resolve/main/model.safetensors"
# Model weights (~1.8 GB)
mkdir "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice" -Force
foreach ($f in @("config.json", "generation_config.json")) {
curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\$f" `
"https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/$f"
}
curl -L -o "$MODELS\Qwen3-TTS-12Hz-0.6B-CustomVoice\model.safetensors" `
"https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice/resolve/main/model.safetensors"
```
### 3e. Test TTS
```powershell
# Activate venv
.\.venv-qwen-tts\Scripts\Activate.ps1
# Orpheus TTS test
python test_orpheus_tts.py
# Qwen3-TTS test
python test_qwen_tts.py
```
> **Key difference from macOS:** Qwen3-TTS will use **CUDA** instead of MPS.
> In `test_qwen_tts.py`, the device selection `torch.device("mps")` will fall through to CUDA automatically
> since `torch.backends.mps.is_available()` returns False on Windows.
> You may want to update the device logic to prefer CUDA:
>
> ```python
> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
> ```
---
## 4. Mission Control Dashboard
```powershell
cd __LOCAL_LLMs\dashboard
# Install dependencies
npm install
# Start dev server
npm run dev
# Open http://localhost:3000
```
The dashboard is pure Next.js — works identically on Windows. The API routes auto-detect:
- **Ollama** at `localhost:11434`
- **Whisper** models in `%USERPROFILE%\whisper-models\`
- **TTS** engines (Orpheus, Qwen3-TTS) and Python venv
### Start Script (PowerShell)
Use the bash script equivalent:
```powershell
# Quick start (manual)
ollama serve # if not already running as service
cd __LOCAL_LLMs\dashboard
npm run dev
```
> TODO: Create `start-dashboard.ps1` as a PowerShell equivalent of `start-dashboard.sh`
---
## 5. Key Differences: macOS vs Windows
| Area | macOS (M4 Pro 48 GB) | Windows (Razer Blade 18) |
| ------------------- | ----------------------------------- | ------------------------------------- |
| **GPU** | Apple Silicon (unified memory, MPS) | RTX 5090 (24 GB VRAM, CUDA) |
| **Ollama GPU** | Automatic (Metal) | Automatic (CUDA) |
| **VRAM** | Shared from 48 GB RAM | Dedicated 24 GB GDDR7 |
| **PyTorch device** | `mps` | `cuda` |
| **Whisper install** | `brew install whisper-cpp` | Build from source or download release |
| **Python venv** | `bin/activate` | `Scripts\Activate.ps1` |
| **Package manager** | Homebrew | winget / scoop |
| **Shell** | zsh / bash | PowerShell / cmd |
| **Scripts** | `.sh` (bash) | `.ps1` (PowerShell) |
| **Model download** | `hf-mirror.com` (corporate proxy) | `huggingface.co` (no proxy) |
| **Dashboard** | Identical | Identical |
| **Ollama models** | Identical | Identical |
### Performance Expectations
| Workload | macOS M4 Pro 48 GB | Razer RTX 5090 24 GB |
| --------------------------- | ---------------------------- | ------------------------- |
| qwen2.5-coder:32b inference | ~1525 tok/s (MPS/CPU blend) | ~4060 tok/s (full CUDA) |
| Whisper large-v3-turbo | ~24x realtime (CPU) | ~815x realtime (CUDA) |
| Orpheus TTS | ~realtime (CPU decode) | ~23x realtime (CUDA) |
| Qwen3-TTS | ~realtime (MPS) | ~24x realtime (CUDA) |
| 70B quantized models | Fits in 48 GB (slow) | Partially offloads to RAM |
---
## 6. File Layout (Same as macOS)
```
__LOCAL_LLMs/
├── dashboard/ ← Mission Control (port 3000) — works as-is
├── models/ ← TTS model weights (gitignored)
│ ├── snac_24khz/
│ ├── Qwen3-TTS-Tokenizer-12Hz/
│ └── Qwen3-TTS-12Hz-0.6B-CustomVoice/
├── .venv-qwen-tts/ ← Python venv (Scripts\ on Windows)
├── test_orpheus_tts.py ← works as-is (device fallback)
├── test_qwen_tts.py ← update device to prefer CUDA
├── windows_specific/
│ ├── razer-blade-18-spec.md ← hardware spec
│ └── setup-guide.md ← this file
└── docs/ ← macOS-focused docs (still useful as reference)
```
---
## 7. Quick Reference — Full Setup Checklist
```
[ ] Install NVIDIA drivers + CUDA Toolkit
[ ] Install Ollama (winget install Ollama.Ollama)
[ ] Pull models: qwen2.5-coder:32b, deepseek-r1:32b, llama3.1:8b, orpheus
[ ] Install Node.js 20+ (winget)
[ ] Install Python 3.12 (winget)
[ ] Install Git (winget)
[ ] Install ffmpeg (winget)
[ ] Clone repo
[ ] Download Whisper model to %USERPROFILE%\whisper-models\
[ ] Build or download whisper-cpp with CUDA
[ ] Create Python venv + install PyTorch CUDA + snac
[ ] Download SNAC decoder
[ ] Download Qwen3-TTS tokenizer + model
[ ] npm install in dashboard/
[ ] Run dashboard: npm run dev
[ ] Verify: http://localhost:3000 shows all green
```