- setup-tts.sh: make fully cross-platform (macOS + Linux/WSL2) - OS detection, apt fallback, CUDA PyTorch install, nvidia-smi check - cross-platform playback hints, HF_MIRROR env override - api/system/route.ts: fix ffmpeg detection (use -version not --version) - api/system/memory/route.ts: remove unused total variable in Linux path - api/system/exec/route.ts: expand allowlist with Linux commands (head, tail, grep, which, ps, uname, free, lscpu, nvidia-smi, etc.) - api/tts/route.ts: cross-platform venv path + CUDA/MPS label - api/whisper/route.ts: Linux binary/model paths - api/ollama/logs/route.ts: Linux log paths + WSL2 hint - test_qwen_tts.py: platform-aware speech text + CUDA device detection - test_orpheus_tts.py: platform-aware text, move import sys to top - setup-guide.md: fix false auto-detect claim, add HF_MIRROR hint
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""
|
|
Test Qwen3-TTS 0.6B (CUDA, MPS, or CPU fallback).
|
|
|
|
Prerequisites:
|
|
bash setup-tts.sh (one-shot: installs everything)
|
|
-- OR manually --
|
|
bash download-tts-models.sh (downloads models via hf-mirror.com)
|
|
|
|
Usage:
|
|
.venv-qwen-tts/bin/python test_qwen_tts.py
|
|
"""
|
|
import os
|
|
import time
|
|
import torch
|
|
import soundfile as sf
|
|
from qwen_tts import Qwen3TTSModel
|
|
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice")
|
|
|
|
# Check model exists locally
|
|
if not os.path.isdir(MODEL_PATH):
|
|
print(f"ERROR: Model not found at {MODEL_PATH}")
|
|
print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)")
|
|
raise SystemExit(1)
|
|
|
|
# Pick device: CUDA > MPS > CPU
|
|
if torch.cuda.is_available():
|
|
device = "cuda"
|
|
dtype = torch.float16
|
|
print(f"Using CUDA ({torch.cuda.get_device_name(0)})")
|
|
elif torch.backends.mps.is_available():
|
|
device = "mps"
|
|
dtype = torch.float32 # MPS doesn't support bfloat16
|
|
print(f"Using MPS (Apple Metal GPU)")
|
|
else:
|
|
device = "cpu"
|
|
dtype = torch.float32
|
|
print(f"Using CPU")
|
|
|
|
print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...")
|
|
t0 = time.time()
|
|
|
|
model = Qwen3TTSModel.from_pretrained(
|
|
MODEL_PATH,
|
|
device_map=device,
|
|
dtype=dtype,
|
|
)
|
|
|
|
print(f"Model loaded in {time.time() - t0:.1f}s")
|
|
print(f"Supported speakers: {model.get_supported_speakers()}")
|
|
print(f"Supported languages: {model.get_supported_languages()}")
|
|
|
|
# Test 1: English with a built-in speaker
|
|
text = f"Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your {'Mac' if device == 'mps' else 'machine'} using {device.upper()}."
|
|
print(f"\nGenerating speech for: {text[:60]}...")
|
|
|
|
t1 = time.time()
|
|
wavs, sr = model.generate_custom_voice(
|
|
text=text,
|
|
language="English",
|
|
speaker="Chelsie",
|
|
)
|
|
elapsed = time.time() - t1
|
|
print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s")
|
|
|
|
output_path = "test_output_english.wav"
|
|
sf.write(output_path, wavs[0], sr)
|
|
print(f"Saved to {output_path}")
|
|
|
|
# Test 2: English with emotion instruction
|
|
text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!"
|
|
print(f"\nGenerating with emotion: {text2[:60]}...")
|
|
|
|
t2 = time.time()
|
|
wavs2, sr2 = model.generate_custom_voice(
|
|
text=text2,
|
|
language="English",
|
|
speaker="Chelsie",
|
|
instruct="Speak with excitement and enthusiasm",
|
|
)
|
|
elapsed2 = time.time() - t2
|
|
print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s")
|
|
|
|
sf.write("test_output_excited.wav", wavs2[0], sr2)
|
|
print("Saved to test_output_excited.wav")
|
|
|
|
print("\nDone! Open the .wav files to listen.")
|