learning_ai_common_plat/__LOCAL_LLMs/test_qwen_tts.py
2026-02-21 14:13:07 -08:00

85 lines
2.5 KiB
Python

"""
Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback).
Prerequisites:
bash setup-tts.sh (one-shot: installs everything)
-- OR manually --
bash download-tts-models.sh (downloads models via hf-mirror.com)
Usage:
.venv-qwen-tts/bin/python test_qwen_tts.py
"""
import os
import time
import torch
import soundfile as sf
from qwen_tts import Qwen3TTSModel
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice")
# Check model exists locally
if not os.path.isdir(MODEL_PATH):
print(f"ERROR: Model not found at {MODEL_PATH}")
print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)")
raise SystemExit(1)
# Pick device: MPS if available, else CPU
if torch.backends.mps.is_available():
device = "mps"
dtype = torch.float32 # MPS doesn't support bfloat16
print(f"Using MPS (Apple Metal GPU)")
else:
device = "cpu"
dtype = torch.float32
print(f"Using CPU")
print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...")
t0 = time.time()
model = Qwen3TTSModel.from_pretrained(
MODEL_PATH,
device_map=device,
dtype=dtype,
)
print(f"Model loaded in {time.time() - t0:.1f}s")
print(f"Supported speakers: {model.get_supported_speakers()}")
print(f"Supported languages: {model.get_supported_languages()}")
# Test 1: English with a built-in speaker
text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac."
print(f"\nGenerating speech for: {text[:60]}...")
t1 = time.time()
wavs, sr = model.generate_custom_voice(
text=text,
language="English",
speaker="Chelsie",
)
elapsed = time.time() - t1
print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s")
output_path = "test_output_english.wav"
sf.write(output_path, wavs[0], sr)
print(f"Saved to {output_path}")
# Test 2: English with emotion instruction
text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!"
print(f"\nGenerating with emotion: {text2[:60]}...")
t2 = time.time()
wavs2, sr2 = model.generate_custom_voice(
text=text2,
language="English",
speaker="Chelsie",
instruct="Speak with excitement and enthusiasm",
)
elapsed2 = time.time() - t2
print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s")
sf.write("test_output_excited.wav", wavs2[0], sr2)
print("Saved to test_output_excited.wav")
print("\nDone! Open the .wav files to listen.")