85 lines
2.5 KiB
Python
85 lines
2.5 KiB
Python
"""
|
|
Test Qwen3-TTS 0.6B on Apple Silicon (MPS or CPU fallback).
|
|
|
|
Prerequisites:
|
|
bash setup-tts.sh (one-shot: installs everything)
|
|
-- OR manually --
|
|
bash download-tts-models.sh (downloads models via hf-mirror.com)
|
|
|
|
Usage:
|
|
.venv-qwen-tts/bin/python test_qwen_tts.py
|
|
"""
|
|
import os
|
|
import time
|
|
import torch
|
|
import soundfile as sf
|
|
from qwen_tts import Qwen3TTSModel
|
|
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice")
|
|
|
|
# Check model exists locally
|
|
if not os.path.isdir(MODEL_PATH):
|
|
print(f"ERROR: Model not found at {MODEL_PATH}")
|
|
print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)")
|
|
raise SystemExit(1)
|
|
|
|
# Pick device: MPS if available, else CPU
|
|
if torch.backends.mps.is_available():
|
|
device = "mps"
|
|
dtype = torch.float32 # MPS doesn't support bfloat16
|
|
print(f"Using MPS (Apple Metal GPU)")
|
|
else:
|
|
device = "cpu"
|
|
dtype = torch.float32
|
|
print(f"Using CPU")
|
|
|
|
print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...")
|
|
t0 = time.time()
|
|
|
|
model = Qwen3TTSModel.from_pretrained(
|
|
MODEL_PATH,
|
|
device_map=device,
|
|
dtype=dtype,
|
|
)
|
|
|
|
print(f"Model loaded in {time.time() - t0:.1f}s")
|
|
print(f"Supported speakers: {model.get_supported_speakers()}")
|
|
print(f"Supported languages: {model.get_supported_languages()}")
|
|
|
|
# Test 1: English with a built-in speaker
|
|
text = "Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your Mac."
|
|
print(f"\nGenerating speech for: {text[:60]}...")
|
|
|
|
t1 = time.time()
|
|
wavs, sr = model.generate_custom_voice(
|
|
text=text,
|
|
language="English",
|
|
speaker="Chelsie",
|
|
)
|
|
elapsed = time.time() - t1
|
|
print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s")
|
|
|
|
output_path = "test_output_english.wav"
|
|
sf.write(output_path, wavs[0], sr)
|
|
print(f"Saved to {output_path}")
|
|
|
|
# Test 2: English with emotion instruction
|
|
text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!"
|
|
print(f"\nGenerating with emotion: {text2[:60]}...")
|
|
|
|
t2 = time.time()
|
|
wavs2, sr2 = model.generate_custom_voice(
|
|
text=text2,
|
|
language="English",
|
|
speaker="Chelsie",
|
|
instruct="Speak with excitement and enthusiasm",
|
|
)
|
|
elapsed2 = time.time() - t2
|
|
print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s")
|
|
|
|
sf.write("test_output_excited.wav", wavs2[0], sr2)
|
|
print("Saved to test_output_excited.wav")
|
|
|
|
print("\nDone! Open the .wav files to listen.")
|