learning_ai_common_plat/__LOCAL_LLMs/test_qwen_tts.py

"""
Test Qwen3-TTS 0.6B (CUDA, MPS, or CPU fallback).

Prerequisites:
  bash setup-tts.sh              (one-shot: installs everything)
  -- OR manually --
  bash download-tts-models.sh    (downloads models via hf-mirror.com)

Usage:
  .venv-qwen-tts/bin/python test_qwen_tts.py
"""
import os
import time
import torch
import soundfile as sf
from qwen_tts import Qwen3TTSModel

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice")

# Check model exists locally
if not os.path.isdir(MODEL_PATH):
    print(f"ERROR: Model not found at {MODEL_PATH}")
    print("Run: bash setup-tts.sh   (or: bash download-tts-models.sh qwen)")
    raise SystemExit(1)

# Pick device: CUDA > MPS > CPU
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16
    print(f"Using CUDA ({torch.cuda.get_device_name(0)})")
elif torch.backends.mps.is_available():
    device = "mps"
    dtype = torch.float32  # MPS doesn't support bfloat16
    print(f"Using MPS (Apple Metal GPU)")
else:
    device = "cpu"
    dtype = torch.float32
    print(f"Using CPU")

print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...")
t0 = time.time()

model = Qwen3TTSModel.from_pretrained(
    MODEL_PATH,
    device_map=device,
    dtype=dtype,
)

print(f"Model loaded in {time.time() - t0:.1f}s")
print(f"Supported speakers: {model.get_supported_speakers()}")
print(f"Supported languages: {model.get_supported_languages()}")

# Test 1: English with a built-in speaker
text = f"Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your {'Mac' if device == 'mps' else 'machine'} using {device.upper()}."
print(f"\nGenerating speech for: {text[:60]}...")

t1 = time.time()
wavs, sr = model.generate_custom_voice(
    text=text,
    language="English",
    speaker="Chelsie",
)
elapsed = time.time() - t1
print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s")

output_path = "test_output_english.wav"
sf.write(output_path, wavs[0], sr)
print(f"Saved to {output_path}")

# Test 2: English with emotion instruction
text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!"
print(f"\nGenerating with emotion: {text2[:60]}...")

t2 = time.time()
wavs2, sr2 = model.generate_custom_voice(
    text=text2,
    language="English",
    speaker="Chelsie",
    instruct="Speak with excitement and enthusiasm",
)
elapsed2 = time.time() - t2
print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s")

sf.write("test_output_excited.wav", wavs2[0], sr2)
print("Saved to test_output_excited.wav")

print("\nDone! Open the .wav files to listen.")