""" Test Qwen3-TTS 0.6B (CUDA, MPS, or CPU fallback). Prerequisites: bash setup-tts.sh (one-shot: installs everything) -- OR manually -- bash download-tts-models.sh (downloads models via hf-mirror.com) Usage: .venv-qwen-tts/bin/python test_qwen_tts.py """ import os import time import torch import soundfile as sf from qwen_tts import Qwen3TTSModel SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) MODEL_PATH = os.path.join(SCRIPT_DIR, "models", "Qwen3-TTS-12Hz-0.6B-CustomVoice") # Check model exists locally if not os.path.isdir(MODEL_PATH): print(f"ERROR: Model not found at {MODEL_PATH}") print("Run: bash setup-tts.sh (or: bash download-tts-models.sh qwen)") raise SystemExit(1) # Pick device: CUDA > MPS > CPU if torch.cuda.is_available(): device = "cuda" dtype = torch.float16 print(f"Using CUDA ({torch.cuda.get_device_name(0)})") elif torch.backends.mps.is_available(): device = "mps" dtype = torch.float32 # MPS doesn't support bfloat16 print(f"Using MPS (Apple Metal GPU)") else: device = "cpu" dtype = torch.float32 print(f"Using CPU") print(f"Loading Qwen3-TTS-12Hz-0.6B-CustomVoice on {device}...") t0 = time.time() model = Qwen3TTSModel.from_pretrained( MODEL_PATH, device_map=device, dtype=dtype, ) print(f"Model loaded in {time.time() - t0:.1f}s") print(f"Supported speakers: {model.get_supported_speakers()}") print(f"Supported languages: {model.get_supported_languages()}") # Test 1: English with a built-in speaker text = f"Hello! Welcome to the local LLM dashboard. I am Qwen three T T S, running entirely on your {'Mac' if device == 'mps' else 'machine'} using {device.upper()}." print(f"\nGenerating speech for: {text[:60]}...") t1 = time.time() wavs, sr = model.generate_custom_voice( text=text, language="English", speaker="Chelsie", ) elapsed = time.time() - t1 print(f"Generated in {elapsed:.1f}s, sample rate={sr}, audio length={len(wavs[0])/sr:.1f}s") output_path = "test_output_english.wav" sf.write(output_path, wavs[0], sr) print(f"Saved to {output_path}") # Test 2: English with emotion instruction text2 = "This is absolutely incredible! I can't believe how well this works on a local machine!" print(f"\nGenerating with emotion: {text2[:60]}...") t2 = time.time() wavs2, sr2 = model.generate_custom_voice( text=text2, language="English", speaker="Chelsie", instruct="Speak with excitement and enthusiasm", ) elapsed2 = time.time() - t2 print(f"Generated in {elapsed2:.1f}s, audio length={len(wavs2[0])/sr2:.1f}s") sf.write("test_output_excited.wav", wavs2[0], sr2) print("Saved to test_output_excited.wav") print("\nDone! Open the .wav files to listen.")