""" Speech processing utilities for YouTube video transcription. """ import whisper import yt_dlp import os import tempfile from typing import Optional class YouTubeTranscriber: """Handles YouTube video audio extraction and transcription.""" def __init__(self, model_size: str = "base"): """ Initialize the transcriber with a Whisper model. Args: model_size: Whisper model size ("tiny", "base", "small", "medium", "large") """ self.model = whisper.load_model(model_size) def extract_audio_from_youtube(self, youtube_url: str) -> str: """ Extract audio from YouTube video and save as temporary file. Args: youtube_url: URL of the YouTube video Returns: Path to the extracted audio file """ # Configure yt-dlp options for audio extraction ydl_opts = { 'format': 'bestaudio[ext=m4a]/bestaudio/best', 'outtmpl': '%(title)s.%(ext)s', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192', }], 'noplaylist': True, 'extract_flat': False, } with tempfile.TemporaryDirectory() as temp_dir: # Change to temp directory for download original_cwd = os.getcwd() os.chdir(temp_dir) try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(youtube_url, download=True) # Find the downloaded audio file audio_files = [f for f in os.listdir('.') if f.endswith('.wav')] if not audio_files: raise ValueError("No audio file was extracted from the YouTube video") audio_file = audio_files[0] audio_path = os.path.join(temp_dir, audio_file) # Create a persistent temp file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: with open(audio_path, 'rb') as source: temp_file.write(source.read()) return temp_file.name finally: os.chdir(original_cwd) def transcribe_audio(self, audio_file_path: str) -> str: """ Transcribe audio file to text using Whisper. Args: audio_file_path: Path to the audio file Returns: Transcribed text """ result = self.model.transcribe(audio_file_path) text = result["text"] # Ensure the text is properly encoded as UTF-8 string if isinstance(text, bytes): text = text.decode('utf-8', errors='ignore') elif not isinstance(text, str): text = str(text) return text def transcribe_youtube_video(self, youtube_url: str) -> str: """ Complete transcription pipeline from YouTube URL to text. Args: youtube_url: URL of the YouTube video Returns: Transcribed text """ print(f"Extracting audio from: {youtube_url}") audio_file = self.extract_audio_from_youtube(youtube_url) try: print("Transcribing audio...") transcript = self.transcribe_audio(audio_file) return transcript finally: # Clean up the temporary audio file if os.path.exists(audio_file): os.unlink(audio_file)