bytelyst-devops-tools/supabase monitor/utils/speech_processing.py

"""
Speech processing utilities for YouTube video transcription.
"""
import whisper
import yt_dlp
import os
import tempfile
from typing import Optional

class YouTubeTranscriber:
    """Handles YouTube video audio extraction and transcription."""

    def __init__(self, model_size: str = "base"):
        """
        Initialize the transcriber with a Whisper model.

        Args:
            model_size: Whisper model size ("tiny", "base", "small", "medium", "large")
        """
        self.model = whisper.load_model(model_size)

    def extract_audio_from_youtube(self, youtube_url: str) -> str:
        """
        Extract audio from YouTube video and save as temporary file.

        Args:
            youtube_url: URL of the YouTube video

        Returns:
            Path to the extracted audio file
        """
        # Configure yt-dlp options for audio extraction
        ydl_opts = {
            'format': 'bestaudio[ext=m4a]/bestaudio/best',
            'outtmpl': '%(title)s.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
            'noplaylist': True,
            'extract_flat': False,
        }

        with tempfile.TemporaryDirectory() as temp_dir:
            # Change to temp directory for download
            original_cwd = os.getcwd()
            os.chdir(temp_dir)

            try:
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    info = ydl.extract_info(youtube_url, download=True)

                # Find the downloaded audio file
                audio_files = [f for f in os.listdir('.') if f.endswith('.wav')]
                if not audio_files:
                    raise ValueError("No audio file was extracted from the YouTube video")

                audio_file = audio_files[0]
                audio_path = os.path.join(temp_dir, audio_file)

                # Create a persistent temp file
                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
                    with open(audio_path, 'rb') as source:
                        temp_file.write(source.read())
                    return temp_file.name

            finally:
                os.chdir(original_cwd)

    def transcribe_audio(self, audio_file_path: str) -> str:
        """
        Transcribe audio file to text using Whisper.

        Args:
            audio_file_path: Path to the audio file

        Returns:
            Transcribed text
        """
        result = self.model.transcribe(audio_file_path)
        text = result["text"]

        # Ensure the text is properly encoded as UTF-8 string
        if isinstance(text, bytes):
            text = text.decode('utf-8', errors='ignore')
        elif not isinstance(text, str):
            text = str(text)

        return text

    def transcribe_youtube_video(self, youtube_url: str) -> str:
        """
        Complete transcription pipeline from YouTube URL to text.

        Args:
            youtube_url: URL of the YouTube video

        Returns:
            Transcribed text
        """
        print(f"Extracting audio from: {youtube_url}")
        audio_file = self.extract_audio_from_youtube(youtube_url)

        try:
            print("Transcribing audio...")
            transcript = self.transcribe_audio(audio_file)
            return transcript
        finally:
            # Clean up the temporary audio file
            if os.path.exists(audio_file):
                os.unlink(audio_file)