bytelyst-devops-tools/supabase monitor/utils/speech_processing.py

115 lines
3.7 KiB
Python

"""
Speech processing utilities for YouTube video transcription.
"""
import whisper
import yt_dlp
import os
import tempfile
from typing import Optional
class YouTubeTranscriber:
"""Handles YouTube video audio extraction and transcription."""
def __init__(self, model_size: str = "base"):
"""
Initialize the transcriber with a Whisper model.
Args:
model_size: Whisper model size ("tiny", "base", "small", "medium", "large")
"""
self.model = whisper.load_model(model_size)
def extract_audio_from_youtube(self, youtube_url: str) -> str:
"""
Extract audio from YouTube video and save as temporary file.
Args:
youtube_url: URL of the YouTube video
Returns:
Path to the extracted audio file
"""
# Configure yt-dlp options for audio extraction
ydl_opts = {
'format': 'bestaudio[ext=m4a]/bestaudio/best',
'outtmpl': '%(title)s.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
'noplaylist': True,
'extract_flat': False,
}
with tempfile.TemporaryDirectory() as temp_dir:
# Change to temp directory for download
original_cwd = os.getcwd()
os.chdir(temp_dir)
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(youtube_url, download=True)
# Find the downloaded audio file
audio_files = [f for f in os.listdir('.') if f.endswith('.wav')]
if not audio_files:
raise ValueError("No audio file was extracted from the YouTube video")
audio_file = audio_files[0]
audio_path = os.path.join(temp_dir, audio_file)
# Create a persistent temp file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
with open(audio_path, 'rb') as source:
temp_file.write(source.read())
return temp_file.name
finally:
os.chdir(original_cwd)
def transcribe_audio(self, audio_file_path: str) -> str:
"""
Transcribe audio file to text using Whisper.
Args:
audio_file_path: Path to the audio file
Returns:
Transcribed text
"""
result = self.model.transcribe(audio_file_path)
text = result["text"]
# Ensure the text is properly encoded as UTF-8 string
if isinstance(text, bytes):
text = text.decode('utf-8', errors='ignore')
elif not isinstance(text, str):
text = str(text)
return text
def transcribe_youtube_video(self, youtube_url: str) -> str:
"""
Complete transcription pipeline from YouTube URL to text.
Args:
youtube_url: URL of the YouTube video
Returns:
Transcribed text
"""
print(f"Extracting audio from: {youtube_url}")
audio_file = self.extract_audio_from_youtube(youtube_url)
try:
print("Transcribing audio...")
transcript = self.transcribe_audio(audio_file)
return transcript
finally:
# Clean up the temporary audio file
if os.path.exists(audio_file):
os.unlink(audio_file)