115 lines
3.7 KiB
Python
115 lines
3.7 KiB
Python
"""
|
|
Speech processing utilities for YouTube video transcription.
|
|
"""
|
|
import whisper
|
|
import yt_dlp
|
|
import os
|
|
import tempfile
|
|
from typing import Optional
|
|
|
|
class YouTubeTranscriber:
|
|
"""Handles YouTube video audio extraction and transcription."""
|
|
|
|
def __init__(self, model_size: str = "base"):
|
|
"""
|
|
Initialize the transcriber with a Whisper model.
|
|
|
|
Args:
|
|
model_size: Whisper model size ("tiny", "base", "small", "medium", "large")
|
|
"""
|
|
self.model = whisper.load_model(model_size)
|
|
|
|
def extract_audio_from_youtube(self, youtube_url: str) -> str:
|
|
"""
|
|
Extract audio from YouTube video and save as temporary file.
|
|
|
|
Args:
|
|
youtube_url: URL of the YouTube video
|
|
|
|
Returns:
|
|
Path to the extracted audio file
|
|
"""
|
|
# Configure yt-dlp options for audio extraction
|
|
ydl_opts = {
|
|
'format': 'bestaudio[ext=m4a]/bestaudio/best',
|
|
'outtmpl': '%(title)s.%(ext)s',
|
|
'postprocessors': [{
|
|
'key': 'FFmpegExtractAudio',
|
|
'preferredcodec': 'wav',
|
|
'preferredquality': '192',
|
|
}],
|
|
'noplaylist': True,
|
|
'extract_flat': False,
|
|
}
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Change to temp directory for download
|
|
original_cwd = os.getcwd()
|
|
os.chdir(temp_dir)
|
|
|
|
try:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
info = ydl.extract_info(youtube_url, download=True)
|
|
|
|
# Find the downloaded audio file
|
|
audio_files = [f for f in os.listdir('.') if f.endswith('.wav')]
|
|
if not audio_files:
|
|
raise ValueError("No audio file was extracted from the YouTube video")
|
|
|
|
audio_file = audio_files[0]
|
|
audio_path = os.path.join(temp_dir, audio_file)
|
|
|
|
# Create a persistent temp file
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
|
|
with open(audio_path, 'rb') as source:
|
|
temp_file.write(source.read())
|
|
return temp_file.name
|
|
|
|
finally:
|
|
os.chdir(original_cwd)
|
|
|
|
def transcribe_audio(self, audio_file_path: str) -> str:
|
|
"""
|
|
Transcribe audio file to text using Whisper.
|
|
|
|
Args:
|
|
audio_file_path: Path to the audio file
|
|
|
|
Returns:
|
|
Transcribed text
|
|
"""
|
|
result = self.model.transcribe(audio_file_path)
|
|
text = result["text"]
|
|
|
|
# Ensure the text is properly encoded as UTF-8 string
|
|
if isinstance(text, bytes):
|
|
text = text.decode('utf-8', errors='ignore')
|
|
elif not isinstance(text, str):
|
|
text = str(text)
|
|
|
|
return text
|
|
|
|
def transcribe_youtube_video(self, youtube_url: str) -> str:
|
|
"""
|
|
Complete transcription pipeline from YouTube URL to text.
|
|
|
|
Args:
|
|
youtube_url: URL of the YouTube video
|
|
|
|
Returns:
|
|
Transcribed text
|
|
"""
|
|
print(f"Extracting audio from: {youtube_url}")
|
|
audio_file = self.extract_audio_from_youtube(youtube_url)
|
|
|
|
try:
|
|
print("Transcribing audio...")
|
|
transcript = self.transcribe_audio(audio_file)
|
|
return transcript
|
|
finally:
|
|
# Clean up the temporary audio file
|
|
if os.path.exists(audio_file):
|
|
os.unlink(audio_file)
|
|
|
|
|