bytelyst-devops-tools/youtube/enhanced_yt_transcript.py

174 lines
6.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Enhanced YouTube Transcript Downloader
Downloads YouTube video transcripts and saves each segment to separate numbered files (cc1.txt, cc2.txt, etc.)
"""
import os
import sys
import argparse
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from urllib.parse import urlparse, parse_qs
def extract_video_id(url_or_id):
"""Extract video ID from YouTube URL or return ID if already provided"""
if len(url_or_id) == 11 and url_or_id.isalnum():
return url_or_id
# Parse YouTube URL
parsed_url = urlparse(url_or_id)
if 'youtube.com' in parsed_url.netloc:
return parse_qs(parsed_url.query).get('v', [None])[0]
elif 'youtu.be' in parsed_url.netloc:
return parsed_url.path[1:]
return None
def download_transcript(video_id, output_dir="captions", language_codes=None):
"""Download transcript and save to numbered files"""
try:
# Get available transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Try to get transcript in preferred language or auto-generated
transcript = None
if language_codes:
for lang in language_codes:
try:
transcript = transcript_list.find_transcript([lang]).fetch()
print(f"✅ Found transcript in language: {lang}")
break
except NoTranscriptFound:
continue
if not transcript:
# Get any available transcript
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
print("✅ Found auto-generated or default transcript")
except NoTranscriptFound:
print("❌ No transcript found for this video")
return False
# Create output directory and chunks subdirectory
chunks_dir = os.path.join(output_dir, "chunks")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"📁 Created directory: {output_dir}")
if not os.path.exists(chunks_dir):
os.makedirs(chunks_dir)
print(f"📁 Created chunks directory: {chunks_dir}")
# Clear existing files in chunks directory
for filename in os.listdir(chunks_dir):
if filename.startswith("cc") and filename.endswith(".txt"):
os.remove(os.path.join(chunks_dir, filename))
# Write each segment to separate files
print(f"📝 Writing {len(transcript)} segments...")
for i, entry in enumerate(transcript, 1):
filename = f"cc{i}.txt"
filepath = os.path.join(chunks_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(entry['text'])
# Show progress for every 10th file or if text is interesting
if i % 10 == 0 or len(entry['text']) > 50:
preview = entry['text'][:50] + "..." if len(entry['text']) > 50 else entry['text']
print(f" 📄 {filename}: {preview}")
# Create complete transcript file with YouTube ID in filename
complete_filename = f"{video_id}_complete_transcript.txt"
complete_filepath = os.path.join(output_dir, complete_filename)
# Combine all chunks into single file
with open(complete_filepath, "w", encoding="utf-8") as f:
for i in range(1, len(transcript) + 1):
chunk_file = os.path.join(chunks_dir, f"cc{i}.txt")
if os.path.exists(chunk_file):
with open(chunk_file, "r", encoding="utf-8") as chunk_f:
f.write(chunk_f.read())
print(f"\n🎉 Success!")
print(f"📊 Total segments: {len(transcript)}")
print(f"📁 Individual files saved in: {os.path.abspath(chunks_dir)}/")
print(f"📄 Complete transcript saved as: {complete_filename}")
# Create a summary file
summary_path = os.path.join(output_dir, "summary.txt")
with open(summary_path, "w", encoding="utf-8") as f:
f.write(f"YouTube Video ID: {video_id}\n")
f.write(f"Total segments: {len(transcript)}\n")
f.write(f"Files: chunks/cc1.txt to chunks/cc{len(transcript)}.txt\n")
f.write(f"Complete transcript: {complete_filename}\n")
f.write(f"Generated: {os.path.basename(__file__)}\n")
print(f"📋 Summary saved to: {summary_path}")
return True
except TranscriptsDisabled:
print("❌ Transcripts are disabled for this video")
return False
except NoTranscriptFound:
print("❌ No transcript found for this video")
return False
except Exception as e:
print(f"❌ Error: {str(e)}")
return False
def main():
parser = argparse.ArgumentParser(
description="Download YouTube transcripts to numbered caption files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s dQw4w9WgXcQ
%(prog)s "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
%(prog)s dQw4w9WgXcQ --output my_captions --languages en es fr
"""
)
parser.add_argument(
"video",
help="YouTube video ID or URL"
)
parser.add_argument(
"--output", "-o",
default="captions",
help="Output directory for caption files (default: captions)"
)
parser.add_argument(
"--languages", "-l",
nargs="*",
default=["en"],
help="Preferred language codes (e.g., en es fr) - default: en"
)
args = parser.parse_args()
# Extract video ID
video_id = extract_video_id(args.video)
if not video_id:
print("❌ Invalid YouTube URL or video ID")
print("Example formats:")
print(" Video ID: dQw4w9WgXcQ")
print(" Full URL: https://www.youtube.com/watch?v=dQw4w9WgXcQ")
print(" Short URL: https://youtu.be/dQw4w9WgXcQ")
sys.exit(1)
print(f"🎬 Processing video ID: {video_id}")
# Download transcript
success = download_transcript(video_id, args.output, args.languages)
if not success:
sys.exit(1)
if __name__ == "__main__":
main()