174 lines
6.3 KiB
Python
Executable File
174 lines
6.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced YouTube Transcript Downloader
|
|
Downloads YouTube video transcripts and saves each segment to separate numbered files (cc1.txt, cc2.txt, etc.)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
def extract_video_id(url_or_id):
|
|
"""Extract video ID from YouTube URL or return ID if already provided"""
|
|
if len(url_or_id) == 11 and url_or_id.isalnum():
|
|
return url_or_id
|
|
|
|
# Parse YouTube URL
|
|
parsed_url = urlparse(url_or_id)
|
|
|
|
if 'youtube.com' in parsed_url.netloc:
|
|
return parse_qs(parsed_url.query).get('v', [None])[0]
|
|
elif 'youtu.be' in parsed_url.netloc:
|
|
return parsed_url.path[1:]
|
|
|
|
return None
|
|
|
|
def download_transcript(video_id, output_dir="captions", language_codes=None):
|
|
"""Download transcript and save to numbered files"""
|
|
|
|
try:
|
|
# Get available transcripts
|
|
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
|
|
# Try to get transcript in preferred language or auto-generated
|
|
transcript = None
|
|
if language_codes:
|
|
for lang in language_codes:
|
|
try:
|
|
transcript = transcript_list.find_transcript([lang]).fetch()
|
|
print(f"✅ Found transcript in language: {lang}")
|
|
break
|
|
except NoTranscriptFound:
|
|
continue
|
|
|
|
if not transcript:
|
|
# Get any available transcript
|
|
try:
|
|
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
|
print("✅ Found auto-generated or default transcript")
|
|
except NoTranscriptFound:
|
|
print("❌ No transcript found for this video")
|
|
return False
|
|
|
|
# Create output directory and chunks subdirectory
|
|
chunks_dir = os.path.join(output_dir, "chunks")
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
print(f"📁 Created directory: {output_dir}")
|
|
if not os.path.exists(chunks_dir):
|
|
os.makedirs(chunks_dir)
|
|
print(f"📁 Created chunks directory: {chunks_dir}")
|
|
|
|
# Clear existing files in chunks directory
|
|
for filename in os.listdir(chunks_dir):
|
|
if filename.startswith("cc") and filename.endswith(".txt"):
|
|
os.remove(os.path.join(chunks_dir, filename))
|
|
|
|
# Write each segment to separate files
|
|
print(f"📝 Writing {len(transcript)} segments...")
|
|
|
|
for i, entry in enumerate(transcript, 1):
|
|
filename = f"cc{i}.txt"
|
|
filepath = os.path.join(chunks_dir, filename)
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(entry['text'])
|
|
|
|
# Show progress for every 10th file or if text is interesting
|
|
if i % 10 == 0 or len(entry['text']) > 50:
|
|
preview = entry['text'][:50] + "..." if len(entry['text']) > 50 else entry['text']
|
|
print(f" 📄 {filename}: {preview}")
|
|
|
|
# Create complete transcript file with YouTube ID in filename
|
|
complete_filename = f"{video_id}_complete_transcript.txt"
|
|
complete_filepath = os.path.join(output_dir, complete_filename)
|
|
|
|
# Combine all chunks into single file
|
|
with open(complete_filepath, "w", encoding="utf-8") as f:
|
|
for i in range(1, len(transcript) + 1):
|
|
chunk_file = os.path.join(chunks_dir, f"cc{i}.txt")
|
|
if os.path.exists(chunk_file):
|
|
with open(chunk_file, "r", encoding="utf-8") as chunk_f:
|
|
f.write(chunk_f.read())
|
|
|
|
print(f"\n🎉 Success!")
|
|
print(f"📊 Total segments: {len(transcript)}")
|
|
print(f"📁 Individual files saved in: {os.path.abspath(chunks_dir)}/")
|
|
print(f"📄 Complete transcript saved as: {complete_filename}")
|
|
|
|
# Create a summary file
|
|
summary_path = os.path.join(output_dir, "summary.txt")
|
|
with open(summary_path, "w", encoding="utf-8") as f:
|
|
f.write(f"YouTube Video ID: {video_id}\n")
|
|
f.write(f"Total segments: {len(transcript)}\n")
|
|
f.write(f"Files: chunks/cc1.txt to chunks/cc{len(transcript)}.txt\n")
|
|
f.write(f"Complete transcript: {complete_filename}\n")
|
|
f.write(f"Generated: {os.path.basename(__file__)}\n")
|
|
|
|
print(f"📋 Summary saved to: {summary_path}")
|
|
return True
|
|
|
|
except TranscriptsDisabled:
|
|
print("❌ Transcripts are disabled for this video")
|
|
return False
|
|
except NoTranscriptFound:
|
|
print("❌ No transcript found for this video")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Error: {str(e)}")
|
|
return False
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Download YouTube transcripts to numbered caption files",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
%(prog)s dQw4w9WgXcQ
|
|
%(prog)s "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
|
%(prog)s dQw4w9WgXcQ --output my_captions --languages en es fr
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"video",
|
|
help="YouTube video ID or URL"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
default="captions",
|
|
help="Output directory for caption files (default: captions)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--languages", "-l",
|
|
nargs="*",
|
|
default=["en"],
|
|
help="Preferred language codes (e.g., en es fr) - default: en"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Extract video ID
|
|
video_id = extract_video_id(args.video)
|
|
if not video_id:
|
|
print("❌ Invalid YouTube URL or video ID")
|
|
print("Example formats:")
|
|
print(" Video ID: dQw4w9WgXcQ")
|
|
print(" Full URL: https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
|
print(" Short URL: https://youtu.be/dQw4w9WgXcQ")
|
|
sys.exit(1)
|
|
|
|
print(f"🎬 Processing video ID: {video_id}")
|
|
|
|
# Download transcript
|
|
success = download_transcript(video_id, args.output, args.languages)
|
|
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |