#!/usr/bin/env python3 """ Enhanced YouTube Transcript Downloader Downloads YouTube video transcripts and saves each segment to separate numbered files (cc1.txt, cc2.txt, etc.) """ import os import sys import argparse from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound from urllib.parse import urlparse, parse_qs def extract_video_id(url_or_id): """Extract video ID from YouTube URL or return ID if already provided""" if len(url_or_id) == 11 and url_or_id.isalnum(): return url_or_id # Parse YouTube URL parsed_url = urlparse(url_or_id) if 'youtube.com' in parsed_url.netloc: return parse_qs(parsed_url.query).get('v', [None])[0] elif 'youtu.be' in parsed_url.netloc: return parsed_url.path[1:] return None def download_transcript(video_id, output_dir="captions", language_codes=None): """Download transcript and save to numbered files""" try: # Get available transcripts transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # Try to get transcript in preferred language or auto-generated transcript = None if language_codes: for lang in language_codes: try: transcript = transcript_list.find_transcript([lang]).fetch() print(f"āœ… Found transcript in language: {lang}") break except NoTranscriptFound: continue if not transcript: # Get any available transcript try: transcript = YouTubeTranscriptApi.get_transcript(video_id) print("āœ… Found auto-generated or default transcript") except NoTranscriptFound: print("āŒ No transcript found for this video") return False # Create output directory and chunks subdirectory chunks_dir = os.path.join(output_dir, "chunks") if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"šŸ“ Created directory: {output_dir}") if not os.path.exists(chunks_dir): os.makedirs(chunks_dir) print(f"šŸ“ Created chunks directory: {chunks_dir}") # Clear existing files in chunks directory for filename in os.listdir(chunks_dir): if filename.startswith("cc") and filename.endswith(".txt"): os.remove(os.path.join(chunks_dir, filename)) # Write each segment to separate files print(f"šŸ“ Writing {len(transcript)} segments...") for i, entry in enumerate(transcript, 1): filename = f"cc{i}.txt" filepath = os.path.join(chunks_dir, filename) with open(filepath, "w", encoding="utf-8") as f: f.write(entry['text']) # Show progress for every 10th file or if text is interesting if i % 10 == 0 or len(entry['text']) > 50: preview = entry['text'][:50] + "..." if len(entry['text']) > 50 else entry['text'] print(f" šŸ“„ {filename}: {preview}") # Create complete transcript file with YouTube ID in filename complete_filename = f"{video_id}_complete_transcript.txt" complete_filepath = os.path.join(output_dir, complete_filename) # Combine all chunks into single file with open(complete_filepath, "w", encoding="utf-8") as f: for i in range(1, len(transcript) + 1): chunk_file = os.path.join(chunks_dir, f"cc{i}.txt") if os.path.exists(chunk_file): with open(chunk_file, "r", encoding="utf-8") as chunk_f: f.write(chunk_f.read()) print(f"\nšŸŽ‰ Success!") print(f"šŸ“Š Total segments: {len(transcript)}") print(f"šŸ“ Individual files saved in: {os.path.abspath(chunks_dir)}/") print(f"šŸ“„ Complete transcript saved as: {complete_filename}") # Create a summary file summary_path = os.path.join(output_dir, "summary.txt") with open(summary_path, "w", encoding="utf-8") as f: f.write(f"YouTube Video ID: {video_id}\n") f.write(f"Total segments: {len(transcript)}\n") f.write(f"Files: chunks/cc1.txt to chunks/cc{len(transcript)}.txt\n") f.write(f"Complete transcript: {complete_filename}\n") f.write(f"Generated: {os.path.basename(__file__)}\n") print(f"šŸ“‹ Summary saved to: {summary_path}") return True except TranscriptsDisabled: print("āŒ Transcripts are disabled for this video") return False except NoTranscriptFound: print("āŒ No transcript found for this video") return False except Exception as e: print(f"āŒ Error: {str(e)}") return False def main(): parser = argparse.ArgumentParser( description="Download YouTube transcripts to numbered caption files", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s dQw4w9WgXcQ %(prog)s "https://www.youtube.com/watch?v=dQw4w9WgXcQ" %(prog)s dQw4w9WgXcQ --output my_captions --languages en es fr """ ) parser.add_argument( "video", help="YouTube video ID or URL" ) parser.add_argument( "--output", "-o", default="captions", help="Output directory for caption files (default: captions)" ) parser.add_argument( "--languages", "-l", nargs="*", default=["en"], help="Preferred language codes (e.g., en es fr) - default: en" ) args = parser.parse_args() # Extract video ID video_id = extract_video_id(args.video) if not video_id: print("āŒ Invalid YouTube URL or video ID") print("Example formats:") print(" Video ID: dQw4w9WgXcQ") print(" Full URL: https://www.youtube.com/watch?v=dQw4w9WgXcQ") print(" Short URL: https://youtu.be/dQw4w9WgXcQ") sys.exit(1) print(f"šŸŽ¬ Processing video ID: {video_id}") # Download transcript success = download_transcript(video_id, args.output, args.languages) if not success: sys.exit(1) if __name__ == "__main__": main()