bytelyst-devops-tools/youtube/transcribe_yt_video.py

58 lines
2.0 KiB
Python

from youtube_transcript_api import YouTubeTranscriptApi
import os
video_id = "c7bbO_KSLPI" # Video ID from the provided URL
# Create API instance and get transcript
ytt_api = YouTubeTranscriptApi()
transcript_list = ytt_api.list(video_id)
# Try to get transcript in Korean (available for this video)
try:
transcript = transcript_list.find_transcript(['ko']).fetch()
print("✅ Found Korean transcript")
except:
# Get any available transcript
transcript = transcript_list.find_generated_transcript(['ko']).fetch()
print("✅ Found Korean auto-generated transcript")
# Create output directory and chunks subdirectory if they don't exist
output_dir = "captions"
chunks_dir = os.path.join(output_dir, "chunks")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not os.path.exists(chunks_dir):
os.makedirs(chunks_dir)
# Clear existing files in chunks directory
for filename in os.listdir(chunks_dir):
if filename.startswith("cc") and filename.endswith(".txt"):
os.remove(os.path.join(chunks_dir, filename))
# Write each caption segment to separate numbered files in chunks folder
for i, entry in enumerate(transcript, 1):
filename = f"cc{i}.txt"
filepath = os.path.join(chunks_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(entry.text)
print(f"Written: {filename} - {entry.text[:50]}...")
# Create complete transcript file with YouTube ID in filename
complete_filename = f"{video_id}_complete_transcript.txt"
complete_filepath = os.path.join(output_dir, complete_filename)
# Combine all chunks into single file
with open(complete_filepath, "w", encoding="utf-8") as f:
for i in range(1, len(transcript) + 1):
chunk_file = os.path.join(chunks_dir, f"cc{i}.txt")
if os.path.exists(chunk_file):
with open(chunk_file, "r", encoding="utf-8") as chunk_f:
f.write(chunk_f.read())
print(f"\nTotal segments: {len(transcript)}")
print(f"Individual files saved in: {chunks_dir}/")
print(f"Complete transcript saved as: {complete_filename}")