58 lines
2.0 KiB
Python
58 lines
2.0 KiB
Python
from youtube_transcript_api import YouTubeTranscriptApi
|
|
import os
|
|
|
|
video_id = "c7bbO_KSLPI" # Video ID from the provided URL
|
|
|
|
# Create API instance and get transcript
|
|
ytt_api = YouTubeTranscriptApi()
|
|
transcript_list = ytt_api.list(video_id)
|
|
|
|
# Try to get transcript in Korean (available for this video)
|
|
try:
|
|
transcript = transcript_list.find_transcript(['ko']).fetch()
|
|
print("✅ Found Korean transcript")
|
|
except:
|
|
# Get any available transcript
|
|
transcript = transcript_list.find_generated_transcript(['ko']).fetch()
|
|
print("✅ Found Korean auto-generated transcript")
|
|
|
|
# Create output directory and chunks subdirectory if they don't exist
|
|
output_dir = "captions"
|
|
chunks_dir = os.path.join(output_dir, "chunks")
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
if not os.path.exists(chunks_dir):
|
|
os.makedirs(chunks_dir)
|
|
|
|
# Clear existing files in chunks directory
|
|
for filename in os.listdir(chunks_dir):
|
|
if filename.startswith("cc") and filename.endswith(".txt"):
|
|
os.remove(os.path.join(chunks_dir, filename))
|
|
|
|
# Write each caption segment to separate numbered files in chunks folder
|
|
for i, entry in enumerate(transcript, 1):
|
|
filename = f"cc{i}.txt"
|
|
filepath = os.path.join(chunks_dir, filename)
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(entry.text)
|
|
|
|
print(f"Written: {filename} - {entry.text[:50]}...")
|
|
|
|
# Create complete transcript file with YouTube ID in filename
|
|
complete_filename = f"{video_id}_complete_transcript.txt"
|
|
complete_filepath = os.path.join(output_dir, complete_filename)
|
|
|
|
# Combine all chunks into single file
|
|
with open(complete_filepath, "w", encoding="utf-8") as f:
|
|
for i in range(1, len(transcript) + 1):
|
|
chunk_file = os.path.join(chunks_dir, f"cc{i}.txt")
|
|
if os.path.exists(chunk_file):
|
|
with open(chunk_file, "r", encoding="utf-8") as chunk_f:
|
|
f.write(chunk_f.read())
|
|
|
|
print(f"\nTotal segments: {len(transcript)}")
|
|
print(f"Individual files saved in: {chunks_dir}/")
|
|
print(f"Complete transcript saved as: {complete_filename}")
|
|
|