Skip to content

Commit

Permalink
fix subtitle parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
arsenstorm committed Jan 13, 2025
1 parent 184efe1 commit 86b73dd
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 12 deletions.
1 change: 0 additions & 1 deletion youtube-dl/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,5 @@ dependencies = [
"Flask==2.3.2",
"flask-cors>=4.0.2",
"python-dotenv==1.0.0",
"webvtt-py>=0.5.1",
"yt-dlp==2024.12.23",
]
21 changes: 11 additions & 10 deletions youtube-dl/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import boto3
from botocore.client import Config
from pathlib import Path
from webvtt import WebVTT
import math
from concurrent.futures import ThreadPoolExecutor
import html

load_dotenv()

Expand Down Expand Up @@ -45,14 +45,14 @@ def get_id_from_url(url):

def process_subtitle(subtitle_file):
subtitles = []
previous_text = None

for caption in WebVTT().read(subtitle_file, encoding='utf-8'):
cleaned_text = caption.text.encode('utf-8', errors='ignore').decode('utf-8')
current_text = f"{caption.start}: {cleaned_text}"
if current_text != previous_text:
subtitles.append(current_text)
previous_text = current_text
with open(subtitle_file, 'r', encoding='utf-8') as f:
for line in f.readlines():
if "<p" in line:
start = line.split("begin=")[1].split("\"")[1]
text = line.split(">")[1].split("</p")[0]
cleaned_text = html.unescape(text)
subtitles.append(f"{start}: {cleaned_text}")

return '\n'.join(subtitles)

Expand Down Expand Up @@ -154,7 +154,7 @@ def download():
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'subtitlesformat': 'vtt',
'subtitlesformat': 'ttml',
'nocheckcertificate': True,
# 'quiet': True,
# 'no_warnings': True,
Expand Down Expand Up @@ -198,13 +198,14 @@ def download():
description = info.get('description', 'No Description')

subtitles = "No subtitles available."
subtitle_file = DOWNLOAD_DIR / f"{video_id}.en.vtt"
subtitle_file = DOWNLOAD_DIR / f"{video_id}.en.ttml"

if subtitle_file.is_file():
subtitles = process_subtitle(subtitle_file)

subtitle_file = save_text_file(
subtitles, f"{video_id}_subtitles.txt")
print(subtitles[:300])

# Save metadata
metadata_content = f"<title>{
Expand Down
24 changes: 23 additions & 1 deletion youtube-dl/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 86b73dd

Please sign in to comment.