ytdlp_subtitle_stranscript / fetchYoutubeSubtitle.py
lanbogao's picture
Return None if no text in xml, due to no subtitle and automatic_captions with gibberish except xml(xml format text part is empty) in this video https://youtu.be/LI0mzC6sl7w
4868530
raw
history blame
9.56 kB
import os
import json
import math
import time
import traceback
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
debug = os.getenv("DEBUG")
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitleItem(item, lang="en", subType="vtt"):
for subtitle in item[lang]:
if lang != "live_chat" and subType == "xml":
if debug:
print(
"subtitle source lang:{} url: {}".format(lang, subtitle.get("url"))
)
return subtitle.get("url").replace("&fmt=" + subtitle.get("ext"), "")
if subtitle.get("ext") == subType:
if debug:
print("subtitle lang:{} url: {}".format(lang, subtitle.get("url")))
return subtitle.get("url")
return None
def getRequestedSubtitlesUrl(info_dict, lang, subType):
item = info_dict.get("requested_subtitles")
if item:
langs = item.keys()
for l in langs:
if l.startswith(lang):
item = {l: [item[l]]} if type(item[l]) == dict else item
url = getUrlFromSubtitleItem(item, l, subType)
if url:
if debug:
print("getRequestedSubtitlesUrl lang:{} url:{}".format(l, url))
return url
return None
def getSubtitleLangUrl(
info_dict,
lang="en",
subType="vtt",
subTitleKeys=["subtitles", "automatic_captions"],
):
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
if lang in langs:
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), lang, subType)
if url:
if debug:
print("getSubtitleLangUrl lang:{}".format(lang))
return url
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
for l in langs:
if l.startswith(lang):
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
if url:
if debug:
print("getSubtitleLangUrl lang:{} url:{}".format(l, url))
return url
return None
def getSubtitleOtherUrl(
info_dict,
lang="en",
subType="vtt",
subTitleKeys=["subtitles", "automatic_captions"],
):
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
if len(langs) == 0:
continue
l = lang if lang in langs else ("en" if "en" in langs else list(langs)[0])
if l is None:
continue
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
if url:
if debug:
print("getSubtitleOtherUrl lang:{} url:{}".format(l, url))
return url
return None
async def fetchSubtitle(
url: str,
lang: Optional[str] = "en",
subType: Optional[str] = "vtt",
proxy: Optional[str] = None,
) -> dict:
return await fetchSubtitlebyType(url, lang, subType, proxy)
async def fetchSubtitlebyType(
url: str,
lang: Optional[str] = "en",
subType: Optional[str] = "vtt",
proxy: Optional[str] = None,
) -> dict:
# lang-code or lang.* .* is regex
reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
ydl_opts = {
"noplaylist": True,
"writesubtitles": True,
"writeautomaticsub": True,
# "listsubtitles": True,
# "subtitlesformat": subType, # mark due to default youtube no srt and xml format
"subtitleslangs": [reqLang],
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
# "debug_printtraffic": True,
"extractor_args": {
"youtube": {
"player_skip": [
"configs",
"initial",
], # skip "webpage" will cause l2P5PgL1LfI missing some langs,
"player_client": ["ios"],
"skip": ["hls", "dash"], # don't skip "translated_subs"
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
# print(ydl_opts)
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
# print(json.dumps(info_dict))
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
isSrt = False
if info_dict.get("extractor") == "youtube" and subType == "srt":
subType = "xml"
isSrt = True
if debug:
print(
"subtitles.keys(): {} automatic_captions: {}".format(
info_dict.get("subtitles").keys(),
info_dict.get("automatic_captions").keys(),
)
)
subtitle_url = getRequestedSubtitlesUrl(info_dict, lang, subType)
if not subtitle_url:
subtitle_url = getSubtitleLangUrl(info_dict, lang, subType)
if not subtitle_url:
subtitle_url = getSubtitleOtherUrl(info_dict, lang, subType)
if subtitle_url:
# print("subtitle_url: {}".format(subtitle_url))
with ydl.urlopen(subtitle_url) as response:
subtitle = (
xml_caption_to_srt(response.read().decode())
if isSrt
else response.read().decode()
)
print(
"url:{}, title:{}, duration:{} len(subtitle): {}".format(
url, title, duration, len(subtitle)
)
)
return {
"title": title,
"duration": duration,
"subtitle": subtitle,
"chapters": info_dict.get("chapters", None),
}
except Exception as e:
print(e)
traceback.print_exc()
return {"error": str(e)}
return {"title": title, "duration": duration, "error": "No subtitles"}
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt(xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(
text.replace("\n", " ").replace(" ", " "),
)
if len(caption) == 0:
continue
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
# return None if no text in xml
return "\n".join(segments).strip() if len(segments) > 0 else None
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
ydl_opts = {
"noplaylist": True,
"writesubtitles": True,
"writeautomaticsub": True,
# "allsubtitles": False,
# "listsubtitles": True,
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
# "debug_printtraffic": True,
"extractor_args": {
"youtube": {
"player_skip": ["configs", "initial"],
"player_client": ["ios"],
"skip": ["hls", "dash"], # , "translated_subs"
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
return {
"title": title,
"duration": duration,
"subtitles": info_dict.get("subtitles"),
"automatic_captions": info_dict.get("automatic_captions"),
}
except Exception as e:
return {"error": str(e)}