ytdlp_subtitle_dev

Sleeping

App Files Files Community

ytdlp_subtitle_dev / fetchYoutubeSubtitle.py

lanbogao

Add proxy & update fetchSubtitleUrls.

d8804e5 over 1 year ago

raw

history blame

5.4 kB

	import json
	import math
	import time
	from typing import Optional
	import xml.etree.ElementTree as ElementTree
	from html import unescape
	import yt_dlp

	# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)

	# "subtitles": {
	# "live_chat": [
	# {
	# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
	# "ext": "json",
	# "video_id": "ANtM2bHRz04",
	# "protocol": "youtube_live_chat_replay"
	# }
	# ]
	# }
	def getUrlFromSubtitles(item, lang='en', subType="vtt"):
	langs = item.keys()
	if len(langs) == 0:
	return None

	l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
	if l is None:
	return

	for subtitle in item[l]:
	# print("getUrlFromSubtitles subtitle: %s" % subtitle)
	if l != "live_chat" and subType =="xml":
	# print("subtitle source url: {}".format(subtitle.get("url")))
	return subtitle.get("url").replace("&fmt="+subtitle.get("ext"),"")
	if subtitle.get("ext") == subType:
	return subtitle.get("url")
	return None

	async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
	return await fetchSubtitlebyType(url, lang, subType, proxy)

	async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
	ydl_opts = {
	"noplaylist": True,
	"writesubtitles": False,
	"allsubtitles": True,
	"subtitleslangs": [lang] if lang else [],
	"skip_download": True,
	"socket_timeout": 20
	}

	if proxy:
	ydl_opts.update({"proxy": proxy})

	title = "unknow"
	duration = ""
	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info_dict = ydl.extract_info(url, download=False)
	title = info_dict.get("title", "unknow")
	seconds = info_dict.get("duration")
	duration = str(seconds) if seconds else ""
	isSrt = False
	if info_dict.get("extractor") == "youtube" and subType == "srt":
	subType = "xml"
	isSrt = True

	for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
	if info_dict.get(subtitle_item):
	subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
	if subtitle_url:
	# print("subtitle_url: {}".format(subtitle_url))
	with ydl.urlopen(subtitle_url) as response:
	subtitle = xml_caption_to_srt(response.read().decode()) if isSrt else response.read().decode()
	print("url{}, title:{}, duration:{} len(subtitle): {}".format(url, title, duration, len(subtitle)))
	return {"title": title, "duration": duration,"subtitle": subtitle, "chapters":info_dict.get("chapters", None) }
	except Exception as e:
	return {"error": str(e)}
	return {"title": title, "duration": duration, "error": "No subtitles"}

	def float_to_srt_time_format(d: float) -> str:
	"""Convert decimal durations into proper srt format.
	:rtype: str
	:returns:
	SubRip Subtitle (str) formatted time duration.
	float_to_srt_time_format(3.89) -> '00:00:03,890'
	"""
	fraction, whole = math.modf(d)
	time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
	ms = f"{fraction:.3f}".replace("0.", "")
	return time_fmt + ms

	def xml_caption_to_srt( xml_captions: str) -> str:
	"""Convert xml caption tracks to "SubRip Subtitle (srt)".
	:param str xml_captions:
	XML formatted caption tracks.
	"""
	segments = []
	root = ElementTree.fromstring(xml_captions)
	for i, child in enumerate(list(root)):
	text = child.text or ""
	caption = unescape(text.replace("\n", " ").replace(" ", " "),)
	try:
	duration = float(child.attrib["dur"])
	except KeyError:
	duration = 0.0
	start = float(child.attrib["start"])
	end = start + duration
	sequence_number = i + 1 # convert from 0-indexed to 1.
	line = "{seq}\n{start} --> {end}\n{text}\n".format(
	seq=sequence_number,
	start=float_to_srt_time_format(start),
	end=float_to_srt_time_format(end),
	text=caption,
	)
	segments.append(line)
	return "\n".join(segments).strip()

	async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
	ydl_opts = {
	"noplaylist": True,
	"writesubtitles": False,
	"allsubtitles": True,
	"skip_download": True,
	}
	if proxy:
	ydl_opts.update({"proxy": proxy})

	title = "unknow"
	duration = ""
	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info_dict = ydl.extract_info(url, download=False)
	title = info_dict.get("title", "unknow")
	seconds = info_dict.get("duration")
	duration = str(seconds) if seconds else ""

	return {"title": title, "duration": duration, "subtitles": info_dict.get("subtitles"),"automatic_captions": info_dict.get("automatic_captions")}

	except Exception as e:
	return {"error": str(e)}