Spaces:
Runtime error
Runtime error
Add get subtitle with type support.
Browse files- fetchYoutubeSubtitle.py +85 -16
- main.py +2 -2
fetchYoutubeSubtitle.py
CHANGED
@@ -1,16 +1,51 @@
|
|
1 |
import json
|
|
|
|
|
2 |
from typing import Optional
|
|
|
|
|
3 |
import yt_dlp
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
langs = item.keys()
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
return subtitle.get("url")
|
11 |
return None
|
12 |
|
13 |
-
async def fetchSubtitle(url: str, lang: Optional[str] = 'en',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
ydl_opts = {
|
15 |
"writesubtitles": True,
|
16 |
"allsubtitles": True,
|
@@ -21,19 +56,53 @@ async def fetchSubtitle(url: str, lang: Optional[str] = 'en', vttType="vtt") ->
|
|
21 |
|
22 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
23 |
info_dict = ydl.extract_info(url, download=False)
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
subtitle_url = getVttUrlFromSubtitles(info_dict.get("automatic_captions"), lang, vttType)
|
32 |
-
if subtitle_url:
|
33 |
-
with ydl.urlopen(subtitle_url) as subtitle:
|
34 |
-
return subtitle.read().decode()
|
35 |
return None
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
async def fetchSubtitleUrls(url: str) -> json:
|
38 |
ydl_opts = {
|
39 |
"writesubtitles": True,
|
|
|
1 |
import json
|
2 |
+
import math
|
3 |
+
import time
|
4 |
from typing import Optional
|
5 |
+
import xml.etree.ElementTree as ElementTree
|
6 |
+
from html import unescape
|
7 |
import yt_dlp
|
8 |
|
9 |
+
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
|
10 |
+
|
11 |
+
# "subtitles": {
|
12 |
+
# "live_chat": [
|
13 |
+
# {
|
14 |
+
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
|
15 |
+
# "ext": "json",
|
16 |
+
# "video_id": "ANtM2bHRz04",
|
17 |
+
# "protocol": "youtube_live_chat_replay"
|
18 |
+
# }
|
19 |
+
# ]
|
20 |
+
# }
|
21 |
+
def getUrlFromSubtitles(item, lang='en', subType="vtt"):
|
22 |
langs = item.keys()
|
23 |
+
if len(langs) == 0:
|
24 |
+
return None
|
25 |
+
|
26 |
+
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
|
27 |
+
print("getUrlFromSubtitles l: %s, item: %s" % (l, item))
|
28 |
+
|
29 |
+
for subtitle in item[l]:
|
30 |
+
print("getUrlFromSubtitles subtitle: %s" % subtitle)
|
31 |
+
if l != "live_chat" and subType =="xml":
|
32 |
+
return subtitle.get("url").replace("fmt="+subtitle.get("ext"),"")
|
33 |
+
if subtitle.get("ext") == subType:
|
34 |
return subtitle.get("url")
|
35 |
return None
|
36 |
|
37 |
+
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt") -> Optional[str]:
|
38 |
+
if subType == "srt":
|
39 |
+
subtitle = await fetchSubtitlebyType(url, lang, subType, True)
|
40 |
+
if subtitle:
|
41 |
+
return subtitle
|
42 |
+
subtitle = await fetchSubtitlebyType(url, lang, "xml", True)
|
43 |
+
print(subtitle)
|
44 |
+
return xml_caption_to_srt(subtitle)
|
45 |
+
else:
|
46 |
+
return await fetchSubtitlebyType(url, lang, subType, True)
|
47 |
+
|
48 |
+
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType="vtt", decode: bool = False) -> Optional[str]:
|
49 |
ydl_opts = {
|
50 |
"writesubtitles": True,
|
51 |
"allsubtitles": True,
|
|
|
56 |
|
57 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
58 |
info_dict = ydl.extract_info(url, download=False)
|
59 |
+
for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
|
60 |
+
if info_dict.get(subtitle_item) :
|
61 |
+
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
|
62 |
+
if subtitle_url:
|
63 |
+
with ydl.urlopen(subtitle_url) as subtitle:
|
64 |
+
return subtitle.read().decode() if decode else subtitle.read()
|
65 |
+
|
|
|
|
|
|
|
|
|
66 |
return None
|
67 |
|
68 |
+
def float_to_srt_time_format(d: float) -> str:
|
69 |
+
"""Convert decimal durations into proper srt format.
|
70 |
+
:rtype: str
|
71 |
+
:returns:
|
72 |
+
SubRip Subtitle (str) formatted time duration.
|
73 |
+
float_to_srt_time_format(3.89) -> '00:00:03,890'
|
74 |
+
"""
|
75 |
+
fraction, whole = math.modf(d)
|
76 |
+
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
|
77 |
+
ms = f"{fraction:.3f}".replace("0.", "")
|
78 |
+
return time_fmt + ms
|
79 |
+
|
80 |
+
def xml_caption_to_srt( xml_captions: str) -> str:
|
81 |
+
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
82 |
+
:param str xml_captions:
|
83 |
+
XML formatted caption tracks.
|
84 |
+
"""
|
85 |
+
segments = []
|
86 |
+
root = ElementTree.fromstring(xml_captions)
|
87 |
+
for i, child in enumerate(list(root)):
|
88 |
+
text = child.text or ""
|
89 |
+
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
|
90 |
+
try:
|
91 |
+
duration = float(child.attrib["dur"])
|
92 |
+
except KeyError:
|
93 |
+
duration = 0.0
|
94 |
+
start = float(child.attrib["start"])
|
95 |
+
end = start + duration
|
96 |
+
sequence_number = i + 1 # convert from 0-indexed to 1.
|
97 |
+
line = "{seq}\n{start} --> {end}\n{text}\n".format(
|
98 |
+
seq=sequence_number,
|
99 |
+
start=float_to_srt_time_format(start),
|
100 |
+
end=float_to_srt_time_format(end),
|
101 |
+
text=caption,
|
102 |
+
)
|
103 |
+
segments.append(line)
|
104 |
+
return "\n".join(segments).strip()
|
105 |
+
|
106 |
async def fetchSubtitleUrls(url: str) -> json:
|
107 |
ydl_opts = {
|
108 |
"writesubtitles": True,
|
main.py
CHANGED
@@ -15,8 +15,8 @@ def read_json():
|
|
15 |
|
16 |
|
17 |
@app.get("/subtitle/")
|
18 |
-
async def get_subtitle(url: str):
|
19 |
-
subtitle = await fetchSubtitle(url)
|
20 |
return JSONResponse(content=subtitle)
|
21 |
|
22 |
|
|
|
15 |
|
16 |
|
17 |
@app.get("/subtitle/")
|
18 |
+
async def get_subtitle(url: str, subtype: str="srt"):
|
19 |
+
subtitle = await fetchSubtitle(url,subType=subtype)
|
20 |
return JSONResponse(content=subtitle)
|
21 |
|
22 |
|