TuberTranscript / app.py
Lookimi's picture
Update app.py
4bcf60e
raw
history blame
2.29 kB
#importing the necessary modules
import os
import urllib.request
import re
import time
import gradio as gr
#Creating a Gradio App Menu
def transcript_extract():
#specifying the YouTube channel URL
channel_url = gr.inputs.Textbox(label="Channel URL")
#accessing the webpage
page = urllib.request.urlopen(channel_url)
#reading the source code
data = page.read().decode("utf-8")
#creating a directory to save the transcripts
os.makedirs('Transcripts',exist_ok=True)
#finding the transcripts
transcript_links = re.findall(r'(\/watch\?v=[A-Za-z0-9_.-]*)', str(data))
#looping through each transcript to download
for link in transcript_links:
video_url = 'http://www.youtube.com'+link
#access the video page
video_page = urllib.request.urlopen(video_url)
#read the source code
video_data = video_page.read().decode("utf-8")
#find the transcript
transcript_link = re.findall(r'(\/timedtext_editor\?[A-Za-z0-9_.-]*)', str(video_data))
#check if there is a transcript available
if(len(transcript_link) > 0):
#access the transcript page
transcript_url ='http://www.youtube.com'+ transcript_link[0]
transcript_page = urllib.request.urlopen(transcript_url)
transcript_data = transcript_page.read().decode("utf-8")
#find the link to the transcript
transcript_download_link = re.findall(r'(\/api\/timedtext\?[A-Za-z0-9_.-]*)', str(transcript_data))
#check if the transcript is available for download
if(len(transcript_download_link) > 0):
#download the transcript
file_name = "Transcripts/" + link[9:] + ".xml"
download_url = 'http://www.youtube.com'+transcript_download_link[0]
urllib.request.urlretrieve(download_url, file_name)
print("Downloading transcript for video " + link[9:] + "...")
time.sleep(3)
else:
print("Transcript not available for video " + link[9:])
else:
print("Transcript not available for video " + link[9:])
#launch the gradio
gr.Interface(fn=transcript_extract, inputs=channel_url, outputs="textbox").launch()