Spaces:
Runtime error
Runtime error
yiyixuxu
commited on
Commit
•
96f1e87
1
Parent(s):
9855e99
add article
Browse files
app.py
CHANGED
@@ -17,9 +17,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
17 |
model, preprocess = clip.load("ViT-B/32")
|
18 |
|
19 |
|
20 |
-
def select_video_format(url, format_note='240p', ext='mp4', max_size =
|
21 |
defaults = ['480p', '360p','240p','144p']
|
22 |
-
ydl_opts =
|
23 |
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
24 |
info_dict = ydl.extract_info(url, download=False)
|
25 |
formats = info_dict.get('formats', None)
|
@@ -28,21 +28,17 @@ def select_video_format(url, format_note='240p', ext='mp4', max_size = 50000000)
|
|
28 |
and f['vcodec'].split('.')[0] != 'av01'
|
29 |
and f['filesize'] is not None and f['filesize'] <= max_size]
|
30 |
available_format_notes = set([f['format_note'] for f in formats])
|
31 |
-
try:
|
32 |
-
if format_note not in available_format_notes:
|
33 |
-
format_note = [d for d in defaults if d in available_format_notes][0]
|
34 |
-
formats = [f for f in formats if f['format_note'] == format_note]
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
43 |
return(format, format_id, fps)
|
44 |
-
|
45 |
-
# to-do: delete saved videos
|
46 |
def download_video(url):
|
47 |
# create "videos" foder for saved videos
|
48 |
path_videos = Path('videos')
|
@@ -58,23 +54,24 @@ def download_video(url):
|
|
58 |
path_video.unlink()
|
59 |
print(f'removed video {path_video}')
|
60 |
# select format to download for given video
|
61 |
-
# by default select
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
'format':format_id,
|
66 |
'outtmpl': "videos/%(id)s.%(ext)s"}
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
78 |
return(fps, save_location)
|
79 |
|
80 |
def process_video_parallel(video, skip_frames, dest_path, num_processes, process_number):
|
@@ -90,7 +87,6 @@ def process_video_parallel(video, skip_frames, dest_path, num_processes, process
|
|
90 |
if count % skip_frames ==0:
|
91 |
filename =f"{dest_path}/{count}.jpg"
|
92 |
cv2.imwrite(filename, frame)
|
93 |
-
#print(f"saved {filename}")
|
94 |
count += 1
|
95 |
cap.release()
|
96 |
|
@@ -136,13 +132,14 @@ def captioned_strip(images, caption=None, times=None, rows=1):
|
|
136 |
draw.text((60, 3), caption, (255, 255, 255), font=font)
|
137 |
for i,ts in enumerate(times):
|
138 |
draw.text((
|
139 |
-
(i
|
140 |
-
i
|
141 |
, ts,
|
142 |
(255, 255, 255), font=font_small)
|
143 |
return img
|
144 |
|
145 |
def run_inference(url, sampling_interval, search_query, bs=526):
|
|
|
146 |
skip_frames, path_frames= vid2frames(url,sampling_interval)
|
147 |
if path_frames is not None:
|
148 |
filenames = sorted(path_frames.glob('*.jpg'),key=lambda p: int(p.stem))
|
@@ -173,9 +170,10 @@ def run_inference(url, sampling_interval, search_query, bs=526):
|
|
173 |
|
174 |
similarity = (100.0 * image_features @ text_features.T)
|
175 |
values, indices = similarity.topk(4, dim=0)
|
176 |
-
|
|
|
177 |
best_frames = [Image.open(filenames[ind]).convert("RGB") for ind in indices]
|
178 |
-
times = [f'{datetime.timedelta(seconds = ind[0].item() * sampling_interval)}' for ind in indices]
|
179 |
image_output = captioned_strip(best_frames,search_query, times,2)
|
180 |
title = search_query
|
181 |
print('task complete')
|
@@ -184,7 +182,7 @@ def run_inference(url, sampling_interval, search_query, bs=526):
|
|
184 |
image_output = None
|
185 |
return(title, image_output)
|
186 |
|
187 |
-
inputs = [gr.inputs.Textbox(label="Give us the link to your youtube video! (
|
188 |
gr.Number(5,label='sampling interval (seconds)'),
|
189 |
gr.inputs.Textbox(label="What do you want to search?")]
|
190 |
outputs = [
|
@@ -192,7 +190,7 @@ outputs = [
|
|
192 |
gr.outputs.Image(label=""),
|
193 |
]
|
194 |
|
195 |
-
|
196 |
|
197 |
gr.Interface(
|
198 |
run_inference,
|
@@ -200,10 +198,11 @@ gr.Interface(
|
|
200 |
outputs=outputs,
|
201 |
title="It Happened One Frame",
|
202 |
description='A CLIP-based app that search YouTube video frame based on text',
|
|
|
203 |
examples=[
|
204 |
['https://youtu.be/v1rkzUIL8oc', 1, "James Cagney dancing down the stairs"],
|
205 |
['https://youtu.be/k4R5wZs8cxI', 1, "James Cagney smashes a grapefruit into Mae Clarke's face"],
|
206 |
['https://youtu.be/0diCvgWv_ng', 1, "little Deborah practicing her ballet while wearing a tutu in empty restaurant"]
|
207 |
]
|
208 |
-
).launch(debug=True,enable_queue=True)
|
209 |
|
|
|
17 |
model, preprocess = clip.load("ViT-B/32")
|
18 |
|
19 |
|
20 |
+
def select_video_format(url, ydl_opts={}, format_note='240p', ext='mp4', max_size = 500000000):
|
21 |
defaults = ['480p', '360p','240p','144p']
|
22 |
+
ydl_opts = ydl_opts
|
23 |
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
24 |
info_dict = ydl.extract_info(url, download=False)
|
25 |
formats = info_dict.get('formats', None)
|
|
|
28 |
and f['vcodec'].split('.')[0] != 'av01'
|
29 |
and f['filesize'] is not None and f['filesize'] <= max_size]
|
30 |
available_format_notes = set([f['format_note'] for f in formats])
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
if format_note not in available_format_notes:
|
33 |
+
format_note = [d for d in defaults if d in available_format_notes][0]
|
34 |
+
formats = [f for f in formats if f['format_note'] == format_note]
|
35 |
+
|
36 |
+
format = formats[0]
|
37 |
+
format_id = format.get('format_id', None)
|
38 |
+
fps = format.get('fps', None)
|
39 |
+
print(f'format selected: {format}')
|
40 |
return(format, format_id, fps)
|
41 |
+
|
|
|
42 |
def download_video(url):
|
43 |
# create "videos" foder for saved videos
|
44 |
path_videos = Path('videos')
|
|
|
54 |
path_video.unlink()
|
55 |
print(f'removed video {path_video}')
|
56 |
# select format to download for given video
|
57 |
+
# by default select 240p and .mp4
|
58 |
+
try:
|
59 |
+
format, format_id, fps = select_video_format(url)
|
60 |
+
ydl_opts = {
|
61 |
'format':format_id,
|
62 |
'outtmpl': "videos/%(id)s.%(ext)s"}
|
63 |
|
64 |
+
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
65 |
+
try:
|
66 |
+
ydl.cache.remove()
|
67 |
+
meta = ydl.extract_info(url)
|
68 |
+
save_location = 'videos/' + meta['id'] + '.' + meta['ext']
|
69 |
+
except youtube_dl.DownloadError as error:
|
70 |
+
print(f'error with download_video function: {error}')
|
71 |
+
save_location = None
|
72 |
+
except IndexError as err:
|
73 |
+
print(f"can't find suitable video formats. we are not able to process video larger than 95 Mib at the moment")
|
74 |
+
fps, save_location = None, None
|
75 |
return(fps, save_location)
|
76 |
|
77 |
def process_video_parallel(video, skip_frames, dest_path, num_processes, process_number):
|
|
|
87 |
if count % skip_frames ==0:
|
88 |
filename =f"{dest_path}/{count}.jpg"
|
89 |
cv2.imwrite(filename, frame)
|
|
|
90 |
count += 1
|
91 |
cap.release()
|
92 |
|
|
|
132 |
draw.text((60, 3), caption, (255, 255, 255), font=font)
|
133 |
for i,ts in enumerate(times):
|
134 |
draw.text((
|
135 |
+
(i // rows) * w + 40 , #column poistion
|
136 |
+
i % rows * h + 33) # row position
|
137 |
, ts,
|
138 |
(255, 255, 255), font=font_small)
|
139 |
return img
|
140 |
|
141 |
def run_inference(url, sampling_interval, search_query, bs=526):
|
142 |
+
print(f"search for : {search_query}")
|
143 |
skip_frames, path_frames= vid2frames(url,sampling_interval)
|
144 |
if path_frames is not None:
|
145 |
filenames = sorted(path_frames.glob('*.jpg'),key=lambda p: int(p.stem))
|
|
|
170 |
|
171 |
similarity = (100.0 * image_features @ text_features.T)
|
172 |
values, indices = similarity.topk(4, dim=0)
|
173 |
+
print(f"indices for best matches{indices}")
|
174 |
+
print(f"filenames for best matches {[filenames[i]for i in indices]}")
|
175 |
best_frames = [Image.open(filenames[ind]).convert("RGB") for ind in indices]
|
176 |
+
times = [f'{datetime.timedelta(seconds = round(ind[0].item() * sampling_interval,2))}' for ind in indices]
|
177 |
image_output = captioned_strip(best_frames,search_query, times,2)
|
178 |
title = search_query
|
179 |
print('task complete')
|
|
|
182 |
image_output = None
|
183 |
return(title, image_output)
|
184 |
|
185 |
+
inputs = [gr.inputs.Textbox(label="Give us the link to your youtube video! (maximum size 50 MB)"),
|
186 |
gr.Number(5,label='sampling interval (seconds)'),
|
187 |
gr.inputs.Textbox(label="What do you want to search?")]
|
188 |
outputs = [
|
|
|
190 |
gr.outputs.Image(label=""),
|
191 |
]
|
192 |
|
193 |
+
article = "Check out [this blogpost](https://yiyixuxu.github.io/2022/06/12/It-Happened-One-Frame.html) about this app."
|
194 |
|
195 |
gr.Interface(
|
196 |
run_inference,
|
|
|
198 |
outputs=outputs,
|
199 |
title="It Happened One Frame",
|
200 |
description='A CLIP-based app that search YouTube video frame based on text',
|
201 |
+
article = article,
|
202 |
examples=[
|
203 |
['https://youtu.be/v1rkzUIL8oc', 1, "James Cagney dancing down the stairs"],
|
204 |
['https://youtu.be/k4R5wZs8cxI', 1, "James Cagney smashes a grapefruit into Mae Clarke's face"],
|
205 |
['https://youtu.be/0diCvgWv_ng', 1, "little Deborah practicing her ballet while wearing a tutu in empty restaurant"]
|
206 |
]
|
207 |
+
).launch(debug=True,enable_queue=True,share=True)
|
208 |
|