Upload folder using huggingface_hub
Browse files- Infer.py +1 -1
- VideoLoader.py +12 -22
- __pycache__/Infer.cpython-38.pyc +0 -0
- __pycache__/VideoLoader.cpython-38.pyc +0 -0
- app.py +28 -75
- model/__pycache__/deberta_moe.cpython-38.pyc +0 -0
- model/deberta_moe.py +1 -1
- new.py +45 -0
- packages.txt +1 -0
- requirements.txt +2 -1
Infer.py
CHANGED
@@ -98,7 +98,7 @@ class Infer:
|
|
98 |
a2tok.to(self.model.device), freeze_last=self.args.freeze_last
|
99 |
)
|
100 |
|
101 |
-
def generate(self, text,
|
102 |
video, video_len = self.video_loader(video_path)
|
103 |
video = self._get_clip_feature(video).unsqueeze(0).float()
|
104 |
video_mask = get_mask(video_len, 10)
|
|
|
98 |
a2tok.to(self.model.device), freeze_last=self.args.freeze_last
|
99 |
)
|
100 |
|
101 |
+
def generate(self, text, candidates, video_path):
|
102 |
video, video_len = self.video_loader(video_path)
|
103 |
video = self._get_clip_feature(video).unsqueeze(0).float()
|
104 |
video_mask = get_mask(video_len, 10)
|
VideoLoader.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
|
2 |
import torch as th
|
3 |
import os
|
4 |
import numpy as np
|
5 |
-
import
|
6 |
|
7 |
|
8 |
class Normalize(object):
|
@@ -45,15 +45,9 @@ class VideoLoader:
|
|
45 |
self.features_dim = 768
|
46 |
|
47 |
def _get_video_dim(self, video_path):
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
None,
|
52 |
-
)
|
53 |
-
width = int(video_stream["width"])
|
54 |
-
height = int(video_stream["height"])
|
55 |
-
num, denum = video_stream["avg_frame_rate"].split("/")
|
56 |
-
frame_rate = int(num) / int(denum)
|
57 |
return height, width, frame_rate
|
58 |
|
59 |
def _get_output_dim(self, h, w):
|
@@ -83,20 +77,17 @@ class VideoLoader:
|
|
83 |
"input": video_path
|
84 |
}
|
85 |
height, width = self._get_output_dim(h, w)
|
86 |
-
|
|
|
|
|
|
|
87 |
try:
|
88 |
-
|
89 |
-
ffmpeg.input(video_path)
|
90 |
-
.filter("fps", fps=self.framerate)
|
91 |
-
.filter("scale", width, height)
|
92 |
-
)
|
93 |
if self.centercrop:
|
94 |
x = int((width - self.size) / 2.0)
|
95 |
y = int((height - self.size) / 2.0)
|
96 |
-
|
97 |
-
|
98 |
-
capture_stdout=True, quiet=True
|
99 |
-
)
|
100 |
except:
|
101 |
print("ffmpeg error at: {}".format(video_path))
|
102 |
return {
|
@@ -105,7 +96,6 @@ class VideoLoader:
|
|
105 |
}
|
106 |
if self.centercrop and isinstance(self.size, int):
|
107 |
height, width = self.size, self.size
|
108 |
-
video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
|
109 |
video = th.from_numpy(video.astype("float32"))
|
110 |
video = video.permute(0, 3, 1, 2) # t,c,h,w
|
111 |
else:
|
|
|
1 |
+
import cv2
|
2 |
import torch as th
|
3 |
import os
|
4 |
import numpy as np
|
5 |
+
from decord import VideoReader, cpu
|
6 |
|
7 |
|
8 |
class Normalize(object):
|
|
|
45 |
self.features_dim = 768
|
46 |
|
47 |
def _get_video_dim(self, video_path):
|
48 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
49 |
+
height, width, _ = vr[0].shape
|
50 |
+
frame_rate = vr.get_avg_fps()
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
return height, width, frame_rate
|
52 |
|
53 |
def _get_output_dim(self, h, w):
|
|
|
77 |
"input": video_path
|
78 |
}
|
79 |
height, width = self._get_output_dim(h, w)
|
80 |
+
# resize ##
|
81 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
82 |
+
video = vr.get_batch(range(0, len(vr), int(fr))).asnumpy()
|
83 |
+
video = np.array([cv2.resize(frame, (width, height)) for frame in video])
|
84 |
try:
|
85 |
+
|
|
|
|
|
|
|
|
|
86 |
if self.centercrop:
|
87 |
x = int((width - self.size) / 2.0)
|
88 |
y = int((height - self.size) / 2.0)
|
89 |
+
video = video[:, y:y+self.size, x:x+self.size, :]
|
90 |
+
|
|
|
|
|
91 |
except:
|
92 |
print("ffmpeg error at: {}".format(video_path))
|
93 |
return {
|
|
|
96 |
}
|
97 |
if self.centercrop and isinstance(self.size, int):
|
98 |
height, width = self.size, self.size
|
|
|
99 |
video = th.from_numpy(video.astype("float32"))
|
100 |
video = video.permute(0, 3, 1, 2) # t,c,h,w
|
101 |
else:
|
__pycache__/Infer.cpython-38.pyc
CHANGED
Binary files a/__pycache__/Infer.cpython-38.pyc and b/__pycache__/Infer.cpython-38.pyc differ
|
|
__pycache__/VideoLoader.cpython-38.pyc
CHANGED
Binary files a/__pycache__/VideoLoader.cpython-38.pyc and b/__pycache__/VideoLoader.cpython-38.pyc differ
|
|
app.py
CHANGED
@@ -33,70 +33,14 @@ def save_video_to_local(video_path):
|
|
33 |
return filename
|
34 |
|
35 |
|
36 |
-
def generate(video, textbox_in,
|
37 |
-
flag = 1
|
38 |
-
if not textbox_in:
|
39 |
-
if len(state_.messages) > 0:
|
40 |
-
textbox_in = state_.messages[-1][1]
|
41 |
-
state_.messages.pop(-1)
|
42 |
-
flag = 0
|
43 |
-
else:
|
44 |
-
return "Please enter instruction"
|
45 |
video = video if video else "none"
|
46 |
-
# assert not (os.path.exists(image1) and os.path.exists(video))
|
47 |
-
|
48 |
-
first_run = False if len(state.messages) > 0 else True
|
49 |
-
|
50 |
-
text_en_in = textbox_in.replace("picture", "image")
|
51 |
-
|
52 |
-
# images_tensor = [[], []]
|
53 |
-
image_processor = handler.image_processor
|
54 |
-
if os.path.exists(image1) and not os.path.exists(video):
|
55 |
-
tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
|
56 |
-
# print(tensor.shape)
|
57 |
-
tensor = tensor.to(handler.model.device, dtype=dtype)
|
58 |
-
images_tensor[0] = images_tensor[0] + [tensor]
|
59 |
-
images_tensor[1] = images_tensor[1] + ['image']
|
60 |
-
print(torch.cuda.memory_allocated())
|
61 |
-
print(torch.cuda.max_memory_allocated())
|
62 |
-
video_processor = handler.video_processor
|
63 |
-
if not os.path.exists(image1) and os.path.exists(video):
|
64 |
-
tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
|
65 |
-
# print(tensor.shape)
|
66 |
-
tensor = tensor.to(handler.model.device, dtype=dtype)
|
67 |
-
images_tensor[0] = images_tensor[0] + [tensor]
|
68 |
-
images_tensor[1] = images_tensor[1] + ['video']
|
69 |
-
print(torch.cuda.memory_allocated())
|
70 |
-
print(torch.cuda.max_memory_allocated())
|
71 |
-
if os.path.exists(image1) and os.path.exists(video):
|
72 |
-
tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
|
73 |
-
# print(tensor.shape)
|
74 |
-
tensor = tensor.to(handler.model.device, dtype=dtype)
|
75 |
-
images_tensor[0] = images_tensor[0] + [tensor]
|
76 |
-
images_tensor[1] = images_tensor[1] + ['video']
|
77 |
-
|
78 |
-
|
79 |
-
tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
|
80 |
-
# print(tensor.shape)
|
81 |
-
tensor = tensor.to(handler.model.device, dtype=dtype)
|
82 |
-
images_tensor[0] = images_tensor[0] + [tensor]
|
83 |
-
images_tensor[1] = images_tensor[1] + ['image']
|
84 |
-
print(torch.cuda.memory_allocated())
|
85 |
-
print(torch.cuda.max_memory_allocated())
|
86 |
-
|
87 |
-
|
88 |
-
text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
|
89 |
-
state_.messages[-1] = (state_.roles[1], text_en_out)
|
90 |
-
|
91 |
-
text_en_out = text_en_out.split('#')[0]
|
92 |
-
textbox_out = text_en_out
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(image1) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
|
100 |
|
101 |
|
102 |
device = "cpu"
|
@@ -105,12 +49,17 @@ handler = Infer(device)
|
|
105 |
if not os.path.exists("temp"):
|
106 |
os.makedirs("temp")
|
107 |
|
108 |
-
print(torch.cuda.memory_allocated())
|
109 |
-
print(torch.cuda.max_memory_allocated())
|
110 |
|
111 |
-
|
112 |
-
|
113 |
)
|
|
|
|
|
|
|
|
|
|
|
114 |
with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
|
115 |
gr.Markdown(title_markdown)
|
116 |
state = gr.State()
|
@@ -127,31 +76,35 @@ with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as de
|
|
127 |
examples=[
|
128 |
[
|
129 |
cur_dir + "/videos/3249402410.mp4",
|
130 |
-
"
|
|
|
131 |
],
|
132 |
[
|
133 |
cur_dir + "/videos/4882821564.mp4",
|
134 |
-
"
|
|
|
135 |
],
|
136 |
[
|
137 |
cur_dir + "/videos/6233408665.mp4",
|
138 |
-
"
|
|
|
139 |
],
|
140 |
],
|
141 |
-
inputs=[video,
|
142 |
)
|
143 |
|
144 |
-
with gr.Column(scale=
|
145 |
chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
|
146 |
with gr.Row():
|
147 |
-
with gr.Column(scale=
|
148 |
-
|
|
|
|
|
149 |
with gr.Column(scale=1, min_width=50):
|
150 |
submit_btn = gr.Button(
|
151 |
value="Send", variant="primary", interactive=True
|
152 |
)
|
153 |
|
154 |
-
submit_btn.click(generate, [video,
|
155 |
-
[state, state_, chatbot, first_run, textbox, video])
|
156 |
|
157 |
demo.launch(share=True)
|
|
|
33 |
return filename
|
34 |
|
35 |
|
36 |
+
def generate(video, textbox_in, candbox_in):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
video = video if video else "none"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
|
40 |
+
|
41 |
+
textbox_out = text_en_out
|
42 |
+
#torch.cuda.empty_cache()
|
43 |
+
return textbox_out
|
|
|
44 |
|
45 |
|
46 |
device = "cpu"
|
|
|
49 |
if not os.path.exists("temp"):
|
50 |
os.makedirs("temp")
|
51 |
|
52 |
+
#print(torch.cuda.memory_allocated())
|
53 |
+
#print(torch.cuda.max_memory_allocated())
|
54 |
|
55 |
+
question_box = gr.Textbox(
|
56 |
+
show_label=False, placeholder="Enter question", container=False
|
57 |
)
|
58 |
+
|
59 |
+
candidates_box = gr.Textbox(
|
60 |
+
show_label=False, placeholder="Enter a list of options", container=False
|
61 |
+
)
|
62 |
+
|
63 |
with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
|
64 |
gr.Markdown(title_markdown)
|
65 |
state = gr.State()
|
|
|
76 |
examples=[
|
77 |
[
|
78 |
cur_dir + "/videos/3249402410.mp4",
|
79 |
+
"What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
|
80 |
+
"['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
|
81 |
],
|
82 |
[
|
83 |
cur_dir + "/videos/4882821564.mp4",
|
84 |
+
"Why did the boy clap his hands when he ran to the christmas tree?",
|
85 |
+
"['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
|
86 |
],
|
87 |
[
|
88 |
cur_dir + "/videos/6233408665.mp4",
|
89 |
+
"What did the people on the sofa do after the lady in pink finished singing?",
|
90 |
+
"['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
|
91 |
],
|
92 |
],
|
93 |
+
inputs=[video, question_box, candidates_box],
|
94 |
)
|
95 |
|
96 |
+
with gr.Column(scale=3):
|
97 |
chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
|
98 |
with gr.Row():
|
99 |
+
with gr.Column(scale=4):
|
100 |
+
question_box.render()
|
101 |
+
with gr.Column(scale=4):
|
102 |
+
candidates_box.render()
|
103 |
with gr.Column(scale=1, min_width=50):
|
104 |
submit_btn = gr.Button(
|
105 |
value="Send", variant="primary", interactive=True
|
106 |
)
|
107 |
|
108 |
+
submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])
|
|
|
109 |
|
110 |
demo.launch(share=True)
|
model/__pycache__/deberta_moe.cpython-38.pyc
CHANGED
Binary files a/model/__pycache__/deberta_moe.cpython-38.pyc and b/model/__pycache__/deberta_moe.cpython-38.pyc differ
|
|
model/deberta_moe.py
CHANGED
@@ -684,7 +684,7 @@ def make_log_bucket_position(relative_pos, bucket_size, max_position):
|
|
684 |
np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1))
|
685 |
+ mid
|
686 |
)
|
687 |
-
bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.
|
688 |
return bucket_pos
|
689 |
|
690 |
|
|
|
684 |
np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1))
|
685 |
+ mid
|
686 |
)
|
687 |
+
bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int64)
|
688 |
return bucket_pos
|
689 |
|
690 |
|
new.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import ffmpeg
|
4 |
+
|
5 |
+
size = 224
|
6 |
+
|
7 |
+
def get_video_dim(video_path):
|
8 |
+
probe = ffmpeg.probe(video_path)
|
9 |
+
video_stream = next(
|
10 |
+
(stream for stream in probe["streams"] if stream["codec_type"] == "video"),
|
11 |
+
None,
|
12 |
+
)
|
13 |
+
width = int(video_stream["width"])
|
14 |
+
height = int(video_stream["height"])
|
15 |
+
num, denum = video_stream["avg_frame_rate"].split("/")
|
16 |
+
frame_rate = int(num) / int(denum)
|
17 |
+
return height, width, frame_rate
|
18 |
+
|
19 |
+
def get_output_dim(self, h, w):
|
20 |
+
if isinstance(self.size, tuple) and len(self.size) == 2:
|
21 |
+
return self.size
|
22 |
+
elif h >= w:
|
23 |
+
return int(h * self.size / w), self.size
|
24 |
+
else:
|
25 |
+
return self.size, int(w * self.size / h)
|
26 |
+
|
27 |
+
h, w, fr = get_video_dim(video_path)
|
28 |
+
height, width = get_output_dim(h, w)
|
29 |
+
|
30 |
+
cmd = (
|
31 |
+
ffmpeg.input(video_path)
|
32 |
+
.filter("fps", fps=1)
|
33 |
+
.filter("scale", width, height)
|
34 |
+
)
|
35 |
+
|
36 |
+
x = int((width - size) / 2.0)
|
37 |
+
y = int((height - size) / 2.0)
|
38 |
+
cmd = cmd.crop(x, y, size, size)
|
39 |
+
out, _ = cmd.output("pipe:", format="rawvideo", pix_fmt="rgb24").run(
|
40 |
+
capture_stdout=True, quiet=True
|
41 |
+
)
|
42 |
+
|
43 |
+
height, width = 224, 224
|
44 |
+
video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
|
45 |
+
video = torch.from_numpy(video.astype("float32"))
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
CHANGED
@@ -15,7 +15,7 @@ email_validator==2.2.0
|
|
15 |
exceptiongroup==1.2.2
|
16 |
fastapi==0.111.0
|
17 |
fastapi-cli==0.0.4
|
18 |
-
ffmpeg==
|
19 |
ffmpy==0.3.2
|
20 |
filelock==3.15.4
|
21 |
fonttools==4.53.1
|
@@ -104,3 +104,4 @@ wcwidth==0.2.13
|
|
104 |
websockets==11.0.3
|
105 |
Werkzeug==3.0.3
|
106 |
zipp==3.19.2
|
|
|
|
15 |
exceptiongroup==1.2.2
|
16 |
fastapi==0.111.0
|
17 |
fastapi-cli==0.0.4
|
18 |
+
ffmpeg-python==0.2.0
|
19 |
ffmpy==0.3.2
|
20 |
filelock==3.15.4
|
21 |
fonttools==4.53.1
|
|
|
104 |
websockets==11.0.3
|
105 |
Werkzeug==3.0.3
|
106 |
zipp==3.19.2
|
107 |
+
decord
|