Spaces:

yixin1121
/

T-MoENet

Runtime error

App Files Files Community

yixin1121 commited on Jul 14

Commit

3a81605

•

1 Parent(s): 85af14c

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

Infer.py +1 -1
VideoLoader.py +12 -22
__pycache__/Infer.cpython-38.pyc +0 -0
__pycache__/VideoLoader.cpython-38.pyc +0 -0
app.py +28 -75
model/__pycache__/deberta_moe.cpython-38.pyc +0 -0
model/deberta_moe.py +1 -1
new.py +45 -0
packages.txt +1 -0
requirements.txt +2 -1

Infer.py CHANGED Viewed

@@ -98,7 +98,7 @@ class Infer:
             a2tok.to(self.model.device), freeze_last=self.args.freeze_last
         )
-    def generate(self, text, video_path, candidates = None):
         video, video_len = self.video_loader(video_path)
         video = self._get_clip_feature(video).unsqueeze(0).float()
         video_mask = get_mask(video_len, 10)

             a2tok.to(self.model.device), freeze_last=self.args.freeze_last
         )
+    def generate(self, text, candidates, video_path):
         video, video_len = self.video_loader(video_path)
         video = self._get_clip_feature(video).unsqueeze(0).float()
         video_mask = get_mask(video_len, 10)

VideoLoader.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch as th
 import os
 import numpy as np
-import ffmpeg
 class Normalize(object):
@@ -45,15 +45,9 @@ class VideoLoader:
         self.features_dim = 768
     def _get_video_dim(self, video_path):
-        probe = ffmpeg.probe(video_path)
-        video_stream = next(
-            (stream for stream in probe["streams"] if stream["codec_type"] == "video"),
-            None,
-        )
-        width = int(video_stream["width"])
-        height = int(video_stream["height"])
-        num, denum = video_stream["avg_frame_rate"].split("/")
-        frame_rate = int(num) / int(denum)
         return height, width, frame_rate
     def _get_output_dim(self, h, w):
@@ -83,20 +77,17 @@ class VideoLoader:
                     "input": video_path
                 }
             height, width = self._get_output_dim(h, w)
             try:
-                cmd = (
-                    ffmpeg.input(video_path)
-                    .filter("fps", fps=self.framerate)
-                    .filter("scale", width, height)
-                )
                 if self.centercrop:
                     x = int((width - self.size) / 2.0)
                     y = int((height - self.size) / 2.0)
-                    cmd = cmd.crop(x, y, self.size, self.size)
-                out, _ = cmd.output("pipe:", format="rawvideo", pix_fmt="rgb24").run(
-                    capture_stdout=True, quiet=True
-                )
             except:
                 print("ffmpeg error at: {}".format(video_path))
                 return {
@@ -105,7 +96,6 @@ class VideoLoader:
                 }
             if self.centercrop and isinstance(self.size, int):
                 height, width = self.size, self.size
-            video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
             video = th.from_numpy(video.astype("float32"))
             video = video.permute(0, 3, 1, 2) # t,c,h,w
         else:

+import cv2
 import torch as th
 import os
 import numpy as np
+from decord import VideoReader, cpu
 class Normalize(object):
         self.features_dim = 768
     def _get_video_dim(self, video_path):
+        vr = VideoReader(video_path, ctx=cpu(0))
+        height, width, _ = vr[0].shape
+        frame_rate = vr.get_avg_fps()
         return height, width, frame_rate
     def _get_output_dim(self, h, w):
                     "input": video_path
                 }
             height, width = self._get_output_dim(h, w)
+            # resize ##
+            vr = VideoReader(video_path, ctx=cpu(0))
+            video = vr.get_batch(range(0, len(vr), int(fr))).asnumpy()
+            video = np.array([cv2.resize(frame, (width, height)) for frame in video])
             try:
                 if self.centercrop:
                     x = int((width - self.size) / 2.0)
                     y = int((height - self.size) / 2.0)
+                    video = video[:, y:y+self.size, x:x+self.size, :]
             except:
                 print("ffmpeg error at: {}".format(video_path))
                 return {
                 }
             if self.centercrop and isinstance(self.size, int):
                 height, width = self.size, self.size
             video = th.from_numpy(video.astype("float32"))
             video = video.permute(0, 3, 1, 2) # t,c,h,w
         else:

__pycache__/Infer.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/Infer.cpython-38.pyc and b/__pycache__/Infer.cpython-38.pyc differ

__pycache__/VideoLoader.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/VideoLoader.cpython-38.pyc and b/__pycache__/VideoLoader.cpython-38.pyc differ

app.py CHANGED Viewed

@@ -33,70 +33,14 @@ def save_video_to_local(video_path):
     return filename
-def generate(video, textbox_in, first_run, state, state_):
-    flag = 1
-    if not textbox_in:
-        if len(state_.messages) > 0:
-            textbox_in = state_.messages[-1][1]
-            state_.messages.pop(-1)
-            flag = 0
-        else:
-            return "Please enter instruction"
     video = video if video else "none"
-    # assert not (os.path.exists(image1) and os.path.exists(video))
-    first_run = False if len(state.messages) > 0 else True
-    text_en_in = textbox_in.replace("picture", "image")
-    # images_tensor = [[], []]
-    image_processor = handler.image_processor
-    if os.path.exists(image1) and not os.path.exists(video):
-        tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
-        # print(tensor.shape)
-        tensor = tensor.to(handler.model.device, dtype=dtype)
-        images_tensor[0] = images_tensor[0] + [tensor]
-        images_tensor[1] = images_tensor[1] + ['image']
-        print(torch.cuda.memory_allocated())
-        print(torch.cuda.max_memory_allocated())
-    video_processor = handler.video_processor
-    if not os.path.exists(image1) and os.path.exists(video):
-        tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
-        # print(tensor.shape)
-        tensor = tensor.to(handler.model.device, dtype=dtype)
-        images_tensor[0] = images_tensor[0] + [tensor]
-        images_tensor[1] = images_tensor[1] + ['video']
-        print(torch.cuda.memory_allocated())
-        print(torch.cuda.max_memory_allocated())
-    if os.path.exists(image1) and os.path.exists(video):
-        tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
-        # print(tensor.shape)
-        tensor = tensor.to(handler.model.device, dtype=dtype)
-        images_tensor[0] = images_tensor[0] + [tensor]
-        images_tensor[1] = images_tensor[1] + ['video']
-        tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
-        # print(tensor.shape)
-        tensor = tensor.to(handler.model.device, dtype=dtype)
-        images_tensor[0] = images_tensor[0] + [tensor]
-        images_tensor[1] = images_tensor[1] + ['image']
-        print(torch.cuda.memory_allocated())
-        print(torch.cuda.max_memory_allocated())
-    text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
-    state_.messages[-1] = (state_.roles[1], text_en_out)
-    text_en_out = text_en_out.split('#')[0]
-    textbox_out = text_en_out
-    show_images = ""
-    if flag:
-        state.append_message(state.roles[0], textbox_in + "\n" + show_images)
-    state.append_message(state.roles[1], textbox_out)
-    torch.cuda.empty_cache()
-    return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(image1) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
 device = "cpu"
@@ -105,12 +49,17 @@ handler = Infer(device)
 if not os.path.exists("temp"):
     os.makedirs("temp")
-print(torch.cuda.memory_allocated())
-print(torch.cuda.max_memory_allocated())
-textbox = gr.Textbox(
-        show_label=False, placeholder="Enter text and press ENTER", container=False
     )
 with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
     gr.Markdown(title_markdown)
     state = gr.State()
@@ -127,31 +76,35 @@ with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as de
                 examples=[
                     [
                         cur_dir + "/videos/3249402410.mp4",
-                        "what did the lady in black on the left do after she finished spreading the sauce on her pizza?",
                     ],
                     [
                         cur_dir + "/videos/4882821564.mp4",
-                        "why did the boy clap his hands when he ran to the christmas tree?",
                     ],
                     [
                         cur_dir + "/videos/6233408665.mp4",
-                        "what did the people on the sofa do after the lady in pink finished singing?",
                     ],
                 ],
-                inputs=[video, textbox],
             )
-        with gr.Column(scale=7):
             chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
             with gr.Row():
-                with gr.Column(scale=2):
-                    textbox.render()
                 with gr.Column(scale=1, min_width=50):
                     submit_btn = gr.Button(
                         value="Send", variant="primary", interactive=True
                     )
-    submit_btn.click(generate, [video, textbox, first_run, state, state_],
-                     [state, state_, chatbot, first_run, textbox,  video])
 demo.launch(share=True)

     return filename
+def generate(video, textbox_in, candbox_in):
     video = video if video else "none"
+    text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
+    textbox_out = text_en_out
+    #torch.cuda.empty_cache()
+    return textbox_out
 device = "cpu"
 if not os.path.exists("temp"):
     os.makedirs("temp")
+#print(torch.cuda.memory_allocated())
+#print(torch.cuda.max_memory_allocated())
+question_box = gr.Textbox(
+    show_label=False, placeholder="Enter question", container=False
     )
+candidates_box = gr.Textbox(
+    show_label=False, placeholder="Enter a list of options", container=False
+    )
 with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
     gr.Markdown(title_markdown)
     state = gr.State()
                 examples=[
                     [
                         cur_dir + "/videos/3249402410.mp4",
+                        "What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
+                            "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
                     ],
                     [
                         cur_dir + "/videos/4882821564.mp4",
+                        "Why did the boy clap his hands when he ran to the christmas tree?",
+                        "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
                     ],
                     [
                         cur_dir + "/videos/6233408665.mp4",
+                        "What did the people on the sofa do after the lady in pink finished singing?",
+                        "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
                     ],
                 ],
+                inputs=[video, question_box, candidates_box],
             )
+        with gr.Column(scale=3):
             chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
             with gr.Row():
+                with gr.Column(scale=4):
+                    question_box.render()
+                with gr.Column(scale=4):
+                    candidates_box.render()
                 with gr.Column(scale=1, min_width=50):
                     submit_btn = gr.Button(
                         value="Send", variant="primary", interactive=True
                     )
+    submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])
 demo.launch(share=True)

model/__pycache__/deberta_moe.cpython-38.pyc CHANGED Viewed

Binary files a/model/__pycache__/deberta_moe.cpython-38.pyc and b/model/__pycache__/deberta_moe.cpython-38.pyc differ

model/deberta_moe.py CHANGED Viewed

@@ -684,7 +684,7 @@ def make_log_bucket_position(relative_pos, bucket_size, max_position):
         np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1))
         + mid
     )
-    bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int)
     return bucket_pos

         np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1))
         + mid
     )
+    bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int64)
     return bucket_pos

new.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import numpy as np
+import ffmpeg
+size = 224
+def get_video_dim(video_path):
+    probe = ffmpeg.probe(video_path)
+    video_stream = next(
+        (stream for stream in probe["streams"] if stream["codec_type"] == "video"),
+        None,
+    )
+    width = int(video_stream["width"])
+    height = int(video_stream["height"])
+    num, denum = video_stream["avg_frame_rate"].split("/")
+    frame_rate = int(num) / int(denum)
+    return height, width, frame_rate
+def get_output_dim(self, h, w):
+    if isinstance(self.size, tuple) and len(self.size) == 2:
+        return self.size
+    elif h >= w:
+        return int(h * self.size / w), self.size
+    else:
+        return self.size, int(w * self.size / h)
+h, w, fr = get_video_dim(video_path)
+height, width = get_output_dim(h, w)
+cmd = (
+    ffmpeg.input(video_path)
+    .filter("fps", fps=1)
+    .filter("scale", width, height)
+)
+x = int((width - size) / 2.0)
+y = int((height - size) / 2.0)
+cmd = cmd.crop(x, y, size, size)
+out, _ = cmd.output("pipe:", format="rawvideo", pix_fmt="rgb24").run(
+    capture_stdout=True, quiet=True
+)
+height, width = 224, 224
+video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
+video = torch.from_numpy(video.astype("float32"))

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -15,7 +15,7 @@ email_validator==2.2.0
 exceptiongroup==1.2.2
 fastapi==0.111.0
 fastapi-cli==0.0.4
-ffmpeg==1.4
 ffmpy==0.3.2
 filelock==3.15.4
 fonttools==4.53.1
@@ -104,3 +104,4 @@ wcwidth==0.2.13
 websockets==11.0.3
 Werkzeug==3.0.3
 zipp==3.19.2

 exceptiongroup==1.2.2
 fastapi==0.111.0
 fastapi-cli==0.0.4
+ffmpeg-python==0.2.0
 ffmpy==0.3.2
 filelock==3.15.4
 fonttools==4.53.1
 websockets==11.0.3
 Werkzeug==3.0.3
 zipp==3.19.2
+decord