yixin1121 commited on
Commit
3a81605
1 Parent(s): 85af14c

Upload folder using huggingface_hub

Browse files
Infer.py CHANGED
@@ -98,7 +98,7 @@ class Infer:
98
  a2tok.to(self.model.device), freeze_last=self.args.freeze_last
99
  )
100
 
101
- def generate(self, text, video_path, candidates = None):
102
  video, video_len = self.video_loader(video_path)
103
  video = self._get_clip_feature(video).unsqueeze(0).float()
104
  video_mask = get_mask(video_len, 10)
 
98
  a2tok.to(self.model.device), freeze_last=self.args.freeze_last
99
  )
100
 
101
+ def generate(self, text, candidates, video_path):
102
  video, video_len = self.video_loader(video_path)
103
  video = self._get_clip_feature(video).unsqueeze(0).float()
104
  video_mask = get_mask(video_len, 10)
VideoLoader.py CHANGED
@@ -1,8 +1,8 @@
1
-
2
  import torch as th
3
  import os
4
  import numpy as np
5
- import ffmpeg
6
 
7
 
8
  class Normalize(object):
@@ -45,15 +45,9 @@ class VideoLoader:
45
  self.features_dim = 768
46
 
47
  def _get_video_dim(self, video_path):
48
- probe = ffmpeg.probe(video_path)
49
- video_stream = next(
50
- (stream for stream in probe["streams"] if stream["codec_type"] == "video"),
51
- None,
52
- )
53
- width = int(video_stream["width"])
54
- height = int(video_stream["height"])
55
- num, denum = video_stream["avg_frame_rate"].split("/")
56
- frame_rate = int(num) / int(denum)
57
  return height, width, frame_rate
58
 
59
  def _get_output_dim(self, h, w):
@@ -83,20 +77,17 @@ class VideoLoader:
83
  "input": video_path
84
  }
85
  height, width = self._get_output_dim(h, w)
86
-
 
 
 
87
  try:
88
- cmd = (
89
- ffmpeg.input(video_path)
90
- .filter("fps", fps=self.framerate)
91
- .filter("scale", width, height)
92
- )
93
  if self.centercrop:
94
  x = int((width - self.size) / 2.0)
95
  y = int((height - self.size) / 2.0)
96
- cmd = cmd.crop(x, y, self.size, self.size)
97
- out, _ = cmd.output("pipe:", format="rawvideo", pix_fmt="rgb24").run(
98
- capture_stdout=True, quiet=True
99
- )
100
  except:
101
  print("ffmpeg error at: {}".format(video_path))
102
  return {
@@ -105,7 +96,6 @@ class VideoLoader:
105
  }
106
  if self.centercrop and isinstance(self.size, int):
107
  height, width = self.size, self.size
108
- video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
109
  video = th.from_numpy(video.astype("float32"))
110
  video = video.permute(0, 3, 1, 2) # t,c,h,w
111
  else:
 
1
+ import cv2
2
  import torch as th
3
  import os
4
  import numpy as np
5
+ from decord import VideoReader, cpu
6
 
7
 
8
  class Normalize(object):
 
45
  self.features_dim = 768
46
 
47
  def _get_video_dim(self, video_path):
48
+ vr = VideoReader(video_path, ctx=cpu(0))
49
+ height, width, _ = vr[0].shape
50
+ frame_rate = vr.get_avg_fps()
 
 
 
 
 
 
51
  return height, width, frame_rate
52
 
53
  def _get_output_dim(self, h, w):
 
77
  "input": video_path
78
  }
79
  height, width = self._get_output_dim(h, w)
80
+ # resize ##
81
+ vr = VideoReader(video_path, ctx=cpu(0))
82
+ video = vr.get_batch(range(0, len(vr), int(fr))).asnumpy()
83
+ video = np.array([cv2.resize(frame, (width, height)) for frame in video])
84
  try:
85
+
 
 
 
 
86
  if self.centercrop:
87
  x = int((width - self.size) / 2.0)
88
  y = int((height - self.size) / 2.0)
89
+ video = video[:, y:y+self.size, x:x+self.size, :]
90
+
 
 
91
  except:
92
  print("ffmpeg error at: {}".format(video_path))
93
  return {
 
96
  }
97
  if self.centercrop and isinstance(self.size, int):
98
  height, width = self.size, self.size
 
99
  video = th.from_numpy(video.astype("float32"))
100
  video = video.permute(0, 3, 1, 2) # t,c,h,w
101
  else:
__pycache__/Infer.cpython-38.pyc CHANGED
Binary files a/__pycache__/Infer.cpython-38.pyc and b/__pycache__/Infer.cpython-38.pyc differ
 
__pycache__/VideoLoader.cpython-38.pyc CHANGED
Binary files a/__pycache__/VideoLoader.cpython-38.pyc and b/__pycache__/VideoLoader.cpython-38.pyc differ
 
app.py CHANGED
@@ -33,70 +33,14 @@ def save_video_to_local(video_path):
33
  return filename
34
 
35
 
36
- def generate(video, textbox_in, first_run, state, state_):
37
- flag = 1
38
- if not textbox_in:
39
- if len(state_.messages) > 0:
40
- textbox_in = state_.messages[-1][1]
41
- state_.messages.pop(-1)
42
- flag = 0
43
- else:
44
- return "Please enter instruction"
45
  video = video if video else "none"
46
- # assert not (os.path.exists(image1) and os.path.exists(video))
47
-
48
- first_run = False if len(state.messages) > 0 else True
49
-
50
- text_en_in = textbox_in.replace("picture", "image")
51
-
52
- # images_tensor = [[], []]
53
- image_processor = handler.image_processor
54
- if os.path.exists(image1) and not os.path.exists(video):
55
- tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
56
- # print(tensor.shape)
57
- tensor = tensor.to(handler.model.device, dtype=dtype)
58
- images_tensor[0] = images_tensor[0] + [tensor]
59
- images_tensor[1] = images_tensor[1] + ['image']
60
- print(torch.cuda.memory_allocated())
61
- print(torch.cuda.max_memory_allocated())
62
- video_processor = handler.video_processor
63
- if not os.path.exists(image1) and os.path.exists(video):
64
- tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
65
- # print(tensor.shape)
66
- tensor = tensor.to(handler.model.device, dtype=dtype)
67
- images_tensor[0] = images_tensor[0] + [tensor]
68
- images_tensor[1] = images_tensor[1] + ['video']
69
- print(torch.cuda.memory_allocated())
70
- print(torch.cuda.max_memory_allocated())
71
- if os.path.exists(image1) and os.path.exists(video):
72
- tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
73
- # print(tensor.shape)
74
- tensor = tensor.to(handler.model.device, dtype=dtype)
75
- images_tensor[0] = images_tensor[0] + [tensor]
76
- images_tensor[1] = images_tensor[1] + ['video']
77
-
78
-
79
- tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
80
- # print(tensor.shape)
81
- tensor = tensor.to(handler.model.device, dtype=dtype)
82
- images_tensor[0] = images_tensor[0] + [tensor]
83
- images_tensor[1] = images_tensor[1] + ['image']
84
- print(torch.cuda.memory_allocated())
85
- print(torch.cuda.max_memory_allocated())
86
-
87
-
88
- text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
89
- state_.messages[-1] = (state_.roles[1], text_en_out)
90
-
91
- text_en_out = text_en_out.split('#')[0]
92
- textbox_out = text_en_out
93
 
94
- show_images = ""
95
- if flag:
96
- state.append_message(state.roles[0], textbox_in + "\n" + show_images)
97
- state.append_message(state.roles[1], textbox_out)
98
- torch.cuda.empty_cache()
99
- return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(image1) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
100
 
101
 
102
  device = "cpu"
@@ -105,12 +49,17 @@ handler = Infer(device)
105
  if not os.path.exists("temp"):
106
  os.makedirs("temp")
107
 
108
- print(torch.cuda.memory_allocated())
109
- print(torch.cuda.max_memory_allocated())
110
 
111
- textbox = gr.Textbox(
112
- show_label=False, placeholder="Enter text and press ENTER", container=False
113
  )
 
 
 
 
 
114
  with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
115
  gr.Markdown(title_markdown)
116
  state = gr.State()
@@ -127,31 +76,35 @@ with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as de
127
  examples=[
128
  [
129
  cur_dir + "/videos/3249402410.mp4",
130
- "what did the lady in black on the left do after she finished spreading the sauce on her pizza?",
 
131
  ],
132
  [
133
  cur_dir + "/videos/4882821564.mp4",
134
- "why did the boy clap his hands when he ran to the christmas tree?",
 
135
  ],
136
  [
137
  cur_dir + "/videos/6233408665.mp4",
138
- "what did the people on the sofa do after the lady in pink finished singing?",
 
139
  ],
140
  ],
141
- inputs=[video, textbox],
142
  )
143
 
144
- with gr.Column(scale=7):
145
  chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
146
  with gr.Row():
147
- with gr.Column(scale=2):
148
- textbox.render()
 
 
149
  with gr.Column(scale=1, min_width=50):
150
  submit_btn = gr.Button(
151
  value="Send", variant="primary", interactive=True
152
  )
153
 
154
- submit_btn.click(generate, [video, textbox, first_run, state, state_],
155
- [state, state_, chatbot, first_run, textbox, video])
156
 
157
  demo.launch(share=True)
 
33
  return filename
34
 
35
 
36
+ def generate(video, textbox_in, candbox_in):
 
 
 
 
 
 
 
 
37
  video = video if video else "none"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
40
+
41
+ textbox_out = text_en_out
42
+ #torch.cuda.empty_cache()
43
+ return textbox_out
 
44
 
45
 
46
  device = "cpu"
 
49
  if not os.path.exists("temp"):
50
  os.makedirs("temp")
51
 
52
+ #print(torch.cuda.memory_allocated())
53
+ #print(torch.cuda.max_memory_allocated())
54
 
55
+ question_box = gr.Textbox(
56
+ show_label=False, placeholder="Enter question", container=False
57
  )
58
+
59
+ candidates_box = gr.Textbox(
60
+ show_label=False, placeholder="Enter a list of options", container=False
61
+ )
62
+
63
  with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
64
  gr.Markdown(title_markdown)
65
  state = gr.State()
 
76
  examples=[
77
  [
78
  cur_dir + "/videos/3249402410.mp4",
79
+ "What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
80
+ "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
81
  ],
82
  [
83
  cur_dir + "/videos/4882821564.mp4",
84
+ "Why did the boy clap his hands when he ran to the christmas tree?",
85
+ "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
86
  ],
87
  [
88
  cur_dir + "/videos/6233408665.mp4",
89
+ "What did the people on the sofa do after the lady in pink finished singing?",
90
+ "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
91
  ],
92
  ],
93
+ inputs=[video, question_box, candidates_box],
94
  )
95
 
96
+ with gr.Column(scale=3):
97
  chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
98
  with gr.Row():
99
+ with gr.Column(scale=4):
100
+ question_box.render()
101
+ with gr.Column(scale=4):
102
+ candidates_box.render()
103
  with gr.Column(scale=1, min_width=50):
104
  submit_btn = gr.Button(
105
  value="Send", variant="primary", interactive=True
106
  )
107
 
108
+ submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])
 
109
 
110
  demo.launch(share=True)
model/__pycache__/deberta_moe.cpython-38.pyc CHANGED
Binary files a/model/__pycache__/deberta_moe.cpython-38.pyc and b/model/__pycache__/deberta_moe.cpython-38.pyc differ
 
model/deberta_moe.py CHANGED
@@ -684,7 +684,7 @@ def make_log_bucket_position(relative_pos, bucket_size, max_position):
684
  np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1))
685
  + mid
686
  )
687
- bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int)
688
  return bucket_pos
689
 
690
 
 
684
  np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1))
685
  + mid
686
  )
687
+ bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int64)
688
  return bucket_pos
689
 
690
 
new.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import ffmpeg
4
+
5
+ size = 224
6
+
7
+ def get_video_dim(video_path):
8
+ probe = ffmpeg.probe(video_path)
9
+ video_stream = next(
10
+ (stream for stream in probe["streams"] if stream["codec_type"] == "video"),
11
+ None,
12
+ )
13
+ width = int(video_stream["width"])
14
+ height = int(video_stream["height"])
15
+ num, denum = video_stream["avg_frame_rate"].split("/")
16
+ frame_rate = int(num) / int(denum)
17
+ return height, width, frame_rate
18
+
19
+ def get_output_dim(self, h, w):
20
+ if isinstance(self.size, tuple) and len(self.size) == 2:
21
+ return self.size
22
+ elif h >= w:
23
+ return int(h * self.size / w), self.size
24
+ else:
25
+ return self.size, int(w * self.size / h)
26
+
27
+ h, w, fr = get_video_dim(video_path)
28
+ height, width = get_output_dim(h, w)
29
+
30
+ cmd = (
31
+ ffmpeg.input(video_path)
32
+ .filter("fps", fps=1)
33
+ .filter("scale", width, height)
34
+ )
35
+
36
+ x = int((width - size) / 2.0)
37
+ y = int((height - size) / 2.0)
38
+ cmd = cmd.crop(x, y, size, size)
39
+ out, _ = cmd.output("pipe:", format="rawvideo", pix_fmt="rgb24").run(
40
+ capture_stdout=True, quiet=True
41
+ )
42
+
43
+ height, width = 224, 224
44
+ video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
45
+ video = torch.from_numpy(video.astype("float32"))
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt CHANGED
@@ -15,7 +15,7 @@ email_validator==2.2.0
15
  exceptiongroup==1.2.2
16
  fastapi==0.111.0
17
  fastapi-cli==0.0.4
18
- ffmpeg==1.4
19
  ffmpy==0.3.2
20
  filelock==3.15.4
21
  fonttools==4.53.1
@@ -104,3 +104,4 @@ wcwidth==0.2.13
104
  websockets==11.0.3
105
  Werkzeug==3.0.3
106
  zipp==3.19.2
 
 
15
  exceptiongroup==1.2.2
16
  fastapi==0.111.0
17
  fastapi-cli==0.0.4
18
+ ffmpeg-python==0.2.0
19
  ffmpy==0.3.2
20
  filelock==3.15.4
21
  fonttools==4.53.1
 
104
  websockets==11.0.3
105
  Werkzeug==3.0.3
106
  zipp==3.19.2
107
+ decord