PeiqingYang commited on
Commit
2f89268
β€’
1 Parent(s): 5c532b1

add highlight guidance, set memory limit

Browse files
web-demos/hugging_face/app.py CHANGED
@@ -71,56 +71,65 @@ def get_frames_from_video(video_input, video_state):
71
  video_path = video_input
72
  frames = []
73
  user_name = time.time()
74
- operation_log = [("",""),("Video uploaded! Try to click the image shown in step2 to add masks.","Normal")]
 
75
  try:
76
  cap = cv2.VideoCapture(video_path)
77
  fps = cap.get(cv2.CAP_PROP_FPS)
78
- while cap.isOpened():
 
 
 
79
  ret, frame = cap.read()
80
  if ret == True:
81
- current_memory_usage = psutil.virtual_memory().percent
 
 
82
  frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
83
- # if current_memory_usage > 90:
84
- # operation_log = [("Memory usage is too high (>90%). Stop the video extraction. Please reduce the video resolution or frame rate.", "Error")]
85
- # print("Memory usage is too high (>90%). Please reduce the video resolution or frame rate.")
86
- # break
87
- else:
88
- break
89
-
90
- # TODO: hard code to avoid out of memory
91
- t, h, w = len(frames), frames[0].shape[0], frames[0].shape[1]
92
- print(f'Inp video shape: t_{t}, s_{h}x_{w}')
93
- if len(frames) > 150 and max(frames[0].shape) > 1024:
94
- raise ValueError('Due to GPU memory constraints, the current version of this demo supports videos \
95
- with a maximum length of 150 and a maximum resolution of 1024. \
96
- We will continue to optimize it after the CVPR 2024 deadline. \
97
- Please stay tuned!')
98
-
99
  except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
 
100
  print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
101
- image_size = (frames[0].shape[0],frames[0].shape[1])
102
  # initialize video_state
 
 
 
103
  video_state = {
104
  "user_name": user_name,
105
  "video_name": os.path.split(video_path)[-1],
106
  "origin_images": frames,
107
  "painted_images": frames.copy(),
108
- "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
109
  "logits": [None]*len(frames),
110
  "select_frame_number": 0,
111
  "fps": fps
112
  }
113
- video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
114
  model.samcontroler.sam_controler.reset_image()
115
  model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
116
- return video_state, video_info, video_state["origin_images"][0], gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=True, maximum=len(frames), value=len(frames)), \
117
- gr.update(visible=True), gr.update(visible=True), \
118
- gr.update(visible=True), gr.update(visible=True),\
119
- gr.update(visible=True), gr.update(visible=True), \
120
- gr.update(visible=True), gr.update(visible=True), \
121
- gr.update(visible=True), gr.update(visible=True), \
122
- gr.update(visible=True), gr.update(visible=True, choices=[], value=[]), \
123
- gr.update(visible=True, value=operation_log), gr.update(visible=True, value=operation_log)
124
 
125
  # get the select frame from gradio slider
126
  def select_template(image_selection_slider, video_state, interactive_state, mask_dropdown):
@@ -175,7 +184,10 @@ def sam_refine(video_state, point_prompt, click_state, interactive_state, evt:gr
175
  video_state["logits"][video_state["select_frame_number"]] = logit
176
  video_state["painted_images"][video_state["select_frame_number"]] = painted_image
177
 
178
- operation_log = [("",""), ("You can try to add positive or negative points by clicking, click Clear clicks button to refresh the image, click Add mask button when you are satisfied with the segment, or click Remove mask button to remove all added masks.","Normal")]
 
 
 
179
  return painted_image, video_state, interactive_state, operation_log, operation_log
180
 
181
  def add_multi_mask(video_state, interactive_state, mask_dropdown):
@@ -326,7 +338,7 @@ def generate_video_from_frames(frames, output_path, fps=30):
326
  return output_path
327
 
328
  def restart():
329
- operation_log = [("",""), ("Try to upload your video and click the Get video info button to get started!", "Normal")]
330
  return {
331
  "user_name": "",
332
  "video_name": "",
@@ -423,6 +435,7 @@ span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; co
423
  button {border-radius: 8px !important;}
424
  .add_button {background-color: #4CAF50 !important;}
425
  .remove_button {background-color: #f44336 !important;}
 
426
  .mask_button_group {gap: 10px !important;}
427
  .video {height: 300px !important;}
428
  .image {height: 300px !important;}
@@ -512,7 +525,8 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=css) as iface:
512
  video_input = gr.Video(elem_classes="video")
513
  extract_frames_button = gr.Button(value="Get video info", interactive=True, variant="primary")
514
  with gr.Column(scale=2):
515
- run_status = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get svideo info button to get started!", "Normal")])
 
516
  video_info = gr.Textbox(label="Video Info")
517
 
518
 
@@ -524,12 +538,10 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=css) as iface:
524
  image_selection_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track start frame", visible=False)
525
  track_pause_number_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track end frame", visible=False)
526
  with gr.Column(scale=2, elem_classes="jc_center"):
527
- run_status2 = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get svideo info button to get started!", "Normal")], visible=False)
528
- with gr.Row():
529
- with gr.Column(scale=2, elem_classes="mask_button_group"):
530
- clear_button_click = gr.Button(value="Clear clicks", interactive=True, visible=False)
531
- remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False, elem_classes="remove_button")
532
- Add_mask_button = gr.Button(value="Add mask", interactive=True, visible=False, elem_classes="add_button")
533
  point_prompt = gr.Radio(
534
  choices=["Positive", "Negative"],
535
  value="Positive",
@@ -537,7 +549,11 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=css) as iface:
537
  interactive=True,
538
  visible=False,
539
  min_width=100,
540
- scale=1)
 
 
 
 
541
  mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask selection", info=".", visible=False)
542
 
543
  # output video
 
71
  video_path = video_input
72
  frames = []
73
  user_name = time.time()
74
+ status_ok = True
75
+ operation_log = [("[Must Do]", "Click image"), (": Video uploaded! Try to click the image shown in step2 to add masks.\n", None)]
76
  try:
77
  cap = cv2.VideoCapture(video_path)
78
  fps = cap.get(cv2.CAP_PROP_FPS)
79
+ length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
80
+
81
+ if length >= 500:
82
+ operation_log = [("You uploaded a video with more than 500 frames. Stop the video extraction. Kindly lower the video frame rate to a value below 500. We highly recommend deploying the demo locally for long video processing.", "Error")]
83
  ret, frame = cap.read()
84
  if ret == True:
85
+ original_h, original_w = frame.shape[:2]
86
+ scale_factor = min(1, 1280/max(original_h, original_w))
87
+ target_h, target_w = int(original_h*scale_factor), int(original_w*scale_factor)
88
  frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
89
+ status_ok = False
90
+ else:
91
+ while cap.isOpened():
92
+ ret, frame = cap.read()
93
+ if ret == True:
94
+ # resize input image
95
+ original_h, original_w = frame.shape[:2]
96
+ scale_factor = min(1, 1280/max(original_h, original_w))
97
+ target_h, target_w = int(original_h*scale_factor), int(original_w*scale_factor)
98
+ frame = cv2.resize(frame, (target_w, target_h))
99
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
100
+ else:
101
+ break
102
+ t = len(frames)
103
+ print(f'Inp video shape: t_{t}, s_{original_h}x{original_w} to s_{target_h}x{target_w}')
 
104
  except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
105
+ status_ok = False
106
  print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
107
+
108
  # initialize video_state
109
+ if frames[0].shape[0] > 720 or frames[0].shape[1] > 720:
110
+ operation_log = [(f"Video uploaded! Try to click the image shown in step2 to add masks. (You uploaded a video with a size of {original_w}x{original_h}, and the length of its longest edge exceeds 720 pixels. We may resize the input video during processing.)", "Normal")]
111
+
112
  video_state = {
113
  "user_name": user_name,
114
  "video_name": os.path.split(video_path)[-1],
115
  "origin_images": frames,
116
  "painted_images": frames.copy(),
117
+ "masks": [np.zeros((target_h, target_w), np.uint8)]*len(frames),
118
  "logits": [None]*len(frames),
119
  "select_frame_number": 0,
120
  "fps": fps
121
  }
122
+ video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), length, (original_w, original_h))
123
  model.samcontroler.sam_controler.reset_image()
124
  model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
125
+ return video_state, video_info, video_state["origin_images"][0], gr.update(visible=status_ok, maximum=len(frames), value=1), gr.update(visible=status_ok, maximum=len(frames), value=len(frames)), \
126
+ gr.update(visible=status_ok), gr.update(visible=status_ok), \
127
+ gr.update(visible=status_ok), gr.update(visible=status_ok),\
128
+ gr.update(visible=status_ok), gr.update(visible=status_ok), \
129
+ gr.update(visible=status_ok), gr.update(visible=status_ok), \
130
+ gr.update(visible=status_ok), gr.update(visible=status_ok), \
131
+ gr.update(visible=status_ok), gr.update(visible=status_ok, choices=[], value=[]), \
132
+ gr.update(visible=True, value=operation_log), gr.update(visible=status_ok, value=operation_log)
133
 
134
  # get the select frame from gradio slider
135
  def select_template(image_selection_slider, video_state, interactive_state, mask_dropdown):
 
184
  video_state["logits"][video_state["select_frame_number"]] = logit
185
  video_state["painted_images"][video_state["select_frame_number"]] = painted_image
186
 
187
+ operation_log = [("[Must Do]", "Add mask"), (": add the current displayed mask for video segmentation.\n", None),
188
+ ("[Optional]", "Remove mask"), (": remove all added masks.\n", None),
189
+ ("[Optional]", "Clear clicks"), (": clear current displayed mask.\n", None),
190
+ ("[Optional]", "Click image"), (": Try to click the image shown in step2 if you want to generate more masks.\n", None)]
191
  return painted_image, video_state, interactive_state, operation_log, operation_log
192
 
193
  def add_multi_mask(video_state, interactive_state, mask_dropdown):
 
338
  return output_path
339
 
340
  def restart():
341
+ operation_log = [("",""), ("Try to upload your video and click the Get video info button to get started! (Kindly ensure that the uploaded video consists of fewer than 500 frames in total)", "Normal")]
342
  return {
343
  "user_name": "",
344
  "video_name": "",
 
435
  button {border-radius: 8px !important;}
436
  .add_button {background-color: #4CAF50 !important;}
437
  .remove_button {background-color: #f44336 !important;}
438
+ .clear_button {background-color: gray !important;}
439
  .mask_button_group {gap: 10px !important;}
440
  .video {height: 300px !important;}
441
  .image {height: 300px !important;}
 
525
  video_input = gr.Video(elem_classes="video")
526
  extract_frames_button = gr.Button(value="Get video info", interactive=True, variant="primary")
527
  with gr.Column(scale=2):
528
+ run_status = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get video info button to get started! (Kindly ensure that the uploaded video consists of fewer than 500 frames in total)", "Normal")],
529
+ color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"})
530
  video_info = gr.Textbox(label="Video Info")
531
 
532
 
 
538
  image_selection_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track start frame", visible=False)
539
  track_pause_number_slider = gr.Slider(minimum=1, maximum=100, step=1, value=1, label="Track end frame", visible=False)
540
  with gr.Column(scale=2, elem_classes="jc_center"):
541
+ run_status2 = gr.HighlightedText(value=[("",""), ("Try to upload your video and click the Get video info button to get started! (Kindly ensure that the uploaded video consists of fewer than 500 frames in total)", "Normal")],
542
+ color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"},
543
+ visible=False)
544
+ with gr.Column():
 
 
545
  point_prompt = gr.Radio(
546
  choices=["Positive", "Negative"],
547
  value="Positive",
 
549
  interactive=True,
550
  visible=False,
551
  min_width=100,
552
+ scale=1,)
553
+ with gr.Row(scale=2, elem_classes="mask_button_group"):
554
+ Add_mask_button = gr.Button(value="Add mask", interactive=True, visible=False, elem_classes="add_button")
555
+ remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False, elem_classes="remove_button")
556
+ clear_button_click = gr.Button(value="Clear clicks", interactive=True, visible=False, elem_classes="clear_button")
557
  mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask selection", info=".", visible=False)
558
 
559
  # output video
web-demos/hugging_face/inpainter/base_inpainter.py CHANGED
@@ -205,7 +205,7 @@ class ProInpainter:
205
  # The ouput size should be divided by 2 so that it can encoded by libx264
206
  size = (int(ratio*size[0])//2*2, int(ratio*size[1])//2*2)
207
 
208
- # TODO: hard code to reduce memory
209
  if max(size[0], size[1]) > 720:
210
  scale = 720.0 / max(size[0], size[1])
211
  # The ouput size should be divided by 2 so that it can encoded by libx264
 
205
  # The ouput size should be divided by 2 so that it can encoded by libx264
206
  size = (int(ratio*size[0])//2*2, int(ratio*size[1])//2*2)
207
 
208
+ # set propainter size limit to 720 to reduce memory usage
209
  if max(size[0], size[1]) > 720:
210
  scale = 720.0 / max(size[0], size[1])
211
  # The ouput size should be divided by 2 so that it can encoded by libx264
web-demos/hugging_face/track_anything.py CHANGED
@@ -17,7 +17,7 @@ class TrackingAnything():
17
  mask, logit, painted_image = self.samcontroler.first_frame_click(image, points, labels, multimask)
18
  return mask, logit, painted_image
19
 
20
- def generator(self, images: list, template_mask:np.ndarray):
21
  masks = []
22
  logits = []
23
  painted_images = []
@@ -31,7 +31,7 @@ class TrackingAnything():
31
  mask, logit, painted_image = self.cutie.track(images[i])
32
  masks.append(mask)
33
  logits.append(logit)
34
- painted_images.append(painted_image)
35
  return masks, logits, painted_images
36
 
37
 
 
17
  mask, logit, painted_image = self.samcontroler.first_frame_click(image, points, labels, multimask)
18
  return mask, logit, painted_image
19
 
20
+ def generator(self, images: list, template_mask:np.ndarray, size_limit=1024):
21
  masks = []
22
  logits = []
23
  painted_images = []
 
31
  mask, logit, painted_image = self.cutie.track(images[i])
32
  masks.append(mask)
33
  logits.append(logit)
34
+ painted_images.append(painted_image)
35
  return masks, logits, painted_images
36
 
37