atwang commited on
Commit
6d737eb
1 Parent(s): 5ceacf4

local app demo is working

Browse files
README.md CHANGED
@@ -11,3 +11,43 @@ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Installation
16
+
17
+ To setup the environment, run the following (recommended in a virtual environment):
18
+ ```
19
+ # install base requirements
20
+ pip install -r pre-requirements.txt
21
+ pip install -r requirements.txt
22
+
23
+ cd mask2former/modeling/pixel_decoder/ops
24
+ python setup.py build install
25
+
26
+ # Option A: running locally only
27
+ pip install open3d==0.17.0
28
+
29
+ # Option B: running over ssh connection / headless environment
30
+ # in a separate folder
31
+ git clone https://github.com/isl-org/Open3D.git
32
+ cd Open3D/
33
+ mkdir build && cd build
34
+ cmake -DENABLE_HEADLESS_RENDERING=ON -DBUILD_GUI=OFF -DBUILD_WEBRTC=OFF -DUSE_SYSTEM_GLEW=OFF -DUSE_SYSTEM_GLFW=OFF ..
35
+ make -j$(nproc)
36
+ make install-pip-package
37
+ # to test custom build
38
+ cd ../examples/python/visualization/
39
+ python headless_rendering.py
40
+ ```
41
+
42
+ The setup with pre-requirements.txt and requirements.txt resolves the issue that certain packages need to be installed
43
+ prior to others. By default, most additional packages should be added to requirements.txt.
44
+
45
+ ## Usage
46
+ To startup the application locally, run
47
+ ```
48
+ gradio app.py
49
+ ```
50
+
51
+ You can view the app on the specified port (usually 7860). To run over an ssh connection, setup port forwarding using
52
+ `-L 7860:localhost:7860` when you create your ssh connection. Note that you will need to install Open3D in headless
53
+ rendering for this to work.
app.py CHANGED
@@ -14,7 +14,7 @@ from inference import main, setup_cfg
14
 
15
  # internal settings
16
  NUM_PROCESSES = 1
17
- CROP = True
18
  SCORE_THRESHOLD = 0.8
19
  MAX_PARTS = 5
20
  ARGS = SimpleNamespace(
@@ -24,6 +24,7 @@ ARGS = SimpleNamespace(
24
  output=".output",
25
  cpu=True,
26
  )
 
27
 
28
  outputs = []
29
 
@@ -52,16 +53,6 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
52
  images[file].append(os.path.join(sub_path, image_file))
53
  return images
54
 
55
- def get_generator(images):
56
- def gen():
57
- while True:
58
- for im in images:
59
- time.sleep(0.025)
60
- yield im
61
- time.sleep(3)
62
-
63
- return gen
64
-
65
  # clear old predictions
66
  for path in os.listdir(ARGS.output):
67
  full_path = os.path.join(ARGS.output, path)
@@ -89,15 +80,32 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
89
  # process output
90
  # TODO: may want to select these in decreasing order of score
91
  image_files = find_images(ARGS.output)
92
- output = []
93
  for count, part in enumerate(image_files):
94
  if count < MAX_PARTS:
95
- # output.append(gr.update(value=get_generator([Image.open(im) for im in image_files[part]]), visible=True))
96
- output.append(get_generator([Image.open(im) for im in image_files[part]]))
97
- # while len(output) < MAX_PARTS:
98
- # output.append(gr.update(visible=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- yield from output[0]()
 
 
 
101
 
102
 
103
  with gr.Blocks() as demo:
@@ -145,7 +153,7 @@ with gr.Blocks() as demo:
145
  interactive=True,
146
  )
147
  num_samples = gr.Number(
148
- value=10,
149
  label="Number of samples",
150
  show_label=True,
151
  interactive=True,
@@ -154,16 +162,28 @@ with gr.Blocks() as demo:
154
  maximum=20,
155
  )
156
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  submit_btn = gr.Button("Run model")
158
 
159
  # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
160
  # identified.
161
- # images = [gr.Image(type="pil", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
162
- image = gr.Image(type="pil", visible=True)
 
163
 
164
- # TODO: maybe need to use a queue here so we don't overload the instance
165
  submit_btn.click(
166
- fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=image, api_name="run_model"
167
  )
168
 
169
  demo.queue(api_open=False)
 
14
 
15
  # internal settings
16
  NUM_PROCESSES = 1
17
+ CROP = False
18
  SCORE_THRESHOLD = 0.8
19
  MAX_PARTS = 5
20
  ARGS = SimpleNamespace(
 
24
  output=".output",
25
  cpu=True,
26
  )
27
+ NUM_SAMPLES = 10
28
 
29
  outputs = []
30
 
 
53
  images[file].append(os.path.join(sub_path, image_file))
54
  return images
55
 
 
 
 
 
 
 
 
 
 
 
56
  # clear old predictions
57
  for path in os.listdir(ARGS.output):
58
  full_path = os.path.join(ARGS.output, path)
 
80
  # process output
81
  # TODO: may want to select these in decreasing order of score
82
  image_files = find_images(ARGS.output)
83
+ outputs = []
84
  for count, part in enumerate(image_files):
85
  if count < MAX_PARTS:
86
+ outputs.append([Image.open(im) for im in image_files[part]])
87
+
88
+ return [
89
+ *[gr.update(value=out[0], visible=True) for out in outputs],
90
+ *[gr.update(visible=False) for _ in range(MAX_PARTS - len(outputs))],
91
+ ]
92
+
93
+
94
+ def get_trigger(idx: int, fps: int = 40, oscillate: bool = True):
95
+ def iter_images(*args, **kwargs):
96
+ if idx < len(outputs):
97
+ for im in outputs[idx]:
98
+ time.sleep(1.0 / fps)
99
+ yield im
100
+ if oscillate:
101
+ for im in reversed(outputs[idx]):
102
+ time.sleep(1.0 / fps)
103
+ yield im
104
 
105
+ else:
106
+ raise ValueError("Could not find any images to load into this module.")
107
+
108
+ return iter_images
109
 
110
 
111
  with gr.Blocks() as demo:
 
153
  interactive=True,
154
  )
155
  num_samples = gr.Number(
156
+ value=NUM_SAMPLES,
157
  label="Number of samples",
158
  show_label=True,
159
  interactive=True,
 
162
  maximum=20,
163
  )
164
 
165
+ examples = gr.Examples(
166
+ examples=[
167
+ ["examples/59-4860.png", "examples/59-4860_d.png"],
168
+ ["examples/174-8460.png", "examples/174-8460_d.png"],
169
+ ["examples/187-0.png", "examples/187-0_d.png"],
170
+ ["examples/187-23040.png", "examples/187-23040_d.png"],
171
+ ],
172
+ inputs=[rgb_image, depth_image],
173
+ api_name=False,
174
+ examples_per_page=2,
175
+ )
176
+
177
  submit_btn = gr.Button("Run model")
178
 
179
  # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
180
  # identified.
181
+ images = [gr.Image(type="pil", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
182
+ for idx, image_comp in enumerate(images):
183
+ image_comp.select(get_trigger(idx), inputs=[], outputs=image_comp, api_name=False)
184
 
 
185
  submit_btn.click(
186
+ fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=images, api_name=False
187
  )
188
 
189
  demo.queue(api_open=False)
dev-requirements.txt DELETED
@@ -1,3 +0,0 @@
1
- black==23.9.1
2
- gradio==3.44.3
3
- huggingface-hub==0.17.2
 
 
 
 
examples/174-8460.png ADDED
examples/174-8460_d.png ADDED
examples/187-0.png ADDED
examples/187-0_d.png ADDED
examples/187-23040.png ADDED
examples/187-23040_d.png ADDED
inference.py CHANGED
@@ -19,7 +19,6 @@ import argparse
19
  import logging
20
  import os
21
  import time
22
- from copy import deepcopy
23
  from typing import Any
24
 
25
  import imageio
@@ -34,13 +33,19 @@ from detectron2.projects.deeplab import add_deeplab_config
34
  from detectron2.structures import instances
35
  from detectron2.utils import comm
36
  from detectron2.utils.logger import setup_logger
37
- from PIL import Image, ImageChops
38
 
39
  from mask2former import (
40
  add_maskformer2_config,
41
  add_motionnet_config,
42
  )
43
  from utilities import prediction_to_json
 
 
 
 
 
 
 
44
 
45
  # import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint, in order to
46
  # replicate model loading without the overhead of setting up an OPDTrainer
@@ -63,9 +68,7 @@ TYPE_CLASSIFICATION = {
63
  1: "translation",
64
  }
65
 
66
- POINT_COLOR = [1, 0, 0] # red for demonstration
67
  ARROW_COLOR = [0, 1, 0] # green
68
- IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
69
 
70
 
71
  def get_parser() -> argparse.ArgumentParser:
@@ -336,348 +339,6 @@ def predict(model: nn.Module, inp: list[dict[str, Any]]) -> list[dict[str, insta
336
  return out
337
 
338
 
339
- def generate_rotation_visualization(
340
- pcd: o3d.geometry.PointCloud,
341
- axis_arrow: o3d.geometry.TriangleMesh,
342
- mask: np.ndarray,
343
- axis_vector: np.ndarray,
344
- origin: np.ndarray,
345
- range_min: float,
346
- range_max: float,
347
- num_samples: int,
348
- output_dir: str,
349
- ) -> None:
350
- """
351
- Generate visualization files for a rotation motion of a part.
352
-
353
- :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
354
- :param axis_arrow: mesh object representing axis arrow of rotation to be rendered in visualization
355
- :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
356
- :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
357
- :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
358
- :param range_min: float representing the minimum range of motion in radians
359
- :param range_max: float representing the maximum range of motion in radians
360
- :param num_samples: number of sample states to visualize in between range_min and range_max of motion
361
- :param output_dir: string path to directory in which to save visualization output
362
- """
363
- angle_in_radians = np.linspace(range_min, range_max, num_samples)
364
- angles_in_degrees = angle_in_radians * 180 / np.pi
365
-
366
- for idx, angle_in_degrees in enumerate(angles_in_degrees):
367
- # Make a copy of your original point cloud and arrow for each rotation
368
- rotated_pcd = deepcopy(pcd)
369
- rotated_arrow = deepcopy(axis_arrow)
370
-
371
- angle_rad = np.radians(angle_in_degrees)
372
- rotated_pcd = rotate_part(rotated_pcd, mask, axis_vector, origin, angle_rad)
373
-
374
- # Create a Visualizer object for each rotation
375
- vis = o3d.visualization.Visualizer()
376
- vis.create_window()
377
-
378
- # Add the rotated geometries
379
- vis.add_geometry(rotated_pcd)
380
- vis.add_geometry(rotated_arrow)
381
-
382
- # Apply the additional rotation around x-axis if desired
383
- angle_x = np.pi * 5.5 / 5 # 198 degrees
384
- rotation_matrix = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
385
- rotated_pcd.rotate(rotation_matrix, center=rotated_pcd.get_center())
386
- rotated_arrow.rotate(rotation_matrix, center=rotated_pcd.get_center())
387
-
388
- # Capture and save the image
389
- output_filename = f"{output_dir}/{idx}.png"
390
- vis.capture_screen_image(output_filename, do_render=True)
391
- vis.destroy_window()
392
-
393
-
394
- def generate_translation_visualization(
395
- pcd: o3d.geometry.PointCloud,
396
- axis_arrow: o3d.geometry.TriangleMesh,
397
- mask: np.ndarray,
398
- end: np.ndarray,
399
- range_min: float,
400
- range_max: float,
401
- num_samples: int,
402
- output_dir: str,
403
- ) -> None:
404
- """
405
- Generate visualization files for a translation motion of a part.
406
-
407
- :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
408
- :param axis_arrow: mesh object representing axis arrow of translation to be rendered in visualization
409
- :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
410
- :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
411
- :param origin: np.array of dimensions (3, ) representing the origin point of the axis of translation
412
- :param range_min: float representing the minimum range of motion
413
- :param range_max: float representing the maximum range of motion
414
- :param num_samples: number of sample states to visualize in between range_min and range_max of motion
415
- :param output_dir: string path to directory in which to save visualization output
416
- """
417
- translate_distances = np.linspace(range_min, range_max, num_samples)
418
- for idx, translate_distance in enumerate(translate_distances):
419
- translated_pcd = deepcopy(pcd)
420
- translated_arrow = deepcopy(axis_arrow)
421
-
422
- translated_pcd = translate_part(translated_pcd, mask, end, translate_distance.item())
423
-
424
- # Create a Visualizer object for each rotation
425
- vis = o3d.visualization.Visualizer()
426
- vis.create_window()
427
-
428
- # Add the translated geometries
429
- vis.add_geometry(translated_pcd)
430
- vis.add_geometry(translated_arrow)
431
-
432
- # Apply the additional rotation around x-axis if desired
433
- # TODO: not sure why we need this rotation for the translation, and when it would be desired
434
- angle_x = np.pi * 5.5 / 5 # 198 degrees
435
- R = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
436
- translated_pcd.rotate(R, center=translated_pcd.get_center())
437
- translated_arrow.rotate(R, center=translated_pcd.get_center())
438
-
439
- # Capture and save the image
440
- output_filename = f"{output_dir}/{idx}.png"
441
- vis.capture_screen_image(output_filename, do_render=True)
442
- vis.destroy_window()
443
-
444
-
445
- def get_rotation_matrix_from_vectors(vec1: np.ndarray, vec2: np.ndarray) -> np.ndarray:
446
- """
447
- Find the rotation matrix that aligns vec1 to vec2
448
-
449
- :param vec1: A 3d "source" vector
450
- :param vec2: A 3d "destination" vector
451
- :return: A transform matrix (3x3) which when applied to vec1, aligns it with vec2.
452
- """
453
- a, b = (vec1 / np.linalg.norm(vec1)).reshape(3), (vec2 / np.linalg.norm(vec2)).reshape(3)
454
- v = np.cross(a, b)
455
- c = np.dot(a, b)
456
- s = np.linalg.norm(v)
457
- kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
458
- rotation_matrix = np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2))
459
- return rotation_matrix
460
-
461
-
462
- def draw_line(start_point: np.ndarray, end_point: np.ndarray) -> o3d.geometry.TriangleMesh:
463
- """
464
- Generate 3D mesh representing axis from start_point to end_point.
465
-
466
- :param start_point: np.ndarray of dimensions (3, ) representing the start point of the axis
467
- :param end_point: np.ndarray of dimensions (3, ) representing the end point of the axis
468
- :return: mesh object representing axis from start to end
469
- """
470
- # Compute direction vector and normalize it
471
- direction_vector = end_point - start_point
472
- normalized_vector = direction_vector / np.linalg.norm(direction_vector)
473
-
474
- # Compute the rotation matrix to align the Z-axis with the desired direction
475
- target_vector = np.array([0, 0, 1])
476
- rot_mat = get_rotation_matrix_from_vectors(target_vector, normalized_vector)
477
-
478
- # Create the cylinder (shaft of the arrow)
479
- cylinder_length = 0.9 # 90% of the total arrow length, you can adjust as needed
480
- cylinder_radius = 0.01 # Adjust the thickness of the arrow shaft
481
- cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius=cylinder_radius, height=cylinder_length)
482
-
483
- # Move base of cylinder to origin, rotate, then translate to start_point
484
- cylinder.translate([0, 0, 0])
485
- cylinder.rotate(rot_mat, center=[0, 0, 0])
486
- cylinder.translate(start_point)
487
-
488
- # Create the cone (head of the arrow)
489
- cone_height = 0.1 # 10% of the total arrow length, adjust as needed
490
- cone_radius = 0.03 # Adjust the size of the arrowhead
491
- cone = o3d.geometry.TriangleMesh.create_cone(radius=cone_radius, height=cone_height)
492
-
493
- # Move base of cone to origin, rotate, then translate to end of cylinder
494
- cone.translate([-0, 0, 0])
495
- cone.rotate(rot_mat, center=[0, 0, 0])
496
- cone.translate(start_point + normalized_vector * 0.4)
497
-
498
- arrow = cylinder + cone
499
- return arrow
500
-
501
-
502
- def rotate_part(
503
- pcd: o3d.geometry.PointCloud, mask: np.ndarray, axis_vector: np.ndarray, origin: np.ndarray, angle_rad: float
504
- ) -> o3d.geometry.PointCloud:
505
- """
506
- Generate rotated point cloud of mask based on provided angle around axis.
507
-
508
- :param pcd: point cloud object representing points of image
509
- :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
510
- :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
511
- :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
512
- :param angle_rad: angle in radians to rotate mask part
513
- :return: point cloud object after rotation of masked part
514
- """
515
- # Get the coordinates of the point cloud as a numpy array
516
- points_np = np.asarray(pcd.points)
517
-
518
- # Convert point cloud colors to numpy array for easier manipulation
519
- colors_np = np.asarray(pcd.colors)
520
-
521
- # Create skew-symmetric matrix from end
522
- K = np.array(
523
- [
524
- [0, -axis_vector[2], axis_vector[1]],
525
- [axis_vector[2], 0, -axis_vector[0]],
526
- [-axis_vector[1], axis_vector[0], 0],
527
- ]
528
- )
529
-
530
- # Compute rotation matrix using Rodrigues' formula
531
- R = np.eye(3) + np.sin(angle_rad) * K + (1 - np.cos(angle_rad)) * np.dot(K, K)
532
-
533
- # Iterate over the mask and rotate the points corresponding to the object pixels
534
- for i in range(mask.shape[0]):
535
- for j in range(mask.shape[1]):
536
- if mask[i, j] > 0: # This condition checks if the pixel belongs to the object
537
- point_index = i * mask.shape[1] + j
538
-
539
- # Translate the point such that the rotation origin is at the world origin
540
- translated_point = points_np[point_index] - origin
541
-
542
- # Rotate the translated point
543
- rotated_point = np.dot(R, translated_point)
544
-
545
- # Translate the point back
546
- points_np[point_index] = rotated_point + origin
547
-
548
- colors_np[point_index] = POINT_COLOR
549
-
550
- # Update the point cloud's coordinates
551
- pcd.points = o3d.utility.Vector3dVector(points_np)
552
-
553
- # Update point cloud colors
554
- pcd.colors = o3d.utility.Vector3dVector(colors_np)
555
-
556
- return pcd
557
-
558
-
559
- def translate_part(pcd, mask, axis_vector, distance):
560
- """
561
- Generate translated point cloud of mask based on provided angle around axis.
562
-
563
- :param pcd: point cloud object representing points of image
564
- :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
565
- :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
566
- :param distance: distance within coordinate system to translate mask part
567
- :return: point cloud object after translation of masked part
568
- """
569
- normalized_vector = axis_vector / np.linalg.norm(axis_vector)
570
- translation_vector = normalized_vector * distance
571
-
572
- # Convert point cloud colors to numpy array for easier manipulation
573
- colors_np = np.asarray(pcd.colors)
574
-
575
- # Get the coordinates of the point cloud as a numpy array
576
- points_np = np.asarray(pcd.points)
577
-
578
- # Iterate over the mask and assign the color to the points corresponding to the object pixels
579
- for i in range(mask.shape[0]):
580
- for j in range(mask.shape[1]):
581
- if mask[i, j] > 0: # This condition checks if the pixel belongs to the object
582
- point_index = i * mask.shape[1] + j
583
- colors_np[point_index] = POINT_COLOR
584
- points_np[point_index] += translation_vector
585
-
586
- # Update point cloud colors
587
- pcd.colors = o3d.utility.Vector3dVector(colors_np)
588
-
589
- # Update the point cloud's coordinates
590
- pcd.points = o3d.utility.Vector3dVector(points_np)
591
-
592
- return pcd
593
-
594
-
595
- def batch_trim(images_path: str, save_path: str, identical: bool = False) -> None:
596
- """
597
- Trim white spaces from all images in the given path and save new images to folder.
598
-
599
- :param images_path: local path to folder containing all images. Images must have the extension ".png", ".jpg", or
600
- ".jpeg".
601
- :param save_path: local path to folder in which to save trimmed images
602
- :param identical: if True, will apply same crop to all images, else each image will have its whitespace trimmed
603
- independently. Note that in the latter case, each image may have a slightly different size.
604
- """
605
-
606
- def get_trim(im):
607
- """Trim whitespace from an image and return the cropped image."""
608
- bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
609
- diff = ImageChops.difference(im, bg)
610
- diff = ImageChops.add(diff, diff, 2.0, -100)
611
- bbox = diff.getbbox()
612
- return bbox
613
-
614
- if identical: #
615
- images = []
616
- optimal_box = None
617
-
618
- # load all images
619
- for image_file in sorted(os.listdir(images_path)):
620
- if image_file.endswith(IMAGE_EXTENSIONS):
621
- image_path = os.path.join(images_path, image_file)
622
- images.append(Image.open(image_path))
623
-
624
- # find optimal box size
625
- for im in images:
626
- bbox = get_trim(im)
627
- if bbox is None:
628
- bbox = (0, 0, im.size[0], im.size[1]) # bound entire image
629
-
630
- if optimal_box is None:
631
- optimal_box = bbox
632
- else:
633
- optimal_box = (
634
- min(optimal_box[0], bbox[0]),
635
- min(optimal_box[1], bbox[1]),
636
- max(optimal_box[2], bbox[2]),
637
- max(optimal_box[3], bbox[3]),
638
- )
639
-
640
- # apply cropping, if optimal box was found
641
- for idx, im in enumerate(images):
642
- im.crop(optimal_box)
643
- im.save(os.path.join(save_path, f"{idx}.png"))
644
- im.close()
645
-
646
- else: # trim each image separately
647
- for image_file in os.listdir(images_path):
648
- if image_file.endswith(IMAGE_EXTENSIONS):
649
- image_path = os.path.join(images_path, image_file)
650
- with Image.open(image_path) as im:
651
- bbox = get_trim(im)
652
- trimmed = im.crop(bbox) if bbox else im
653
- trimmed.save(os.path.join(save_path, image_file))
654
-
655
-
656
- def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "output.gif") -> None:
657
- """
658
- Create gif out of folder of images and save to file.
659
-
660
- :param image_folder_path: path to folder containing images (non-recursive). Assumes images are named as {i}.png for
661
- each of i from 0 to num_samples.
662
- :param num_samples: number of sampled images to compile into gif.
663
- :param gif_filename: filename for gif, defaults to "output.gif"
664
- """
665
- # Generate a list of image filenames (assuming the images are saved as 0.png, 1.png, etc.)
666
- image_files = [f"{image_folder_path}/{i}.png" for i in range(num_samples)]
667
-
668
- # Read the images using imageio
669
- images = [imageio.imread(image_file) for image_file in image_files]
670
- assert all(
671
- images[0].shape == im.shape for im in images
672
- ), f"Found some images with a different shape: {[im.shape for im in images]}"
673
-
674
- # Save images as a gif
675
- gif_output_path = f"{image_folder_path}/{gif_filename}"
676
- imageio.mimsave(gif_output_path, images, duration=0.1)
677
-
678
- return
679
-
680
-
681
  def main(
682
  cfg: CfgNode,
683
  rgb_image: str,
 
19
  import logging
20
  import os
21
  import time
 
22
  from typing import Any
23
 
24
  import imageio
 
33
  from detectron2.structures import instances
34
  from detectron2.utils import comm
35
  from detectron2.utils.logger import setup_logger
 
36
 
37
  from mask2former import (
38
  add_maskformer2_config,
39
  add_motionnet_config,
40
  )
41
  from utilities import prediction_to_json
42
+ from visualization import (
43
+ draw_line,
44
+ generate_rotation_visualization,
45
+ generate_translation_visualization,
46
+ batch_trim,
47
+ create_gif,
48
+ )
49
 
50
  # import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint, in order to
51
  # replicate model loading without the overhead of setting up an OPDTrainer
 
68
  1: "translation",
69
  }
70
 
 
71
  ARROW_COLOR = [0, 1, 0] # green
 
72
 
73
 
74
  def get_parser() -> argparse.ArgumentParser:
 
339
  return out
340
 
341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  def main(
343
  cfg: CfgNode,
344
  rgb_image: str,
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  h5py==3.9.0
2
  imageio==2.31.3
3
- open3d==0.17.0
4
  opencv-python==4.8.0.76
5
  pandas==2.1.0
6
  pycocotools==2.0.7
@@ -8,5 +7,8 @@ scikit-image==0.21.0
8
  scikit-learn==1.3.0
9
  scipy==1.11.2
10
  timm==0.9.7
 
 
 
11
  detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
12
  -e mask2former/modeling/pixel_decoder/ops/
 
1
  h5py==3.9.0
2
  imageio==2.31.3
 
3
  opencv-python==4.8.0.76
4
  pandas==2.1.0
5
  pycocotools==2.0.7
 
7
  scikit-learn==1.3.0
8
  scipy==1.11.2
9
  timm==0.9.7
10
+ black==23.9.1
11
+ gradio==3.44.3
12
+ huggingface-hub==0.17.2
13
  detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
14
  -e mask2former/modeling/pixel_decoder/ops/
visualization.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from copy import deepcopy
3
+
4
+ import imageio
5
+ import open3d as o3d
6
+ import numpy as np
7
+ from PIL import Image, ImageChops
8
+
9
+ POINT_COLOR = [1, 0, 0] # red for demonstration
10
+ ARROW_COLOR = [0, 1, 0] # green
11
+ IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
12
+
13
+
14
+ def generate_rotation_visualization(
15
+ pcd: o3d.geometry.PointCloud,
16
+ axis_arrow: o3d.geometry.TriangleMesh,
17
+ mask: np.ndarray,
18
+ axis_vector: np.ndarray,
19
+ origin: np.ndarray,
20
+ range_min: float,
21
+ range_max: float,
22
+ num_samples: int,
23
+ output_dir: str,
24
+ ) -> None:
25
+ """
26
+ Generate visualization files for a rotation motion of a part.
27
+
28
+ :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
29
+ :param axis_arrow: mesh object representing axis arrow of rotation to be rendered in visualization
30
+ :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
31
+ :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
32
+ :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
33
+ :param range_min: float representing the minimum range of motion in radians
34
+ :param range_max: float representing the maximum range of motion in radians
35
+ :param num_samples: number of sample states to visualize in between range_min and range_max of motion
36
+ :param output_dir: string path to directory in which to save visualization output
37
+ """
38
+ angle_in_radians = np.linspace(range_min, range_max, num_samples)
39
+ angles_in_degrees = angle_in_radians * 180 / np.pi
40
+
41
+ for idx, angle_in_degrees in enumerate(angles_in_degrees):
42
+ # Make a copy of your original point cloud and arrow for each rotation
43
+ rotated_pcd = deepcopy(pcd)
44
+ rotated_arrow = deepcopy(axis_arrow)
45
+
46
+ angle_rad = np.radians(angle_in_degrees)
47
+ rotated_pcd = rotate_part(rotated_pcd, mask, axis_vector, origin, angle_rad)
48
+
49
+ # Create a Visualizer object for each rotation
50
+ vis = o3d.visualization.Visualizer()
51
+ vis.create_window(visible=False)
52
+
53
+ # Add the rotated geometries
54
+ vis.add_geometry(rotated_pcd)
55
+ vis.add_geometry(rotated_arrow)
56
+
57
+ # Apply the additional rotation around x-axis if desired
58
+ angle_x = np.pi * 5.5 / 5 # 198 degrees
59
+ rotation_matrix = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
60
+ rotated_pcd.rotate(rotation_matrix, center=rotated_pcd.get_center())
61
+ rotated_arrow.rotate(rotation_matrix, center=rotated_pcd.get_center())
62
+
63
+ # Capture and save the image
64
+ output_filename = f"{output_dir}/{idx}.png"
65
+ vis.capture_screen_image(output_filename, do_render=True)
66
+ vis.destroy_window()
67
+
68
+
69
+ def generate_translation_visualization(
70
+ pcd: o3d.geometry.PointCloud,
71
+ axis_arrow: o3d.geometry.TriangleMesh,
72
+ mask: np.ndarray,
73
+ end: np.ndarray,
74
+ range_min: float,
75
+ range_max: float,
76
+ num_samples: int,
77
+ output_dir: str,
78
+ ) -> None:
79
+ """
80
+ Generate visualization files for a translation motion of a part.
81
+
82
+ :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
83
+ :param axis_arrow: mesh object representing axis arrow of translation to be rendered in visualization
84
+ :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
85
+ :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
86
+ :param origin: np.array of dimensions (3, ) representing the origin point of the axis of translation
87
+ :param range_min: float representing the minimum range of motion
88
+ :param range_max: float representing the maximum range of motion
89
+ :param num_samples: number of sample states to visualize in between range_min and range_max of motion
90
+ :param output_dir: string path to directory in which to save visualization output
91
+ """
92
+ translate_distances = np.linspace(range_min, range_max, num_samples)
93
+ for idx, translate_distance in enumerate(translate_distances):
94
+ translated_pcd = deepcopy(pcd)
95
+ translated_arrow = deepcopy(axis_arrow)
96
+
97
+ translated_pcd = translate_part(translated_pcd, mask, end, translate_distance.item())
98
+
99
+ # Create a Visualizer object for each rotation
100
+ vis = o3d.visualization.Visualizer()
101
+ vis.create_window(visible=False)
102
+
103
+ # Add the translated geometries
104
+ vis.add_geometry(translated_pcd)
105
+ vis.add_geometry(translated_arrow)
106
+
107
+ # Apply the additional rotation around x-axis if desired
108
+ # TODO: not sure why we need this rotation for the translation, and when it would be desired
109
+ angle_x = np.pi * 5.5 / 5 # 198 degrees
110
+ R = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
111
+ translated_pcd.rotate(R, center=translated_pcd.get_center())
112
+ translated_arrow.rotate(R, center=translated_pcd.get_center())
113
+
114
+ # Capture and save the image
115
+ output_filename = f"{output_dir}/{idx}.png"
116
+ vis.capture_screen_image(output_filename, do_render=True)
117
+ vis.destroy_window()
118
+
119
+
120
+ def get_rotation_matrix_from_vectors(vec1: np.ndarray, vec2: np.ndarray) -> np.ndarray:
121
+ """
122
+ Find the rotation matrix that aligns vec1 to vec2
123
+
124
+ :param vec1: A 3d "source" vector
125
+ :param vec2: A 3d "destination" vector
126
+ :return: A transform matrix (3x3) which when applied to vec1, aligns it with vec2.
127
+ """
128
+ a, b = (vec1 / np.linalg.norm(vec1)).reshape(3), (vec2 / np.linalg.norm(vec2)).reshape(3)
129
+ v = np.cross(a, b)
130
+ c = np.dot(a, b)
131
+ s = np.linalg.norm(v)
132
+ kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
133
+ rotation_matrix = np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2))
134
+ return rotation_matrix
135
+
136
+
137
+ def draw_line(start_point: np.ndarray, end_point: np.ndarray) -> o3d.geometry.TriangleMesh:
138
+ """
139
+ Generate 3D mesh representing axis from start_point to end_point.
140
+
141
+ :param start_point: np.ndarray of dimensions (3, ) representing the start point of the axis
142
+ :param end_point: np.ndarray of dimensions (3, ) representing the end point of the axis
143
+ :return: mesh object representing axis from start to end
144
+ """
145
+ # Compute direction vector and normalize it
146
+ direction_vector = end_point - start_point
147
+ normalized_vector = direction_vector / np.linalg.norm(direction_vector)
148
+
149
+ # Compute the rotation matrix to align the Z-axis with the desired direction
150
+ target_vector = np.array([0, 0, 1])
151
+ rot_mat = get_rotation_matrix_from_vectors(target_vector, normalized_vector)
152
+
153
+ # Create the cylinder (shaft of the arrow)
154
+ cylinder_length = 0.9 # 90% of the total arrow length, you can adjust as needed
155
+ cylinder_radius = 0.01 # Adjust the thickness of the arrow shaft
156
+ cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius=cylinder_radius, height=cylinder_length)
157
+
158
+ # Move base of cylinder to origin, rotate, then translate to start_point
159
+ cylinder.translate([0, 0, 0])
160
+ cylinder.rotate(rot_mat, center=[0, 0, 0])
161
+ cylinder.translate(start_point)
162
+
163
+ # Create the cone (head of the arrow)
164
+ cone_height = 0.1 # 10% of the total arrow length, adjust as needed
165
+ cone_radius = 0.03 # Adjust the size of the arrowhead
166
+ cone = o3d.geometry.TriangleMesh.create_cone(radius=cone_radius, height=cone_height)
167
+
168
+ # Move base of cone to origin, rotate, then translate to end of cylinder
169
+ cone.translate([-0, 0, 0])
170
+ cone.rotate(rot_mat, center=[0, 0, 0])
171
+ cone.translate(start_point + normalized_vector * 0.4)
172
+
173
+ arrow = cylinder + cone
174
+ return arrow
175
+
176
+
177
+ def rotate_part(
178
+ pcd: o3d.geometry.PointCloud, mask: np.ndarray, axis_vector: np.ndarray, origin: np.ndarray, angle_rad: float
179
+ ) -> o3d.geometry.PointCloud:
180
+ """
181
+ Generate rotated point cloud of mask based on provided angle around axis.
182
+
183
+ :param pcd: point cloud object representing points of image
184
+ :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
185
+ :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
186
+ :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
187
+ :param angle_rad: angle in radians to rotate mask part
188
+ :return: point cloud object after rotation of masked part
189
+ """
190
+ # Get the coordinates of the point cloud as a numpy array
191
+ points_np = np.asarray(pcd.points)
192
+
193
+ # Convert point cloud colors to numpy array for easier manipulation
194
+ colors_np = np.asarray(pcd.colors)
195
+
196
+ # Create skew-symmetric matrix from end
197
+ K = np.array(
198
+ [
199
+ [0, -axis_vector[2], axis_vector[1]],
200
+ [axis_vector[2], 0, -axis_vector[0]],
201
+ [-axis_vector[1], axis_vector[0], 0],
202
+ ]
203
+ )
204
+
205
+ # Compute rotation matrix using Rodrigues' formula
206
+ R = np.eye(3) + np.sin(angle_rad) * K + (1 - np.cos(angle_rad)) * np.dot(K, K)
207
+
208
+ # Iterate over the mask and rotate the points corresponding to the object pixels
209
+ for i in range(mask.shape[0]):
210
+ for j in range(mask.shape[1]):
211
+ if mask[i, j] > 0: # This condition checks if the pixel belongs to the object
212
+ point_index = i * mask.shape[1] + j
213
+
214
+ # Translate the point such that the rotation origin is at the world origin
215
+ translated_point = points_np[point_index] - origin
216
+
217
+ # Rotate the translated point
218
+ rotated_point = np.dot(R, translated_point)
219
+
220
+ # Translate the point back
221
+ points_np[point_index] = rotated_point + origin
222
+
223
+ colors_np[point_index] = POINT_COLOR
224
+
225
+ # Update the point cloud's coordinates
226
+ pcd.points = o3d.utility.Vector3dVector(points_np)
227
+
228
+ # Update point cloud colors
229
+ pcd.colors = o3d.utility.Vector3dVector(colors_np)
230
+
231
+ return pcd
232
+
233
+
234
+ def translate_part(pcd, mask, axis_vector, distance):
235
+ """
236
+ Generate translated point cloud of mask based on provided angle around axis.
237
+
238
+ :param pcd: point cloud object representing points of image
239
+ :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
240
+ :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
241
+ :param distance: distance within coordinate system to translate mask part
242
+ :return: point cloud object after translation of masked part
243
+ """
244
+ normalized_vector = axis_vector / np.linalg.norm(axis_vector)
245
+ translation_vector = normalized_vector * distance
246
+
247
+ # Convert point cloud colors to numpy array for easier manipulation
248
+ colors_np = np.asarray(pcd.colors)
249
+
250
+ # Get the coordinates of the point cloud as a numpy array
251
+ points_np = np.asarray(pcd.points)
252
+
253
+ # Iterate over the mask and assign the color to the points corresponding to the object pixels
254
+ for i in range(mask.shape[0]):
255
+ for j in range(mask.shape[1]):
256
+ if mask[i, j] > 0: # This condition checks if the pixel belongs to the object
257
+ point_index = i * mask.shape[1] + j
258
+ colors_np[point_index] = POINT_COLOR
259
+ points_np[point_index] += translation_vector
260
+
261
+ # Update point cloud colors
262
+ pcd.colors = o3d.utility.Vector3dVector(colors_np)
263
+
264
+ # Update the point cloud's coordinates
265
+ pcd.points = o3d.utility.Vector3dVector(points_np)
266
+
267
+ return pcd
268
+
269
+
270
+ def batch_trim(images_path: str, save_path: str, identical: bool = False) -> None:
271
+ """
272
+ Trim white spaces from all images in the given path and save new images to folder.
273
+
274
+ :param images_path: local path to folder containing all images. Images must have the extension ".png", ".jpg", or
275
+ ".jpeg".
276
+ :param save_path: local path to folder in which to save trimmed images
277
+ :param identical: if True, will apply same crop to all images, else each image will have its whitespace trimmed
278
+ independently. Note that in the latter case, each image may have a slightly different size.
279
+ """
280
+
281
+ def get_trim(im):
282
+ """Trim whitespace from an image and return the cropped image."""
283
+ bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
284
+ diff = ImageChops.difference(im, bg)
285
+ diff = ImageChops.add(diff, diff, 2.0, -100)
286
+ bbox = diff.getbbox()
287
+ return bbox
288
+
289
+ if identical: #
290
+ images = []
291
+ optimal_box = None
292
+
293
+ # load all images
294
+ for image_file in sorted(os.listdir(images_path)):
295
+ if image_file.endswith(IMAGE_EXTENSIONS):
296
+ image_path = os.path.join(images_path, image_file)
297
+ images.append(Image.open(image_path))
298
+
299
+ # find optimal box size
300
+ for im in images:
301
+ bbox = get_trim(im)
302
+ if bbox is None:
303
+ bbox = (0, 0, im.size[0], im.size[1]) # bound entire image
304
+
305
+ if optimal_box is None:
306
+ optimal_box = bbox
307
+ else:
308
+ optimal_box = (
309
+ min(optimal_box[0], bbox[0]),
310
+ min(optimal_box[1], bbox[1]),
311
+ max(optimal_box[2], bbox[2]),
312
+ max(optimal_box[3], bbox[3]),
313
+ )
314
+
315
+ # apply cropping, if optimal box was found
316
+ for idx, im in enumerate(images):
317
+ im.crop(optimal_box)
318
+ im.save(os.path.join(save_path, f"{idx}.png"))
319
+ im.close()
320
+
321
+ else: # trim each image separately
322
+ for image_file in os.listdir(images_path):
323
+ if image_file.endswith(IMAGE_EXTENSIONS):
324
+ image_path = os.path.join(images_path, image_file)
325
+ with Image.open(image_path) as im:
326
+ bbox = get_trim(im)
327
+ trimmed = im.crop(bbox) if bbox else im
328
+ trimmed.save(os.path.join(save_path, image_file))
329
+
330
+
331
+ def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "output.gif") -> None:
332
+ """
333
+ Create gif out of folder of images and save to file.
334
+
335
+ :param image_folder_path: path to folder containing images (non-recursive). Assumes images are named as {i}.png for
336
+ each of i from 0 to num_samples.
337
+ :param num_samples: number of sampled images to compile into gif.
338
+ :param gif_filename: filename for gif, defaults to "output.gif"
339
+ """
340
+ # Generate a list of image filenames (assuming the images are saved as 0.png, 1.png, etc.)
341
+ image_files = [f"{image_folder_path}/{i}.png" for i in range(num_samples)]
342
+
343
+ # Read the images using imageio
344
+ images = [imageio.imread(image_file) for image_file in image_files]
345
+ assert all(
346
+ images[0].shape == im.shape for im in images
347
+ ), f"Found some images with a different shape: {[im.shape for im in images]}"
348
+
349
+ # Save images as a gif
350
+ gif_output_path = f"{image_folder_path}/{gif_filename}"
351
+ imageio.mimsave(gif_output_path, images, duration=0.1)
352
+
353
+ return