Spaces:

yansong1616
/

3dilize_anything

Sleeping

App Files Files Community

yansong1616 commited on Sep 5

Commit

56cd6b7

•

1 Parent(s): 633d2c0

Upload 90 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

dust3r/__init__.py +2 -0
dust3r/__pycache__/__init__.cpython-310.pyc +0 -0
dust3r/__pycache__/__init__.cpython-38.pyc +0 -0
dust3r/__pycache__/__init__.cpython-39.pyc +0 -0
dust3r/__pycache__/image_pairs.cpython-310.pyc +0 -0
dust3r/__pycache__/image_pairs.cpython-38.pyc +0 -0
dust3r/__pycache__/inference.cpython-310.pyc +0 -0
dust3r/__pycache__/inference.cpython-38.pyc +0 -0
dust3r/__pycache__/inference.cpython-39.pyc +0 -0
dust3r/__pycache__/model.cpython-310.pyc +0 -0
dust3r/__pycache__/model.cpython-38.pyc +0 -0
dust3r/__pycache__/model.cpython-39.pyc +0 -0
dust3r/__pycache__/optim_factory.cpython-310.pyc +0 -0
dust3r/__pycache__/optim_factory.cpython-38.pyc +0 -0
dust3r/__pycache__/patch_embed.cpython-310.pyc +0 -0
dust3r/__pycache__/patch_embed.cpython-38.pyc +0 -0
dust3r/__pycache__/post_process.cpython-310.pyc +0 -0
dust3r/__pycache__/render_to_3d.cpython-310.pyc +0 -0
dust3r/__pycache__/viz.cpython-310.pyc +0 -0
dust3r/__pycache__/viz.cpython-38.pyc +0 -0
dust3r/cloud_opt/__init__.py +29 -0
dust3r/cloud_opt/__pycache__/__init__.cpython-310.pyc +0 -0
dust3r/cloud_opt/__pycache__/__init__.cpython-38.pyc +0 -0
dust3r/cloud_opt/__pycache__/base_opt.cpython-310.pyc +0 -0
dust3r/cloud_opt/__pycache__/base_opt.cpython-38.pyc +0 -0
dust3r/cloud_opt/__pycache__/commons.cpython-310.pyc +0 -0
dust3r/cloud_opt/__pycache__/commons.cpython-38.pyc +0 -0
dust3r/cloud_opt/__pycache__/init_im_poses.cpython-310.pyc +0 -0
dust3r/cloud_opt/__pycache__/init_im_poses.cpython-38.pyc +0 -0
dust3r/cloud_opt/__pycache__/optimizer.cpython-310.pyc +0 -0
dust3r/cloud_opt/__pycache__/optimizer.cpython-38.pyc +0 -0
dust3r/cloud_opt/__pycache__/pair_viewer.cpython-310.pyc +0 -0
dust3r/cloud_opt/base_opt.py +380 -0
dust3r/cloud_opt/commons.py +91 -0
dust3r/cloud_opt/init_im_poses.py +316 -0
dust3r/cloud_opt/optimizer.py +249 -0
dust3r/cloud_opt/pair_viewer.py +125 -0
dust3r/datasets/__init__.py +42 -0
dust3r/datasets/base/__init__.py +2 -0
dust3r/datasets/base/base_stereo_view_dataset.py +220 -0
dust3r/datasets/base/batched_sampler.py +74 -0
dust3r/datasets/base/easy_dataset.py +157 -0
dust3r/datasets/co3d.py +146 -0
dust3r/datasets/utils/__init__.py +2 -0
dust3r/datasets/utils/cropping.py +119 -0
dust3r/datasets/utils/transforms.py +11 -0
dust3r/heads/__init__.py +19 -0
dust3r/heads/__pycache__/__init__.cpython-310.pyc +0 -0
dust3r/heads/__pycache__/__init__.cpython-38.pyc +0 -0
dust3r/heads/__pycache__/__init__.cpython-39.pyc +0 -0

dust3r/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

dust3r/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (145 Bytes). View file

dust3r/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (143 Bytes). View file

dust3r/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (143 Bytes). View file

dust3r/__pycache__/image_pairs.cpython-310.pyc ADDED Viewed

Binary file (3.19 kB). View file

dust3r/__pycache__/image_pairs.cpython-38.pyc ADDED Viewed

Binary file (3.25 kB). View file

dust3r/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (5.2 kB). View file

dust3r/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (5.21 kB). View file

dust3r/__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (5.2 kB). View file

dust3r/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (5.99 kB). View file

dust3r/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (5.96 kB). View file

dust3r/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (5.97 kB). View file

dust3r/__pycache__/optim_factory.cpython-310.pyc ADDED Viewed

Binary file (371 Bytes). View file

dust3r/__pycache__/optim_factory.cpython-38.pyc ADDED Viewed

Binary file (367 Bytes). View file

dust3r/__pycache__/patch_embed.cpython-310.pyc ADDED Viewed

Binary file (2.74 kB). View file

dust3r/__pycache__/patch_embed.cpython-38.pyc ADDED Viewed

Binary file (2.76 kB). View file

dust3r/__pycache__/post_process.cpython-310.pyc ADDED Viewed

Binary file (1.65 kB). View file

dust3r/__pycache__/render_to_3d.cpython-310.pyc ADDED Viewed

Binary file (2.91 kB). View file

dust3r/__pycache__/viz.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

dust3r/__pycache__/viz.cpython-38.pyc ADDED Viewed

Binary file (10.6 kB). View file

dust3r/cloud_opt/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# global alignment optimization wrapper function
+# --------------------------------------------------------
+from enum import Enum
+from .optimizer import PointCloudOptimizer
+from .pair_viewer import PairViewer
+class GlobalAlignerMode(Enum):
+    PointCloudOptimizer = "PointCloudOptimizer"
+    PairViewer = "PairViewer"
+def global_aligner(dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw):
+    # extract all inputs
+    view1, view2, pred1, pred2 = [dust3r_output[k] for k in 'view1 view2 pred1 pred2'.split()]
+    # build the optimizer
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    elif mode == GlobalAlignerMode.PairViewer:
+        net = PairViewer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    else:
+        raise NotImplementedError(f'Unknown mode {mode}')
+    return net

dust3r/cloud_opt/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.04 kB). View file

dust3r/cloud_opt/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (1.03 kB). View file

dust3r/cloud_opt/__pycache__/base_opt.cpython-310.pyc ADDED Viewed

Binary file (15.6 kB). View file

dust3r/cloud_opt/__pycache__/base_opt.cpython-38.pyc ADDED Viewed

Binary file (15.8 kB). View file

dust3r/cloud_opt/__pycache__/commons.cpython-310.pyc ADDED Viewed

Binary file (3.36 kB). View file

dust3r/cloud_opt/__pycache__/commons.cpython-38.pyc ADDED Viewed

Binary file (3.41 kB). View file

dust3r/cloud_opt/__pycache__/init_im_poses.cpython-310.pyc ADDED Viewed

Binary file (8.42 kB). View file

dust3r/cloud_opt/__pycache__/init_im_poses.cpython-38.pyc ADDED Viewed

Binary file (8.45 kB). View file

dust3r/cloud_opt/__pycache__/optimizer.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

dust3r/cloud_opt/__pycache__/optimizer.cpython-38.pyc ADDED Viewed

Binary file (11.4 kB). View file

dust3r/cloud_opt/__pycache__/pair_viewer.cpython-310.pyc ADDED Viewed

Binary file (4.89 kB). View file

dust3r/cloud_opt/base_opt.py ADDED Viewed

	@@ -0,0 +1,380 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for the global alignement procedure
+# --------------------------------------------------------
+from copy import deepcopy
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+from dust3r.utils.geometry import inv, geotrf
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb
+from dust3r.viz import SceneViz, segment_sky, auto_cam_size
+from dust3r.optim_factory import adjust_learning_rate_by_lr
+from dust3r.cloud_opt.commons import (edge_str, ALL_DISTS, NoGradParamDict, get_imshapes, signed_expm1, signed_log1p,
+                                      cosine_schedule, linear_schedule, get_conf_trf)
+import dust3r.cloud_opt.init_im_poses as init_fun
+class BasePCOptimizer (nn.Module):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            other = deepcopy(args[0])
+            attrs = '''edges is_symmetrized dist n_imgs pred_i pred_j imshapes
+                        min_conf_thr conf_thr conf_i conf_j im_conf
+                        base_scale norm_pw_scale POSE_DIM pw_poses
+                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs'''.split()
+            self.__dict__.update({k: other[k] for k in attrs})
+        else:
+            self._init_from_views(*args, **kwargs)
+    def _init_from_views(self, view1, view2, pred1, pred2,
+                         dist='l1',
+                         conf='log',
+                         min_conf_thr=3,
+                         base_scale=0.5,
+                         allow_pw_adaptors=False,
+                         pw_break=20,
+                         rand_pose=torch.randn,
+                         iterationsCount=None,
+                        ):
+        super().__init__()
+        if not isinstance(view1['idx'], list):
+            view1['idx'] = view1['idx'].tolist()
+        if not isinstance(view2['idx'], list):
+            view2['idx'] = view2['idx'].tolist()
+        self.edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
+        self.dist = ALL_DISTS[dist]
+        self.n_imgs = self._check_edges()
+        # input data
+        pred1_pts = pred1['pts3d']
+        pred2_pts = pred2['pts3d_in_other_view']
+        self.pred_i = NoGradParamDict({ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.pred_j = NoGradParamDict({ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
+        # work in log-scale with conf
+        pred1_conf = pred1['conf']
+        pred2_conf = pred2['conf']
+        self.min_conf_thr = min_conf_thr
+        self.conf_trf = get_conf_trf(conf)
+        self.conf_i = NoGradParamDict({ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.conf_j = NoGradParamDict({ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
+        # pairwise pose parameters
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(rand_pose((self.n_edges, 1+self.POSE_DIM)))  # pairwise poses
+        self.pw_adaptors = nn.Parameter(torch.zeros((self.n_edges, 2)))  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+        # possibly store images for show_pointcloud
+        self.imgs = None
+        if 'img' in view1 and 'img' in view2:
+            imgs = [torch.zeros((3,)+hw) for hw in self.imshapes]
+            for v in range(len(self.edges)):
+                idx = view1['idx'][v]
+                imgs[idx] = view1['img'][v]
+                idx = view2['idx'][v]
+                imgs[idx] = view2['img'][v]
+            self.imgs = rgb(imgs)
+    @property
+    def n_edges(self):
+        return len(self.edges)
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+    @property
+    def imsizes(self):
+        return [(w, h) for h, w in self.imshapes]
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+    def state_dict(self, trainable=True):
+        all_params = super().state_dict()
+        return {k: v for k, v in all_params.items() if k.startswith(('_', 'pred_i.', 'pred_j.', 'conf_i.', 'conf_j.')) != trainable}
+    def load_state_dict(self, data):
+        return super().load_state_dict(self.state_dict(trainable=False) | data)
+    def _check_edges(self):
+        indices = sorted({i for edge in self.edges for i in edge})
+        assert indices == list(range(len(indices))), 'bad pair indices: missing values '
+        return len(indices)
+    @torch.no_grad()
+    def _compute_img_conf(self, pred1_conf, pred2_conf):
+        im_conf = nn.ParameterList([torch.zeros(hw, device=self.device) for hw in self.imshapes])
+        for e, (i, j) in enumerate(self.edges):
+            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
+            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
+        return im_conf
+    def get_adaptors(self): # 公式(5)中的σ_e
+        adapt = self.pw_adaptors
+        adapt = torch.cat((adapt[:, 0:1], adapt), dim=-1)  # (scale_xy, scale_xy, scale_z)
+        if self.norm_pw_scale:  # normalize so that the product == 1
+            adapt = adapt - adapt.mean(dim=1, keepdim=True) # 归一化
+        return (adapt / self.pw_break).exp() # TODO gys:公式(5)中的σ_e是什么？
+    def _get_poses(self, poses): # self.im_poses 或者 self.pw_poses
+        # normalize rotation
+        Q = poses[:, :4]
+        T = signed_expm1(poses[:, 4:7])
+        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
+        return RT
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(T / (scale or 1))  # translation is function of scale
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+    def get_pw_scale(self):
+        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
+        scale = scale * self.get_pw_norm_scale_factor()
+        return scale
+    def get_pw_poses(self):  # cam to world
+        RT = self._get_poses(self.pw_poses)
+        scaled_RT = RT.clone()
+        scaled_RT[:, :3] *= self.get_pw_scale().view(-1, 1, 1)  # scale the rotation AND translation
+        return scaled_RT
+    def get_masks(self):
+        return [(conf > self.min_conf_thr) for conf in self.im_conf]
+    def depth_to_pts3d(self):
+        raise NotImplementedError()
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def _set_focal(self, idx, focal, force=False):
+        raise NotImplementedError()
+    def get_focals(self):
+        raise NotImplementedError()
+    def get_known_focal_mask(self):
+        raise NotImplementedError()
+    def get_principal_points(self):
+        raise NotImplementedError()
+    def get_conf(self, mode=None):
+        trf = self.conf_trf if mode is None else get_conf_trf(mode)
+        return [trf(c) for c in self.im_conf]
+    def get_im_poses(self):
+        raise NotImplementedError()
+    def _set_depthmap(self, idx, depth, force=False):
+        raise NotImplementedError()
+    def get_depthmaps(self, raw=False):
+        raise NotImplementedError()
+    @torch.no_grad()
+    def clean_pointcloud(self, tol=0.001, max_bad_conf=0):
+        """ Method:
+        1) express all 3d points in each camera coordinate frame
+        2) if they're in front of a depthmap --> then lower their confidence
+        """
+        assert 0 <= tol < 1
+        cams = inv(self.get_im_poses())
+        K = self.get_intrinsics()
+        depthmaps = self.get_depthmaps()
+        res = deepcopy(self)
+        for i, pts3d in enumerate(self.depth_to_pts3d()):
+            for j in range(self.n_imgs):
+                if i == j:
+                    continue
+                # project 3dpts in other view
+                Hi, Wi = self.imshapes[i]
+                Hj, Wj = self.imshapes[j]
+                proj = geotrf(cams[j], pts3d[:Hi*Wi]).reshape(Hi, Wi, 3)
+                proj_depth = proj[:, :, 2]
+                u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
+                # check which points are actually in the visible cone
+                msk_i = (proj_depth > 0) & (0 <= u) & (u < Wj) & (0 <= v) & (v < Hj)
+                msk_j = v[msk_i], u[msk_i]
+                # find bad points = those in front but less confident
+                bad_points = (proj_depth[msk_i] < (1-tol) * depthmaps[j][msk_j]
+                              ) & (res.im_conf[i][msk_i] < res.im_conf[j][msk_j])
+                bad_msk_i = msk_i.clone()
+                bad_msk_i[msk_i] = bad_points
+                res.im_conf[i][bad_msk_i] = res.im_conf[i][bad_msk_i].clip_(max=max_bad_conf)
+        return res
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+        if ret_details:
+            return loss, details
+        return loss
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == 'msp' or init == 'mst':
+            # ==============3.3.Downstream Applications：主要是为3.4. Global Alignment中的公式(5)初始化内外参矩阵和待估计的世界坐标系的坐标============
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == 'known_poses':
+            init_fun.init_from_known_poses(self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP)
+        else:
+            raise ValueError(f'bad value for {init=}')
+        global_alignment_loop(self, **kw) # 3.4. Global Alignment：梯度下降公式(5)
+    @torch.no_grad()
+    def mask_sky(self):
+        res = deepcopy(self)
+        for i in range(self.n_imgs):
+            sky = segment_sky(self.imgs[i])
+            res.im_conf[i][sky] = 0
+        return res
+    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
+        viz = SceneViz()
+        if self.imgs is None:
+            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
+            colors = list(map(tuple, colors.tolist()))
+            for n in range(self.n_imgs):
+                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
+        else:
+            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
+            colors = np.random.randint(256, size=(self.n_imgs, 3))
+        # camera poses
+        im_poses = to_numpy(self.get_im_poses())
+        if cam_size is None:
+            cam_size = auto_cam_size(im_poses)
+        viz.add_cameras(im_poses, self.get_focals(), colors=colors,
+                        images=self.imgs, imsizes=self.imsizes, cam_size=cam_size)
+        if show_pw_cams:
+            pw_poses = self.get_pw_poses()
+            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
+            if show_pw_pts3d:
+                pts = [geotrf(pw_poses[e], self.pred_i[edge_str(i, j)]) for e, (i, j) in enumerate(self.edges)]
+                viz.add_pointcloud(pts, (128, 0, 128))
+        viz.show(**kw)
+        return viz
+def global_alignment_loop(net, lr=0.01, niter=300, schedule='cosine', lr_min=1e-6, verbose=False):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+    if verbose:
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+    with tqdm.tqdm(total=niter) as bar:
+        while bar.n < bar.total:
+            t = bar.n / bar.total
+            if schedule == 'cosine':
+                lr = cosine_schedule(t, lr_base, lr_min)
+            elif schedule == 'linear':
+                lr = linear_schedule(t, lr_base, lr_min)
+            else:
+                raise ValueError(f'bad lr {schedule=}')
+            adjust_learning_rate_by_lr(optimizer, lr)
+            optimizer.zero_grad()
+            loss = net() # 论文中：Global optimization
+            loss.backward()
+            optimizer.step()
+            loss = float(loss)
+            bar.set_postfix_str(f'{lr=:g} loss={loss:g}')
+            if bar.n % 30 == 0:
+                print(' ')
+            bar.update()

dust3r/cloud_opt/commons.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+def edge_str(i, j):
+    return f'{i}_{j}'
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+    # edge对应的两张图片经dust3r输出的置信度，分别对两张图片所有像素点的置信度取平均值再相乘，作为当前edge的置信度
+def compute_edge_scores(edges, conf_i, conf_j):# edge对应的两张图片经dust3r会输出两个置信度矩阵，分别对两张图片所有像素点的置信度取平均值再相乘，作为当前edge的置信度
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f'incorrect shape for image {i}'
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f'incorrect shape for image {j}'
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+def get_conf_trf(mode):
+    if mode == 'log':
+        def conf_trf(x): return x.log()
+    elif mode == 'sqrt':
+        def conf_trf(x): return x.sqrt()
+    elif mode == 'm1':
+        def conf_trf(x): return x-1
+    elif mode in ('id', 'none'):
+        def conf_trf(x): return x
+    else:
+        raise ValueError(f'bad mode for {mode=}')
+    return conf_trf
+def l2_dist(a, b, weight):
+    return ((a - b).square().sum(dim=-1) * weight)
+def l1_dist(a, b, weight):
+    return ((a - b).norm(dim=-1) * weight) # torch.norm()是求范式的损失，默认是第二范式
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1+np.cos(t * np.pi))/2
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t

dust3r/cloud_opt/init_im_poses.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Initialization functions for global alignment
+# --------------------------------------------------------
+from functools import cache
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
+from dust3r.post_process import estimate_focal_knowing_depth
+from dust3r.viz import to_numpy
+from dust3r.cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
+@torch.no_grad()
+def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
+    device = self.device
+    # indices of known poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    assert nkp == self.n_imgs, 'not all poses are known'
+    # get all focals
+    nkf, _, im_focals = get_known_focals(self)
+    assert nkf == self.n_imgs
+    im_pp = self.get_principal_points()
+    best_depthmaps = {}
+    # init all pairwise poses
+    for e, (i, j) in enumerate(tqdm(self.edges)):
+        i_j = edge_str(i, j)
+        # find relative pose for this pair
+        P1 = torch.eye(4, device=device)
+        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
+        _, P2 = fast_pnp(self.pred_j[i_j], float(im_focals[i].mean()),
+                         pp=im_pp[i], msk=msk, device=device, niter_PnP=niter_PnP)
+        # align the two predicted camera with the two gt cameras
+        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
+        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
+        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+        # remember if this is a good depthmap
+        score = float(self.conf_i[i_j].mean())
+        if score > best_depthmaps.get(i, (0,))[0]:
+            best_depthmaps[i] = score, i_j, s
+    # init all image poses
+    for n in range(self.n_imgs):
+        assert known_poses_msk[n]
+        _, i_j, scale = best_depthmaps[n]
+        depth = self.pred_i[i_j][:, :, 2]
+        self._set_depthmap(n, depth * scale)
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """ Init all camera poses (image-wise and pairwise poses) given
+        an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(self.imshapes, self.edges,
+                                                          self.pred_i, self.pred_j, self.conf_i, self.conf_j, self.im_conf, self.min_conf_thr,
+                                                          device, has_im_poses=self.has_im_poses, **kw)
+    return init_from_pts3d(self, pts3d, im_focals, im_poses) # 初始化
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    if nkp == 1: # 0
+        raise NotImplementedError("Would be simpler to just align everything afterwards on the single known pose")
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(im_poses[known_poses_msk], known_poses[known_poses_msk])
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+    # pw_poses：遍历所有的edge，计算每个edge对应的(即输入dust3r的第一张图片的)相机坐标系转成“世界坐标系”的转换矩阵即P_e
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        # pred_i：dust3r输出的第一张图片对应的3D点云
+        s, R, T = rigid_points_registration(self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]) # 估计每个edge对应的相机坐标系转成世界坐标系的外参矩阵
+        self._set_pose(self.pw_poses, e, R, T, scale=s) # pw_poses *****************
+    # TODO gys:s_factor是什么? take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factorS
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2] # 将世界坐标系的点pts3d[i]转成相机坐标系
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world) # im_poses ********************
+            if im_focals[i] is not None:
+                self._set_focal(i, im_focals[i])
+    print(' init loss =', float(self()))
+def minimum_spanning_tree(imshapes, edges, pred_i, pred_j, conf_i, conf_j, im_conf, min_conf_thr,
+                          device, has_im_poses=True, niter_PnP=10):
+    n_imgs = len(imshapes)
+    sparse_graph = -dict_to_sparse_graph(compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j)) # 计算置信度，返回一个矩阵，表示两两图片表示的edge的置信度
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo() # 将上面的矩阵转换成最小生成树，因为sparse_graph加了负号，所以这里筛选出来的其实是最大的置信度
+    # 上面找最小生成树的目的是：为每个图片尽量选一个置信度最大的edge，因为每两两图片之间都存在一个edge
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes) # 长度为5的空list（输入图片的数量是5）
+    todo = sorted(zip(-msp.data, msp.row, msp.col)) # 根据最小生成树选出：平均置信度最大的4个edge（输入图片的数量是5），这4个edge一定包含5张输入图像 ，因为是生成树 # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+    # init with strongest edge
+    score, i, j = todo.pop() # 这里的socre是compute_edge_scores函数计算出的置信度
+    print(f' init edge ({i}*,{j}*) {score=}')
+    i_j = edge_str(i, j)
+    pts3d[i] = pred_i[i_j].clone() # 置信度最大的edge对应的两张图片的三维点云（对与所有图片，每两张图片经dust3r都会输出两个三维点云）
+    pts3d[j] = pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses: #============选择置信度最高edge中的第一张图片的相机坐标系为世界坐标系==============
+        im_poses[i] = torch.eye(4, device=device) # 4*4的单位矩阵，因为该图片的相机坐标系就是世界坐标系，所以外参矩阵为单位矩阵
+        im_focals[i] = estimate_focal(pred_i[i_j]) # 3.3 估计内参矩阵
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop() # pop把list最后一个元素弹出
+        if im_focals[i] is None: # 图片i对应的相机内参已经计算过了
+            im_focals[i] = estimate_focal(pred_i[i_j])
+        if i in done:
+            print(f' init edge ({i},{j}*) {score=}')
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j]) # 3.3 外参估计，s是sigma；直接调用roma工具包实现的
+            trf = sRT_to_4x4(s, R, T, device) # 存放到4*4的矩阵中，第四行是[0,0,0,1]，对应齐次坐标的转换
+            pts3d[j] = geotrf(trf, pred_j[i_j]) # pred_j[i_j]表示dust3r的输出：图片j在i的相机坐标系下的三维点云
+            done.add(j)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        elif j in done:
+            print(f' init edge ({i}*,{j}) {score=}')
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j]) # 从pred_j[i_j]转换到 pts3d[j]的外参矩阵
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, pred_i[i_j]) # 应用估计出的外参矩阵将相机坐标系的点转成世界坐标系
+            done.add(i)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(sparse_graph.values())  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[np.argsort(pair_scores)]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr # 使用PnP算法估计外参矩阵
+                res = fast_pnp(pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP)
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+    return pts3d, msp_edges, im_focals, im_poses # pts3d表示：每个输入的图片在自己的相机坐标系下的三维点经im_poses转换成世界坐标系的点
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1 # 取出照片数量
+    for e in dic:
+        a1 = max(e)
+        a2 = 2
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res # 将edge中存放的置信度转移到一个n_imgs * n_imgs大小的列表中
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration( # 调用roma的工具类函数
+        pts1.reshape(-1, 3), pts2.reshape(-1, 3), weights=conf.ravel(), compute_scaling=True)
+    return s, R, T  # return un-scaled (R, T)
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device) # 单位矩阵
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf # 外参矩阵 3*4
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W/2, H/2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(pts3d_i.unsqueeze(0), pp.unsqueeze(
+        0), focal_mode='weiszfeld', min_focal=0.5, max_focal=3.5).ravel()
+    return float(focal)
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S/2, S*3, 21)
+    else:
+        tentative_focals = [focal]
+    if pp is None:
+        pp = (W/2, H/2)
+    else:
+        pp = to_numpy(pp)
+    best = 0,
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+        success, R, T, inliers = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                                    iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+        if not success:
+            continue
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+    if not best[0]:
+        return None
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+def get_known_poses(self):
+    if self.has_im_poses:
+        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
+        known_poses = self.get_im_poses()
+        return known_poses_msk.sum(), known_poses_msk, known_poses
+    else:
+        return 0, None, None
+def get_known_focals(self):
+    if self.has_im_poses:
+        known_focal_msk = self.get_known_focal_mask()
+        known_focals = self.get_focals()
+        return known_focal_msk.sum(), known_focal_msk, known_focals
+    else:
+        return 0, None, None
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps*poses[:, :3, 2]))
+    R, T, s = roma.rigid_points_registration(center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True)
+    return s, R, T

dust3r/cloud_opt/optimizer.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main class for the implementation of the global alignment
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import xy_grid, geotrf
+from dust3r.utils.device import to_cpu, to_numpy
+class PointCloudOptimizer(BasePCOptimizer):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.has_im_poses = True  # by definition of this class
+        self.focal_break = focal_break
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
+        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
+        self.im_focals = nn.ParameterList(torch.FloatTensor(
+            [self.focal_break*np.log(max(H, W))]) for H, W in self.imshapes)  # camera intrinsics
+        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+        self.imshape = self.imshapes[0]
+        im_areas = [h*w for h, w in self.imshapes]
+        self.max_area = max(im_areas)
+        # adding thing to optimize
+        self.im_depthmaps = ParameterStack(self.im_depthmaps, is_param=True, fill=self.max_area)
+        self.im_poses = ParameterStack(self.im_poses, is_param=True)
+        self.im_focals = ParameterStack(self.im_focals, is_param=True)
+        self.im_pp = ParameterStack(self.im_pp, is_param=True)
+        self.register_buffer('_pp', torch.tensor([(w/2, h/2) for h, w in self.imshapes]))
+        self.register_buffer('_grid', ParameterStack(
+            [xy_grid(W, H, device=self.device) for H, W in self.imshapes], fill=self.max_area))
+        # pre-compute pixel weights
+        self.register_buffer('_weight_i', ParameterStack(
+            [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges], fill=self.max_area))
+        self.register_buffer('_weight_j', ParameterStack(
+            [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges], fill=self.max_area))
+        # precompute
+        self.register_buffer('_stacked_pred_i', ParameterStack(self.pred_i, self.str_edges, fill=self.max_area))
+        self.register_buffer('_stacked_pred_j', ParameterStack(self.pred_j, self.str_edges, fill=self.max_area))
+        self.register_buffer('_ei', torch.tensor([i for i, j in self.edges]))
+        self.register_buffer('_ej', torch.tensor([j for i, j in self.edges]))
+        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
+        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
+    def _check_all_imgs_are_selected(self, msk):
+        assert np.all(self._get_msk_indices(msk) == np.arange(self.n_imgs)), 'incomplete mask!'
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        self._check_all_imgs_are_selected(pose_msk)
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            print(f' (setting pose #{idx} = {pose[:3,3]})')
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = (n_known_poses <= 1)
+        self.im_poses.requires_grad_(False)
+        self.norm_pw_scale = False
+    def preset_focal(self, known_focals, msk=None):
+        self._check_all_imgs_are_selected(msk)
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            print(f' (setting focal #{idx} = {focal})')
+            self._no_grad(self._set_focal(idx, focal))
+        self.im_focals.requires_grad_(False)
+    def preset_principal_point(self, known_pp, msk=None):
+        self._check_all_imgs_are_selected(msk)
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            print(f' (setting principal point #{idx} = {pp})')
+            self._no_grad(self._set_principal_point(idx, pp))
+        self.im_pp.requires_grad_(False)
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.cumsum([0] + msk.tolist())
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f'bad {msk=}')
+    def _no_grad(self, tensor):
+        assert tensor.requires_grad, 'it must be True at this point, otherwise no modification occurs'
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = self.focal_break * np.log(focal)
+        return param
+    def get_focals(self): # 论文中Recovering intrinsics章节：求内参矩阵（即焦距）
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_break).exp()
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
+        return param
+    def get_principal_points(self):
+        return self._pp + 10 * self.im_pp # 将图像坐标系和像素坐标系的中心点偏移量
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().flatten()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+    def get_im_poses(self):  # cam to world 外参数矩阵的逆
+        cam2world = self._get_poses(self.im_poses)
+        return cam2world
+    def _set_depthmap(self, idx, depth, force=False):
+        depth = _ravel_hw(depth, self.max_area)
+        param = self.im_depthmaps[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+    def get_depthmaps(self, raw=False): #论文中公式(1)上面的的深度信息D
+        res = self.im_depthmaps.exp()
+        if not raw:
+            res = [dm[:h*w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def depth_to_pts3d(self): # 这里根据深度信息D计算真实的世界坐标系下的点，即论文中公式(1)上面的公式
+        # Get depths and  projection params if not provided
+        focals = self.get_focals() # 论文中Recovering intrinsics章节：求内参矩阵（即焦距）
+        pp = self.get_principal_points() # 图像坐标系和像素坐标系之间的偏移，即照片宽高的一半
+        im_poses = self.get_im_poses() # 外参数矩阵
+        depth = self.get_depthmaps(raw=True)#论文中公式(1)上面的深度信息D
+        # get pointmaps in camera frame self._grid：输入的所有图像(图像坐标系)
+        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp) # 将输入图像的坐标点转成相机坐标系下的点
+        # project to world frame
+        return geotrf(im_poses, rel_ptmaps) # 再由相机坐标系转成世界坐标系
+    def get_pts3d(self, raw=False): # 计算真实的世界坐标系下的三维点坐标，根据公式(1)上面的深度D计算公式计算
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    # 这里的forward返回的就是公式(5)计算的损失值
+    def forward(self): # 论文中： Global optimization
+        pw_poses = self.get_pw_poses() # pw_poses cam-to-world 公式(5)的P_e： 外参矩阵的逆，由相机坐标系转成世界坐标系，requires_grad=True
+        pw_adapt = self.get_adaptors().unsqueeze(1) # 公式(5)中的比例系数 sigma，requires_grad=False
+        proj_pts3d = self.get_pts3d(raw=True) # im_poses 公式(5)的待优化的真实的世界坐标系下的三维点requires_grad=True
+        # rotate pairwise prediction according to pw_poses 根据公式(5)的外参矩阵部分转成世界坐标系requires_grad=True
+        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i) # _stacked_pred_i/j表示dest3r预测的三维点云, requires_grad=False
+        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
+        # compute the loss： 转换成世界坐标系后的两张图像分别与待估计世界坐标系下的点（proj_pts3d）计算损失
+        li = self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum() / self.total_area_i
+        lj = self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum() / self.total_area_j
+        return li + lj
+def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
+    pp = pp.unsqueeze(1)
+    focal = focal.unsqueeze(1)
+    assert focal.shape == (len(depth), 1, 1)
+    assert pp.shape == (len(depth), 1, 2)
+    assert pixel_grid.shape == depth.shape + (2,)
+    depth = depth.unsqueeze(-1)
+    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1) # 公式(1)上面的计算公式，根据内参矩阵和深度D，将图像坐标系的点转成相机坐标系下的三维点
+def ParameterStack(params, keys=None, is_param=None, fill=0):
+    if keys is not None:
+        params = [params[k] for k in keys]
+    if fill > 0:
+        params = [_ravel_hw(p, fill) for p in params]
+    requires_grad = params[0].requires_grad
+    assert all(p.requires_grad == requires_grad for p in params)
+    params = torch.stack(list(params)).float().detach()
+    if is_param or requires_grad:
+        params = nn.Parameter(params)
+        params.requires_grad_(requires_grad)
+    return params
+def _ravel_hw(tensor, fill=0):
+    # ravel H,W
+    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+    if len(tensor) < fill:
+        tensor = torch.cat((tensor, tensor.new_zeros((fill - len(tensor),)+tensor.shape[1:])))
+    return tensor
+def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    return minf*focal_base, maxf*focal_base
+def apply_mask(img, msk):
+    img = img.copy()
+    img[msk] = 0
+    return img

dust3r/cloud_opt/pair_viewer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dummy optimizer for visualizing pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import cv2
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import inv, geotrf, depthmap_to_absolute_camera_coordinates
+from dust3r.cloud_opt.commons import edge_str
+from dust3r.post_process import estimate_focal_knowing_depth
+class PairViewer (BasePCOptimizer):
+    """
+    This a Dummy Optimizer.
+    To use only when the goal is to visualize the results for a pair of images (with is_symmetrized)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.is_symmetrized and self.n_edges == 2
+        self.has_im_poses = True
+        # compute all parameters directly from raw input
+        self.focals = []
+        self.pp = []
+        rel_poses = []
+        confs = []
+        for i in range(self.n_imgs):
+            conf = float(self.conf_i[edge_str(i, 1-i)].mean() * self.conf_j[edge_str(i, 1-i)].mean())
+            print(f'  - {conf=:.3} for edge {i}-{1-i}')
+            confs.append(conf)
+            H, W = self.imshapes[i]
+            pts3d = self.pred_i[edge_str(i, 1-i)]
+            pp = torch.tensor((W/2, H/2))
+            focal = float(estimate_focal_knowing_depth(pts3d[None], pp, focal_mode='weiszfeld'))
+            self.focals.append(focal)
+            self.pp.append(pp)
+            # estimate the pose of pts1 in image 2
+            pixels = np.mgrid[:W, :H].T.astype(np.float32)
+            pts3d = self.pred_j[edge_str(1-i, i)].numpy()
+            assert pts3d.shape[:2] == (H, W)
+            msk = self.get_masks()[i].numpy()
+            K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+            try:
+                res = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                         iterationsCount=100, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+                success, R, T, inliers = res
+                assert success
+                R = cv2.Rodrigues(R)[0]  # world to cam
+                pose = inv(np.r_[np.c_[R, T], [(0, 0, 0, 1)]])  # cam to world
+            except:
+                pose = np.eye(4)
+            rel_poses.append(torch.from_numpy(pose.astype(np.float32)))
+        # let's use the pair with the most confidence
+        if confs[0] > confs[1]:
+            # ptcloud is expressed in camera1
+            self.im_poses = [torch.eye(4), rel_poses[1]]  # I, cam2-to-cam1
+            self.depth = [self.pred_i['0_1'][..., 2], geotrf(inv(rel_poses[1]), self.pred_j['0_1'])[..., 2]]
+        else:
+            # ptcloud is expressed in camera2
+            self.im_poses = [rel_poses[0], torch.eye(4)]  # I, cam1-to-cam2
+            self.depth = [geotrf(inv(rel_poses[0]), self.pred_j['1_0'])[..., 2], self.pred_i['1_0'][..., 2]]
+        self.im_poses = nn.Parameter(torch.stack(self.im_poses, dim=0), requires_grad=False)
+        self.focals = nn.Parameter(torch.tensor(self.focals), requires_grad=False)
+        self.pp = nn.Parameter(torch.stack(self.pp, dim=0), requires_grad=False)
+        self.depth = nn.ParameterList(self.depth)
+        for p in self.parameters():
+            p.requires_grad = False
+    def _set_depthmap(self, idx, depth, force=False):
+        print('_set_depthmap is ignored in PairViewer')
+        return
+    def get_depthmaps(self, raw=False):
+        depth = [d.to(self.device) for d in self.depth]
+        return depth
+    def _set_focal(self, idx, focal, force=False):
+        self.focals[idx] = focal
+    def get_focals(self):
+        return self.focals
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.focals])
+    def get_principal_points(self):
+        return self.pp
+    def get_intrinsics(self):
+        focals = self.get_focals()
+        pps = self.get_principal_points()
+        K = torch.zeros((len(focals), 3, 3), device=self.device)
+        for i in range(len(focals)):
+            K[i, 0, 0] = K[i, 1, 1] = focals[i]
+            K[i, :2, 2] = pps[i]
+            K[i, 2, 2] = 1
+        return K
+    def get_im_poses(self):
+        return self.im_poses
+    def depth_to_pts3d(self):
+        pts3d = []
+        for d, intrinsics, im_pose in zip(self.depth, self.get_intrinsics(), self.get_im_poses()):
+            pts, _ = depthmap_to_absolute_camera_coordinates(d.cpu().numpy(),
+                                                             intrinsics.cpu().numpy(),
+                                                             im_pose.cpu().numpy())
+            pts3d.append(torch.from_numpy(pts).to(device=self.device))
+        return pts3d
+    def forward(self):
+        return float('nan')

dust3r/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .utils.transforms import *
+from .base.batched_sampler import BatchedRandomSampler  # noqa: F401
+from .co3d import Co3d  # noqa: F401
+def get_data_loader(dataset, batch_size, num_workers=8, shuffle=True, drop_last=True, pin_mem=True):
+    import torch
+    from croco.utils.misc import get_world_size, get_rank
+    # pytorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+    world_size = get_world_size()
+    rank = get_rank()
+    try:
+        sampler = dataset.make_sampler(batch_size, shuffle=shuffle, world_size=world_size,
+                                       rank=rank, drop_last=drop_last)
+    except (AttributeError, NotImplementedError):
+        # not avail for this dataset
+        if torch.distributed.is_initialized():
+            sampler = torch.utils.data.DistributedSampler(
+                dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last
+            )
+        elif shuffle:
+            sampler = torch.utils.data.RandomSampler(dataset)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=drop_last,
+    )
+    return data_loader

dust3r/datasets/base/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

dust3r/datasets/base/base_stereo_view_dataset.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL
+import numpy as np
+import torch
+from dust3r.datasets.base.easy_dataset import EasyDataset
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+class BaseStereoViewDataset (EasyDataset):
+    """ Define all basic options.
+    Usage:
+        class MyDataset (BaseStereoViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+    def __init__(self, *,  # only keyword arguments
+                 split=None,
+                 resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+                 transform=ImgNorm,
+                 aug_crop=False,
+                 seed=None):
+        self.num_views = 2
+        self.split = split
+        self._set_resolutions(resolution)
+        self.transform = transform
+        if isinstance(transform, str):
+            transform = eval(transform)
+        self.aug_crop = aug_crop
+        self.seed = seed
+    def __len__(self):
+        return len(self.scenes)
+    def get_stats(self):
+        return f"{len(self)} pairs"
+    def __repr__(self):
+        resolutions_str = '['+';'.join(f'{w}x{h}' for w, h in self._resolutions)+']'
+        return f"""{type(self).__name__}({self.get_stats()},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace('self.', '').replace('\n', '').replace('   ', '')
+    def _get_views(self, idx, resolution, rng):
+        raise NotImplementedError()
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, '_rng'):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+        # over-loaded code
+        resolution = self._resolutions[ar_idx]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+        assert len(views) == self.num_views
+        # check data-types
+        for v, view in enumerate(views):
+            assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view['idx'] = (idx, ar_idx, v)
+            # encode the image
+            width, height = view['img'].size
+            view['true_shape'] = np.int32((height, width))
+            view['img'] = self.transform(view['img'])
+            assert 'camera_intrinsics' in view
+            if 'camera_pose' not in view:
+                view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}'
+            assert 'pts3d' not in view
+            assert 'valid_mask' not in view
+            assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}'
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+            view['pts3d'] = pts3d
+            view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view['camera_intrinsics']
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view['rng'] = int.from_bytes(self._rng.bytes(4), 'big')
+        return views
+    def _set_resolutions(self, resolutions):
+        assert resolutions is not None, 'undefined resolution'
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(width, int), f'Bad type for {width=} {type(width)=}, should be int'
+            assert isinstance(height, int), f'Bad type for {height=} {type(height)=}, should be int'
+            assert width >= height
+            self._resolutions.append((width, height))
+    def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None):
+        """ This function:
+            - first downsizes the image with LANCZOS inteprolation,
+              which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W-cx)
+        min_margin_y = min(cy, H-cy)
+        assert min_margin_x > W/5, f'Bad principal point in view={info}'
+        assert min_margin_y > H/5, f'Bad principal point in view={info}'
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1*W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H/W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2):
+                resolution = resolution[::-1]
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        if self.aug_crop > 1:
+            target_resolution += rng.integers(0, self.aug_crop)
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution)
+        # actual cropping (if necessary) with bilinear interpolation
+        intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=0.5)
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution)
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+        return image, depthmap, intrinsics2
+def is_good_type(key, v):
+    """ returns (is_good, err_msg)
+    """
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+def view_name(view, batch_index=None):
+    def sel(x): return x[batch_index] if batch_index not in (None, slice(None)) else x
+    db = sel(view['dataset'])
+    label = sel(view['label'])
+    instance = sel(view['instance'])
+    return f"{db}/{label}/{instance}"
+def transpose_to_landscape(view):
+    height, width = view['true_shape']
+    if width < height:
+        # rectify portrait to landscape
+        assert view['img'].shape == (3, height, width)
+        view['img'] = view['img'].swapaxes(1, 2)
+        assert view['valid_mask'].shape == (height, width)
+        view['valid_mask'] = view['valid_mask'].swapaxes(0, 1)
+        assert view['depthmap'].shape == (height, width)
+        view['depthmap'] = view['depthmap'].swapaxes(0, 1)
+        assert view['pts3d'].shape == (height, width, 3)
+        view['pts3d'] = view['pts3d'].swapaxes(0, 1)
+        # transpose x and y pixels
+        view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]]

dust3r/datasets/base/batched_sampler.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Random sampling under a constraint
+# --------------------------------------------------------
+import numpy as np
+import torch
+class BatchedRandomSampler:
+    """ Random sampling under a constraint: each sample in the batch has the same feature,
+    which is chosen randomly from a known pool of 'features' for each batch.
+    For instance, the 'feature' could be the image aspect-ratio.
+    The index returned is a tuple (sample_idx, feat_idx).
+    This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
+    """
+    def __init__(self, dataset, batch_size, pool_size, world_size=1, rank=0, drop_last=True):
+        self.batch_size = batch_size
+        self.pool_size = pool_size
+        self.len_dataset = N = len(dataset)
+        self.total_size = round_by(N, batch_size*world_size) if drop_last else N
+        assert world_size == 1 or drop_last, 'must drop the last batch in distributed mode'
+        # distributed sampler
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = None
+    def __len__(self):
+        return self.total_size // self.world_size
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def __iter__(self):
+        # prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, 'use set_epoch() if distributed mode is used'
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+        # random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+        # random feat_idxs (same across each batch)
+        n_batches = (self.total_size+self.batch_size-1) // self.batch_size
+        feat_idxs = rng.integers(self.pool_size, size=n_batches)
+        feat_idxs = np.broadcast_to(feat_idxs[:, None], (n_batches, self.batch_size))
+        feat_idxs = feat_idxs.ravel()[:self.total_size]
+        # put them together
+        idxs = np.c_[sample_idxs, feat_idxs]  # shape = (total_size, 2)
+        # Distributed sampler: we select a subset of batches
+        # make sure the slice for each node is aligned with batch_size
+        size_per_proc = self.batch_size * ((self.total_size + self.world_size *
+                                           self.batch_size-1) // (self.world_size * self.batch_size))
+        idxs = idxs[self.rank*size_per_proc: (self.rank+1)*size_per_proc]
+        yield from (tuple(idx) for idx in idxs)
+def round_by(total, multiple, up=False):
+    if up:
+        total = total + multiple-1
+    return (total//multiple) * multiple

dust3r/datasets/base/easy_dataset.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# A dataset base class that you can easily resize and combine.
+# --------------------------------------------------------
+import numpy as np
+from dust3r.datasets.base.batched_sampler import BatchedRandomSampler
+class EasyDataset:
+    """ a dataset that you can easily resize and combine.
+    Examples:
+    ---------
+        2 * dataset ==> duplicate each element 2x
+        10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary)
+        dataset1 + dataset2 ==> concatenate datasets
+    """
+    def __add__(self, other):
+        return CatDataset([self, other])
+    def __rmul__(self, factor):
+        return MulDataset(factor, self)
+    def __rmatmul__(self, factor):
+        return ResizedDataset(factor, self)
+    def set_epoch(self, epoch):
+        pass  # nothing to do by default
+    def make_sampler(self, batch_size, shuffle=True, world_size=1, rank=0, drop_last=True):
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        num_of_aspect_ratios = len(self._resolutions)
+        return BatchedRandomSampler(self, batch_size, num_of_aspect_ratios, world_size=world_size, rank=rank, drop_last=drop_last)
+class MulDataset (EasyDataset):
+    """ Artifically augmenting the size of a dataset.
+    """
+    multiplicator: int
+    def __init__(self, multiplicator, dataset):
+        assert isinstance(multiplicator, int) and multiplicator > 0
+        self.multiplicator = multiplicator
+        self.dataset = dataset
+    def __len__(self):
+        return self.multiplicator * len(self.dataset)
+    def __repr__(self):
+        return f'{self.multiplicator}*{repr(self.dataset)}'
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            idx, other = idx
+            return self.dataset[idx // self.multiplicator, other]
+        else:
+            return self.dataset[idx // self.multiplicator]
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+class ResizedDataset (EasyDataset):
+    """ Artifically changing the size of a dataset.
+    """
+    new_size: int
+    def __init__(self, new_size, dataset):
+        assert isinstance(new_size, int) and new_size > 0
+        self.new_size = new_size
+        self.dataset = dataset
+    def __len__(self):
+        return self.new_size
+    def __repr__(self):
+        size_str = str(self.new_size)
+        for i in range((len(size_str)-1) // 3):
+            sep = -4*i-3
+            size_str = size_str[:sep] + '_' + size_str[sep:]
+        return f'{size_str} @ {repr(self.dataset)}'
+    def set_epoch(self, epoch):
+        # this random shuffle only depends on the epoch
+        rng = np.random.default_rng(seed=epoch+777)
+        # shuffle all indices
+        perm = rng.permutation(len(self.dataset))
+        # rotary extension until target size is met
+        shuffled_idxs = np.concatenate([perm] * (1 + (len(self)-1) // len(self.dataset)))
+        self._idxs_mapping = shuffled_idxs[:self.new_size]
+        assert len(self._idxs_mapping) == self.new_size
+    def __getitem__(self, idx):
+        assert hasattr(self, '_idxs_mapping'), 'You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()'
+        if isinstance(idx, tuple):
+            idx, other = idx
+            return self.dataset[self._idxs_mapping[idx], other]
+        else:
+            return self.dataset[self._idxs_mapping[idx]]
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+class CatDataset (EasyDataset):
+    """ Concatenation of several datasets
+    """
+    def __init__(self, datasets):
+        for dataset in datasets:
+            assert isinstance(dataset, EasyDataset)
+        self.datasets = datasets
+        self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
+    def __len__(self):
+        return self._cum_sizes[-1]
+    def __repr__(self):
+        # remove uselessly long transform
+        return ' + '.join(repr(dataset).replace(',transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))', '') for dataset in self.datasets)
+    def set_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_epoch(epoch)
+    def __getitem__(self, idx):
+        other = None
+        if isinstance(idx, tuple):
+            idx, other = idx
+        if not (0 <= idx < len(self)):
+            raise IndexError()
+        db_idx = np.searchsorted(self._cum_sizes, idx, 'right')
+        dataset = self.datasets[db_idx]
+        new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+        if other is not None:
+            new_idx = (new_idx, other)
+        return dataset[new_idx]
+    @property
+    def _resolutions(self):
+        resolutions = self.datasets[0]._resolutions
+        for dataset in self.datasets[1:]:
+            assert tuple(dataset._resolutions) == tuple(resolutions)
+        return resolutions

dust3r/datasets/co3d.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed Co3d_v2
+# dataset at https://github.com/facebookresearch/co3d - Creative Commons Attribution-NonCommercial 4.0 International
+# See datasets_preprocess/preprocess_co3d.py
+# --------------------------------------------------------
+import os.path as osp
+import json
+import itertools
+from collections import deque
+import cv2
+import numpy as np
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+class Co3d(BaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert mask_bg in (True, False, 'rand')
+        self.mask_bg = mask_bg
+        # load all scenes
+        with open(osp.join(self.ROOT, f'selected_seqs_{self.split}.json'), 'r') as f:
+            self.scenes = json.load(f)
+            self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0}
+            self.scenes = {(k, k2): v2 for k, v in self.scenes.items()
+                           for k2, v2 in v.items()}
+        self.scene_list = list(self.scenes.keys())
+        # for each scene, we have 100 images ==> 360 degrees (so 25 frames ~= 90 degrees)
+        # we prepare all combinations such that i-j = +/- [5, 10, .., 90] degrees
+        self.combinations = [(i, j)
+                             for i, j in itertools.combinations(range(100), 2)
+                             if 0 < abs(i-j) <= 30 and abs(i-j) % 5 == 0]
+        self.invalidate = {scene: {} for scene in self.scene_list}
+    def __len__(self):
+        return len(self.scene_list) * len(self.combinations)
+    def _get_views(self, idx, resolution, rng):
+        # choose a scene
+        obj, instance = self.scene_list[idx // len(self.combinations)]
+        image_pool = self.scenes[obj, instance]
+        im1_idx, im2_idx = self.combinations[idx % len(self.combinations)]
+        # add a bit of randomness
+        last = len(image_pool)-1
+        if resolution not in self.invalidate[obj, instance]:  # flag invalid images
+            self.invalidate[obj, instance][resolution] = [False for _ in range(len(image_pool))]
+        # decide now if we mask the bg
+        mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2))
+        views = []
+        imgs_idxs = [max(0, min(im_idx + rng.integers(-4, 5), last)) for im_idx in [im2_idx, im1_idx]]
+        imgs_idxs = deque(imgs_idxs)
+        while len(imgs_idxs) > 0:  # some images (few) have zero depth
+            im_idx = imgs_idxs.pop()
+            if self.invalidate[obj, instance][resolution][im_idx]:
+                # search for a valid image
+                random_direction = 2 * rng.choice(2) - 1
+                for offset in range(1, len(image_pool)):
+                    tentative_im_idx = (im_idx + (random_direction * offset)) % len(image_pool)
+                    if not self.invalidate[obj, instance][resolution][tentative_im_idx]:
+                        im_idx = tentative_im_idx
+                        break
+            view_idx = image_pool[im_idx]
+            impath = osp.join(self.ROOT, obj, instance, 'images', f'frame{view_idx:06n}.jpg')
+            # load camera params
+            input_metadata = np.load(impath.replace('jpg', 'npz'))
+            camera_pose = input_metadata['camera_pose'].astype(np.float32)
+            intrinsics = input_metadata['camera_intrinsics'].astype(np.float32)
+            # load image and depth
+            rgb_image = imread_cv2(impath)
+            depthmap = imread_cv2(impath.replace('images', 'depths') + '.geometric.png', cv2.IMREAD_UNCHANGED)
+            depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(input_metadata['maximum_depth'])
+            if mask_bg:
+                # load object mask
+                maskpath = osp.join(self.ROOT, obj, instance, 'masks', f'frame{view_idx:06n}.png')
+                maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(np.float32)
+                maskmap = (maskmap / 255.0) > 0.1
+                # update the depthmap with mask
+                depthmap *= maskmap
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath)
+            num_valid = (depthmap > 0.0).sum()
+            if num_valid == 0:
+                # problem, invalidate image and retry
+                self.invalidate[obj, instance][resolution][im_idx] = True
+                imgs_idxs.append(im_idx)
+                continue
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,
+                camera_intrinsics=intrinsics,
+                dataset='Co3d_v2',
+                label=osp.join(obj, instance),
+                instance=osp.split(impath)[1],
+            ))
+        return views
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+    dataset = Co3d(split='train', ROOT="data/co3d_subset_processed", resolution=224, aug_crop=16)
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx*255, (1 - idx)*255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()

dust3r/datasets/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

dust3r/datasets/utils/cropping.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+from dust3r.utils.geometry import colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics  # noqa
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+class ImageList:
+    """ Convenience class to aply the same operation to a whole set of images.
+    """
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+    def __len__(self):
+        return len(self.images)
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch('resize', *args, **kwargs))
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch('crop', *args, **kwargs))
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+def rescale_image_depthmap(image, depthmap, camera_intrinsics, output_resolution):
+    """ Jointly rescale a (image, depthmap)
+        so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+    assert output_resolution.shape == (2,)
+    # define output resolution
+    scale_final = max(output_resolution / image.size) + 1e-8
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+    # first rescale the image so that it contains the crop
+    image = image.resize(output_resolution, resample=lanczos)
+    if depthmap is not None:
+        depthmap = cv2.resize(depthmap, output_resolution, fx=scale_final,
+                              fy=scale_final, interpolation=cv2.INTER_NEAREST)
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final)
+    return image.to_pil(), depthmap, camera_intrinsics
+def camera_matrix_of_crop(input_camera_matrix, input_resolution, output_resolution, scaling=1, offset_factor=0.5, offset=None):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+    return output_camera_matrix
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+    return image.to_pil(), depthmap, camera_intrinsics
+def bbox_from_intrinsics_in_out(input_camera_matrix, output_camera_matrix, output_resolution):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l+out_width, t+out_height)
+    return crop_bbox

dust3r/datasets/utils/transforms.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUST3R default transforms
+# --------------------------------------------------------
+import torchvision.transforms as tvf
+from dust3r.utils.image import ImgNorm
+# define the standard image transforms
+ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])

dust3r/heads/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# head factory
+# --------------------------------------------------------
+from .linear_head import LinearPts3d
+from .dpt_head import create_dpt_head
+def head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder
+    """
+    if head_type == 'linear' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf)
+    elif head_type == 'dpt' and output_mode == 'pts3d':
+        return create_dpt_head(net, has_conf=has_conf)
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")

dust3r/heads/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (618 Bytes). View file

dust3r/heads/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (621 Bytes). View file

dust3r/heads/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (621 Bytes). View file