Spaces:

zhaoyian01
/

GraCo

Running

+import numpy as np
+from copy import deepcopy
+import cv2
+class Clicker(object):
+    def __init__(self, gt_mask=None, init_clicks=None, ignore_label=-1, click_indx_offset=0):
+        self.click_indx_offset = click_indx_offset
+        if gt_mask is not None:
+            self.gt_mask = gt_mask == 1
+            self.not_ignore_mask = gt_mask != ignore_label
+        else:
+            self.gt_mask = None
+        self.reset_clicks()
+        if init_clicks is not None:
+            for click in init_clicks:
+                self.add_click(click)
+    def make_next_click(self, pred_mask):
+        assert self.gt_mask is not None
+        click = self._get_next_click(pred_mask)
+        self.add_click(click)
+    def get_clicks(self, clicks_limit=None):
+        return self.clicks_list[:clicks_limit]
+    def _get_next_click(self, pred_mask, padding=True):
+        fn_mask = np.logical_and(np.logical_and(self.gt_mask, np.logical_not(pred_mask)), self.not_ignore_mask)
+        fp_mask = np.logical_and(np.logical_and(np.logical_not(self.gt_mask), pred_mask), self.not_ignore_mask)
+        if padding:
+            fn_mask = np.pad(fn_mask, ((1, 1), (1, 1)), 'constant')
+            fp_mask = np.pad(fp_mask, ((1, 1), (1, 1)), 'constant')
+        fn_mask_dt = cv2.distanceTransform(fn_mask.astype(np.uint8), cv2.DIST_L2, 0)
+        fp_mask_dt = cv2.distanceTransform(fp_mask.astype(np.uint8), cv2.DIST_L2, 0)
+        if padding:
+            fn_mask_dt = fn_mask_dt[1:-1, 1:-1]
+            fp_mask_dt = fp_mask_dt[1:-1, 1:-1]
+        fn_mask_dt = fn_mask_dt * self.not_clicked_map
+        fp_mask_dt = fp_mask_dt * self.not_clicked_map
+        fn_max_dist = np.max(fn_mask_dt)
+        fp_max_dist = np.max(fp_mask_dt)
+        is_positive = fn_max_dist > fp_max_dist
+        if is_positive:
+            coords_y, coords_x = np.where(fn_mask_dt == fn_max_dist)  # coords is [y, x]
+        else:
+            coords_y, coords_x = np.where(fp_mask_dt == fp_max_dist)  # coords is [y, x]
+        return Click(is_positive=is_positive, coords=(coords_y[0], coords_x[0]))
+    def add_click(self, click):
+        coords = click.coords
+        click.indx = self.click_indx_offset + self.num_pos_clicks + self.num_neg_clicks
+        if click.is_positive:
+            self.num_pos_clicks += 1
+        else:
+            self.num_neg_clicks += 1
+        self.clicks_list.append(click)
+        if self.gt_mask is not None:
+            self.not_clicked_map[coords[0], coords[1]] = False
+    def _remove_last_click(self):
+        click = self.clicks_list.pop()
+        coords = click.coords
+        if click.is_positive:
+            self.num_pos_clicks -= 1
+        else:
+            self.num_neg_clicks -= 1
+        if self.gt_mask is not None:
+            self.not_clicked_map[coords[0], coords[1]] = True
+    def reset_clicks(self):
+        if self.gt_mask is not None:
+            self.not_clicked_map = np.ones_like(self.gt_mask, dtype=bool)
+        self.num_pos_clicks = 0
+        self.num_neg_clicks = 0
+        self.clicks_list = []
+    def get_state(self):
+        return deepcopy(self.clicks_list)
+    def set_state(self, state):
+        self.reset_clicks()
+        for click in state:
+            self.add_click(click)
+    def __len__(self):
+        return len(self.clicks_list)
+class Click:
+    def __init__(self, is_positive, coords, indx=None):
+        self.is_positive = is_positive
+        self.coords = coords
+        self.indx = indx
+    @property
+    def coords_and_indx(self):
+        return (*self.coords, self.indx)
+    def copy(self, **kwargs):
+        self_copy = deepcopy(self)
+        for k, v in kwargs.items():
+            setattr(self_copy, k, v)
+        return self_copy

isegm/inference/evaluation.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from time import time
+import numpy as np
+import torch
+import cv2
+from isegm.inference import utils
+from isegm.inference.clicker import Click, Clicker
+try:
+    get_ipython()
+    from tqdm import tqdm_notebook as tqdm
+except NameError:
+    from tqdm import tqdm
+def evaluate_dataset(dataset, predictor, sam_type=None, oracle=False, gra_oracle=False, **kwargs):
+    all_ious = []
+    start_time = time()
+    all_gras = {}
+    for index in tqdm(range(len(dataset)), leave=False):
+        sample = dataset.get_sample(index)
+        for object_id in sample.objects_ids:
+            if gra_oracle:
+                sample_ious, gra_idx = evaluate_sample_oracle(sample.image, sample.gt_mask(object_id), predictor,
+                                                sample_id=index, sam_type=sam_type, oracle=oracle, **kwargs)
+                all_gras[gra_idx] = all_gras.get(gra_idx, 0) + 1
+            else:
+                _, sample_ious, _ = evaluate_sample(sample.image, sample.gt_mask(object_id), predictor,
+                                                    sample_id=index, sam_type=sam_type, oracle=oracle, **kwargs)
+            all_ious.append(sample_ious)
+    end_time = time()
+    elapsed_time = end_time - start_time
+    if len(all_gras) > 0:
+        print(all_gras)
+    return all_ious, elapsed_time
+def evaluate_sample(image, gt_mask, predictor, max_iou_thr,
+                    pred_thr=0.49, min_clicks=1, max_clicks=20,
+                    sample_id=None, sam_type=False, oracle=False, callback=None):
+    clicker = Clicker(gt_mask=gt_mask)
+    pred_mask = np.zeros_like(gt_mask)
+    ious_list = []
+    with torch.no_grad():
+        predictor.set_input_image(image)
+        if sam_type == 'SAM':
+            for click_indx in range(max_clicks):
+                clicker.make_next_click(pred_mask)
+                point_coords, point_labels = get_sam_input(clicker)
+                if oracle:
+                    ious = []
+                    pred_masks = []
+                    pred_probs, _, _ = predictor.predict(point_coords, point_labels, multimask_output=True, return_logits=True)
+                    for idx in range(pred_probs.shape[0]):
+                        pred_masks.append(pred_probs[idx] > predictor.model.mask_threshold)
+                        ious.append(utils.get_iou(gt_mask, pred_masks[-1]))
+                    tgt_idx = np.argmax(np.array(ious))
+                    iou = ious[tgt_idx]
+                    pred_mask = pred_masks[tgt_idx]
+                else:
+                    pred_probs, _, _ = predictor.predict(point_coords, point_labels, multimask_output=False, return_logits=True)
+                    pred_probs = pred_probs[0]
+                    pred_mask = pred_probs > predictor.model.mask_threshold
+                    iou = utils.get_iou(gt_mask, pred_mask)
+                if callback is not None:
+                    callback(image, gt_mask, pred_probs, sample_id, click_indx, clicker.clicks_list)
+                ious_list.append(iou)
+                if iou >= max_iou_thr and click_indx + 1 >= min_clicks:
+                    break
+            return clicker.clicks_list, np.array(ious_list, dtype=np.float32), pred_probs
+        else:
+            for click_indx in range(max_clicks):
+                clicker.make_next_click(pred_mask)
+                pred_probs = predictor.get_prediction(clicker)
+                pred_mask = pred_probs > pred_thr
+                iou = utils.get_iou(gt_mask, pred_mask)
+                if callback is not None:
+                    callback(image, gt_mask, pred_probs, sample_id, click_indx, clicker.clicks_list)
+                ious_list.append(iou)
+                if iou >= max_iou_thr and click_indx + 1 >= min_clicks:
+                    break
+            return clicker.clicks_list, np.array(ious_list, dtype=np.float32), pred_probs
+def evaluate_sample_oracle(image, gt_mask, predictor, max_iou_thr,
+                    pred_thr=0.49, min_clicks=1, max_clicks=20,
+                    sample_id=None, sam_type=False, oracle=False, callback=None):
+    clicker = Clicker(gt_mask=gt_mask)
+    ious_lists = []
+    click_indxs = []
+    with torch.no_grad():
+        predictor.set_input_image(image)
+        min_num = 100
+        for gra in range(1, 11):
+            cur_gra = round(gra * 0.1, 1)
+            ious_list = []
+            clicker.reset_clicks()
+            pred_mask = np.zeros_like(gt_mask)
+            if sam_type == 'SAM_GraCo':
+                for click_indx in range(max_clicks):
+                    clicker.make_next_click(pred_mask)
+                    point_coords, point_labels = get_sam_input(clicker)
+                    if oracle:
+                        ious = []
+                        pred_masks = []
+                        pred_probs, _, _ = predictor.predict(point_coords, point_labels, gra=cur_gra, multimask_output=True, return_logits=True)
+                        for idx in range(pred_probs.shape[0]):
+                            pred_masks.append(pred_probs[idx] > predictor.model.mask_threshold)
+                            ious.append(utils.get_iou(gt_mask, pred_masks[-1]))
+                        tgt_idx = np.argmax(np.array(ious))
+                        iou = ious[tgt_idx]
+                        pred_mask = pred_masks[tgt_idx]
+                    else:
+                        pred_probs, _, _ = predictor.predict(point_coords, point_labels, gra=cur_gra, multimask_output=False, return_logits=True)
+                        pred_probs = pred_probs[0]
+                        pred_mask = pred_probs > predictor.model.mask_threshold
+                        iou = utils.get_iou(gt_mask, pred_mask)
+                    if callback is not None:
+                        callback(image, gt_mask, pred_probs, sample_id, click_indx, clicker.clicks_list)
+                    ious_list.append(iou)
+                    if iou >= max_iou_thr and click_indx + 1 >= min_clicks:
+                        min_num = min(min_num, click_indx + 1)
+                        break
+                    if min_num <= max_clicks and click_indx + 1 > min_num:
+                        break
+            else:
+                predictor.prev_prediction = torch.zeros_like(predictor.original_image[:, :1, :, :])
+                for click_indx in range(max_clicks):
+                    clicker.make_next_click(pred_mask)
+                    pred_probs = predictor.get_prediction(clicker, gra=cur_gra)
+                    pred_mask = pred_probs > pred_thr
+                    iou = utils.get_iou(gt_mask, pred_mask)
+                    if callback is not None:
+                        callback(image, gt_mask, pred_probs, sample_id, click_indx, clicker.clicks_list)
+                    ious_list.append(iou)
+                    if iou >= max_iou_thr and click_indx + 1 >= min_clicks:
+                        min_num = min(min_num, click_indx + 1)
+                        break
+                    if min_num <= max_clicks and click_indx + 1 > min_num:
+                        break
+            ious_lists.append(np.array(ious_list, dtype=np.float32))
+            click_indxs.append(click_indx)
+        click_indxs = np.array(click_indxs)
+        tgt_idxs = np.squeeze(np.argwhere(click_indxs == np.min(click_indxs)), axis=1)
+        selected_ious = [ious_lists[i] for i in tgt_idxs]
+        max_index = np.argmax([ious[0] for ious in selected_ious])
+        ious = selected_ious[max_index]
+        tgt_idx = tgt_idxs[max_index]
+    return ious, tgt_idx
+def get_sam_input(clicker, reverse=True):
+    clicks_list = clicker.get_clicks()
+    points_nd = get_points_nd([clicks_list])
+    point_length = len(points_nd[0]) // 2
+    point_coords = []
+    point_labels = []
+    for i, point in enumerate(points_nd[0]):
+        if point[0] == -1:
+            continue
+        if i < point_length:
+            point_labels.append(1)
+        else:
+            point_labels.append(0)
+        if reverse:
+            point_coords.append([point[1], point[0]])  # for SAM
+    return np.array(point_coords), np.array(point_labels)
+def get_points_nd(clicks_lists):
+    total_clicks = []
+    num_pos_clicks = [sum(x.is_positive for x in clicks_list) for clicks_list in clicks_lists]
+    num_neg_clicks = [len(clicks_list) - num_pos for clicks_list, num_pos in zip(clicks_lists, num_pos_clicks)]
+    num_max_points = max(num_pos_clicks + num_neg_clicks)
+    num_max_points = max(1, num_max_points)
+    for clicks_list in clicks_lists:
+        pos_clicks = [click.coords_and_indx for click in clicks_list if click.is_positive]
+        pos_clicks = pos_clicks + (num_max_points - len(pos_clicks)) * [(-1, -1, -1)]
+        neg_clicks = [click.coords_and_indx for click in clicks_list if not click.is_positive]
+        neg_clicks = neg_clicks + (num_max_points - len(neg_clicks)) * [(-1, -1, -1)]
+        total_clicks.append(pos_clicks + neg_clicks)
+    return total_clicks

isegm/inference/predictors/__init__.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from .base import BasePredictor
+from .brs import InputBRSPredictor, FeatureBRSPredictor, HRNetFeatureBRSPredictor
+from .brs_functors import InputOptimizer, ScaleBiasOptimizer
+from isegm.inference.transforms import ZoomIn
+from isegm.model.is_hrnet_model import HRNetModel
+def get_predictor(net, brs_mode, device,
+                  gra=None, sam_type=None,
+                  prob_thresh=0.49,
+                  with_flip=True,
+                  zoom_in_params=dict(),
+                  predictor_params=None,
+                  brs_opt_func_params=None,
+                  lbfgs_params=None):
+    lbfgs_params_ = {
+        'm': 20,
+        'factr': 0,
+        'pgtol': 1e-8,
+        'maxfun': 20,
+    }
+    predictor_params_ = {
+        'optimize_after_n_clicks': 1
+    }
+    if zoom_in_params is not None:
+        zoom_in = ZoomIn(**zoom_in_params)
+    else:
+        zoom_in = None
+    if lbfgs_params is not None:
+        lbfgs_params_.update(lbfgs_params)
+    lbfgs_params_['maxiter'] = 2 * lbfgs_params_['maxfun']
+    if brs_opt_func_params is None:
+        brs_opt_func_params = dict()
+    if isinstance(net, (list, tuple)):
+        assert brs_mode == 'NoBRS', "Multi-stage models support only NoBRS mode."
+    if brs_mode == 'NoBRS':
+        if predictor_params is not None:
+            predictor_params_.update(predictor_params)
+        predictor = BasePredictor(net, device, gra=gra, sam_type=sam_type, zoom_in=zoom_in, with_flip=with_flip, **predictor_params_)
+    elif brs_mode.startswith('f-BRS'):
+        predictor_params_.update({
+            'net_clicks_limit': 8,
+        })
+        if predictor_params is not None:
+            predictor_params_.update(predictor_params)
+        insertion_mode = {
+            'f-BRS-A': 'after_c4',
+            'f-BRS-B': 'after_aspp',
+            'f-BRS-C': 'after_deeplab'
+        }[brs_mode]
+        opt_functor = ScaleBiasOptimizer(prob_thresh=prob_thresh,
+                                         with_flip=with_flip,
+                                         optimizer_params=lbfgs_params_,
+                                         **brs_opt_func_params)
+        if isinstance(net, HRNetModel):
+            FeaturePredictor = HRNetFeatureBRSPredictor
+            insertion_mode = {'after_c4': 'A', 'after_aspp': 'A', 'after_deeplab': 'C'}[insertion_mode]
+        else:
+            FeaturePredictor = FeatureBRSPredictor
+        predictor = FeaturePredictor(net, device,
+                                     opt_functor=opt_functor,
+                                     with_flip=with_flip,
+                                     insertion_mode=insertion_mode,
+                                     zoom_in=zoom_in,
+                                     **predictor_params_)
+    elif brs_mode == 'RGB-BRS' or brs_mode == 'DistMap-BRS':
+        use_dmaps = brs_mode == 'DistMap-BRS'
+        predictor_params_.update({
+            'net_clicks_limit': 5,
+        })
+        if predictor_params is not None:
+            predictor_params_.update(predictor_params)
+        opt_functor = InputOptimizer(prob_thresh=prob_thresh,
+                                     with_flip=with_flip,
+                                     optimizer_params=lbfgs_params_,
+                                     **brs_opt_func_params)
+        predictor = InputBRSPredictor(net, device,
+                                      optimize_target='dmaps' if use_dmaps else 'rgb',
+                                      opt_functor=opt_functor,
+                                      with_flip=with_flip,
+                                      zoom_in=zoom_in,
+                                      **predictor_params_)
+    else:
+        raise NotImplementedError
+    return predictor

isegm/inference/predictors/base.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from torchvision import transforms
+from isegm.inference.transforms import AddHorizontalFlip, SigmoidForPred, LimitLongestSide
+class BasePredictor(object):
+    def __init__(self, model, device, gra=None, sam_type=None,
+                 net_clicks_limit=None,
+                 with_flip=False,
+                 zoom_in=None,
+                 max_size=None,
+                 **kwargs):
+        self.with_flip = with_flip
+        self.net_clicks_limit = net_clicks_limit
+        self.original_image = None
+        self.device = device
+        self.gra=gra if gra is not None and gra > 0 else None
+        self.sam_type = sam_type
+        self.zoom_in = zoom_in
+        self.prev_prediction = None
+        self.model_indx = 0
+        self.click_models = None
+        self.net_state_dict = None
+        if isinstance(model, tuple):
+            self.net, self.click_models = model
+        else:
+            self.net = model
+        self.to_tensor = transforms.ToTensor()
+        self.transforms = [zoom_in] if zoom_in is not None else []
+        if max_size is not None:
+            self.transforms.append(LimitLongestSide(max_size=max_size))
+        self.transforms.append(SigmoidForPred())
+        if with_flip:
+            self.transforms.append(AddHorizontalFlip())
+    def set_input_image(self, image):
+        if not isinstance(image, torch.Tensor):
+            image_nd = self.to_tensor(image)
+        else:
+            image_nd = image
+        for transform in self.transforms:
+            transform.reset()
+        self.original_image = image_nd.to(self.device)
+        if len(self.original_image.shape) == 3:
+            self.original_image = self.original_image.unsqueeze(0)
+        self.prev_prediction = torch.zeros_like(self.original_image[:, :1, :, :])
+    def get_prediction(self, clicker, prev_mask=None, gra=None):
+        clicks_list = clicker.get_clicks()
+        if self.click_models is not None:
+            model_indx = min(clicker.click_indx_offset + len(clicks_list), len(self.click_models)) - 1
+            if model_indx != self.model_indx:
+                self.model_indx = model_indx
+                self.net = self.click_models[model_indx]
+        input_image = self.original_image
+        if prev_mask is None:
+            prev_mask = self.prev_prediction
+        if (hasattr(self.net, 'with_prev_mask') and self.net.with_prev_mask) or self.sam_type is not None:
+            input_image = torch.cat((input_image, prev_mask), dim=1)
+        image_nd, clicks_lists, is_image_changed = self.apply_transforms(
+            input_image, [clicks_list]
+        )
+        pred_logits = self._get_prediction(image_nd, clicks_lists, is_image_changed, gra=gra)
+        prediction = F.interpolate(pred_logits, mode='bilinear', align_corners=True,
+                                   size=image_nd.size()[2:])
+        for t in reversed(self.transforms):
+            prediction = t.inv_transform(prediction)
+        if self.zoom_in is not None and self.zoom_in.check_possible_recalculation():
+            return self.get_prediction(clicker)
+        self.prev_prediction = prediction
+        return prediction.cpu().numpy()[0, 0]
+    def _get_prediction(self, image_nd, clicks_lists, is_image_changed, gra=None):
+        points_nd = self.get_points_nd(clicks_lists)
+        if gra is None:
+            gra = self.gra
+        if self.sam_type == 'SAM':
+            batched_input = self.get_sam_batched_input(image_nd, points_nd)
+            batched_output = self.net(batched_input, multimask_output=False, return_logits=True)
+            return torch.cat([batch['masks'] for batch in batched_output], dim=0)
+        if gra is not None:
+            return self.net(image_nd, points_nd, torch.Tensor([gra]).to(self.device))['instances']
+        else:
+            return self.net(image_nd, points_nd)['instances']
+    def _batch_infer(self, batch_image_tensor, batch_clickers, prev_mask=None):
+        if prev_mask is None:
+            prev_mask = self.prev_prediction
+        if hasattr(self.net, 'with_prev_mask') and self.net.with_prev_mask:
+            input_image = torch.cat((batch_image_tensor, prev_mask), dim=1)
+        clicks_lists = [clicker.get_clicks() for clicker in batch_clickers]
+        image_nd, clicks_lists, is_image_changed = self.apply_transforms(
+            input_image, clicks_lists
+        )
+        points_nd = self.get_points_nd(clicks_lists)
+        pred_logits = self.net(image_nd, points_nd)['instances']
+        prediction = F.interpolate(pred_logits, mode='bilinear', align_corners=True,
+                                   size=image_nd.size()[2:])
+        for t in reversed(self.transforms):
+            prediction = t.inv_transform(prediction)
+        self.prev_prediction = prediction
+        return prediction.cpu().numpy()[:, 0]
+    def _get_transform_states(self):
+        return [x.get_state() for x in self.transforms]
+    def _set_transform_states(self, states):
+        assert len(states) == len(self.transforms)
+        for state, transform in zip(states, self.transforms):
+            transform.set_state(state)
+    def apply_transforms(self, image_nd, clicks_lists):
+        is_image_changed = False
+        for t in self.transforms:
+            image_nd, clicks_lists = t.transform(image_nd, clicks_lists)
+            is_image_changed |= t.image_changed
+        return image_nd, clicks_lists, is_image_changed
+    def get_points_nd(self, clicks_lists):
+        total_clicks = []
+        num_pos_clicks = [sum(x.is_positive for x in clicks_list) for clicks_list in clicks_lists]
+        num_neg_clicks = [len(clicks_list) - num_pos for clicks_list, num_pos in zip(clicks_lists, num_pos_clicks)]
+        num_max_points = max(num_pos_clicks + num_neg_clicks)
+        if self.net_clicks_limit is not None:
+            num_max_points = min(self.net_clicks_limit, num_max_points)
+        num_max_points = max(1, num_max_points)
+        for clicks_list in clicks_lists:
+            clicks_list = clicks_list[:self.net_clicks_limit]
+            pos_clicks = [click.coords_and_indx for click in clicks_list if click.is_positive]
+            pos_clicks = pos_clicks + (num_max_points - len(pos_clicks)) * [(-1, -1, -1)]
+            neg_clicks = [click.coords_and_indx for click in clicks_list if not click.is_positive]
+            neg_clicks = neg_clicks + (num_max_points - len(neg_clicks)) * [(-1, -1, -1)]
+            total_clicks.append(pos_clicks + neg_clicks)
+        return torch.tensor(total_clicks, device=self.device)
+    def get_sam_batched_input(self, image_nd, points_nd):
+        batched_output = []
+        for i in range(image_nd.shape[0]):
+            image = image_nd[i]
+            point_length = points_nd[i].shape[0] // 2
+            point_coords = []
+            point_labels = []
+            for i, point in enumerate(points_nd[i]):
+                point_np = point.cpu().numpy()
+                if point_np[0] == -1:
+                    continue
+                if i < point_length:
+                    point_labels.append(1)
+                else:
+                    point_labels.append(0)
+                point_coords.append([point_np[1], point_np[0]])
+            res = {
+                'image': image[:3, :, :],
+                'point_coords': torch.as_tensor(np.array(point_coords), dtype=torch.float, device=self.device)[None, :],
+                'point_labels': torch.as_tensor(np.array(point_labels), dtype=torch.float, device=self.device)[None, :],
+                'original_size': image.cpu().numpy().shape[1:],
+                'mask_inputs': image[3, :, :][None, None, :]
+            }
+            batched_output.append(res)
+        return batched_output
+    def get_states(self):
+        return {
+            'transform_states': self._get_transform_states(),
+            'prev_prediction': self.prev_prediction.clone()
+        }
+    def set_states(self, states):
+        self._set_transform_states(states['transform_states'])
+        self.prev_prediction = states['prev_prediction']

isegm/inference/predictors/brs.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy.optimize import fmin_l_bfgs_b
+from .base import BasePredictor
+class BRSBasePredictor(BasePredictor):
+    def __init__(self, model, device, opt_functor, optimize_after_n_clicks=1, **kwargs):
+        super().__init__(model, device, **kwargs)
+        self.optimize_after_n_clicks = optimize_after_n_clicks
+        self.opt_functor = opt_functor
+        self.opt_data = None
+        self.input_data = None
+    def set_input_image(self, image):
+        super().set_input_image(image)
+        self.opt_data = None
+        self.input_data = None
+    def _get_clicks_maps_nd(self, clicks_lists, image_shape, radius=1):
+        pos_clicks_map = np.zeros((len(clicks_lists), 1) + image_shape, dtype=np.float32)
+        neg_clicks_map = np.zeros((len(clicks_lists), 1) + image_shape, dtype=np.float32)
+        for list_indx, clicks_list in enumerate(clicks_lists):
+            for click in clicks_list:
+                y, x = click.coords
+                y, x = int(round(y)), int(round(x))
+                y1, x1 = y - radius, x - radius
+                y2, x2 = y + radius + 1, x + radius + 1
+                if click.is_positive:
+                    pos_clicks_map[list_indx, 0, y1:y2, x1:x2] = True
+                else:
+                    neg_clicks_map[list_indx, 0, y1:y2, x1:x2] = True
+        with torch.no_grad():
+            pos_clicks_map = torch.from_numpy(pos_clicks_map).to(self.device)
+            neg_clicks_map = torch.from_numpy(neg_clicks_map).to(self.device)
+        return pos_clicks_map, neg_clicks_map
+    def get_states(self):
+        return {'transform_states': self._get_transform_states(), 'opt_data': self.opt_data}
+    def set_states(self, states):
+        self._set_transform_states(states['transform_states'])
+        self.opt_data = states['opt_data']
+class FeatureBRSPredictor(BRSBasePredictor):
+    def __init__(self, model, device, opt_functor, insertion_mode='after_deeplab', **kwargs):
+        super().__init__(model, device, opt_functor=opt_functor, **kwargs)
+        self.insertion_mode = insertion_mode
+        self._c1_features = None
+        if self.insertion_mode == 'after_deeplab':
+            self.num_channels = model.feature_extractor.ch
+        elif self.insertion_mode == 'after_c4':
+            self.num_channels = model.feature_extractor.aspp_in_channels
+        elif self.insertion_mode == 'after_aspp':
+            self.num_channels = model.feature_extractor.ch + 32
+        else:
+            raise NotImplementedError
+    def _get_prediction(self, image_nd, clicks_lists, is_image_changed):
+        points_nd = self.get_points_nd(clicks_lists)
+        pos_mask, neg_mask = self._get_clicks_maps_nd(clicks_lists, image_nd.shape[2:])
+        num_clicks = len(clicks_lists[0])
+        bs = image_nd.shape[0] // 2 if self.with_flip else image_nd.shape[0]
+        if self.opt_data is None or self.opt_data.shape[0] // (2 * self.num_channels) != bs:
+            self.opt_data = np.zeros((bs * 2 * self.num_channels), dtype=np.float32)
+        if num_clicks <= self.net_clicks_limit or is_image_changed or self.input_data is None:
+            self.input_data = self._get_head_input(image_nd, points_nd)
+        def get_prediction_logits(scale, bias):
+            scale = scale.view(bs, -1, 1, 1)
+            bias = bias.view(bs, -1, 1, 1)
+            if self.with_flip:
+                scale = scale.repeat(2, 1, 1, 1)
+                bias = bias.repeat(2, 1, 1, 1)
+            scaled_backbone_features = self.input_data * scale
+            scaled_backbone_features = scaled_backbone_features + bias
+            if self.insertion_mode == 'after_c4':
+                x = self.net.feature_extractor.aspp(scaled_backbone_features)
+                x = F.interpolate(x, mode='bilinear', size=self._c1_features.size()[2:],
+                                  align_corners=True)
+                x = torch.cat((x, self._c1_features), dim=1)
+                scaled_backbone_features = self.net.feature_extractor.head(x)
+            elif self.insertion_mode == 'after_aspp':
+                scaled_backbone_features = self.net.feature_extractor.head(scaled_backbone_features)
+            pred_logits = self.net.head(scaled_backbone_features)
+            pred_logits = F.interpolate(pred_logits, size=image_nd.size()[2:], mode='bilinear',
+                                        align_corners=True)
+            return pred_logits
+        self.opt_functor.init_click(get_prediction_logits, pos_mask, neg_mask, self.device)
+        if num_clicks > self.optimize_after_n_clicks:
+            opt_result = fmin_l_bfgs_b(func=self.opt_functor, x0=self.opt_data,
+                                       **self.opt_functor.optimizer_params)
+            self.opt_data = opt_result[0]
+        with torch.no_grad():
+            if self.opt_functor.best_prediction is not None:
+                opt_pred_logits = self.opt_functor.best_prediction
+            else:
+                opt_data_nd = torch.from_numpy(self.opt_data).to(self.device)
+                opt_vars, _ = self.opt_functor.unpack_opt_params(opt_data_nd)
+                opt_pred_logits = get_prediction_logits(*opt_vars)
+        return opt_pred_logits
+    def _get_head_input(self, image_nd, points):
+        with torch.no_grad():
+            image_nd, prev_mask = self.net.prepare_input(image_nd)
+            coord_features = self.net.get_coord_features(image_nd, prev_mask, points)
+            if self.net.rgb_conv is not None:
+                x = self.net.rgb_conv(torch.cat((image_nd, coord_features), dim=1))
+                additional_features = None
+            elif hasattr(self.net, 'maps_transform'):
+                x = image_nd
+                additional_features = self.net.maps_transform(coord_features)
+            if self.insertion_mode == 'after_c4' or self.insertion_mode == 'after_aspp':
+                c1, _, c3, c4 = self.net.feature_extractor.backbone(x, additional_features)
+                c1 = self.net.feature_extractor.skip_project(c1)
+                if self.insertion_mode == 'after_aspp':
+                    x = self.net.feature_extractor.aspp(c4)
+                    x = F.interpolate(x, size=c1.size()[2:], mode='bilinear', align_corners=True)
+                    x = torch.cat((x, c1), dim=1)
+                    backbone_features = x
+                else:
+                    backbone_features = c4
+                    self._c1_features = c1
+            else:
+                backbone_features = self.net.feature_extractor(x, additional_features)[0]
+        return backbone_features
+class HRNetFeatureBRSPredictor(BRSBasePredictor):
+    def __init__(self, model, device, opt_functor, insertion_mode='A', **kwargs):
+        super().__init__(model, device, opt_functor=opt_functor, **kwargs)
+        self.insertion_mode = insertion_mode
+        self._c1_features = None
+        if self.insertion_mode == 'A':
+            self.num_channels = sum(k * model.feature_extractor.width for k in [1, 2, 4, 8])
+        elif self.insertion_mode == 'C':
+            self.num_channels = 2 * model.feature_extractor.ocr_width
+        else:
+            raise NotImplementedError
+    def _get_prediction(self, image_nd, clicks_lists, is_image_changed):
+        points_nd = self.get_points_nd(clicks_lists)
+        pos_mask, neg_mask = self._get_clicks_maps_nd(clicks_lists, image_nd.shape[2:])
+        num_clicks = len(clicks_lists[0])
+        bs = image_nd.shape[0] // 2 if self.with_flip else image_nd.shape[0]
+        if self.opt_data is None or self.opt_data.shape[0] // (2 * self.num_channels) != bs:
+            self.opt_data = np.zeros((bs * 2 * self.num_channels), dtype=np.float32)
+        if num_clicks <= self.net_clicks_limit or is_image_changed or self.input_data is None:
+            self.input_data = self._get_head_input(image_nd, points_nd)
+        def get_prediction_logits(scale, bias):
+            scale = scale.view(bs, -1, 1, 1)
+            bias = bias.view(bs, -1, 1, 1)
+            if self.with_flip:
+                scale = scale.repeat(2, 1, 1, 1)
+                bias = bias.repeat(2, 1, 1, 1)
+            scaled_backbone_features = self.input_data * scale
+            scaled_backbone_features = scaled_backbone_features + bias
+            if self.insertion_mode == 'A':
+                if self.net.feature_extractor.ocr_width > 0:
+                    out_aux = self.net.feature_extractor.aux_head(scaled_backbone_features)
+                    feats = self.net.feature_extractor.conv3x3_ocr(scaled_backbone_features)
+                    context = self.net.feature_extractor.ocr_gather_head(feats, out_aux)
+                    feats = self.net.feature_extractor.ocr_distri_head(feats, context)
+                else:
+                    feats = scaled_backbone_features
+                pred_logits = self.net.feature_extractor.cls_head(feats)
+            elif self.insertion_mode == 'C':
+                pred_logits = self.net.feature_extractor.cls_head(scaled_backbone_features)
+            else:
+                raise NotImplementedError
+            pred_logits = F.interpolate(pred_logits, size=image_nd.size()[2:], mode='bilinear',
+                                        align_corners=True)
+            return pred_logits
+        self.opt_functor.init_click(get_prediction_logits, pos_mask, neg_mask, self.device)
+        if num_clicks > self.optimize_after_n_clicks:
+            opt_result = fmin_l_bfgs_b(func=self.opt_functor, x0=self.opt_data,
+                                       **self.opt_functor.optimizer_params)
+            self.opt_data = opt_result[0]
+        with torch.no_grad():
+            if self.opt_functor.best_prediction is not None:
+                opt_pred_logits = self.opt_functor.best_prediction
+            else:
+                opt_data_nd = torch.from_numpy(self.opt_data).to(self.device)
+                opt_vars, _ = self.opt_functor.unpack_opt_params(opt_data_nd)
+                opt_pred_logits = get_prediction_logits(*opt_vars)
+        return opt_pred_logits
+    def _get_head_input(self, image_nd, points):
+        with torch.no_grad():
+            image_nd, prev_mask = self.net.prepare_input(image_nd)
+            coord_features = self.net.get_coord_features(image_nd, prev_mask, points)
+            if self.net.rgb_conv is not None:
+                x = self.net.rgb_conv(torch.cat((image_nd, coord_features), dim=1))
+                additional_features = None
+            elif hasattr(self.net, 'maps_transform'):
+                x = image_nd
+                additional_features = self.net.maps_transform(coord_features)
+            feats = self.net.feature_extractor.compute_hrnet_feats(x, additional_features)
+            if self.insertion_mode == 'A':
+                backbone_features = feats
+            elif self.insertion_mode == 'C':
+                out_aux = self.net.feature_extractor.aux_head(feats)
+                feats = self.net.feature_extractor.conv3x3_ocr(feats)
+                context = self.net.feature_extractor.ocr_gather_head(feats, out_aux)
+                backbone_features = self.net.feature_extractor.ocr_distri_head(feats, context)
+            else:
+                raise NotImplementedError
+        return backbone_features
+class InputBRSPredictor(BRSBasePredictor):
+    def __init__(self, model, device, opt_functor, optimize_target='rgb', **kwargs):
+        super().__init__(model, device, opt_functor=opt_functor, **kwargs)
+        self.optimize_target = optimize_target
+    def _get_prediction(self, image_nd, clicks_lists, is_image_changed):
+        points_nd = self.get_points_nd(clicks_lists)
+        pos_mask, neg_mask = self._get_clicks_maps_nd(clicks_lists, image_nd.shape[2:])
+        num_clicks = len(clicks_lists[0])
+        if self.opt_data is None or is_image_changed:
+            if self.optimize_target == 'dmaps':
+                opt_channels = self.net.coord_feature_ch - 1 if self.net.with_prev_mask else self.net.coord_feature_ch
+            else:
+                opt_channels = 3
+            bs = image_nd.shape[0] // 2 if self.with_flip else image_nd.shape[0]
+            self.opt_data = torch.zeros((bs, opt_channels, image_nd.shape[2], image_nd.shape[3]),
+                                        device=self.device, dtype=torch.float32)
+        def get_prediction_logits(opt_bias):
+            input_image, prev_mask = self.net.prepare_input(image_nd)
+            dmaps = self.net.get_coord_features(input_image, prev_mask, points_nd)
+            if self.optimize_target == 'rgb':
+                input_image = input_image + opt_bias
+            elif self.optimize_target == 'dmaps':
+                if self.net.with_prev_mask:
+                    dmaps[:, 1:, :, :] = dmaps[:, 1:, :, :] + opt_bias
+                else:
+                    dmaps = dmaps + opt_bias
+            if self.net.rgb_conv is not None:
+                x = self.net.rgb_conv(torch.cat((input_image, dmaps), dim=1))
+                if self.optimize_target == 'all':
+                    x = x + opt_bias
+                coord_features = None
+            elif hasattr(self.net, 'maps_transform'):
+                x = input_image
+                coord_features = self.net.maps_transform(dmaps)
+            pred_logits = self.net.backbone_forward(x, coord_features=coord_features)['instances']
+            pred_logits = F.interpolate(pred_logits, size=image_nd.size()[2:], mode='bilinear', align_corners=True)
+            return pred_logits
+        self.opt_functor.init_click(get_prediction_logits, pos_mask, neg_mask, self.device,
+                                    shape=self.opt_data.shape)
+        if num_clicks > self.optimize_after_n_clicks:
+            opt_result = fmin_l_bfgs_b(func=self.opt_functor, x0=self.opt_data.cpu().numpy().ravel(),
+                                       **self.opt_functor.optimizer_params)
+            self.opt_data = torch.from_numpy(opt_result[0]).view(self.opt_data.shape).to(self.device)
+        with torch.no_grad():
+            if self.opt_functor.best_prediction is not None:
+                opt_pred_logits = self.opt_functor.best_prediction
+            else:
+                opt_vars, _ = self.opt_functor.unpack_opt_params(self.opt_data)
+                opt_pred_logits = get_prediction_logits(*opt_vars)
+        return opt_pred_logits

isegm/inference/predictors/brs_functors.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+import numpy as np
+from isegm.model.metrics import _compute_iou
+from .brs_losses import BRSMaskLoss
+class BaseOptimizer:
+    def __init__(self, optimizer_params,
+                 prob_thresh=0.49,
+                 reg_weight=1e-3,
+                 min_iou_diff=0.01,
+                 brs_loss=BRSMaskLoss(),
+                 with_flip=False,
+                 flip_average=False,
+                 **kwargs):
+        self.brs_loss = brs_loss
+        self.optimizer_params = optimizer_params
+        self.prob_thresh = prob_thresh
+        self.reg_weight = reg_weight
+        self.min_iou_diff = min_iou_diff
+        self.with_flip = with_flip
+        self.flip_average = flip_average
+        self.best_prediction = None
+        self._get_prediction_logits = None
+        self._opt_shape = None
+        self._best_loss = None
+        self._click_masks = None
+        self._last_mask = None
+        self.device = None
+    def init_click(self, get_prediction_logits, pos_mask, neg_mask, device, shape=None):
+        self.best_prediction = None
+        self._get_prediction_logits = get_prediction_logits
+        self._click_masks = (pos_mask, neg_mask)
+        self._opt_shape = shape
+        self._last_mask = None
+        self.device = device
+    def __call__(self, x):
+        opt_params = torch.from_numpy(x).float().to(self.device)
+        opt_params.requires_grad_(True)
+        with torch.enable_grad():
+            opt_vars, reg_loss = self.unpack_opt_params(opt_params)
+            result_before_sigmoid = self._get_prediction_logits(*opt_vars)
+            result = torch.sigmoid(result_before_sigmoid)
+            pos_mask, neg_mask = self._click_masks
+            if self.with_flip and self.flip_average:
+                result, result_flipped = torch.chunk(result, 2, dim=0)
+                result = 0.5 * (result + torch.flip(result_flipped, dims=[3]))
+                pos_mask, neg_mask = pos_mask[:result.shape[0]], neg_mask[:result.shape[0]]
+            loss, f_max_pos, f_max_neg = self.brs_loss(result, pos_mask, neg_mask)
+            loss = loss + reg_loss
+        f_val = loss.detach().cpu().numpy()
+        if self.best_prediction is None or f_val < self._best_loss:
+            self.best_prediction = result_before_sigmoid.detach()
+            self._best_loss = f_val
+        if f_max_pos < (1 - self.prob_thresh) and f_max_neg < self.prob_thresh:
+            return [f_val, np.zeros_like(x)]
+        current_mask = result > self.prob_thresh
+        if self._last_mask is not None and self.min_iou_diff > 0:
+            diff_iou = _compute_iou(current_mask, self._last_mask)
+            if len(diff_iou) > 0 and diff_iou.mean() > 1 - self.min_iou_diff:
+                return [f_val, np.zeros_like(x)]
+        self._last_mask = current_mask
+        loss.backward()
+        f_grad = opt_params.grad.cpu().numpy().ravel().astype(np.float)
+        return [f_val, f_grad]
+    def unpack_opt_params(self, opt_params):
+        raise NotImplementedError
+class InputOptimizer(BaseOptimizer):
+    def unpack_opt_params(self, opt_params):
+        opt_params = opt_params.view(self._opt_shape)
+        if self.with_flip:
+            opt_params_flipped = torch.flip(opt_params, dims=[3])
+            opt_params = torch.cat([opt_params, opt_params_flipped], dim=0)
+        reg_loss = self.reg_weight * torch.sum(opt_params**2)
+        return (opt_params,), reg_loss
+class ScaleBiasOptimizer(BaseOptimizer):
+    def __init__(self, *args, scale_act=None, reg_bias_weight=10.0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.scale_act = scale_act
+        self.reg_bias_weight = reg_bias_weight
+    def unpack_opt_params(self, opt_params):
+        scale, bias = torch.chunk(opt_params, 2, dim=0)
+        reg_loss = self.reg_weight * (torch.sum(scale**2) + self.reg_bias_weight * torch.sum(bias**2))
+        if self.scale_act == 'tanh':
+            scale = torch.tanh(scale)
+        elif self.scale_act == 'sin':
+            scale = torch.sin(scale)
+        return (1 + scale, bias), reg_loss

isegm/inference/predictors/brs_losses.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from isegm.model.losses import SigmoidBinaryCrossEntropyLoss
+class BRSMaskLoss(torch.nn.Module):
+    def __init__(self, eps=1e-5):
+        super().__init__()
+        self._eps = eps
+    def forward(self, result, pos_mask, neg_mask):
+        pos_diff = (1 - result) * pos_mask
+        pos_target = torch.sum(pos_diff ** 2)
+        pos_target = pos_target / (torch.sum(pos_mask) + self._eps)
+        neg_diff = result * neg_mask
+        neg_target = torch.sum(neg_diff ** 2)
+        neg_target = neg_target / (torch.sum(neg_mask) + self._eps)
+        loss = pos_target + neg_target
+        with torch.no_grad():
+            f_max_pos = torch.max(torch.abs(pos_diff)).item()
+            f_max_neg = torch.max(torch.abs(neg_diff)).item()
+        return loss, f_max_pos, f_max_neg
+class OracleMaskLoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.gt_mask = None
+        self.loss = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
+        self.predictor = None
+        self.history = []
+    def set_gt_mask(self, gt_mask):
+        self.gt_mask = gt_mask
+        self.history = []
+    def forward(self, result, pos_mask, neg_mask):
+        gt_mask = self.gt_mask.to(result.device)
+        if self.predictor.object_roi is not None:
+            r1, r2, c1, c2 = self.predictor.object_roi[:4]
+            gt_mask = gt_mask[:, :, r1:r2 + 1, c1:c2 + 1]
+            gt_mask = torch.nn.functional.interpolate(gt_mask, result.size()[2:],  mode='bilinear', align_corners=True)
+        if result.shape[0] == 2:
+            gt_mask_flipped = torch.flip(gt_mask, dims=[3])
+            gt_mask = torch.cat([gt_mask, gt_mask_flipped], dim=0)
+        loss = self.loss(result, gt_mask)
+        self.history.append(loss.detach().cpu().numpy()[0])
+        if len(self.history) > 5 and abs(self.history[-5] - self.history[-1]) < 1e-5:
+            return 0, 0, 0
+        return loss, 1.0, 1.0

isegm/inference/transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .base import SigmoidForPred
+from .flip import AddHorizontalFlip
+from .zoom_in import ZoomIn
+from .limit_longest_side import LimitLongestSide
+from .crops import Crops

isegm/inference/transforms/base.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+class BaseTransform(object):
+    def __init__(self):
+        self.image_changed = False
+    def transform(self, image_nd, clicks_lists):
+        raise NotImplementedError
+    def inv_transform(self, prob_map):
+        raise NotImplementedError
+    def reset(self):
+        raise NotImplementedError
+    def get_state(self):
+        raise NotImplementedError
+    def set_state(self, state):
+        raise NotImplementedError
+class SigmoidForPred(BaseTransform):
+    def transform(self, image_nd, clicks_lists):
+        return image_nd, clicks_lists
+    def inv_transform(self, prob_map):
+        return torch.sigmoid(prob_map)
+    def reset(self):
+        pass
+    def get_state(self):
+        return None
+    def set_state(self, state):
+        pass

isegm/inference/transforms/crops.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import math
+import torch
+import numpy as np
+from typing import List
+from isegm.inference.clicker import Click
+from .base import BaseTransform
+class Crops(BaseTransform):
+    def __init__(self, crop_size=(320, 480), min_overlap=0.2):
+        super().__init__()
+        self.crop_height, self.crop_width = crop_size
+        self.min_overlap = min_overlap
+        self.x_offsets = None
+        self.y_offsets = None
+        self._counts = None
+    def transform(self, image_nd, clicks_lists: List[List[Click]]):
+        assert image_nd.shape[0] == 1 and len(clicks_lists) == 1
+        image_height, image_width = image_nd.shape[2:4]
+        self._counts = None
+        if image_height < self.crop_height or image_width < self.crop_width:
+            return image_nd, clicks_lists
+        self.x_offsets = get_offsets(image_width, self.crop_width, self.min_overlap)
+        self.y_offsets = get_offsets(image_height, self.crop_height, self.min_overlap)
+        self._counts = np.zeros((image_height, image_width))
+        image_crops = []
+        for dy in self.y_offsets:
+            for dx in self.x_offsets:
+                self._counts[dy:dy + self.crop_height, dx:dx + self.crop_width] += 1
+                image_crop = image_nd[:, :, dy:dy + self.crop_height, dx:dx + self.crop_width]
+                image_crops.append(image_crop)
+        image_crops = torch.cat(image_crops, dim=0)
+        self._counts = torch.tensor(self._counts, device=image_nd.device, dtype=torch.float32)
+        clicks_list = clicks_lists[0]
+        clicks_lists = []
+        for dy in self.y_offsets:
+            for dx in self.x_offsets:
+                crop_clicks = [x.copy(coords=(x.coords[0] - dy, x.coords[1] - dx)) for x in clicks_list]
+                clicks_lists.append(crop_clicks)
+        return image_crops, clicks_lists
+    def inv_transform(self, prob_map):
+        if self._counts is None:
+            return prob_map
+        new_prob_map = torch.zeros((1, 1, *self._counts.shape),
+                                   dtype=prob_map.dtype, device=prob_map.device)
+        crop_indx = 0
+        for dy in self.y_offsets:
+            for dx in self.x_offsets:
+                new_prob_map[0, 0, dy:dy + self.crop_height, dx:dx + self.crop_width] += prob_map[crop_indx, 0]
+                crop_indx += 1
+        new_prob_map = torch.div(new_prob_map, self._counts)
+        return new_prob_map
+    def get_state(self):
+        return self.x_offsets, self.y_offsets, self._counts
+    def set_state(self, state):
+        self.x_offsets, self.y_offsets, self._counts = state
+    def reset(self):
+        self.x_offsets = None
+        self.y_offsets = None
+        self._counts = None
+def get_offsets(length, crop_size, min_overlap_ratio=0.2):
+    if length == crop_size:
+        return [0]
+    N = (length / crop_size - min_overlap_ratio) / (1 - min_overlap_ratio)
+    N = math.ceil(N)
+    overlap_ratio = (N - length / crop_size) / (N - 1)
+    overlap_width = int(crop_size * overlap_ratio)
+    offsets = [0]
+    for i in range(1, N):
+        new_offset = offsets[-1] + crop_size - overlap_width
+        if new_offset + crop_size > length:
+            new_offset = length - crop_size
+        offsets.append(new_offset)
+    return offsets

isegm/inference/transforms/flip.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+from typing import List
+from isegm.inference.clicker import Click
+from .base import BaseTransform
+class AddHorizontalFlip(BaseTransform):
+    def transform(self, image_nd, clicks_lists: List[List[Click]]):
+        assert len(image_nd.shape) == 4
+        image_nd = torch.cat([image_nd, torch.flip(image_nd, dims=[3])], dim=0)
+        image_width = image_nd.shape[3]
+        clicks_lists_flipped = []
+        for clicks_list in clicks_lists:
+            clicks_list_flipped = [click.copy(coords=(click.coords[0], image_width - click.coords[1] - 1))
+                                   for click in clicks_list]
+            clicks_lists_flipped.append(clicks_list_flipped)
+        clicks_lists = clicks_lists + clicks_lists_flipped
+        return image_nd, clicks_lists
+    def inv_transform(self, prob_map):
+        assert len(prob_map.shape) == 4 and prob_map.shape[0] % 2 == 0
+        num_maps = prob_map.shape[0] // 2
+        prob_map, prob_map_flipped = prob_map[:num_maps], prob_map[num_maps:]
+        return 0.5 * (prob_map + torch.flip(prob_map_flipped, dims=[3]))
+    def get_state(self):
+        return None
+    def set_state(self, state):
+        pass
+    def reset(self):
+        pass

isegm/inference/transforms/limit_longest_side.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from .zoom_in import ZoomIn, get_roi_image_nd
+class LimitLongestSide(ZoomIn):
+    def __init__(self, max_size=800):
+        super().__init__(target_size=max_size, skip_clicks=0)
+    def transform(self, image_nd, clicks_lists):
+        assert image_nd.shape[0] == 1 and len(clicks_lists) == 1
+        image_max_size = max(image_nd.shape[2:4])
+        self.image_changed = False
+        if image_max_size <= self.target_size:
+            return image_nd, clicks_lists
+        self._input_image = image_nd
+        self._object_roi = (0, image_nd.shape[2] - 1, 0, image_nd.shape[3] - 1)
+        self._roi_image = get_roi_image_nd(image_nd, self._object_roi, self.target_size)
+        self.image_changed = True
+        tclicks_lists = [self._transform_clicks(clicks_lists[0])]
+        return self._roi_image, tclicks_lists

isegm/inference/transforms/zoom_in.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+import numpy as np
+from typing import List
+from isegm.inference.clicker import Click
+from isegm.utils.misc import get_bbox_iou, get_bbox_from_mask, expand_bbox, clamp_bbox
+from .base import BaseTransform
+class ZoomIn(BaseTransform):
+    def __init__(self,
+                 target_size=400,
+                 skip_clicks=1,
+                 expansion_ratio=1.4,
+                 min_crop_size=200,
+                 recompute_thresh_iou=0.5,
+                 prob_thresh=0.50):
+        super().__init__()
+        self.target_size = target_size
+        self.min_crop_size = min_crop_size
+        self.skip_clicks = skip_clicks
+        self.expansion_ratio = expansion_ratio
+        self.recompute_thresh_iou = recompute_thresh_iou
+        self.prob_thresh = prob_thresh
+        self._input_image_shape = None
+        self._prev_probs = None
+        self._object_roi = None
+        self._roi_image = None
+    def transform(self, image_nd, clicks_lists: List[List[Click]]):
+        transformed_image = []
+        transformed_clicks_lists = []
+        for bindx in range(len(clicks_lists)):
+            new_image_nd, new_clicks_lists = self._transform(image_nd[bindx].unsqueeze(0), [clicks_lists[bindx]])
+            transformed_image.append(new_image_nd)
+            transformed_clicks_lists.append(new_clicks_lists[0])
+        return torch.cat(transformed_image, dim=0), transformed_clicks_lists
+    def _transform(self, image_nd, clicks_lists: List[List[Click]]):
+        assert image_nd.shape[0] == 1 and len(clicks_lists) == 1
+        self.image_changed = False
+        clicks_list = clicks_lists[0]
+        if len(clicks_list) <= self.skip_clicks:
+            return image_nd, clicks_lists
+        self._input_image_shape = image_nd.shape
+        current_object_roi = None
+        if self._prev_probs is not None:
+            current_pred_mask = (self._prev_probs > self.prob_thresh)[0, 0]
+            if current_pred_mask.sum() > 0:
+                current_object_roi = get_object_roi(current_pred_mask, clicks_list,
+                                                    self.expansion_ratio, self.min_crop_size)
+        if current_object_roi is None:
+            if self.skip_clicks >= 0:
+                return image_nd, clicks_lists
+            else:
+                current_object_roi = 0, image_nd.shape[2] - 1, 0, image_nd.shape[3] - 1
+        update_object_roi = False
+        if self._object_roi is None:
+            update_object_roi = True
+        elif not check_object_roi(self._object_roi, clicks_list):
+            update_object_roi = True
+        elif get_bbox_iou(current_object_roi, self._object_roi) < self.recompute_thresh_iou:
+            update_object_roi = True
+        if update_object_roi:
+            self._object_roi = current_object_roi
+            self.image_changed = True
+        self._roi_image = get_roi_image_nd(image_nd, self._object_roi, self.target_size)
+        tclicks_lists = [self._transform_clicks(clicks_list)]
+        return self._roi_image.to(image_nd.device), tclicks_lists
+    def inv_transform(self, prob_map):
+        new_prob_maps = []
+        for bindx in range(prob_map.shape[0]):
+            new_prob_map = self._inv_transform(prob_map[bindx].unsqueeze(0))
+            new_prob_maps.append(new_prob_map)
+        return torch.cat(new_prob_maps, dim=0)
+    def _inv_transform(self, prob_map):
+        if self._object_roi is None:
+            self._prev_probs = prob_map.cpu().numpy()
+            return prob_map
+        assert prob_map.shape[0] == 1
+        rmin, rmax, cmin, cmax = self._object_roi
+        prob_map = torch.nn.functional.interpolate(prob_map, size=(rmax - rmin + 1, cmax - cmin + 1),
+                                                   mode='bilinear', align_corners=True)
+        if self._prev_probs is not None:
+            new_prob_map = torch.zeros(*self._prev_probs.shape, device=prob_map.device, dtype=prob_map.dtype)
+            new_prob_map[:, :, rmin:rmax + 1, cmin:cmax + 1] = prob_map
+        else:
+            new_prob_map = prob_map
+        self._prev_probs = new_prob_map.cpu().numpy()
+        return new_prob_map
+    def check_possible_recalculation(self):
+        if self._prev_probs is None or self._object_roi is not None or self.skip_clicks > 0:
+            return False
+        pred_mask = (self._prev_probs > self.prob_thresh)[0, 0]
+        if pred_mask.sum() > 0:
+            possible_object_roi = get_object_roi(pred_mask, [],
+                                                 self.expansion_ratio, self.min_crop_size)
+            image_roi = (0, self._input_image_shape[2] - 1, 0, self._input_image_shape[3] - 1)
+            if get_bbox_iou(possible_object_roi, image_roi) < 0.50:
+                return True
+        return False
+    def get_state(self):
+        roi_image = self._roi_image.cpu() if self._roi_image is not None else None
+        return self._input_image_shape, self._object_roi, self._prev_probs, roi_image, self.image_changed
+    def set_state(self, state):
+        self._input_image_shape, self._object_roi, self._prev_probs, self._roi_image, self.image_changed = state
+    def reset(self):
+        self._input_image_shape = None
+        self._object_roi = None
+        self._prev_probs = None
+        self._roi_image = None
+        self.image_changed = False
+    def _transform_clicks(self, clicks_list):
+        if self._object_roi is None:
+            return clicks_list
+        rmin, rmax, cmin, cmax = self._object_roi
+        crop_height, crop_width = self._roi_image.shape[2:]
+        transformed_clicks = []
+        for click in clicks_list:
+            new_r = crop_height * (click.coords[0] - rmin) / (rmax - rmin + 1)
+            new_c = crop_width * (click.coords[1] - cmin) / (cmax - cmin + 1)
+            transformed_clicks.append(click.copy(coords=(new_r, new_c)))
+        return transformed_clicks
+def get_object_roi(pred_mask, clicks_list, expansion_ratio, min_crop_size):
+    pred_mask = pred_mask.copy()
+    for click in clicks_list:
+        if click.is_positive:
+            pred_mask[int(click.coords[0]), int(click.coords[1])] = 1
+    bbox = get_bbox_from_mask(pred_mask)
+    bbox = expand_bbox(bbox, expansion_ratio, min_crop_size)
+    h, w = pred_mask.shape[0], pred_mask.shape[1]
+    bbox = clamp_bbox(bbox, 0, h - 1, 0, w - 1)
+    return bbox
+def get_roi_image_nd(image_nd, object_roi, target_size):
+    rmin, rmax, cmin, cmax = object_roi
+    height = rmax - rmin + 1
+    width = cmax - cmin + 1
+    if isinstance(target_size, tuple):
+        new_height, new_width = target_size
+    else:
+        scale = target_size / max(height, width)
+        new_height = int(round(height * scale))
+        new_width = int(round(width * scale))
+    with torch.no_grad():
+        roi_image_nd = image_nd[:, :, rmin:rmax + 1, cmin:cmax + 1]
+        roi_image_nd = torch.nn.functional.interpolate(roi_image_nd, size=(new_height, new_width),
+                                                       mode='bilinear', align_corners=True)
+    return roi_image_nd
+def check_object_roi(object_roi, clicks_list):
+    for click in clicks_list:
+        if click.is_positive:
+            if click.coords[0] < object_roi[0] or click.coords[0] >= object_roi[1]:
+                return False
+            if click.coords[1] < object_roi[2] or click.coords[1] >= object_roi[3]:
+                return False
+    return True

isegm/inference/utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from datetime import timedelta
+from pathlib import Path
+import torch
+import numpy as np
+from isegm.utils.serialization import load_model
+def get_time_metrics(all_ious, elapsed_time):
+    n_images = len(all_ious)
+    n_clicks = sum(map(len, all_ious))
+    mean_spc = elapsed_time / n_clicks
+    mean_spi = elapsed_time / n_images
+    return mean_spc, mean_spi
+def load_is_model(checkpoint, device, eval_ritm, lora_checkpoint=None, **kwargs):
+    if isinstance(checkpoint, (str, Path)):
+        state_dict = torch.load(checkpoint, map_location='cpu')
+    else:
+        state_dict = checkpoint
+    if isinstance(state_dict, list):
+        model = load_single_is_model(state_dict[0], device, eval_ritm, **kwargs)
+        models = [load_single_is_model(x, device, eval_ritm, **kwargs) for x in state_dict]
+        return model, models
+    else:
+        return load_single_is_model(state_dict, device, eval_ritm, lora_checkpoint=lora_checkpoint, **kwargs)
+def load_single_is_model(state_dict, device, eval_ritm, lora_checkpoint=None, **kwargs):
+    if 'config' in state_dict.keys():
+        _config = state_dict['config']
+    if lora_checkpoint is not None:
+        lora_state_dict = torch.load(lora_checkpoint, map_location='cpu')
+        _config = lora_state_dict['config']
+    model = load_model(_config, eval_ritm, **kwargs)
+    print("Load predictor weights...")
+    if 'state_dict' in state_dict.keys():
+        msg = model.load_state_dict(state_dict['state_dict'], strict=False)
+    else:
+        try:
+            msg = model.load_state_dict(state_dict, strict=False)
+        except:
+            current_state_dict = model.state_dict()
+            new_state_dict = {}
+            for k, v in state_dict.items():
+                if k in current_state_dict and v.shape == current_state_dict[k].shape:
+                    new_state_dict[k] = v
+            msg = model.load_state_dict(new_state_dict, strict=False)
+    print(msg)
+    if lora_checkpoint is not None:
+        print("Load predictor LoRA weights...")
+        msg = model.load_state_dict(lora_state_dict['state_dict'], strict=False)
+        print(msg[1])
+    for param in model.parameters():
+        param.requires_grad = False
+    model.to(device)
+    model.eval()
+    return model
+def get_iou(gt_mask, pred_mask, ignore_label=-1):
+    ignore_gt_mask_inv = gt_mask != ignore_label
+    obj_gt_mask = gt_mask == 1
+    intersection = np.logical_and(np.logical_and(pred_mask, obj_gt_mask), ignore_gt_mask_inv).sum()
+    union = np.logical_and(np.logical_or(pred_mask, obj_gt_mask), ignore_gt_mask_inv).sum()
+    return intersection / union
+def compute_noc_metric(all_ious, iou_thrs, max_clicks=20):
+    def _get_noc(iou_arr, iou_thr):
+        vals = iou_arr >= iou_thr
+        return np.argmax(vals) + 1 if np.any(vals) else max_clicks
+    noc_list = []
+    noc_list_std = []
+    over_max_list = []
+    for iou_thr in iou_thrs:
+        scores_arr = np.array([_get_noc(iou_arr, iou_thr)
+                               for iou_arr in all_ious], dtype=np.int_)
+        score = scores_arr.mean()
+        score_std = scores_arr.std()
+        over_max = (scores_arr == max_clicks).sum()
+        noc_list.append(score)
+        noc_list_std.append(score_std)
+        over_max_list.append(over_max)
+    return noc_list, noc_list_std, over_max_list
+def find_checkpoint(weights_folder, checkpoint_name):
+    weights_folder = Path(weights_folder)
+    if ':' in checkpoint_name:
+        model_name, checkpoint_name = checkpoint_name.split(':')
+        models_candidates = [x for x in weights_folder.glob(f'{model_name}*') if x.is_dir()]
+        assert len(models_candidates) == 1
+        model_folder = models_candidates[0]
+    else:
+        model_folder = weights_folder
+    if checkpoint_name.endswith('.pth'):
+        if Path(checkpoint_name).exists():
+            checkpoint_path = checkpoint_name
+        else:
+            checkpoint_path = weights_folder / checkpoint_name
+    else:
+        model_checkpoints = list(model_folder.rglob(f'{checkpoint_name}*.pth'))
+        assert len(model_checkpoints) == 1
+        checkpoint_path = model_checkpoints[0]
+    return str(checkpoint_path)
+def get_results_table(noc_list, over_max_list, brs_type, dataset_name, mean_spc, elapsed_time, iou_first,
+                      n_clicks=20, model_name=None):
+    table_header = (f'|{"BRS Type":^13}|{"Dataset":^11}|'
+                    f'{"NoC@80%":^9}|{"NoC@85%":^9}|{"NoC@90%":^9}|'
+                    f'{"IoU@1":^9}|'
+                    f'{">="+str(n_clicks)+"@85%":^9}|{">="+str(n_clicks)+"@90%":^9}|'
+                    f'{"SPC,s":^7}|{"Time":^9}|')
+    row_width = len(table_header)
+    header = f'Eval results for model: {model_name}\n' if model_name is not None else ''
+    header += '-' * row_width + '\n'
+    header += table_header + '\n' + '-' * row_width
+    eval_time = str(timedelta(seconds=int(elapsed_time)))
+    table_row = f'|{brs_type:^13}|{dataset_name:^11}|'
+    table_row += f'{noc_list[0]:^9.2f}|'
+    table_row += f'{noc_list[1]:^9.2f}|' if len(noc_list) > 1 else f'{"?":^9}|'
+    table_row += f'{noc_list[2]:^9.2f}|' if len(noc_list) > 2 else f'{"?":^9}|'
+    table_row += f'{iou_first:^9.2f}|'
+    table_row += f'{over_max_list[1]:^9}|' if len(noc_list) > 1 else f'{"?":^9}|'
+    table_row += f'{over_max_list[2]:^9}|' if len(noc_list) > 2 else f'{"?":^9}|'
+    table_row += f'{mean_spc:^7.3f}|{eval_time:^9}|'
+    return header, table_row

isegm/model/__init__.py ADDED Viewed

File without changes

isegm/model/build_sam.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from functools import partial
+from .sam_modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer, SAMISWrapper
+def build_sam_vit_h(checkpoint=None, enable_lora=False, enable_gra=False, mode='eval', image_size=1024):
+    return _build_sam(
+        encoder_embed_dim=1280,
+        encoder_depth=32,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[7, 15, 23, 31],
+        checkpoint=checkpoint,
+        enable_lora=enable_lora,
+        enable_gra=enable_gra,
+        mode=mode,
+        image_size=image_size,
+    )
+build_sam = build_sam_vit_h
+def build_sam_vit_l(checkpoint=None, enable_lora=False, enable_gra=False, mode='eval', image_size=1024):
+    return _build_sam(
+        encoder_embed_dim=1024,
+        encoder_depth=24,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[5, 11, 17, 23],
+        checkpoint=checkpoint,
+        enable_lora=enable_lora,
+        enable_gra=enable_gra,
+        mode=mode,
+        image_size=image_size,
+    )
+def build_sam_vit_b(checkpoint=None, enable_lora=False, enable_gra=False, mode='eval', image_size=1024):
+    return _build_sam(
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        checkpoint=checkpoint,
+        enable_lora=enable_lora,
+        enable_gra=enable_gra,
+        mode=mode,
+        image_size=image_size,
+    )
+sam_model_registry = {
+    "default": build_sam_vit_h,
+    "vit_h": build_sam_vit_h,
+    "vit_l": build_sam_vit_l,
+    "vit_b": build_sam_vit_b,
+}
+def _build_sam(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+    checkpoint=None,
+    enable_lora=False,
+    enable_gra=False,
+    mode='eval',
+    image_size=1024,
+):
+    prompt_embed_dim = 256
+    image_size = image_size
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    if mode == 'train':
+        sam = SAMISWrapper(
+            encoder_embed_dim=encoder_embed_dim,
+            encoder_depth=encoder_depth,
+            encoder_num_heads=encoder_num_heads,
+            encoder_global_attn_indexes=encoder_global_attn_indexes,
+            enable_lora=enable_lora,
+            enable_gra=enable_gra,
+            with_prev_mask=True,
+            image_size=image_size,
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+    else:
+        sam = Sam(
+            image_encoder=ImageEncoderViT(
+                depth=encoder_depth,
+                embed_dim=encoder_embed_dim,
+                img_size=image_size,
+                mlp_ratio=4,
+                norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+                num_heads=encoder_num_heads,
+                patch_size=vit_patch_size,
+                qkv_bias=True,
+                use_rel_pos=True,
+                global_attn_indexes=encoder_global_attn_indexes,
+                window_size=14,
+                out_chans=prompt_embed_dim,
+            ),
+            prompt_encoder=PromptEncoder(
+                embed_dim=prompt_embed_dim,
+                image_embedding_size=(image_embedding_size, image_embedding_size),
+                input_image_size=(image_size, image_size),
+                mask_in_chans=16,
+            ),
+            mask_decoder=MaskDecoder(
+                num_multimask_outputs=3,
+                transformer=TwoWayTransformer(
+                    depth=2,
+                    embedding_dim=prompt_embed_dim,
+                    mlp_dim=2048,
+                    num_heads=8,
+                ),
+                transformer_dim=prompt_embed_dim,
+                iou_head_depth=3,
+                iou_head_hidden_dim=256,
+            ),
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+        sam.eval()
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            pretrained_dict = torch.load(f)
+        model_dict = sam.state_dict()
+        new_pretrained_dict = {}
+        for k, v in pretrained_dict.items():
+            if k in model_dict and v.shape == model_dict[k].shape:
+                new_pretrained_dict[k] = v
+        msg = sam.load_state_dict(new_pretrained_dict, strict=False)
+        print("SAM load Info: ", msg)
+    return sam

isegm/model/initializer.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import torch.nn as nn
+import numpy as np
+class Initializer(object):
+    def __init__(self, local_init=True, gamma=None):
+        self.local_init = local_init
+        self.gamma = gamma
+    def __call__(self, m):
+        if getattr(m, '__initialized', False):
+            return
+        if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
+                          nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d,
+                          nn.GroupNorm, nn.SyncBatchNorm)) or 'BatchNorm' in m.__class__.__name__:
+            if m.weight is not None:
+                self._init_gamma(m.weight.data)
+            if m.bias is not None:
+                self._init_beta(m.bias.data)
+        else:
+            if getattr(m, 'weight', None) is not None:
+                self._init_weight(m.weight.data)
+            if getattr(m, 'bias', None) is not None:
+                self._init_bias(m.bias.data)
+        if self.local_init:
+            object.__setattr__(m, '__initialized', True)
+    def _init_weight(self, data):
+        nn.init.uniform_(data, -0.07, 0.07)
+    def _init_bias(self, data):
+        nn.init.constant_(data, 0)
+    def _init_gamma(self, data):
+        if self.gamma is None:
+            nn.init.constant_(data, 1.0)
+        else:
+            nn.init.normal_(data, 1.0, self.gamma)
+    def _init_beta(self, data):
+        nn.init.constant_(data, 0)
+class Bilinear(Initializer):
+    def __init__(self, scale, groups, in_channels, **kwargs):
+        super().__init__(**kwargs)
+        self.scale = scale
+        self.groups = groups
+        self.in_channels = in_channels
+    def _init_weight(self, data):
+        """Reset the weight and bias."""
+        bilinear_kernel = self.get_bilinear_kernel(self.scale)
+        weight = torch.zeros_like(data)
+        for i in range(self.in_channels):
+            if self.groups == 1:
+                j = i
+            else:
+                j = 0
+            weight[i, j] = bilinear_kernel
+        data[:] = weight
+    @staticmethod
+    def get_bilinear_kernel(scale):
+        """Generate a bilinear upsampling kernel."""
+        kernel_size = 2 * scale - scale % 2
+        scale = (kernel_size + 1) // 2
+        center = scale - 0.5 * (1 + kernel_size % 2)
+        og = np.ogrid[:kernel_size, :kernel_size]
+        kernel = (1 - np.abs(og[0] - center) / scale) * (1 - np.abs(og[1] - center) / scale)
+        return torch.tensor(kernel, dtype=torch.float32)
+class XavierGluon(Initializer):
+    def __init__(self, rnd_type='uniform', factor_type='avg', magnitude=3, **kwargs):
+        super().__init__(**kwargs)
+        self.rnd_type = rnd_type
+        self.factor_type = factor_type
+        self.magnitude = float(magnitude)
+    def _init_weight(self, arr):
+        fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(arr)
+        if self.factor_type == 'avg':
+            factor = (fan_in + fan_out) / 2.0
+        elif self.factor_type == 'in':
+            factor = fan_in
+        elif self.factor_type == 'out':
+            factor = fan_out
+        else:
+            raise ValueError('Incorrect factor type')
+        scale = np.sqrt(self.magnitude / factor)
+        if self.rnd_type == 'uniform':
+            nn.init.uniform_(arr, -scale, scale)
+        elif self.rnd_type == 'gaussian':
+            nn.init.normal_(arr, 0, scale)
+        else:
+            raise ValueError('Unknown random type')

isegm/model/is_deeplab_model.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch.nn as nn
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from .modeling.deeplab_v3 import DeepLabV3Plus
+from .modeling.basic_blocks import SepConvHead
+from isegm.model.modifiers import LRMult
+class DeeplabModel(ISModel):
+    @serialize
+    def __init__(self, backbone='resnet50', deeplab_ch=256, aspp_dropout=0.5,
+                 backbone_norm_layer=None, backbone_lr_mult=0.1, norm_layer=nn.BatchNorm2d, **kwargs):
+        super().__init__(norm_layer=norm_layer, **kwargs)
+        self.feature_extractor = DeepLabV3Plus(backbone=backbone, ch=deeplab_ch, project_dropout=aspp_dropout,
+                                               norm_layer=norm_layer, backbone_norm_layer=backbone_norm_layer)
+        self.feature_extractor.backbone.apply(LRMult(backbone_lr_mult))
+        self.head = SepConvHead(1, in_channels=deeplab_ch, mid_channels=deeplab_ch // 2,
+                                num_layers=2, norm_layer=norm_layer)
+    def backbone_forward(self, image, coord_features=None):
+        backbone_features = self.feature_extractor(image, coord_features)
+        return {'instances': self.head(backbone_features[0])}

isegm/model/is_hrformer_model.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+from collections import OrderedDict
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from isegm.model.modifiers import LRMult
+from .modeling.hrformer import HRT_B_OCR_V3
+class HRFormerModel(ISModel):
+    @serialize
+    def __init__(
+        self,
+        num_classes=1,
+        in_ch=6,
+        backbone_lr_mult=0.1,
+        **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.feature_extractor = HRT_B_OCR_V3(num_classes, in_ch)
+        self.feature_extractor.apply(LRMult(backbone_lr_mult))
+    def backbone_forward(self, image, coord_features=None):
+        backbone_features = self.feature_extractor(image)
+        return {'instances': backbone_features[0], 'instances_aux': backbone_features[1]}
+    def init_weight(self, pretrained=None):
+        if pretrained is not None:
+            state_dict = torch.load(pretrained)['model']
+            state_dict_rename = OrderedDict()
+            for k, v in state_dict.items():
+                state_dict_rename['backbone.' + k] = v
+            ori_proj_weight = state_dict_rename['backbone.conv1.weight']
+            state_dict_rename['backbone.conv1.weight'] = torch.cat([ori_proj_weight, ori_proj_weight], dim=1)
+            self.feature_extractor.load_state_dict(state_dict_rename, False)
+            print('Successfully loaded pretrained model.')

isegm/model/is_hrnet_model.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from .modeling.hrnet_ocr import HighResolutionNet
+from isegm.model.modifiers import LRMult
+class HRNetModel(ISModel):
+    @serialize
+    def __init__(self, width=48, ocr_width=256, small=False, backbone_lr_mult=0.1,
+                 norm_layer=nn.BatchNorm2d, **kwargs):
+        super().__init__(**kwargs)
+        self.feature_extractor = HighResolutionNet(width=width, ocr_width=ocr_width, small=small,
+                                                   num_classes=1, norm_layer=norm_layer)
+        self.feature_extractor.apply(LRMult(backbone_lr_mult))
+        if ocr_width > 0:
+            self.feature_extractor.ocr_distri_head.apply(LRMult(1.0))
+            self.feature_extractor.ocr_gather_head.apply(LRMult(1.0))
+            self.feature_extractor.conv3x3_ocr.apply(LRMult(1.0))
+    def backbone_forward(self, image, coord_features=None):
+        net_outputs = self.feature_extractor(image, coord_features)
+        return {'instances': net_outputs[0], 'instances_aux': net_outputs[1]}

isegm/model/is_model.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from isegm.model.ops import DistMaps, BatchImageNormalize, ScaleLayer
+class ISModel(nn.Module):
+    def __init__(self, with_aux_output=False, norm_radius=5, use_disks=False, cpu_dist_maps=False,
+                 use_rgb_conv=False, use_leaky_relu=False, # the two arguments only used for RITM
+                 with_prev_mask=False, norm_mean_std=([.485, .456, .406], [.229, .224, .225])):
+        super().__init__()
+        self.with_aux_output = with_aux_output
+        self.with_prev_mask = with_prev_mask
+        self.normalization = BatchImageNormalize(norm_mean_std[0], norm_mean_std[1])
+        self.coord_feature_ch = 2
+        if self.with_prev_mask:
+            self.coord_feature_ch += 1
+        if use_rgb_conv:
+            # Only RITM models need to transform the coordinate features, though they don't use
+            # exact 'rgb_conv'. We keep 'use_rgb_conv' only for compatible issues.
+            # The simpleclick models use a patch embedding layer instead
+            mt_layers = [
+                nn.Conv2d(in_channels=self.coord_feature_ch, out_channels=16, kernel_size=1),
+                nn.LeakyReLU(negative_slope=0.2) if use_leaky_relu else nn.ReLU(inplace=True),
+                nn.Conv2d(in_channels=16, out_channels=64, kernel_size=3, stride=2, padding=1),
+                ScaleLayer(init_value=0.05, lr_mult=1)
+            ]
+            self.maps_transform = nn.Sequential(*mt_layers)
+        else:
+            self.maps_transform=nn.Identity()
+        self.dist_maps = DistMaps(norm_radius=norm_radius, spatial_scale=1.0,
+                                  cpu_mode=cpu_dist_maps, use_disks=use_disks)
+    def forward(self, image, points, text=None, gra=None):
+        image, prev_mask = self.prepare_input(image)
+        coord_features = self.get_coord_features(image, prev_mask, points)
+        coord_features = self.maps_transform(coord_features)
+        if gra is not None and text is not None:
+            outputs = self.backbone_forward(image, coord_features, text=text, gra=gra)
+        elif gra is not None:
+            outputs = self.backbone_forward(image, coord_features, gra=gra)
+        elif text is not None:
+            outputs = self.backbone_forward(image, coord_features, text=text)
+        else:
+            outputs = self.backbone_forward(image, coord_features)
+        outputs['instances'] = nn.functional.interpolate(outputs['instances'], size=image.size()[2:],
+                                                         mode='bilinear', align_corners=True)
+        if self.with_aux_output:
+            outputs['instances_aux'] = nn.functional.interpolate(outputs['instances_aux'], size=image.size()[2:],
+                                                             mode='bilinear', align_corners=True)
+        return outputs
+    def prepare_input(self, image):
+        prev_mask = None
+        if self.with_prev_mask:
+            prev_mask = image[:, 3:, :, :]
+            image = image[:, :3, :, :]
+        image = self.normalization(image)
+        return image, prev_mask
+    def backbone_forward(self, image, coord_features=None):
+        raise NotImplementedError
+    def get_coord_features(self, image, prev_mask, points):
+        coord_features = self.dist_maps(image, points)
+        if prev_mask is not None:
+            coord_features = torch.cat((prev_mask, coord_features), dim=1)
+        return coord_features
+def split_points_by_order(tpoints: torch.Tensor, groups):
+    points = tpoints.cpu().numpy()
+    num_groups = len(groups)
+    bs = points.shape[0]
+    num_points = points.shape[1] // 2
+    groups = [x if x > 0 else num_points for x in groups]
+    group_points = [np.full((bs, 2 * x, 3), -1, dtype=np.float32)
+                    for x in groups]
+    last_point_indx_group = np.zeros((bs, num_groups, 2), dtype=np.int_)
+    for group_indx, group_size in enumerate(groups):
+        last_point_indx_group[:, group_indx, 1] = group_size
+    for bindx in range(bs):
+        for pindx in range(2 * num_points):
+            point = points[bindx, pindx, :]
+            group_id = int(point[2])
+            if group_id < 0:
+                continue
+            is_negative = int(pindx >= num_points)
+            if group_id >= num_groups or (group_id == 0 and is_negative):  # disable negative first click
+                group_id = num_groups - 1
+            new_point_indx = last_point_indx_group[bindx, group_id, is_negative]
+            last_point_indx_group[bindx, group_id, is_negative] += 1
+            group_points[group_id][bindx, new_point_indx, :] = point
+    group_points = [torch.tensor(x, dtype=tpoints.dtype, device=tpoints.device)
+                    for x in group_points]
+    return group_points

isegm/model/is_plainvit_model.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import math
+import torch.nn as nn
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from .modeling.models_vit import VisionTransformer, PatchEmbed
+from .modeling.swin_transformer import SwinTransfomerSegHead
+class SimpleFPN(nn.Module):
+    def __init__(self, in_dim=768, out_dims=[128, 256, 512, 1024]):
+        super().__init__()
+        self.down_4_chan = max(out_dims[0]*2, in_dim // 2)
+        self.down_4 = nn.Sequential(
+            nn.ConvTranspose2d(in_dim, self.down_4_chan, 2, stride=2),
+            nn.GroupNorm(1, self.down_4_chan),
+            nn.GELU(),
+            nn.ConvTranspose2d(self.down_4_chan, self.down_4_chan // 2, 2, stride=2),
+            nn.GroupNorm(1, self.down_4_chan // 2),
+            nn.Conv2d(self.down_4_chan // 2, out_dims[0], 1),
+            nn.GroupNorm(1, out_dims[0]),
+            nn.GELU()
+        )
+        self.down_8_chan = max(out_dims[1], in_dim // 2)
+        self.down_8 = nn.Sequential(
+            nn.ConvTranspose2d(in_dim, self.down_8_chan, 2, stride=2),
+            nn.GroupNorm(1, self.down_8_chan),
+            nn.Conv2d(self.down_8_chan, out_dims[1], 1),
+            nn.GroupNorm(1, out_dims[1]),
+            nn.GELU()
+        )
+        self.down_16 = nn.Sequential(
+            nn.Conv2d(in_dim, out_dims[2], 1),
+            nn.GroupNorm(1, out_dims[2]),
+            nn.GELU()
+        )
+        self.down_32_chan = max(out_dims[3], in_dim * 2)
+        self.down_32 = nn.Sequential(
+            nn.Conv2d(in_dim, self.down_32_chan, 2, stride=2),
+            nn.GroupNorm(1, self.down_32_chan),
+            nn.Conv2d(self.down_32_chan, out_dims[3], 1),
+            nn.GroupNorm(1, out_dims[3]),
+            nn.GELU()
+        )
+        self.init_weights()
+    def init_weights(self):
+        pass
+    def forward(self, x):
+        x_down_4 = self.down_4(x)
+        x_down_8 = self.down_8(x)
+        x_down_16 = self.down_16(x)
+        x_down_32 = self.down_32(x)
+        return [x_down_4, x_down_8, x_down_16, x_down_32]
+class PlainVitModel(ISModel):
+    @serialize
+    def __init__(
+        self,
+        backbone_params={},
+        neck_params={},
+        head_params={},
+        random_split=False,
+        **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.random_split = random_split
+        self.patch_embed_coords = PatchEmbed(
+            img_size= backbone_params['img_size'],
+            patch_size=backbone_params['patch_size'],
+            in_chans=3 if self.with_prev_mask else 2,
+            embed_dim=backbone_params['embed_dim'],
+        )
+        self.backbone = VisionTransformer(**backbone_params)
+        self.neck = SimpleFPN(**neck_params)
+        self.head = SwinTransfomerSegHead(**head_params)
+    def backbone_forward(self, image, coord_features=None, gra=None):
+        coord_features = self.patch_embed_coords(coord_features)
+        backbone_features = self.backbone.forward_backbone(image, coord_features, gra=gra, shuffle=self.random_split)
+        # Extract 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        B, N, C = backbone_features.shape
+        grid_size = self.backbone.patch_embed.grid_size
+        backbone_features = backbone_features.transpose(-1,-2).view(B, C, grid_size[0], grid_size[1])
+        multi_scale_features = self.neck(backbone_features)
+        return {'instances': self.head(multi_scale_features), 'instances_aux': None}

isegm/model/is_plainvit_model_lora.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import math
+import torch.nn as nn
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from .modeling.models_vit_lora import VisionTransformer_lora, PatchEmbed
+from .modeling.swin_transformer import SwinTransfomerSegHead
+class SimpleFPN(nn.Module):
+    def __init__(self, in_dim=768, out_dims=[128, 256, 512, 1024]):
+        super().__init__()
+        self.down_4_chan = max(out_dims[0]*2, in_dim // 2)
+        self.down_4 = nn.Sequential(
+            nn.ConvTranspose2d(in_dim, self.down_4_chan, 2, stride=2),
+            nn.GroupNorm(1, self.down_4_chan),
+            nn.GELU(),
+            nn.ConvTranspose2d(self.down_4_chan, self.down_4_chan // 2, 2, stride=2),
+            nn.GroupNorm(1, self.down_4_chan // 2),
+            nn.Conv2d(self.down_4_chan // 2, out_dims[0], 1),
+            nn.GroupNorm(1, out_dims[0]),
+            nn.GELU()
+        )
+        self.down_8_chan = max(out_dims[1], in_dim // 2)
+        self.down_8 = nn.Sequential(
+            nn.ConvTranspose2d(in_dim, self.down_8_chan, 2, stride=2),
+            nn.GroupNorm(1, self.down_8_chan),
+            nn.Conv2d(self.down_8_chan, out_dims[1], 1),
+            nn.GroupNorm(1, out_dims[1]),
+            nn.GELU()
+        )
+        self.down_16 = nn.Sequential(
+            nn.Conv2d(in_dim, out_dims[2], 1),
+            nn.GroupNorm(1, out_dims[2]),
+            nn.GELU()
+        )
+        self.down_32_chan = max(out_dims[3], in_dim * 2)
+        self.down_32 = nn.Sequential(
+            nn.Conv2d(in_dim, self.down_32_chan, 2, stride=2),
+            nn.GroupNorm(1, self.down_32_chan),
+            nn.Conv2d(self.down_32_chan, out_dims[3], 1),
+            nn.GroupNorm(1, out_dims[3]),
+            nn.GELU()
+        )
+        self.init_weights()
+    def init_weights(self):
+        pass
+    def forward(self, x):
+        x_down_4 = self.down_4(x)
+        x_down_8 = self.down_8(x)
+        x_down_16 = self.down_16(x)
+        x_down_32 = self.down_32(x)
+        return [x_down_4, x_down_8, x_down_16, x_down_32]
+class PlainVitModel_lora(ISModel):
+    @serialize
+    def __init__(
+        self,
+        backbone_params={},
+        neck_params={},
+        head_params={},
+        random_split=False,
+        **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.random_split = random_split
+        self.patch_embed_coords = PatchEmbed(
+            img_size= backbone_params['img_size'],
+            patch_size=backbone_params['patch_size'],
+            in_chans=3 if self.with_prev_mask else 2,
+            embed_dim=backbone_params['embed_dim'],
+        )
+        self.backbone = VisionTransformer_lora(**backbone_params)
+        self.neck = SimpleFPN(**neck_params)
+        self.head = SwinTransfomerSegHead(**head_params)
+    def backbone_forward(self, image, coord_features=None, gra=None):
+        coord_features = self.patch_embed_coords(coord_features)
+        backbone_features = self.backbone.forward_backbone(image, coord_features, gra=gra, shuffle=self.random_split)
+        # Extract 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        B, N, C = backbone_features.shape
+        grid_size = self.backbone.patch_embed.grid_size
+        backbone_features = backbone_features.transpose(-1,-2).view(B, C, grid_size[0], grid_size[1])
+        multi_scale_features = self.neck(backbone_features)
+        return {'instances': self.head(multi_scale_features), 'instances_aux': None}

isegm/model/is_segformer_model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch.nn as nn
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from isegm.model.modifiers import LRMult
+from .modeling.segformer import MixVisionTransformer, SegformerHead
+class SegformerModel(ISModel):
+    @serialize
+    def __init__(
+        self,
+        backbone_params=None,
+        decode_head_params=None,
+        backbone_lr_mult=0.1,
+        **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.feature_extractor = MixVisionTransformer(**backbone_params)
+        self.feature_extractor.apply(LRMult(backbone_lr_mult))
+        self.head = SegformerHead(**decode_head_params)
+    def backbone_forward(self, image, coord_features=None):
+        backbone_features = self.feature_extractor(image, coord_features)
+        return {'instances': self.head(backbone_features), 'instances_aux': None}

isegm/model/is_swinformer_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from .modeling.swin_transformer import SwinTransformer, SwinTransfomerSegHead
+class SwinformerModel(ISModel):
+    @serialize
+    def __init__(
+        self,
+        backbone_params={},
+        head_params={},
+        **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.backbone = SwinTransformer(**backbone_params)
+        self.head = SwinTransfomerSegHead(**head_params)
+    def backbone_forward(self, image, coord_features=None):
+        backbone_features = self.backbone(image, coord_features)
+        return {'instances': self.head(backbone_features), 'instances_aux': None}

isegm/model/is_text_graco_model.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch.nn as nn
+from isegm.utils.serialization import serialize
+from .is_model import ISModel
+from .is_plainvit_model import SimpleFPN
+from .modeling.models_vit import VisionTransformer, PatchEmbed
+from .modeling.twoway_transformer import TwoWayTransformer, PositionEmbeddingRandom
+from .modeling.swin_transformer import SwinTransfomerSegHead
+from .modeling.clip_text_encoding import ClipTextEncoder
+class TextGraCoModel(ISModel):
+    @serialize
+    def __init__(
+        self,
+        image_encoder_params={},
+        text_encoder_params={},
+        cross_encoder_params={},
+        neck_params={},
+        head_params={},
+        random_split=False,
+        **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.random_split = random_split
+        self.patch_embed_coords = PatchEmbed(
+            img_size=image_encoder_params['img_size'],
+            patch_size=image_encoder_params['patch_size'],
+            in_chans=3 if self.with_prev_mask else 2,
+            embed_dim=image_encoder_params['embed_dim'],
+        )
+        self.image_encoder = VisionTransformer(**image_encoder_params)
+        self.text_encoder = ClipTextEncoder(**text_encoder_params)
+        self.cross_encoder = TwoWayTransformer(**cross_encoder_params)
+        self.pe_layer = PositionEmbeddingRandom(cross_encoder_params["embedding_dim"] // 2)
+        patch_size = image_encoder_params['patch_size'][0]
+        self.image_embedding_size = image_encoder_params['img_size'][0] // (patch_size if patch_size > 0 else 1)
+        self.neck = SimpleFPN(**neck_params)
+        self.head = SwinTransfomerSegHead(**head_params)
+    def backbone_forward(self, image, coord_features=None, text=None, gra=None):
+        coord_features = self.patch_embed_coords(coord_features)
+        backbone_features = self.image_encoder.forward_backbone(image, coord_features, gra=gra, shuffle=self.random_split)
+        text_features = self.text_encoder(text)
+        text_features, backbone_features = self.cross_encoder(
+            backbone_features,
+            self.pe_layer((self.image_embedding_size, self.image_embedding_size)).unsqueeze(0),
+            text_features)
+        # Extract 4 stage image_encoder feature map: 1/4, 1/8, 1/16, 1/32
+        B, N, C = backbone_features.shape
+        grid_size = self.image_encoder.patch_embed.grid_size
+        backbone_features = backbone_features.transpose(-1,-2).view(B, C, grid_size[0], grid_size[1])
+        multi_scale_features = self.neck(backbone_features)
+        return {'instances': self.head(multi_scale_features), 'instances_aux': None}

isegm/model/losses.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from isegm.utils import misc
+class NormalizedFocalLossSigmoid(nn.Module):
+    def __init__(self, axis=-1, alpha=0.25, gamma=2, max_mult=-1, eps=1e-12,
+                 from_sigmoid=False, detach_delimeter=True,
+                 batch_axis=0, weight=None, size_average=True,
+                 ignore_label=-1):
+        super(NormalizedFocalLossSigmoid, self).__init__()
+        self._axis = axis
+        self._alpha = alpha
+        self._gamma = gamma
+        self._ignore_label = ignore_label
+        self._weight = weight if weight is not None else 1.0
+        self._batch_axis = batch_axis
+        self._from_logits = from_sigmoid
+        self._eps = eps
+        self._size_average = size_average
+        self._detach_delimeter = detach_delimeter
+        self._max_mult = max_mult
+        self._k_sum = 0
+        self._m_max = 0
+    def forward(self, pred, label):
+        one_hot = label > 0.5
+        sample_weight = label != self._ignore_label
+        if not self._from_logits:
+            pred = torch.sigmoid(pred)
+        alpha = torch.where(one_hot, self._alpha * sample_weight, (1 - self._alpha) * sample_weight)
+        pt = torch.where(sample_weight, 1.0 - torch.abs(label - pred), torch.ones_like(pred))
+        beta = (1 - pt) ** self._gamma
+        sw_sum = torch.sum(sample_weight, dim=(-2, -1), keepdim=True)
+        beta_sum = torch.sum(beta, dim=(-2, -1), keepdim=True)
+        mult = sw_sum / (beta_sum + self._eps)
+        if self._detach_delimeter:
+            mult = mult.detach()
+        beta = beta * mult
+        if self._max_mult > 0:
+            beta = torch.clamp_max(beta, self._max_mult)
+        with torch.no_grad():
+            ignore_area = torch.sum(label == self._ignore_label, dim=tuple(range(1, label.dim()))).cpu().numpy()
+            sample_mult = torch.mean(mult, dim=tuple(range(1, mult.dim()))).cpu().numpy()
+            if np.any(ignore_area == 0):
+                self._k_sum = 0.9 * self._k_sum + 0.1 * sample_mult[ignore_area == 0].mean()
+                beta_pmax, _ = torch.flatten(beta, start_dim=1).max(dim=1)
+                beta_pmax = beta_pmax.mean().item()
+                self._m_max = 0.8 * self._m_max + 0.2 * beta_pmax
+        loss = -alpha * beta * torch.log(torch.min(pt + self._eps, torch.ones(1, dtype=torch.float).to(pt.device)))
+        loss = self._weight * (loss * sample_weight)
+        if self._size_average:
+            bsum = torch.sum(sample_weight, dim=misc.get_dims_with_exclusion(sample_weight.dim(), self._batch_axis))
+            loss = torch.sum(loss, dim=misc.get_dims_with_exclusion(loss.dim(), self._batch_axis)) / (bsum + self._eps)
+        else:
+            loss = torch.sum(loss, dim=misc.get_dims_with_exclusion(loss.dim(), self._batch_axis))
+        return loss
+    def log_states(self, sw, name, global_step):
+        sw.add_scalar(tag=name + '_k', value=self._k_sum, global_step=global_step)
+        sw.add_scalar(tag=name + '_m', value=self._m_max, global_step=global_step)
+class FocalLoss(nn.Module):
+    def __init__(self, axis=-1, alpha=0.25, gamma=2,
+                 from_logits=False, batch_axis=0,
+                 weight=None, num_class=None,
+                 eps=1e-9, size_average=True, scale=1.0,
+                 ignore_label=-1):
+        super(FocalLoss, self).__init__()
+        self._axis = axis
+        self._alpha = alpha
+        self._gamma = gamma
+        self._ignore_label = ignore_label
+        self._weight = weight if weight is not None else 1.0
+        self._batch_axis = batch_axis
+        self._scale = scale
+        self._num_class = num_class
+        self._from_logits = from_logits
+        self._eps = eps
+        self._size_average = size_average
+    def forward(self, pred, label, sample_weight=None):
+        one_hot = label > 0.5
+        sample_weight = label != self._ignore_label
+        if not self._from_logits:
+            pred = torch.sigmoid(pred)
+        alpha = torch.where(one_hot, self._alpha * sample_weight, (1 - self._alpha) * sample_weight)
+        pt = torch.where(sample_weight, 1.0 - torch.abs(label - pred), torch.ones_like(pred))
+        beta = (1 - pt) ** self._gamma
+        loss = -alpha * beta * torch.log(torch.min(pt + self._eps, torch.ones(1, dtype=torch.float).to(pt.device)))
+        loss = self._weight * (loss * sample_weight)
+        if self._size_average:
+            tsum = torch.sum(sample_weight, dim=misc.get_dims_with_exclusion(label.dim(), self._batch_axis))
+            loss = torch.sum(loss, dim=misc.get_dims_with_exclusion(loss.dim(), self._batch_axis)) / (tsum + self._eps)
+        else:
+            loss = torch.sum(loss, dim=misc.get_dims_with_exclusion(loss.dim(), self._batch_axis))
+        return self._scale * loss
+class SoftIoU(nn.Module):
+    def __init__(self, from_sigmoid=False, ignore_label=-1):
+        super().__init__()
+        self._from_sigmoid = from_sigmoid
+        self._ignore_label = ignore_label
+    def forward(self, pred, label):
+        label = label.view(pred.size())
+        sample_weight = label != self._ignore_label
+        if not self._from_sigmoid:
+            pred = torch.sigmoid(pred)
+        loss = 1.0 - torch.sum(pred * label * sample_weight, dim=(1, 2, 3)) \
+            / (torch.sum(torch.max(pred, label) * sample_weight, dim=(1, 2, 3)) + 1e-8)
+        return loss
+class SigmoidBinaryCrossEntropyLoss(nn.Module):
+    def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, ignore_label=-1):
+        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
+        self._from_sigmoid = from_sigmoid
+        self._ignore_label = ignore_label
+        self._weight = weight if weight is not None else 1.0
+        self._batch_axis = batch_axis
+    def forward(self, pred, label):
+        label = label.view(pred.size())
+        sample_weight = label != self._ignore_label
+        label = torch.where(sample_weight, label, torch.zeros_like(label))
+        if not self._from_sigmoid:
+            loss = torch.relu(pred) - pred * label + F.softplus(-torch.abs(pred))
+        else:
+            eps = 1e-12
+            loss = -(torch.log(pred + eps) * label
+                     + torch.log(1. - pred + eps) * (1. - label))
+        loss = self._weight * (loss * sample_weight)
+        return torch.mean(loss, dim=misc.get_dims_with_exclusion(loss.dim(), self._batch_axis))
+class BinaryDiceLoss(nn.Module):
+    """ Dice Loss for binary segmentation
+    """
+    def forward(self, pred, label):
+        batchsize = pred.size(0)
+        # convert probability to binary label using maximum probability
+        input_pred, input_label = pred.max(1)
+        input_pred *= input_label.float()
+        # convert to floats
+        input_pred = input_pred.float()
+        target_label = label.float()
+        # convert to 1D
+        input_pred = input_pred.view(batchsize, -1)
+        target_label = target_label.view(batchsize, -1)
+        # compute dice score
+        intersect = torch.sum(input_pred * target_label, 1)
+        input_area = torch.sum(input_pred * input_pred, 1)
+        target_area = torch.sum(target_label * target_label, 1)
+        sum = input_area + target_area
+        epsilon = torch.tensor(1e-6)
+        # batch dice loss and ignore dice loss where target area = 0
+        batch_loss = torch.tensor(1.0) - (torch.tensor(2.0) * intersect + epsilon) / (sum + epsilon)
+        loss = batch_loss.mean()
+        return loss

isegm/model/metrics.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+import numpy as np
+from isegm.utils import misc
+class TrainMetric(object):
+    def __init__(self, pred_outputs, gt_outputs):
+        self.pred_outputs = pred_outputs
+        self.gt_outputs = gt_outputs
+    def update(self, *args, **kwargs):
+        raise NotImplementedError
+    def get_epoch_value(self):
+        raise NotImplementedError
+    def reset_epoch_stats(self):
+        raise NotImplementedError
+    def log_states(self, sw, tag_prefix, global_step):
+        pass
+    @property
+    def name(self):
+        return type(self).__name__
+class AdaptiveIoU(TrainMetric):
+    def __init__(self, init_thresh=0.4, thresh_step=0.025, thresh_beta=0.99, iou_beta=0.9,
+                 ignore_label=-1, from_logits=True,
+                 pred_output='instances', gt_output='instances'):
+        super().__init__(pred_outputs=(pred_output,), gt_outputs=(gt_output,))
+        self._ignore_label = ignore_label
+        self._from_logits = from_logits
+        self._iou_thresh = init_thresh
+        self._thresh_step = thresh_step
+        self._thresh_beta = thresh_beta
+        self._iou_beta = iou_beta
+        self._ema_iou = 0.0
+        self._epoch_iou_sum = 0.0
+        self._epoch_batch_count = 0
+    def update(self, pred, gt):
+        gt_mask = gt > 0.5
+        if self._from_logits:
+            pred = torch.sigmoid(pred)
+        gt_mask_area = torch.sum(gt_mask, dim=(1, 2)).detach().cpu().numpy()
+        if np.all(gt_mask_area == 0):
+            return
+        ignore_mask = gt == self._ignore_label
+        max_iou = _compute_iou(pred > self._iou_thresh, gt_mask, ignore_mask).mean()
+        best_thresh = self._iou_thresh
+        for t in [best_thresh - self._thresh_step, best_thresh + self._thresh_step]:
+            temp_iou = _compute_iou(pred > t, gt_mask, ignore_mask).mean()
+            if temp_iou > max_iou:
+                max_iou = temp_iou
+                best_thresh = t
+        self._iou_thresh = self._thresh_beta * self._iou_thresh + (1 - self._thresh_beta) * best_thresh
+        self._ema_iou = self._iou_beta * self._ema_iou + (1 - self._iou_beta) * max_iou
+        self._epoch_iou_sum += max_iou
+        self._epoch_batch_count += 1
+    def get_epoch_value(self):
+        if self._epoch_batch_count > 0:
+            return self._epoch_iou_sum / self._epoch_batch_count
+        else:
+            return 0.0
+    def reset_epoch_stats(self):
+        self._epoch_iou_sum = 0.0
+        self._epoch_batch_count = 0
+    def log_states(self, sw, tag_prefix, global_step):
+        sw.add_scalar(tag=tag_prefix + '_ema_iou', value=self._ema_iou, global_step=global_step)
+        sw.add_scalar(tag=tag_prefix + '_iou_thresh', value=self._iou_thresh, global_step=global_step)
+    @property
+    def iou_thresh(self):
+        return self._iou_thresh
+def _compute_iou(pred_mask, gt_mask, ignore_mask=None, keep_ignore=False):
+    if ignore_mask is not None:
+        pred_mask = torch.where(ignore_mask, torch.zeros_like(pred_mask), pred_mask)
+    reduction_dims = misc.get_dims_with_exclusion(gt_mask.dim(), 0)
+    union = torch.mean((pred_mask | gt_mask).float(), dim=reduction_dims).detach().cpu().numpy()
+    intersection = torch.mean((pred_mask & gt_mask).float(), dim=reduction_dims).detach().cpu().numpy()
+    nonzero = union > 0
+    iou = intersection[nonzero] / union[nonzero]
+    if not keep_ignore:
+        return iou
+    else:
+        result = np.full_like(intersection, -1)
+        result[nonzero] = iou
+        return result

isegm/model/modeling/__init__.py ADDED Viewed

File without changes

isegm/model/modeling/basic_blocks.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch.nn as nn
+from isegm.model import ops
+class ConvHead(nn.Module):
+    def __init__(self, out_channels, in_channels=32, num_layers=1,
+                 kernel_size=3, padding=1,
+                 norm_layer=nn.BatchNorm2d):
+        super(ConvHead, self).__init__()
+        convhead = []
+        for i in range(num_layers):
+            convhead.extend([
+                nn.Conv2d(in_channels, in_channels, kernel_size, padding=padding),
+                nn.ReLU(),
+                norm_layer(in_channels) if norm_layer is not None else nn.Identity()
+            ])
+        convhead.append(nn.Conv2d(in_channels, out_channels, 1, padding=0))
+        self.convhead = nn.Sequential(*convhead)
+    def forward(self, *inputs):
+        return self.convhead(inputs[0])
+class SepConvHead(nn.Module):
+    def __init__(self, num_outputs, in_channels, mid_channels, num_layers=1,
+                 kernel_size=3, padding=1, dropout_ratio=0.0, dropout_indx=0,
+                 norm_layer=nn.BatchNorm2d):
+        super(SepConvHead, self).__init__()
+        sepconvhead = []
+        for i in range(num_layers):
+            sepconvhead.append(
+                SeparableConv2d(in_channels=in_channels if i == 0 else mid_channels,
+                                out_channels=mid_channels,
+                                dw_kernel=kernel_size, dw_padding=padding,
+                                norm_layer=norm_layer, activation='relu')
+            )
+            if dropout_ratio > 0 and dropout_indx == i:
+                sepconvhead.append(nn.Dropout(dropout_ratio))
+        sepconvhead.append(
+            nn.Conv2d(in_channels=mid_channels, out_channels=num_outputs, kernel_size=1, padding=0)
+        )
+        self.layers = nn.Sequential(*sepconvhead)
+    def forward(self, *inputs):
+        x = inputs[0]
+        return self.layers(x)
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, dw_kernel, dw_padding, dw_stride=1,
+                 activation=None, use_bias=False, norm_layer=None):
+        super(SeparableConv2d, self).__init__()
+        _activation = ops.select_activation_function(activation)
+        self.body = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, kernel_size=dw_kernel, stride=dw_stride,
+                      padding=dw_padding, bias=use_bias, groups=in_channels),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=use_bias),
+            norm_layer(out_channels) if norm_layer is not None else nn.Identity(),
+            _activation()
+        )
+    def forward(self, x):
+        return self.body(x)

isegm/model/modeling/clip/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip import *

isegm/model/modeling/clip/clip.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        try:
+            # loading JIT archive
+            model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
+            state_dict = None
+        except RuntimeError:
+            # loading saved state dict
+            if jit:
+                warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+                jit = False
+            state_dict = torch.load(opened_file, map_location="cpu")
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def _node_get(node: torch._C.Node, key: str):
+        """Gets attributes of a node which is polymorphic over return type.
+        From https://github.com/pytorch/pytorch/pull/82628
+        """
+        sel = node.kindOf(key)
+        return getattr(node, sel)(key)
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(_node_get(node, "value")).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if _node_get(inputs[i].node(), "value") == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    else:
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

isegm/model/modeling/clip/model.py ADDED Viewed

	@@ -0,0 +1,436 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

isegm/model/modeling/clip/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

isegm/model/modeling/clip_text_encoding.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from torch import nn
+from .clip import clip
+class ClipTextEncoder(nn.Module):
+    def __init__(self, clip_enocder_name="ViT-B/32", embedding_dim=512, out_dim=768):
+        super().__init__()
+        assert clip_enocder_name in ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model, self.preprocess = clip.load(clip_enocder_name, device=self.device)
+        # freeze model
+        for _, param in self.model.named_parameters():
+            param.requires_grad = False
+        self.out_proj = nn.Linear(embedding_dim, out_dim)
+        nn.init.zeros_(self.out_proj.bias)
+    @torch.no_grad()
+    def forward(self, prompt):
+        '''
+        prompt: text tokens
+        '''
+        text_features = self.model.encode_text(prompt).type(torch.float32)
+        # norm
+        # text_features /= text_features.norm(dim=-1, keepdim=True)  # [bs, 1024]
+        # proj
+        text_features = self.out_proj(text_features)
+        return text_features

isegm/model/modeling/deeplab_v3.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from contextlib import ExitStack
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .basic_blocks import SeparableConv2d
+from .resnet import ResNetBackbone
+from isegm.model import ops
+class DeepLabV3Plus(nn.Module):
+    def __init__(self, backbone='resnet50', norm_layer=nn.BatchNorm2d,
+                 backbone_norm_layer=None,
+                 ch=256,
+                 project_dropout=0.5,
+                 inference_mode=False,
+                 **kwargs):
+        super(DeepLabV3Plus, self).__init__()
+        if backbone_norm_layer is None:
+            backbone_norm_layer = norm_layer
+        self.backbone_name = backbone
+        self.norm_layer = norm_layer
+        self.backbone_norm_layer = backbone_norm_layer
+        self.inference_mode = False
+        self.ch = ch
+        self.aspp_in_channels = 2048
+        self.skip_project_in_channels = 256  # layer 1 out_channels
+        self._kwargs = kwargs
+        if backbone == 'resnet34':
+            self.aspp_in_channels = 512
+            self.skip_project_in_channels = 64
+        self.backbone = ResNetBackbone(backbone=self.backbone_name, pretrained_base=False,
+                                       norm_layer=self.backbone_norm_layer, **kwargs)
+        self.head = _DeepLabHead(in_channels=ch + 32, mid_channels=ch, out_channels=ch,
+                                 norm_layer=self.norm_layer)
+        self.skip_project = _SkipProject(self.skip_project_in_channels, 32, norm_layer=self.norm_layer)
+        self.aspp = _ASPP(in_channels=self.aspp_in_channels,
+                          atrous_rates=[12, 24, 36],
+                          out_channels=ch,
+                          project_dropout=project_dropout,
+                          norm_layer=self.norm_layer)
+        if inference_mode:
+            self.set_prediction_mode()
+    def load_pretrained_weights(self):
+        pretrained = ResNetBackbone(backbone=self.backbone_name, pretrained_base=True,
+                                    norm_layer=self.backbone_norm_layer, **self._kwargs)
+        backbone_state_dict = self.backbone.state_dict()
+        pretrained_state_dict = pretrained.state_dict()
+        backbone_state_dict.update(pretrained_state_dict)
+        self.backbone.load_state_dict(backbone_state_dict)
+        if self.inference_mode:
+            for param in self.backbone.parameters():
+                param.requires_grad = False
+    def set_prediction_mode(self):
+        self.inference_mode = True
+        self.eval()
+    def forward(self, x, additional_features=None):
+        with ExitStack() as stack:
+            if self.inference_mode:
+                stack.enter_context(torch.no_grad())
+            c1, _, c3, c4 = self.backbone(x, additional_features)
+            c1 = self.skip_project(c1)
+            x = self.aspp(c4)
+            x = F.interpolate(x, c1.size()[2:], mode='bilinear', align_corners=True)
+            x = torch.cat((x, c1), dim=1)
+            x = self.head(x)
+        return x,
+class _SkipProject(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer=nn.BatchNorm2d):
+        super(_SkipProject, self).__init__()
+        _activation = ops.select_activation_function("relu")
+        self.skip_project = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+            norm_layer(out_channels),
+            _activation()
+        )
+    def forward(self, x):
+        return self.skip_project(x)
+class _DeepLabHead(nn.Module):
+    def __init__(self, out_channels, in_channels, mid_channels=256, norm_layer=nn.BatchNorm2d):
+        super(_DeepLabHead, self).__init__()
+        self.block = nn.Sequential(
+            SeparableConv2d(in_channels=in_channels, out_channels=mid_channels, dw_kernel=3,
+                            dw_padding=1, activation='relu', norm_layer=norm_layer),
+            SeparableConv2d(in_channels=mid_channels, out_channels=mid_channels, dw_kernel=3,
+                            dw_padding=1, activation='relu', norm_layer=norm_layer),
+            nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1)
+        )
+    def forward(self, x):
+        return self.block(x)
+class _ASPP(nn.Module):
+    def __init__(self, in_channels, atrous_rates, out_channels=256,
+                 project_dropout=0.5, norm_layer=nn.BatchNorm2d):
+        super(_ASPP, self).__init__()
+        b0 = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False),
+            norm_layer(out_channels),
+            nn.ReLU()
+        )
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        b1 = _ASPPConv(in_channels, out_channels, rate1, norm_layer)
+        b2 = _ASPPConv(in_channels, out_channels, rate2, norm_layer)
+        b3 = _ASPPConv(in_channels, out_channels, rate3, norm_layer)
+        b4 = _AsppPooling(in_channels, out_channels, norm_layer=norm_layer)
+        self.concurent = nn.ModuleList([b0, b1, b2, b3, b4])
+        project = [
+            nn.Conv2d(in_channels=5*out_channels, out_channels=out_channels,
+                      kernel_size=1, bias=False),
+            norm_layer(out_channels),
+            nn.ReLU()
+        ]
+        if project_dropout > 0:
+            project.append(nn.Dropout(project_dropout))
+        self.project = nn.Sequential(*project)
+    def forward(self, x):
+        x = torch.cat([block(x) for block in self.concurent], dim=1)
+        return self.project(x)
+class _AsppPooling(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer):
+        super(_AsppPooling, self).__init__()
+        self.gap = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, bias=False),
+            norm_layer(out_channels),
+            nn.ReLU()
+        )
+    def forward(self, x):
+        pool = self.gap(x)
+        return F.interpolate(pool, x.size()[2:], mode='bilinear', align_corners=True)
+def _ASPPConv(in_channels, out_channels, atrous_rate, norm_layer):
+    block = nn.Sequential(
+        nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                  kernel_size=3, padding=atrous_rate,
+                  dilation=atrous_rate, bias=False),
+        norm_layer(out_channels),
+        nn.ReLU()
+    )
+    return block

isegm/model/modeling/hrformer.py ADDED Viewed

	@@ -0,0 +1,487 @@

+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: RainbowSecret
+## Microsoft Research
+## [email protected], [email protected]
+## Copyright (c) 2021
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import os
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from .hrformer_helper.backbone_selector import BackboneSelector
+from .hrformer_helper.hrt.module_helper import ModuleHelper
+from .hrformer_helper.hrt.modules.spatial_ocr_block import SpatialGather_Module, SpatialOCR_Module
+from .hrformer_helper.hrt.logger import Logger as Log
+from .hrformer_helper.hrt.hrt_backbone import HRTBackbone, HRTBackbone_v2
+class BackboneSelector(object):
+    def __init__(self, configer):
+        self.configer = configer
+    def get_backbone(self, **params):
+        backbone = self.configer.get("network", "backbone")
+        model = None
+        # if (
+        #     "resnet" in backbone or "resnext" in backbone or "resnest" in backbone
+        # ) and "senet" not in backbone:
+        #     model = ResNetBackbone(self.configer)(**params)
+        if "hrt" in backbone:
+            model = HRTBackbone(self.configer)(**params)
+            pass
+        # elif "hrnet" in backbone:
+        #     model = HRNetBackbone(self.configer)(**params)
+        # elif "swin" in backbone:
+        #     model = SwinTransformerBackbone(self.configer)(**params)
+        else:
+            Log.error("Backbone {} is invalid.".format(backbone))
+            exit(1)
+        return model
+class HRT_B_OCR_V3(nn.Module):
+    def __init__(self, num_classes, in_ch=3, backbone='hrt_base', bn_type="torchbn", pretrained=None):
+        super(HRT_B_OCR_V3, self).__init__()
+        self.num_classes = num_classes
+        self.bn_type = bn_type
+        self.backbone = HRTBackbone_v2(backbone, pretrained, in_ch)()
+        in_channels = 1170
+        hidden_dim = 512
+        group_channel = math.gcd(in_channels, hidden_dim)
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                hidden_dim,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                groups=group_channel,
+            ),
+            ModuleHelper.BNReLU(
+                hidden_dim, bn_type=self.bn_type
+            ),
+        )
+        self.ocr_gather_head = SpatialGather_Module(self.num_classes)
+        self.ocr_distri_head = SpatialOCR_Module(
+            in_channels=hidden_dim,
+            key_channels=hidden_dim // 2,
+            out_channels=hidden_dim,
+            scale=1,
+            dropout=0.05,
+            bn_type=self.bn_type,
+        )
+        self.cls_head = nn.Conv2d(
+            hidden_dim, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+        self.aux_head = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                hidden_dim,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                groups=group_channel,
+            ),
+            ModuleHelper.BNReLU(
+                hidden_dim, bn_type=self.bn_type
+            ),
+            nn.Conv2d(
+                hidden_dim,
+                self.num_classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+        )
+    def forward(self, x_):
+        x = self.backbone(x_)
+        _, _, h, w = x[0].size()
+        feat1 = x[0]
+        feat2 = F.interpolate(x[1], size=(h, w), mode="bilinear", align_corners=True)
+        feat3 = F.interpolate(x[2], size=(h, w), mode="bilinear", align_corners=True)
+        feat4 = F.interpolate(x[3], size=(h, w), mode="bilinear", align_corners=True)
+        feats = torch.cat([feat1, feat2, feat3, feat4], 1)
+        out_aux = self.aux_head(feats)
+        feats = self.conv3x3(feats)
+        context = self.ocr_gather_head(feats, out_aux)
+        feats = self.ocr_distri_head(feats, context)
+        out = self.cls_head(feats)
+        out_aux = F.interpolate(
+            out_aux, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        out = F.interpolate(
+            out, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        return out_aux, out
+class HRT_S_OCR_V2(nn.Module):
+    def __init__(self, num_classes, backbone='hrt_small', bn_type="torchbn", pretrained=None):
+        super(HRT_S_OCR_V2, self).__init__()
+        self.num_classes = num_classes
+        self.bn_type = bn_type
+        self.backbone = HRTBackbone_v2(backbone, pretrained)()
+        in_channels = 480
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1),
+            ModuleHelper.BNReLU(512, bn_type=self.bn_type),
+        )
+        self.ocr_gather_head = SpatialGather_Module(self.num_classes)
+        self.ocr_distri_head = SpatialOCR_Module(
+            in_channels=512,
+            key_channels=256,
+            out_channels=512,
+            scale=1,
+            dropout=0.05,
+            bn_type=self.bn_type,
+        )
+        self.cls_head = nn.Conv2d(
+            512, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+        self.aux_head = nn.Sequential(
+            nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1),
+            ModuleHelper.BNReLU(512, bn_type=self.bn_type),
+            nn.Conv2d(
+                512, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+            ),
+        )
+    def forward(self, x_):
+        x = self.backbone(x_)
+        _, _, h, w = x[0].size()
+        feat1 = x[0]
+        feat2 = F.interpolate(x[1], size=(h, w), mode="bilinear", align_corners=True)
+        feat3 = F.interpolate(x[2], size=(h, w), mode="bilinear", align_corners=True)
+        feat4 = F.interpolate(x[3], size=(h, w), mode="bilinear", align_corners=True)
+        feats = torch.cat([feat1, feat2, feat3, feat4], 1)
+        out_aux = self.aux_head(feats)
+        feats = self.conv3x3(feats)
+        context = self.ocr_gather_head(feats, out_aux)
+        feats = self.ocr_distri_head(feats, context)
+        out = self.cls_head(feats)
+        out_aux = F.interpolate(
+            out_aux, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        out = F.interpolate(
+            out, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        return out_aux, out
+class HRT_SMALL_OCR_V2(nn.Module):
+    def __init__(self, configer):
+        super(HRT_SMALL_OCR_V2, self).__init__()
+        self.configer = configer
+        self.num_classes = self.configer.get("data", "num_classes")
+        self.backbone = BackboneSelector(configer).get_backbone()
+        in_channels = 480
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1),
+            ModuleHelper.BNReLU(512, bn_type=self.configer.get("network", "bn_type")),
+        )
+        self.ocr_gather_head = SpatialGather_Module(self.num_classes)
+        self.ocr_distri_head = SpatialOCR_Module(
+            in_channels=512,
+            key_channels=256,
+            out_channels=512,
+            scale=1,
+            dropout=0.05,
+            bn_type=self.configer.get("network", "bn_type"),
+        )
+        self.cls_head = nn.Conv2d(
+            512, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+        self.aux_head = nn.Sequential(
+            nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1),
+            ModuleHelper.BNReLU(512, bn_type=self.configer.get("network", "bn_type")),
+            nn.Conv2d(
+                512, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+            ),
+        )
+    def forward(self, x_):
+        x = self.backbone(x_)
+        _, _, h, w = x[0].size()
+        feat1 = x[0]
+        feat2 = F.interpolate(x[1], size=(h, w), mode="bilinear", align_corners=True)
+        feat3 = F.interpolate(x[2], size=(h, w), mode="bilinear", align_corners=True)
+        feat4 = F.interpolate(x[3], size=(h, w), mode="bilinear", align_corners=True)
+        feats = torch.cat([feat1, feat2, feat3, feat4], 1)
+        out_aux = self.aux_head(feats)
+        feats = self.conv3x3(feats)
+        context = self.ocr_gather_head(feats, out_aux)
+        feats = self.ocr_distri_head(feats, context)
+        out = self.cls_head(feats)
+        out_aux = F.interpolate(
+            out_aux, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        out = F.interpolate(
+            out, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        return out_aux, out
+class HRT_BASE_OCR_V2(nn.Module):
+    def __init__(self, configer):
+        super(HRT_BASE_OCR_V2, self).__init__()
+        self.configer = configer
+        self.num_classes = self.configer.get("data", "num_classes")
+        self.backbone = BackboneSelector(configer).get_backbone()
+        in_channels = 1170
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1),
+            ModuleHelper.BNReLU(512, bn_type=self.configer.get("network", "bn_type")),
+        )
+        self.ocr_gather_head = SpatialGather_Module(self.num_classes)
+        self.ocr_distri_head = SpatialOCR_Module(
+            in_channels=512,
+            key_channels=256,
+            out_channels=512,
+            scale=1,
+            dropout=0.05,
+            bn_type=self.configer.get("network", "bn_type"),
+        )
+        self.cls_head = nn.Conv2d(
+            512, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+        self.aux_head = nn.Sequential(
+            nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1),
+            ModuleHelper.BNReLU(512, bn_type=self.configer.get("network", "bn_type")),
+            nn.Conv2d(
+                512, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+            ),
+        )
+    def forward(self, x_):
+        x = self.backbone(x_)
+        _, _, h, w = x[0].size()
+        feat1 = x[0]
+        feat2 = F.interpolate(x[1], size=(h, w), mode="bilinear", align_corners=True)
+        feat3 = F.interpolate(x[2], size=(h, w), mode="bilinear", align_corners=True)
+        feat4 = F.interpolate(x[3], size=(h, w), mode="bilinear", align_corners=True)
+        feats = torch.cat([feat1, feat2, feat3, feat4], 1)
+        out_aux = self.aux_head(feats)
+        feats = self.conv3x3(feats)
+        context = self.ocr_gather_head(feats, out_aux)
+        feats = self.ocr_distri_head(feats, context)
+        out = self.cls_head(feats)
+        out_aux = F.interpolate(
+            out_aux, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        out = F.interpolate(
+            out, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        return out_aux, out
+class HRT_SMALL_OCR_V3(nn.Module):
+    def __init__(self, configer):
+        super(HRT_SMALL_OCR_V3, self).__init__()
+        self.configer = configer
+        self.num_classes = self.configer.get("data", "num_classes")
+        self.backbone = BackboneSelector(configer).get_backbone()
+        in_channels = 480
+        hidden_dim = 512
+        group_channel = math.gcd(in_channels, hidden_dim)
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                hidden_dim,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                groups=group_channel,
+            ),
+            ModuleHelper.BNReLU(
+                hidden_dim, bn_type=self.configer.get("network", "bn_type")
+            ),
+        )
+        self.ocr_gather_head = SpatialGather_Module(self.num_classes)
+        self.ocr_distri_head = SpatialOCR_Module(
+            in_channels=hidden_dim,
+            key_channels=hidden_dim // 2,
+            out_channels=hidden_dim,
+            scale=1,
+            dropout=0.05,
+            bn_type=self.configer.get("network", "bn_type"),
+        )
+        self.cls_head = nn.Conv2d(
+            hidden_dim, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+        self.aux_head = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                hidden_dim,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                groups=group_channel,
+            ),
+            ModuleHelper.BNReLU(
+                hidden_dim, bn_type=self.configer.get("network", "bn_type")
+            ),
+            nn.Conv2d(
+                hidden_dim,
+                self.num_classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+        )
+    def forward(self, x_):
+        x = self.backbone(x_)
+        _, _, h, w = x[0].size()
+        feat1 = x[0]
+        feat2 = F.interpolate(x[1], size=(h, w), mode="bilinear", align_corners=True)
+        feat3 = F.interpolate(x[2], size=(h, w), mode="bilinear", align_corners=True)
+        feat4 = F.interpolate(x[3], size=(h, w), mode="bilinear", align_corners=True)
+        feats = torch.cat([feat1, feat2, feat3, feat4], 1)
+        out_aux = self.aux_head(feats)
+        feats = self.conv3x3(feats)
+        context = self.ocr_gather_head(feats, out_aux)
+        feats = self.ocr_distri_head(feats, context)
+        out = self.cls_head(feats)
+        out_aux = F.interpolate(
+            out_aux, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        out = F.interpolate(
+            out, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        return out_aux, out
+class HRT_BASE_OCR_V3(nn.Module):
+    def __init__(self, configer):
+        super(HRT_BASE_OCR_V3, self).__init__()
+        self.configer = configer
+        self.num_classes = self.configer.get("data", "num_classes")
+        self.backbone = BackboneSelector(configer).get_backbone()
+        in_channels = 1170
+        hidden_dim = 512
+        group_channel = math.gcd(in_channels, hidden_dim)
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                hidden_dim,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                groups=group_channel,
+            ),
+            ModuleHelper.BNReLU(
+                hidden_dim, bn_type=self.configer.get("network", "bn_type")
+            ),
+        )
+        self.ocr_gather_head = SpatialGather_Module(self.num_classes)
+        self.ocr_distri_head = SpatialOCR_Module(
+            in_channels=hidden_dim,
+            key_channels=hidden_dim // 2,
+            out_channels=hidden_dim,
+            scale=1,
+            dropout=0.05,
+            bn_type=self.configer.get("network", "bn_type"),
+        )
+        self.cls_head = nn.Conv2d(
+            hidden_dim, self.num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+        self.aux_head = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                hidden_dim,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                groups=group_channel,
+            ),
+            ModuleHelper.BNReLU(
+                hidden_dim, bn_type=self.configer.get("network", "bn_type")
+            ),
+            nn.Conv2d(
+                hidden_dim,
+                self.num_classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+        )
+    def forward(self, x_):
+        x = self.backbone(x_)
+        _, _, h, w = x[0].size()
+        feat1 = x[0]
+        feat2 = F.interpolate(x[1], size=(h, w), mode="bilinear", align_corners=True)
+        feat3 = F.interpolate(x[2], size=(h, w), mode="bilinear", align_corners=True)
+        feat4 = F.interpolate(x[3], size=(h, w), mode="bilinear", align_corners=True)
+        feats = torch.cat([feat1, feat2, feat3, feat4], 1)
+        out_aux = self.aux_head(feats)
+        feats = self.conv3x3(feats)
+        context = self.ocr_gather_head(feats, out_aux)
+        feats = self.ocr_distri_head(feats, context)
+        out = self.cls_head(feats)
+        out_aux = F.interpolate(
+            out_aux, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        out = F.interpolate(
+            out, size=(x_.size(2), x_.size(3)), mode="bilinear", align_corners=True
+        )
+        return out_aux, out

isegm/model/modeling/hrformer_helper/__init__.py ADDED Viewed

File without changes

isegm/model/modeling/hrformer_helper/backbone_selector.py ADDED Viewed

	@@ -0,0 +1,54 @@

+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Donny You, RainbowSecret
+## Microsoft Research
+## [email protected]
+## Copyright (c) 2019
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# from lib.models.backbones.resnet.resnet_backbone import ResNetBackbone
+# from lib.models.backbones.hrnet.hrnet_backbone import HRNetBackbone
+from .hrt.hrt_backbone import HRTBackbone
+# from lib.models.backbones.swin.swin_backbone import SwinTransformerBackbone
+from .hrt.logger import Logger as Log
+class BackboneSelector(object):
+    def __init__(self, configer):
+        self.configer = configer
+    def get_backbone(self, **params):
+        backbone = self.configer.get("network", "backbone")
+        model = None
+        # if (
+        #     "resnet" in backbone or "resnext" in backbone or "resnest" in backbone
+        # ) and "senet" not in backbone:
+        #     model = ResNetBackbone(self.configer)(**params)
+        if "hrt" in backbone:
+            # model = HRTBackbone(self.configer)(**params)
+            pass
+        # elif "hrnet" in backbone:
+        #     model = HRNetBackbone(self.configer)(**params)
+        # elif "swin" in backbone:
+        #     model = SwinTransformerBackbone(self.configer)(**params)
+        else:
+            Log.error("Backbone {} is invalid.".format(backbone))
+            exit(1)
+        return model
+class Test():
+    def __init__():
+        pass

isegm/model/modeling/hrformer_helper/hrt/__init__.py ADDED Viewed

File without changes

isegm/model/modeling/hrformer_helper/hrt/hrt_backbone.py ADDED Viewed

	@@ -0,0 +1,661 @@

+import os
+import pdb
+import argparse
+import torch
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+from .modules.bottleneck_block import Bottleneck, BottleneckDWP
+from .modules.transformer_block import GeneralTransformerBlock
+from .module_helper import ModuleHelper
+from .logger import Logger as Log
+blocks_dict = {
+    "BOTTLENECK": Bottleneck,
+    "TRANSFORMER_BLOCK": GeneralTransformerBlock,
+}
+BN_MOMENTUM = 0.1
+class HighResolutionTransformerModule(nn.Module):
+    def __init__(
+        self,
+        num_branches,
+        blocks,
+        num_blocks,
+        num_inchannels,
+        num_channels,
+        num_heads,
+        num_window_sizes,
+        num_mlp_ratios,
+        multi_scale_output=True,
+        drop_path=0.0,
+    ):
+        """Based on Local-Attention & FFN-DW-BN
+        num_heads: the number of head witin each MHSA
+        num_window_sizes: the window size for the local self-attention
+        num_halo_sizes: the halo size around the local window
+            - reference: ``Scaling Local Self-Attention for Parameter Efficient Visual Backbones''
+        num_sr_ratios: the spatial reduction ratios of PVT/SRA scheme.
+            - reference: ``Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions''
+        """
+        super(HighResolutionTransformerModule, self).__init__()
+        self._check_branches(
+            num_branches, blocks, num_blocks, num_inchannels, num_channels
+        )
+        self.num_inchannels = num_inchannels
+        self.num_branches = num_branches
+        self.multi_scale_output = multi_scale_output
+        self.branches = self._make_branches(
+            num_branches,
+            blocks,
+            num_blocks,
+            num_channels,
+            num_heads,
+            num_window_sizes,
+            num_mlp_ratios,
+            drop_path,
+        )
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=True)
+        self.num_heads = num_heads
+        self.num_window_sizes = num_window_sizes
+        self.num_mlp_ratios = num_mlp_ratios
+    def _check_branches(
+        self, num_branches, blocks, num_blocks, num_inchannels, num_channels
+    ):
+        if num_branches != len(num_blocks):
+            error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(
+                num_branches, len(num_blocks)
+            )
+            Log.error(error_msg)
+            raise ValueError(error_msg)
+        if num_branches != len(num_channels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
+                num_branches, len(num_channels)
+            )
+            Log.error(error_msg)
+            raise ValueError(error_msg)
+        if num_branches != len(num_inchannels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format(
+                num_branches, len(num_inchannels)
+            )
+            Log.error(error_msg)
+            raise ValueError(error_msg)
+    def _make_one_branch(
+        self,
+        branch_index,
+        block,
+        num_blocks,
+        num_channels,
+        num_heads,
+        num_window_sizes,
+        num_mlp_ratios,
+        drop_paths,
+        stride=1,
+    ):
+        downsample = None
+        if (
+            stride != 1
+            or self.num_inchannels[branch_index]
+            != num_channels[branch_index] * block.expansion
+        ):
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.SyncBatchNorm(
+                    num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM
+                ),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.num_inchannels[branch_index],
+                num_channels[branch_index],
+                num_heads=num_heads[branch_index],
+                window_size=num_window_sizes[branch_index],
+                mlp_ratio=num_mlp_ratios[branch_index],
+                drop_path=drop_paths[0],
+            )
+        )
+        self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index],
+                    num_heads=num_heads[branch_index],
+                    window_size=num_window_sizes[branch_index],
+                    mlp_ratio=num_mlp_ratios[branch_index],
+                    drop_path=drop_paths[i],
+                )
+            )
+        return nn.Sequential(*layers)
+    def _make_branches(
+        self,
+        num_branches,
+        block,
+        num_blocks,
+        num_channels,
+        num_heads,
+        num_window_sizes,
+        num_mlp_ratios,
+        drop_paths,
+    ):
+        branches = []
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(
+                    i,
+                    block,
+                    num_blocks,
+                    num_channels,
+                    num_heads,
+                    num_window_sizes,
+                    num_mlp_ratios,
+                    drop_paths=[_ * (2 ** i) for _ in drop_paths]
+                    if os.environ.get("multi_res_drop_path", False)
+                    else drop_paths,
+                )
+            )
+        return nn.ModuleList(branches)
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_inchannels[j],
+                                num_inchannels[i],
+                                kernel_size=1,
+                                stride=1,
+                                bias=False,
+                            ),
+                            nn.SyncBatchNorm(num_inchannels[i], momentum=BN_MOMENTUM),
+                            nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"),
+                        )
+                    )
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_inchannels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=num_inchannels[j],
+                                        bias=False,
+                                    ),
+                                    nn.SyncBatchNorm(
+                                        num_inchannels[j], momentum=BN_MOMENTUM
+                                    ),
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        kernel_size=1,
+                                        stride=1,
+                                        bias=False,
+                                    ),
+                                    nn.SyncBatchNorm(
+                                        num_outchannels_conv3x3, momentum=BN_MOMENTUM
+                                    ),
+                                )
+                            )
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_inchannels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=num_inchannels[j],
+                                        bias=False,
+                                    ),
+                                    nn.SyncBatchNorm(
+                                        num_inchannels[j], momentum=BN_MOMENTUM
+                                    ),
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        kernel_size=1,
+                                        stride=1,
+                                        bias=False,
+                                    ),
+                                    nn.SyncBatchNorm(
+                                        num_outchannels_conv3x3, momentum=BN_MOMENTUM
+                                    ),
+                                    nn.ReLU(False),
+                                )
+                            )
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+        return nn.ModuleList(fuse_layers)
+    def get_num_inchannels(self):
+        return self.num_inchannels
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                elif j > i:
+                    width_output = x[i].shape[-1]
+                    height_output = x[i].shape[-2]
+                    y = y + F.interpolate(
+                        self.fuse_layers[i][j](x[j]),
+                        size=[height_output, width_output],
+                        mode="bilinear",
+                        align_corners=True,
+                    )
+                else:
+                    y = y + self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+class HighResolutionTransformer(nn.Module):
+    def __init__(self, cfg, in_ch=3, **kwargs):
+        super(HighResolutionTransformer, self).__init__()
+        self.conv1 = nn.Conv2d(in_ch, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.SyncBatchNorm(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.SyncBatchNorm(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        # stochastic depth
+        depth_s2 = cfg["STAGE2"]["NUM_BLOCKS"][0] * cfg["STAGE2"]["NUM_MODULES"]
+        depth_s3 = cfg["STAGE3"]["NUM_BLOCKS"][0] * cfg["STAGE3"]["NUM_MODULES"]
+        depth_s4 = cfg["STAGE4"]["NUM_BLOCKS"][0] * cfg["STAGE4"]["NUM_MODULES"]
+        depths = [depth_s2, depth_s3, depth_s4]
+        drop_path_rate = cfg["DROP_PATH_RATE"]
+        if os.environ.get("drop_path_rate") is not None:
+            drop_path_rate = float(os.environ.get("drop_path_rate"))
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        self.stage1_cfg = cfg["STAGE1"]
+        num_channels = self.stage1_cfg["NUM_CHANNELS"][0]
+        block = blocks_dict[self.stage1_cfg["BLOCK"]]
+        num_blocks = self.stage1_cfg["NUM_BLOCKS"][0]
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+        stage1_out_channel = block.expansion * num_channels
+        self.stage2_cfg = cfg["STAGE2"]
+        num_channels = self.stage2_cfg["NUM_CHANNELS"]
+        block = blocks_dict[self.stage2_cfg["BLOCK"]]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition1 = self._make_transition_layer(
+            [stage1_out_channel], num_channels
+        )
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels, drop_path=dpr[0:depth_s2]
+        )
+        self.stage3_cfg = cfg["STAGE3"]
+        num_channels = self.stage3_cfg["NUM_CHANNELS"]
+        block = blocks_dict[self.stage3_cfg["BLOCK"]]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels, drop_path=dpr[depth_s2 : depth_s2 + depth_s3]
+        )
+        self.stage4_cfg = cfg["STAGE4"]
+        num_channels = self.stage4_cfg["NUM_CHANNELS"]
+        block = blocks_dict[self.stage4_cfg["BLOCK"]]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg,
+            num_channels,
+            multi_scale_output=True,
+            drop_path=dpr[depth_s2 + depth_s3 :],
+        )
+        if os.environ.get("keep_imagenet_head"):
+            (
+                self.incre_modules,
+                self.downsamp_modules,
+                self.final_layer,
+            ) = self._make_head(pre_stage_channels)
+    def _make_head(self, pre_stage_channels):
+        head_block = BottleneckDWP
+        head_channels = [32, 64, 128, 256]
+        # Increasing the #channels on each resolution
+        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
+        incre_modules = []
+        for i, channels in enumerate(pre_stage_channels):
+            incre_module = self._make_layer(
+                head_block, channels, head_channels[i], 1, stride=1
+            )
+            incre_modules.append(incre_module)
+        incre_modules = nn.ModuleList(incre_modules)
+        # downsampling modules
+        downsamp_modules = []
+        for i in range(len(pre_stage_channels) - 1):
+            in_channels = head_channels[i] * head_block.expansion
+            out_channels = head_channels[i + 1] * head_block.expansion
+            downsamp_module = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=in_channels,
+                ),
+                nn.SyncBatchNorm(in_channels, momentum=BN_MOMENTUM),
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1),
+                nn.SyncBatchNorm(out_channels, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+            )
+            downsamp_modules.append(downsamp_module)
+        downsamp_modules = nn.ModuleList(downsamp_modules)
+        final_layer = nn.Sequential(
+            nn.Conv2d(
+                in_channels=head_channels[3] * head_block.expansion,
+                out_channels=2048,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.SyncBatchNorm(2048, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        )
+        return incre_modules, downsamp_modules, final_layer
+    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                3,
+                                1,
+                                1,
+                                bias=False,
+                            ),
+                            nn.SyncBatchNorm(
+                                num_channels_cur_layer[i], momentum=BN_MOMENTUM
+                            ),
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i + 1 - num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = (
+                        num_channels_cur_layer[i]
+                        if j == i - num_branches_pre
+                        else inchannels
+                    )
+                    conv3x3s.append(
+                        nn.Sequential(
+                            nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
+                            nn.SyncBatchNorm(outchannels, momentum=BN_MOMENTUM),
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                transition_layers.append(nn.Sequential(*conv3x3s))
+        return nn.ModuleList(transition_layers)
+    def _make_layer(
+        self,
+        block,
+        inplanes,
+        planes,
+        blocks,
+        num_heads=1,
+        stride=1,
+        window_size=7,
+        mlp_ratio=4.0,
+    ):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.SyncBatchNorm(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+        layers = []
+        if isinstance(block, GeneralTransformerBlock):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    num_heads,
+                    window_size,
+                    mlp_ratio,
+                )
+            )
+        else:
+            layers.append(block(inplanes, planes, stride, downsample))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+        return nn.Sequential(*layers)
+    def _make_stage(
+        self, layer_config, num_inchannels, multi_scale_output=True, drop_path=0.0
+    ):
+        num_modules = layer_config["NUM_MODULES"]
+        num_branches = layer_config["NUM_BRANCHES"]
+        num_blocks = layer_config["NUM_BLOCKS"]
+        num_channels = layer_config["NUM_CHANNELS"]
+        block = blocks_dict[layer_config["BLOCK"]]
+        num_heads = layer_config["NUM_HEADS"]
+        num_window_sizes = layer_config["NUM_WINDOW_SIZES"]
+        num_mlp_ratios = layer_config["NUM_MLP_RATIOS"]
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+            modules.append(
+                HighResolutionTransformerModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    num_heads,
+                    num_window_sizes,
+                    num_mlp_ratios,
+                    reset_multi_scale_output,
+                    drop_path=drop_path[num_blocks[0] * i : num_blocks[0] * (i + 1)],
+                )
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+        return nn.Sequential(*modules), num_inchannels
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x_list = []
+        for i in range(self.stage2_cfg["NUM_BRANCHES"]):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+        x_list = []
+        for i in range(self.stage3_cfg["NUM_BRANCHES"]):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+        x_list = []
+        for i in range(self.stage4_cfg["NUM_BRANCHES"]):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+        if os.environ.get("keep_imagenet_head"):
+            x_list = []
+            y = self.incre_modules[0](y_list[0])
+            x_list.append(y)
+            for i in range(len(self.downsamp_modules)):
+                y = self.incre_modules[i + 1](y_list[i + 1]) + self.downsamp_modules[i](
+                    y
+                )
+                x_list.append(y)
+            y = self.final_layer(y)
+            del x_list[-1]
+            x_list.append(y)
+            return x_list
+        else:
+            return y_list
+class HRTBackbone(object):
+    def __init__(self, configer):
+        self.configer = configer
+    def __call__(self):
+        arch = self.configer.get("network", "backbone")
+        from .hrt_config import MODEL_CONFIGS
+        if arch in [
+            "hrt_small",
+            "hrt_base",
+            "hrt_base_win13",
+            "hrt_base_win15",
+        ]:
+            arch_net = HighResolutionTransformer(MODEL_CONFIGS[arch])
+            arch_net = ModuleHelper.load_model(
+                arch_net,
+                pretrained=self.configer.get("network", "pretrained"),
+                all_match=False,
+                network="hrt_window" if "win" in arch else "hrt",
+            )
+        else:
+            raise Exception("Architecture undefined!")
+        return arch_net
+class HRTBackbone_v2(object):
+    def __init__(self, backbone='hrt_small', pretrained=None, in_ch=3):
+        self.backbone = backbone
+        self.pretrained = pretrained
+        self.in_ch = in_ch
+    def __call__(self):
+        from .hrt_config import MODEL_CONFIGS
+        if self.backbone in [
+            "hrt_small",
+            "hrt_base",
+            "hrt_base_win13",
+            "hrt_base_win15",
+        ]:
+            arch_net = HighResolutionTransformer(MODEL_CONFIGS[self.backbone], in_ch=self.in_ch)
+            arch_net = ModuleHelper.load_model(
+                arch_net,
+                pretrained=self.pretrained,
+                all_match=False,
+                network="hrt_window" if "win" in self.backbone else "hrt",
+            )
+        else:
+            raise Exception("ARCHITECTURE UNDEFINED!")
+        return arch_net

isegm/model/modeling/hrformer_helper/hrt/hrt_config.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Rainbowsecret ([email protected])
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from yacs.config import CfgNode as CN
+# configs for HRT_SMALL
+HRT_SMALL = CN()
+HRT_SMALL.DROP_PATH_RATE = 0.2
+HRT_SMALL.STAGE1 = CN()
+HRT_SMALL.STAGE1.NUM_MODULES = 1
+HRT_SMALL.STAGE1.NUM_BRANCHES = 1
+HRT_SMALL.STAGE1.NUM_BLOCKS = [2]
+HRT_SMALL.STAGE1.NUM_CHANNELS = [64]
+HRT_SMALL.STAGE1.NUM_HEADS = [2]
+HRT_SMALL.STAGE1.NUM_MLP_RATIOS = [4]
+HRT_SMALL.STAGE1.NUM_RESOLUTIONS = [[56, 56]]
+HRT_SMALL.STAGE1.BLOCK = "BOTTLENECK"
+HRT_SMALL.STAGE2 = CN()
+HRT_SMALL.STAGE2.NUM_MODULES = 1
+HRT_SMALL.STAGE2.NUM_BRANCHES = 2
+HRT_SMALL.STAGE2.NUM_BLOCKS = [2, 2]
+HRT_SMALL.STAGE2.NUM_CHANNELS = [32, 64]
+HRT_SMALL.STAGE2.NUM_HEADS = [1, 2]
+HRT_SMALL.STAGE2.NUM_MLP_RATIOS = [4, 4]
+HRT_SMALL.STAGE2.NUM_RESOLUTIONS = [[56, 56], [28, 28]]
+HRT_SMALL.STAGE2.NUM_WINDOW_SIZES = [7, 7]
+HRT_SMALL.STAGE2.BLOCK = "TRANSFORMER_BLOCK"
+HRT_SMALL.STAGE3 = CN()
+HRT_SMALL.STAGE3.NUM_MODULES = 4
+HRT_SMALL.STAGE3.NUM_BRANCHES = 3
+HRT_SMALL.STAGE3.NUM_BLOCKS = [2, 2, 2]
+HRT_SMALL.STAGE3.NUM_CHANNELS = [32, 64, 128]
+HRT_SMALL.STAGE3.NUM_HEADS = [1, 2, 4]
+HRT_SMALL.STAGE3.NUM_MLP_RATIOS = [4, 4, 4]
+HRT_SMALL.STAGE3.NUM_RESOLUTIONS = [[56, 56], [28, 28], [14, 14]]
+HRT_SMALL.STAGE3.NUM_WINDOW_SIZES = [7, 7, 7]
+HRT_SMALL.STAGE3.BLOCK = "TRANSFORMER_BLOCK"
+HRT_SMALL.STAGE4 = CN()
+HRT_SMALL.STAGE4.NUM_MODULES = 2
+HRT_SMALL.STAGE4.NUM_BRANCHES = 4
+HRT_SMALL.STAGE4.NUM_BLOCKS = [2, 2, 2, 2]
+HRT_SMALL.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
+HRT_SMALL.STAGE4.NUM_HEADS = [1, 2, 4, 8]
+HRT_SMALL.STAGE4.NUM_MLP_RATIOS = [4, 4, 4, 4]
+HRT_SMALL.STAGE4.NUM_RESOLUTIONS = [[56, 56], [28, 28], [14, 14], [7, 7]]
+HRT_SMALL.STAGE4.NUM_WINDOW_SIZES = [7, 7, 7, 7]
+HRT_SMALL.STAGE4.BLOCK = "TRANSFORMER_BLOCK"
+# configs for HRT_BASE
+HRT_BASE = CN()
+HRT_BASE.DROP_PATH_RATE = 0.2
+HRT_BASE.STAGE1 = CN()
+HRT_BASE.STAGE1.NUM_MODULES = 1
+HRT_BASE.STAGE1.NUM_BRANCHES = 1
+HRT_BASE.STAGE1.NUM_BLOCKS = [2]
+HRT_BASE.STAGE1.NUM_CHANNELS = [64]
+HRT_BASE.STAGE1.NUM_HEADS = [2]
+HRT_BASE.STAGE1.NUM_MLP_RATIOS = [4]
+HRT_BASE.STAGE1.NUM_RESOLUTIONS = [[56, 56]]
+HRT_BASE.STAGE1.BLOCK = "BOTTLENECK"
+HRT_BASE.STAGE2 = CN()
+HRT_BASE.STAGE2.NUM_MODULES = 1
+HRT_BASE.STAGE2.NUM_BRANCHES = 2
+HRT_BASE.STAGE2.NUM_BLOCKS = [2, 2]
+HRT_BASE.STAGE2.NUM_CHANNELS = [78, 156]
+HRT_BASE.STAGE2.NUM_HEADS = [2, 4]
+HRT_BASE.STAGE2.NUM_MLP_RATIOS = [4, 4]
+HRT_BASE.STAGE2.NUM_RESOLUTIONS = [[56, 56], [28, 28]]
+HRT_BASE.STAGE2.NUM_WINDOW_SIZES = [7, 7]
+HRT_BASE.STAGE2.BLOCK = "TRANSFORMER_BLOCK"
+HRT_BASE.STAGE3 = CN()
+HRT_BASE.STAGE3.NUM_MODULES = 4
+HRT_BASE.STAGE3.NUM_BRANCHES = 3
+HRT_BASE.STAGE3.NUM_BLOCKS = [2, 2, 2]
+HRT_BASE.STAGE3.NUM_CHANNELS = [78, 156, 312]
+HRT_BASE.STAGE3.NUM_HEADS = [2, 4, 8]
+HRT_BASE.STAGE3.NUM_MLP_RATIOS = [4, 4, 4]
+HRT_BASE.STAGE3.NUM_RESOLUTIONS = [[56, 56], [28, 28], [14, 14]]
+HRT_BASE.STAGE3.NUM_WINDOW_SIZES = [7, 7, 7]
+HRT_BASE.STAGE3.BLOCK = "TRANSFORMER_BLOCK"
+HRT_BASE.STAGE4 = CN()
+HRT_BASE.STAGE4.NUM_MODULES = 2
+HRT_BASE.STAGE4.NUM_BRANCHES = 4
+HRT_BASE.STAGE4.NUM_BLOCKS = [2, 2, 2, 2]
+HRT_BASE.STAGE4.NUM_CHANNELS = [78, 156, 312, 624]
+HRT_BASE.STAGE4.NUM_HEADS = [2, 4, 8, 16]
+HRT_BASE.STAGE4.NUM_MLP_RATIOS = [4, 4, 4, 4]
+HRT_BASE.STAGE4.NUM_RESOLUTIONS = [[56, 56], [28, 28], [14, 14], [7, 7]]
+HRT_BASE.STAGE4.NUM_WINDOW_SIZES = [7, 7, 7, 7]
+HRT_BASE.STAGE4.BLOCK = "TRANSFORMER_BLOCK"
+HRT_BASE_WIN_13 = HRT_BASE.clone()
+HRT_BASE_WIN_13.STAGE2.NUM_WINDOW_SIZES = [13, 13]
+HRT_BASE_WIN_13.STAGE3.NUM_WINDOW_SIZES = [13, 13, 13]
+HRT_BASE_WIN_13.STAGE4.NUM_WINDOW_SIZES = [13, 13, 13, 13]
+HRT_BASE_WIN_15 = HRT_BASE.clone()
+HRT_BASE_WIN_15.STAGE2.NUM_WINDOW_SIZES = [15, 15]
+HRT_BASE_WIN_15.STAGE3.NUM_WINDOW_SIZES = [15, 15, 15]
+HRT_BASE_WIN_15.STAGE4.NUM_WINDOW_SIZES = [15, 15, 15, 15]
+MODEL_CONFIGS = {
+    "hrt_small": HRT_SMALL,
+    "hrt_base": HRT_BASE,
+    "hrt_base_win13": HRT_BASE_WIN_13,
+    "hrt_base_win15": HRT_BASE_WIN_15,
+}

isegm/model/modeling/hrformer_helper/hrt/logger.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# Author: Donny You([email protected])
+# Logging tool implemented with the python Package logging.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import logging
+import os
+import sys
+DEFAULT_LOGFILE_LEVEL = 'debug'
+DEFAULT_STDOUT_LEVEL = 'info'
+DEFAULT_LOG_FILE = './default.log'
+DEFAULT_LOG_FORMAT = '%(asctime)s %(levelname)-7s %(message)s'
+LOG_LEVEL_DICT = {
+    'debug': logging.DEBUG,
+    'info': logging.INFO,
+    'warning': logging.WARNING,
+    'error': logging.ERROR,
+    'critical': logging.CRITICAL
+}
+class Logger(object):
+    """
+    Args:
+      Log level: CRITICAL>ERROR>WARNING>INFO>DEBUG.
+      Log file: The file that stores the logging info.
+      rewrite: Clear the log file.
+      log format: The format of log messages.
+      stdout level: The log level to print on the screen.
+    """
+    logfile_level = None
+    log_file = None
+    log_format = None
+    rewrite = None
+    stdout_level = None
+    logger = None
+    _caches = {}
+    @staticmethod
+    def init(logfile_level=DEFAULT_LOGFILE_LEVEL,
+             log_file=DEFAULT_LOG_FILE,
+             log_format=DEFAULT_LOG_FORMAT,
+             rewrite=False,
+             stdout_level=None):
+        Logger.logfile_level = logfile_level
+        Logger.log_file = log_file
+        Logger.log_format = log_format
+        Logger.rewrite = rewrite
+        Logger.stdout_level = stdout_level
+        Logger.logger = logging.getLogger()
+        Logger.logger.handlers = []
+        fmt = logging.Formatter(Logger.log_format)
+        if Logger.logfile_level is not None:
+            filemode = 'w'
+            if not Logger.rewrite:
+                filemode = 'a'
+            dir_name = os.path.dirname(os.path.abspath(Logger.log_file))
+            if not os.path.exists(dir_name):
+                os.makedirs(dir_name)
+            if Logger.logfile_level not in LOG_LEVEL_DICT:
+                print('Invalid logging level: {}'.format(Logger.logfile_level))
+                Logger.logfile_level = DEFAULT_LOGFILE_LEVEL
+            Logger.logger.setLevel(LOG_LEVEL_DICT[Logger.logfile_level])
+            fh = logging.FileHandler(Logger.log_file, mode=filemode)
+            fh.setFormatter(fmt)
+            fh.setLevel(LOG_LEVEL_DICT[Logger.logfile_level])
+            Logger.logger.addHandler(fh)
+        if stdout_level is not None:
+            if Logger.logfile_level is None:
+                Logger.logger.setLevel(LOG_LEVEL_DICT[Logger.stdout_level])
+            console = logging.StreamHandler()
+            if Logger.stdout_level not in LOG_LEVEL_DICT:
+                print('Invalid logging level: {}'.format(Logger.stdout_level))
+                return
+            console.setLevel(LOG_LEVEL_DICT[Logger.stdout_level])
+            console.setFormatter(fmt)
+            Logger.logger.addHandler(console)
+    @staticmethod
+    def set_log_file(file_path):
+        Logger.log_file = file_path
+        Logger.init(log_file=file_path)
+    @staticmethod
+    def set_logfile_level(log_level):
+        if log_level not in LOG_LEVEL_DICT:
+            print('Invalid logging level: {}'.format(log_level))
+            return
+        Logger.init(logfile_level=log_level)
+    @staticmethod
+    def clear_log_file():
+        Logger.rewrite = True
+        Logger.init(rewrite=True)
+    @staticmethod
+    def check_logger():
+        if Logger.logger is None:
+            Logger.init(logfile_level=None, stdout_level=DEFAULT_STDOUT_LEVEL)
+    @staticmethod
+    def set_stdout_level(log_level):
+        if log_level not in LOG_LEVEL_DICT:
+            print('Invalid logging level: {}'.format(log_level))
+            return
+        Logger.init(stdout_level=log_level)
+    @staticmethod
+    def debug(message):
+        Logger.check_logger()
+        filename = os.path.basename(sys._getframe().f_back.f_code.co_filename)
+        lineno = sys._getframe().f_back.f_lineno
+        prefix = '[{}, {}]'.format(filename,lineno)
+        Logger.logger.debug('{} {}'.format(prefix, message))
+    @staticmethod
+    def info(message):
+        Logger.check_logger()
+        filename = os.path.basename(sys._getframe().f_back.f_code.co_filename)
+        lineno = sys._getframe().f_back.f_lineno
+        prefix = '[{}, {}]'.format(filename,lineno)
+        Logger.logger.info('{} {}'.format(prefix, message))
+    @staticmethod
+    def info_once(message):
+        Logger.check_logger()
+        filename = os.path.basename(sys._getframe().f_back.f_code.co_filename)
+        lineno = sys._getframe().f_back.f_lineno
+        prefix = '[{}, {}]'.format(filename, lineno)
+        if Logger._caches.get((prefix, message)) is not None:
+            return
+        Logger.logger.info('{} {}'.format(prefix, message))
+        Logger._caches[(prefix, message)] = True
+    @staticmethod
+    def warn(message):
+        Logger.check_logger()
+        filename = os.path.basename(sys._getframe().f_back.f_code.co_filename)
+        lineno = sys._getframe().f_back.f_lineno
+        prefix = '[{}, {}]'.format(filename,lineno)
+        Logger.logger.warn('{} {}'.format(prefix, message))
+    @staticmethod
+    def error(message):
+        Logger.check_logger()
+        filename = os.path.basename(sys._getframe().f_back.f_code.co_filename)
+        lineno = sys._getframe().f_back.f_lineno
+        prefix = '[{}, {}]'.format(filename,lineno)
+        Logger.logger.error('{} {}'.format(prefix, message))
+    @staticmethod
+    def critical(message):
+        Logger.check_logger()
+        filename = os.path.basename(sys._getframe().f_back.f_code.co_filename)
+        lineno = sys._getframe().f_back.f_lineno
+        prefix = '[{}, {}]'.format(filename,lineno)
+        Logger.logger.critical('{} {}'.format(prefix, message))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--logfile_level', default="debug", type=str,
+                        dest='logfile_level', help='To set the log level to files.')
+    parser.add_argument('--stdout_level', default=None, type=str,
+                        dest='stdout_level', help='To set the level to print to screen.')
+    parser.add_argument('--log_file', default="./default.log", type=str,
+                        dest='log_file', help='The path of log files.')
+    parser.add_argument('--log_format', default="%(asctime)s %(levelname)-7s %(message)s",
+                        type=str, dest='log_format', help='The format of log messages.')
+    parser.add_argument('--rewrite', default=False, type=bool,
+                        dest='rewrite', help='Clear the log files existed.')
+    args = parser.parse_args()
+    Logger.init(logfile_level=args.logfile_level, stdout_level=args.stdout_level,
+                log_file=args.log_file, log_format=args.log_format, rewrite=args.rewrite)
+    Logger.info("info test.")
+    Logger.debug("debug test.")
+    Logger.warn("warn test.")
+    Logger.error("error test.")
+    Logger.debug("debug test.")

isegm/model/modeling/hrformer_helper/hrt/module_helper.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# Author: Donny You ([email protected])
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import os
+import pdb
+import math
+import torch
+import torch.nn as nn
+try:
+    from urllib import urlretrieve
+except ImportError:
+    from urllib.request import urlretrieve
+from .logger import Logger as Log
+class ModuleHelper(object):
+    @staticmethod
+    def BNReLU(num_features, bn_type=None, **kwargs):
+        if bn_type == "torchbn":
+            return nn.Sequential(nn.BatchNorm2d(num_features, **kwargs), nn.ReLU())
+        elif bn_type == "torchsyncbn":
+            return nn.Sequential(nn.SyncBatchNorm(num_features, **kwargs), nn.ReLU())
+        elif bn_type == "syncbn":
+            from lib.extensions.syncbn.module import BatchNorm2d
+            return nn.Sequential(BatchNorm2d(num_features, **kwargs), nn.ReLU())
+        elif bn_type == "sn":
+            from lib.extensions.switchablenorms.switchable_norm import SwitchNorm2d
+            return nn.Sequential(SwitchNorm2d(num_features, **kwargs), nn.ReLU())
+        elif bn_type == "gn":
+            return nn.Sequential(
+                nn.GroupNorm(num_groups=8, num_channels=num_features, **kwargs),
+                nn.ReLU(),
+            )
+        elif bn_type == "fn":
+            Log.error("Not support Filter-Response-Normalization: {}.".format(bn_type))
+            exit(1)
+        elif bn_type == "inplace_abn":
+            torch_ver = torch.__version__[:3]
+            # Log.info('Pytorch Version: {}'.format(torch_ver))
+            if torch_ver == "0.4":
+                from lib.extensions.inplace_abn.bn import InPlaceABNSync
+                return InPlaceABNSync(num_features, **kwargs)
+            elif torch_ver in ("1.0", "1.1"):
+                from lib.extensions.inplace_abn_1.bn import InPlaceABNSync
+                return InPlaceABNSync(num_features, **kwargs)
+            elif torch_ver == "1.2":
+                from inplace_abn import InPlaceABNSync
+                return InPlaceABNSync(num_features, **kwargs)
+        else:
+            Log.error("Not support BN type: {}.".format(bn_type))
+            exit(1)
+    @staticmethod
+    def BatchNorm2d(bn_type="torch", ret_cls=False):
+        if bn_type == "torchbn":
+            return nn.BatchNorm2d
+        elif bn_type == "torchsyncbn":
+            return nn.SyncBatchNorm
+        elif bn_type == "syncbn":
+            from lib.extensions.syncbn.module import BatchNorm2d
+            return BatchNorm2d
+        elif bn_type == "sn":
+            from lib.extensions.switchablenorms.switchable_norm import SwitchNorm2d
+            return SwitchNorm2d
+        elif bn_type == "gn":
+            return functools.partial(nn.GroupNorm, num_groups=32)
+        elif bn_type == "inplace_abn":
+            torch_ver = torch.__version__[:3]
+            if torch_ver == "0.4":
+                from lib.extensions.inplace_abn.bn import InPlaceABNSync
+                if ret_cls:
+                    return InPlaceABNSync
+                return functools.partial(InPlaceABNSync, activation="none")
+            elif torch_ver in ("1.0", "1.1"):
+                from lib.extensions.inplace_abn_1.bn import InPlaceABNSync
+                if ret_cls:
+                    return InPlaceABNSync
+                return functools.partial(InPlaceABNSync, activation="none")
+            elif torch_ver == "1.2":
+                from inplace_abn import InPlaceABNSync
+                if ret_cls:
+                    return InPlaceABNSync
+                return functools.partial(InPlaceABNSync, activation="identity")
+        else:
+            Log.error("Not support BN type: {}.".format(bn_type))
+            exit(1)
+    @staticmethod
+    def load_model(model, pretrained=None, all_match=True, network="resnet101"):
+        if pretrained is None:
+            return model
+        if all_match:
+            Log.info("Loading pretrained model:{}".format(pretrained))
+            pretrained_dict = torch.load(pretrained)
+            model_dict = model.state_dict()
+            load_dict = dict()
+            for k, v in pretrained_dict.items():
+                if "resinit.{}".format(k) in model_dict:
+                    load_dict["resinit.{}".format(k)] = v
+                else:
+                    load_dict[k] = v
+            model.load_state_dict(load_dict)
+        else:
+            Log.info("Loading pretrained model:{}".format(pretrained))
+            pretrained_dict = torch.load(pretrained)
+            # settings for "wide_resnet38"  or network == "resnet152"
+            if network == "wide_resnet":
+                pretrained_dict = pretrained_dict["state_dict"]
+            model_dict = model.state_dict()
+            if network == "hrnet_plus":
+                # pretrained_dict['conv1_full_res.weight'] = pretrained_dict['conv1.weight']
+                # pretrained_dict['conv2_full_res.weight'] = pretrained_dict['conv2.weight']
+                load_dict = {
+                    k: v for k, v in pretrained_dict.items() if k in model_dict.keys()
+                }
+            elif network == "hrt_window":
+                pretrained_dict = pretrained_dict["model"]
+                for name, m in model.named_parameters():
+                    if "relative_position_bias_table" in name and "embed" not in name:
+                        target_size = int(math.sqrt(m.shape[0]))
+                        head_num = m.shape[-1]
+                        ckpt_size = int(math.sqrt(pretrained_dict[name].shape[0]))
+                        if target_size != ckpt_size:
+                            Log.info(
+                                f"Interpolate from size {pretrained_dict[name ].shape} to {m.shape}."
+                            )
+                            reshape_ckpt = (
+                                pretrained_dict[name]
+                                .permute(1, 0)
+                                .reshape(1, head_num, ckpt_size, ckpt_size)
+                            )
+                            inter_ckpt = (
+                                torch.nn.functional.interpolate(
+                                    reshape_ckpt,
+                                    size=(target_size, target_size),
+                                    mode="bilinear",
+                                )
+                                .reshape(head_num, -1)
+                                .permute(1, 0)
+                            )
+                            scale = 1
+                            inter_ckpt *= scale
+                            pretrained_dict[name] = inter_ckpt
+                for name, m in list(pretrained_dict.items()):
+                    if "relative_position_index" in name:
+                        Log.info(f"Remove {name}.")
+                        pretrained_dict.pop(name)
+                load_dict = {
+                    k: v for k, v in pretrained_dict.items() if k in model_dict.keys()
+                }
+                Log.info(
+                    "Missing keys: {}".format(list(set(model_dict) - set(load_dict)))
+                )
+            elif network == "hrt":
+                pretrained_dict = pretrained_dict["model"]
+                load_dict = {
+                    k: v for k, v in pretrained_dict.items() if k in model_dict.keys()
+                }
+                Log.info(
+                    "Missing keys: {}".format(list(set(model_dict) - set(load_dict)))
+                )
+            elif network == "swin":
+                pretrained_dict = pretrained_dict["model"]
+                # TODO fix the mis-match between the dict keys and the checkpoint keys.
+                pretrained_dict = {
+                    k.replace(".attn.", ".attn.attn."): v
+                    for k, v in pretrained_dict.items()
+                }
+                load_dict = {
+                    k: v for k, v in pretrained_dict.items() if k in model_dict.keys()
+                }
+                Log.info(
+                    "Missing keys: {}".format(list(set(model_dict) - set(load_dict)))
+                )
+            elif network == "hrnet" or network == "xception" or network == "resnest":
+                load_dict = {
+                    k: v for k, v in pretrained_dict.items() if k in model_dict.keys()
+                }
+                Log.info(
+                    "Missing keys: {}".format(list(set(model_dict) - set(load_dict)))
+                )
+            elif network == "dcnet" or network == "resnext":
+                load_dict = dict()
+                for k, v in pretrained_dict.items():
+                    if "resinit.{}".format(k) in model_dict:
+                        load_dict["resinit.{}".format(k)] = v
+                    else:
+                        if k in model_dict:
+                            load_dict[k] = v
+                        else:
+                            pass
+            elif network == "wide_resnet":
+                load_dict = {
+                    ".".join(k.split(".")[1:]): v
+                    for k, v in pretrained_dict.items()
+                    if ".".join(k.split(".")[1:]) in model_dict
+                }
+            else:
+                load_dict = {
+                    ".".join(k.split(".")[1:]): v
+                    for k, v in pretrained_dict.items()
+                    if ".".join(k.split(".")[1:]) in model_dict
+                }
+            # used to debug
+            if int(os.environ.get("debug_load_model", 0)):
+                Log.info("Matched Keys List:")
+                for key in load_dict.keys():
+                    Log.info("{}".format(key))
+            model_dict.update(load_dict)
+            model.load_state_dict(model_dict)
+        return model
+    @staticmethod
+    def load_url(url, map_location=None):
+        model_dir = os.path.join("~", ".PyTorchCV", "models")
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+        filename = url.split("/")[-1]
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            Log.info('Downloading: "{}" to {}\n'.format(url, cached_file))
+            urlretrieve(url, cached_file)
+        Log.info("Loading pretrained model:{}".format(cached_file))
+        return torch.load(cached_file, map_location=map_location)
+    @staticmethod
+    def constant_init(module, val, bias=0):
+        nn.init.constant_(module.weight, val)
+        if hasattr(module, "bias") and module.bias is not None:
+            nn.init.constant_(module.bias, bias)
+    @staticmethod
+    def xavier_init(module, gain=1, bias=0, distribution="normal"):
+        assert distribution in ["uniform", "normal"]
+        if distribution == "uniform":
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+        if hasattr(module, "bias") and module.bias is not None:
+            nn.init.constant_(module.bias, bias)
+    @staticmethod
+    def normal_init(module, mean=0, std=1, bias=0):
+        nn.init.normal_(module.weight, mean, std)
+        if hasattr(module, "bias") and module.bias is not None:
+            nn.init.constant_(module.bias, bias)
+    @staticmethod
+    def uniform_init(module, a=0, b=1, bias=0):
+        nn.init.uniform_(module.weight, a, b)
+        if hasattr(module, "bias") and module.bias is not None:
+            nn.init.constant_(module.bias, bias)
+    @staticmethod
+    def kaiming_init(
+        module, mode="fan_in", nonlinearity="leaky_relu", bias=0, distribution="normal"
+    ):
+        assert distribution in ["uniform", "normal"]
+        if distribution == "uniform":
+            nn.init.kaiming_uniform_(
+                module.weight, mode=mode, nonlinearity=nonlinearity
+            )
+        else:
+            nn.init.kaiming_normal_(module.weight, mode=mode, nonlinearity=nonlinearity)
+        if hasattr(module, "bias") and module.bias is not None:
+            nn.init.constant_(module.bias, bias)

isegm/model/modeling/hrformer_helper/hrt/modules/__init__.py ADDED Viewed

File without changes

isegm/model/modeling/hrformer_helper/hrt/modules/bottleneck_block.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import pdb
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+# from torchvision.models.utils import load_state_dict_from_url
+# from timm.models.registry import register_model
+from functools import partial
+BN_MOMENTUM = 0.1
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        mhsa_flag=False,
+        num_heads=1,
+        num_halo_block=1,
+        num_mlp_ratio=4,
+        num_sr_ratio=1,
+        num_resolution=None,
+        with_rpe=False,
+        with_ffn=True,
+    ):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.SyncBatchNorm(planes)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
+        )
+        self.bn2 = nn.SyncBatchNorm(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False
+        )
+        self.bn3 = nn.SyncBatchNorm(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class BottleneckDWP(nn.Module):
+    expansion = 4
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        mhsa_flag=False,
+        num_heads=1,
+        num_halo_block=1,
+        num_mlp_ratio=4,
+        num_sr_ratio=1,
+        num_resolution=None,
+        with_rpe=False,
+        with_ffn=True,
+    ):
+        super(BottleneckDWP, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.SyncBatchNorm(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            groups=planes,
+        )
+        self.bn2 = nn.SyncBatchNorm(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False
+        )
+        self.bn3 = nn.SyncBatchNorm(planes * self.expansion, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out

isegm/model/modeling/hrformer_helper/hrt/modules/ffn_block.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import pdb
+import torch
+import torch.nn as nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class MlpLight(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, in_features)
+        self.act = act_layer()
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        return x
+class MlpDW(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        if N == (H * W + 1):
+            cls_tokens = x[:, 0, :]
+            x_ = x[:, 1:, :].permute(0, 2, 1).reshape(B, C, H, W)
+        else:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+        x_ = self.fc1(x_)
+        x_ = self.act1(x_)
+        x_ = self.dw3x3(x_)
+        x_ = self.act2(x_)
+        x_ = self.drop(x_)
+        x_ = self.fc2(x_)
+        x_ = self.drop(x_)
+        x_ = x_.reshape(B, C, -1).permute(0, 2, 1)
+        if N == (H * W + 1):
+            x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+        else:
+            x = x_
+        return x
+class MlpDWBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.norm1 = nn.SyncBatchNorm(hidden_features)
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = nn.SyncBatchNorm(hidden_features)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = act_layer()
+        self.norm3 = nn.SyncBatchNorm(out_features)
+        # self.drop = nn.Dropout(drop, inplace=True)
+    def forward(self, x, H, W):
+        if len(x.shape) == 3:
+            B, N, C = x.shape
+            if N == (H * W + 1):
+                cls_tokens = x[:, 0, :]
+                x_ = x[:, 1:, :].permute(0, 2, 1).reshape(B, C, H, W)
+            else:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.fc1(x_)
+            x_ = self.norm1(x_)
+            x_ = self.act1(x_)
+            x_ = self.dw3x3(x_)
+            x_ = self.norm2(x_)
+            x_ = self.act2(x_)
+            # x_ = self.drop(x_)
+            x_ = self.fc2(x_)
+            x_ = self.norm3(x_)
+            x_ = self.act3(x_)
+            # x_ = self.drop(x_)
+            x_ = x_.reshape(B, C, -1).permute(0, 2, 1)
+            if N == (H * W + 1):
+                x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+            else:
+                x = x_
+            return x
+        elif len(x.shape) == 4:
+            x = self.fc1(x)
+            x = self.norm1(x)
+            x = self.act1(x)
+            x = self.dw3x3(x)
+            x = self.norm2(x)
+            x = self.act2(x)
+            # x = self.drop(x)
+            x = self.fc2(x)
+            x = self.norm3(x)
+            x = self.act3(x)
+            # x = self.drop(x)
+            return x
+        else:
+            raise RuntimeError("Unsupported input shape: {}".format(x.shape))
+class MlpConvBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Sequential(
+            nn.Conv1d(
+                in_channels=in_features,
+                out_channels=hidden_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.BatchNorm1d(hidden_features),
+        )
+        self.act = act_layer()
+        self.fc2 = nn.Sequential(
+            nn.Conv1d(
+                in_channels=hidden_features,
+                out_channels=out_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.BatchNorm1d(out_features),
+        )
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = x.transpose(1, 2)
+        x = self.drop(x)
+        return x
+class MlpWODWBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.norm1 = nn.SyncBatchNorm(hidden_features)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = act_layer()
+        self.norm3 = nn.SyncBatchNorm(out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x, H, W):
+        if len(x.shape) == 3:
+            B, N, C = x.shape
+            if N == (H * W + 1):
+                cls_tokens = x[:, 0, :]
+                x_ = x[:, 1:, :].permute(0, 2, 1).reshape(B, C, H, W)
+            else:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.fc1(x_)
+            x_ = self.norm1(x_)
+            x_ = self.act1(x_)
+            x_ = self.fc2(x_)
+            x_ = self.norm3(x_)
+            x_ = self.act3(x_)
+            x_ = self.drop(x_)
+            x_ = x_.reshape(B, C, -1).permute(0, 2, 1)
+            if N == (H * W + 1):
+                x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+            else:
+                x = x_
+            return x
+        elif len(x.shape) == 4:
+            x = self.fc1(x)
+            x = self.norm1(x)
+            x = self.act1(x)
+            x = self.dw3x3(x)
+            x = self.norm2(x)
+            x = self.act2(x)
+            x = self.drop(x)
+            x = self.fc2(x)
+            x = self.norm3(x)
+            x = self.act3(x)
+            x = self.drop(x)
+            return x
+        else:
+            raise RuntimeError("Unsupported input shape: {}".format(x.shape))

isegm/model/modeling/hrformer_helper/hrt/modules/multihead_attention.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import copy
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+from torch.nn.functional import linear, pad, softmax, dropout
+from torch.overrides import has_torch_function, handle_torch_function
+class MultiheadAttention(Module):
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kdim=None,
+        vdim=None,
+    ):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+        self.add_zero_attn = add_zero_attn
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if "_qkv_same_embed_dim" not in state:
+            state["_qkv_same_embed_dim"] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        key_padding_mask=None,
+        need_weights=False,
+        attn_mask=None,
+        residual_attn=None,
+    ):
+        if not self._qkv_same_embed_dim:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                out_dim=self.vdim,
+                residual_attn=residual_attn,
+            )
+        else:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                out_dim=self.vdim,
+                residual_attn=residual_attn,
+            )
+    def multi_head_attention_forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        embed_dim_to_check: int,
+        num_heads: int,
+        in_proj_weight: Tensor,
+        in_proj_bias: Tensor,
+        bias_k: Optional[Tensor],
+        bias_v: Optional[Tensor],
+        add_zero_attn: bool,
+        dropout_p: float,
+        out_proj_weight: Tensor,
+        out_proj_bias: Tensor,
+        training: bool = True,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        use_separate_proj_weight: bool = False,
+        q_proj_weight: Optional[Tensor] = None,
+        k_proj_weight: Optional[Tensor] = None,
+        v_proj_weight: Optional[Tensor] = None,
+        static_k: Optional[Tensor] = None,
+        static_v: Optional[Tensor] = None,
+        out_dim: Optional[Tensor] = None,
+        residual_attn: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        if not torch.jit.is_scripting():
+            tens_ops = (
+                query,
+                key,
+                value,
+                in_proj_weight,
+                in_proj_bias,
+                bias_k,
+                bias_v,
+                out_proj_weight,
+                out_proj_bias,
+            )
+            if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
+                tens_ops
+            ):
+                return handle_torch_function(
+                    multi_head_attention_forward,
+                    tens_ops,
+                    query,
+                    key,
+                    value,
+                    embed_dim_to_check,
+                    num_heads,
+                    in_proj_weight,
+                    in_proj_bias,
+                    bias_k,
+                    bias_v,
+                    add_zero_attn,
+                    dropout_p,
+                    out_proj_weight,
+                    out_proj_bias,
+                    training=training,
+                    key_padding_mask=key_padding_mask,
+                    need_weights=need_weights,
+                    attn_mask=attn_mask,
+                    use_separate_proj_weight=use_separate_proj_weight,
+                    q_proj_weight=q_proj_weight,
+                    k_proj_weight=k_proj_weight,
+                    v_proj_weight=v_proj_weight,
+                    static_k=static_k,
+                    static_v=static_v,
+                )
+        tgt_len, bsz, embed_dim = query.size()
+        key = query if key is None else key
+        value = query if value is None else value
+        assert embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+        head_dim = embed_dim // num_heads
+        v_head_dim = out_dim // num_heads
+        assert (
+            head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        scaling = float(head_dim) ** -0.5
+        q = self.q_proj(query) * scaling
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        if attn_mask is not None:
+            assert (
+                attn_mask.dtype == torch.float32
+                or attn_mask.dtype == torch.float64
+                or attn_mask.dtype == torch.float16
+                or attn_mask.dtype == torch.uint8
+                or attn_mask.dtype == torch.bool
+            ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(
+                attn_mask.dtype
+            )
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn(
+                    "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+                )
+                attn_mask = attn_mask.to(torch.bool)
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError("The size of the 2D attn_mask is not correct.")
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0),
+                    key.size(0),
+                ]:
+                    raise RuntimeError("The size of the 3D attn_mask is not correct.")
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(attn_mask.dim())
+                )
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+            )
+            key_padding_mask = key_padding_mask.to(torch.bool)
+        q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+        src_len = k.size(1)
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if add_zero_attn:
+            src_len += 1
+            k = torch.cat(
+                [
+                    k,
+                    torch.zeros(
+                        (k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device
+                    ),
+                ],
+                dim=1,
+            )
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        (v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device
+                    ),
+                ],
+                dim=1,
+            )
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+        """
+        Attention weight for the invalid region is -inf
+        """
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+            else:
+                attn_output_weights += attn_mask
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float("-inf"),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+        if residual_attn is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights += residual_attn.unsqueeze(0)
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+        """
+        Reweight the attention map before softmax().
+        attn_output_weights: (b*n_head, n, hw)
+        """
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        attn_output_weights = dropout(
+            attn_output_weights, p=dropout_p, training=training
+        )
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+        attn_output = (
+            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+        )
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            return attn_output, attn_output_weights.sum(dim=1) / num_heads
+        else:
+            return attn_output