Spaces:

ljsabc
/

AnimeIns_CPU

Running

App Files Files Community

ljsabc commited on Dec 5, 2023

Commit

395d300

•

1 Parent(s): 77edf8c

Initial commit.

Browse files

Files changed (23) hide show

animeinsseg/__init__.py +708 -0
animeinsseg/anime_instances.py +301 -0
animeinsseg/data/__init__.py +2 -0
animeinsseg/data/dataset.py +929 -0
animeinsseg/data/maskrefine_dataset.py +235 -0
animeinsseg/data/metrics.py +348 -0
animeinsseg/data/paste_methods.py +327 -0
animeinsseg/data/sampler.py +226 -0
animeinsseg/data/syndataset.py +213 -0
animeinsseg/data/transforms.py +299 -0
animeinsseg/inpainting/__init__.py +0 -0
animeinsseg/inpainting/ldm_inpaint.py +353 -0
animeinsseg/inpainting/patch_match.py +203 -0
animeinsseg/models/__init__.py +7 -0
animeinsseg/models/animeseg_refine/__init__.py +189 -0
animeinsseg/models/animeseg_refine/encoders.py +51 -0
animeinsseg/models/animeseg_refine/isnet.py +645 -0
animeinsseg/models/animeseg_refine/models.py +0 -0
animeinsseg/models/animeseg_refine/modnet.py +667 -0
animeinsseg/models/animeseg_refine/u2net.py +228 -0
animeinsseg/models/rtmdet_inshead_custom.py +370 -0
app.py +67 -0
requirements.txt +15 -0

animeinsseg/__init__.py ADDED Viewed

	@@ -0,0 +1,708 @@

+import mmcv, torch
+from tqdm import tqdm
+from einops import rearrange
+import os
+import os.path as osp
+import cv2
+import gc
+import math
+from .anime_instances import AnimeInstances
+import numpy as np
+from typing import List, Tuple, Union, Optional, Callable
+from mmengine import Config
+from mmengine.model.utils import revert_sync_batchnorm
+from mmdet.utils import register_all_modules, get_test_pipeline_cfg
+from mmdet.apis import init_detector
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample, SampleList
+from mmdet.structures.bbox.transforms import scale_boxes, get_box_wh
+from mmdet.models.dense_heads.rtmdet_ins_head import RTMDetInsHead
+from pycocotools.coco import COCO
+from mmcv.transforms import Compose
+from mmdet.models.detectors.single_stage import SingleStageDetector
+from utils.logger import LOGGER
+from utils.io_utils import square_pad_resize, find_all_imgs, imglist2grid, mask2rle, dict2json, scaledown_maxsize, resize_pad
+from utils.constants import DEFAULT_DEVICE, CATEGORIES
+from utils.booru_tagger import Tagger
+from .models.animeseg_refine import AnimeSegmentation, load_refinenet, get_mask
+from .models.rtmdet_inshead_custom import RTMDetInsSepBNHeadCustom
+from torchvision.ops.boxes import box_iou
+import torch.nn.functional as F
+def prepare_refine_batch(segmentations: np.ndarray, img: np.ndarray, max_batch_size: int = 4, device: str = 'cpu', input_size: int = 720):
+    img, (pt, pb, pl, pr) = resize_pad(img, input_size, pad_value=(0, 0, 0))
+    img = img.transpose((2, 0, 1)).astype(np.float32) / 255.
+    batch = []
+    num_seg = len(segmentations)
+    for ii, seg in enumerate(segmentations):
+        seg, _ = resize_pad(seg, input_size, 0)
+        seg = seg[None, ...]
+        batch.append(np.concatenate((img, seg)))
+        if ii == num_seg - 1:
+            yield torch.from_numpy(np.array(batch)).to(device), (pt, pb, pl, pr)
+        elif len(batch) >= max_batch_size:
+            yield torch.from_numpy(np.array(batch)).to(device), (pt, pb, pl, pr)
+            batch = []
+VALID_REFINEMETHODS = {'animeseg', 'none'}
+register_all_modules()
+def single_image_preprocess(img: Union[str, np.ndarray], pipeline: Compose):
+    if isinstance(img, str):
+        img = mmcv.imread(img)
+    elif not isinstance(img, np.ndarray):
+        raise NotImplementedError
+    # img = square_pad_resize(img, 1024)[0]
+    data_ = dict(img=img, img_id=0)
+    data_ = pipeline(data_)
+    data_['inputs'] = [data_['inputs']]
+    data_['data_samples'] = [data_['data_samples']]
+    return data_, img
+def animeseg_refine(det_pred: DetDataSample, img: np.ndarray, net: AnimeSegmentation, to_rgb=True, input_size: int = 1024):
+    num_pred = len(det_pred.pred_instances)
+    if num_pred < 1:
+        return
+    with torch.no_grad():
+        if to_rgb:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        seg_thr = 0.5
+        mask = get_mask(net, img, s=input_size)[..., 0]
+        mask = (mask > seg_thr)
+        ins_masks = det_pred.pred_instances.masks
+        if isinstance(ins_masks, torch.Tensor):
+            tensor_device = ins_masks.device
+            tensor_dtype = ins_masks.dtype
+            to_tensor = True
+            ins_masks = ins_masks.cpu().numpy()
+        area_original = np.sum(ins_masks, axis=(1, 2))
+        masks_refined = np.bitwise_and(ins_masks, mask[None, ...])
+        area_refined = np.sum(masks_refined, axis=(1, 2))
+        for ii in range(num_pred):
+            if area_refined[ii] / area_original[ii] > 0.3:
+                ins_masks[ii] = masks_refined[ii]
+        ins_masks = np.ascontiguousarray(ins_masks)
+        # for ii, insm in enumerate(ins_masks):
+        #     cv2.imwrite(f'{ii}.png', insm.astype(np.uint8) * 255)
+        if to_tensor:
+            ins_masks = torch.from_numpy(ins_masks).to(dtype=tensor_dtype).to(device=tensor_device)
+        det_pred.pred_instances.masks = ins_masks
+        # rst = np.concatenate((mask * img + 1 - mask, mask * 255), axis=2).astype(np.uint8)
+        # cv2.imwrite('rst.png', rst)
+# def refinenet_forward(det_pred: DetDataSample, img: np.ndarray, net: AnimeSegmentation, to_rgb=True, input_size: int = 1024):
+#     num_pred = len(det_pred.pred_instances)
+#     if num_pred < 1:
+#         return
+#     with torch.no_grad():
+#         if to_rgb:
+#             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+#         seg_thr = 0.5
+#         h0, w0 = h, w = img.shape[0], img.shape[1]
+#         if h > w:
+#             h, w = input_size, int(input_size * w / h)
+#         else:
+#             h, w = int(input_size * h / w), input_size
+#         ph, pw = input_size - h, input_size - w
+#         tmpImg = np.zeros([s, s, 3], dtype=np.float32)
+#         tmpImg[ph // 2:ph // 2 + h, pw // 2:pw // 2 + w] = cv2.resize(input_img, (w, h)) / 255
+#         tmpImg = tmpImg.transpose((2, 0, 1))
+#         tmpImg = torch.from_numpy(tmpImg).unsqueeze(0).type(torch.FloatTensor).to(model.device)
+#         with torch.no_grad():
+#             if use_amp:
+#                 with amp.autocast():
+#                     pred = model(tmpImg)
+#                 pred = pred.to(dtype=torch.float32)
+#             else:
+#                 pred = model(tmpImg)
+#             pred = pred[0, :, ph // 2:ph // 2 + h, pw // 2:pw // 2 + w]
+#             pred = cv2.resize(pred.cpu().numpy().transpose((1, 2, 0)), (w0, h0))[:, :, np.newaxis]
+#             return pred
+#         mask = (mask > seg_thr)
+#         ins_masks = det_pred.pred_instances.masks
+#         if isinstance(ins_masks, torch.Tensor):
+#             tensor_device = ins_masks.device
+#             tensor_dtype = ins_masks.dtype
+#             to_tensor = True
+#             ins_masks = ins_masks.cpu().numpy()
+#         area_original = np.sum(ins_masks, axis=(1, 2))
+#         masks_refined = np.bitwise_and(ins_masks, mask[None, ...])
+#         area_refined = np.sum(masks_refined, axis=(1, 2))
+#         for ii in range(num_pred):
+#             if area_refined[ii] / area_original[ii] > 0.3:
+#                 ins_masks[ii] = masks_refined[ii]
+#         ins_masks = np.ascontiguousarray(ins_masks)
+#         # for ii, insm in enumerate(ins_masks):
+#         #     cv2.imwrite(f'{ii}.png', insm.astype(np.uint8) * 255)
+#         if to_tensor:
+#             ins_masks = torch.from_numpy(ins_masks).to(dtype=tensor_dtype).to(device=tensor_device)
+#         det_pred.pred_instances.masks = ins_masks
+def read_imglst_from_txt(filep) -> List[str]:
+    with open(filep, 'r', encoding='utf8') as f:
+        lines = f.read().splitlines()
+    return lines
+class AnimeInsSeg:
+    def __init__(self, ckpt: str, default_det_size: int = 640, device: str = None,
+                 refine_kwargs: dict = {'refine_method': 'refinenet_isnet'},
+                 tagger_path: str = 'models/wd-v1-4-swinv2-tagger-v2/model.onnx', mask_thr=0.3) -> None:
+        self.ckpt = ckpt
+        self.default_det_size = default_det_size
+        self.device = DEFAULT_DEVICE if device is None else device
+        # init detector in mmdet's way
+        ckpt = torch.load(ckpt, map_location='cpu')
+        cfg = Config.fromstring(ckpt['meta']['cfg'].replace('file_client_args', 'backend_args'), file_format='.py')
+        cfg.visualizer = []
+        cfg.vis_backends = {}
+        cfg.default_hooks.pop('visualization')
+        # self.model: SingleStageDetector = init_detector(cfg, checkpoint=None, device='cpu')
+        model = MODELS.build(cfg.model)
+        model = revert_sync_batchnorm(model)
+        self.model = model.to(self.device).eval()
+        self.model.load_state_dict(ckpt['state_dict'], strict=False)
+        self.model = self.model.to(self.device).eval()
+        self.cfg = cfg.copy()
+        test_pipeline = get_test_pipeline_cfg(self.cfg.copy())
+        test_pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+        test_pipeline = Compose(test_pipeline)
+        self.default_data_pipeline = test_pipeline
+        self.refinenet = None
+        self.refinenet_animeseg: AnimeSegmentation = None
+        self.postprocess_refine: Callable = None
+        if refine_kwargs is not None:
+            self.set_refine_method(**refine_kwargs)
+        self.tagger = None
+        self.tagger_path = tagger_path
+        self.mask_thr = mask_thr
+    def init_tagger(self, tagger_path: str = None):
+        tagger_path = self.tagger_path if tagger_path is None else tagger_path
+        self.tagger = Tagger(self.tagger_path)
+    def infer_tags(self, instances: AnimeInstances, img: np.ndarray, infer_grey: bool = False):
+        if self.tagger is None:
+            self.init_tagger()
+        if infer_grey:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[..., None][..., [0, 0, 0]]
+        num_ins = len(instances)
+        for ii in range(num_ins):
+            bbox = instances.bboxes[ii]
+            mask = instances.masks[ii]
+            if isinstance(bbox, torch.Tensor):
+                bbox = bbox.cpu().numpy()
+                mask = mask.cpu().numpy()
+            bbox = bbox.astype(np.int32)
+            crop = img[bbox[1]: bbox[3] + bbox[1], bbox[0]: bbox[2] + bbox[0]].copy()
+            mask = mask[bbox[1]: bbox[3] + bbox[1], bbox[0]: bbox[2] + bbox[0]]
+            crop[mask == 0] = 255
+            tags, character_tags = self.tagger.label_cv2_bgr(crop)
+            exclude_tags = ['simple_background', 'white_background']
+            valid_tags = []
+            for tag in tags:
+                if tag in exclude_tags:
+                    continue
+                valid_tags.append(tag)
+            instances.tags[ii] = ' '.join(valid_tags)
+            instances.character_tags[ii] = character_tags
+    @torch.no_grad()
+    def infer_embeddings(self, imgs, det_size = None):
+        def hijack_bbox_mask_post_process(
+                self,
+                results,
+                mask_feat,
+                cfg,
+                rescale: bool = False,
+                with_nms: bool = True,
+                img_meta: Optional[dict] = None):
+            stride = self.prior_generator.strides[0][0]
+            if rescale:
+                assert img_meta.get('scale_factor') is not None
+                scale_factor = [1 / s for s in img_meta['scale_factor']]
+                results.bboxes = scale_boxes(results.bboxes, scale_factor)
+            if hasattr(results, 'score_factors'):
+                # TODO： Add sqrt operation in order to be consistent with
+                #  the paper.
+                score_factors = results.pop('score_factors')
+                results.scores = results.scores * score_factors
+            # filter small size bboxes
+            if cfg.get('min_bbox_size', -1) >= 0:
+                w, h = get_box_wh(results.bboxes)
+                valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+                if not valid_mask.all():
+                    results = results[valid_mask]
+            # results.mask_feat = mask_feat
+            return results, mask_feat
+        def hijack_detector_predict(self: SingleStageDetector,
+                    batch_inputs: torch.Tensor,
+                    batch_data_samples: SampleList,
+                    rescale: bool = True) -> SampleList:
+            x = self.extract_feat(batch_inputs)
+            bbox_head: RTMDetInsSepBNHeadCustom = self.bbox_head
+            old_postprocess = RTMDetInsSepBNHeadCustom._bbox_mask_post_process
+            RTMDetInsSepBNHeadCustom._bbox_mask_post_process = hijack_bbox_mask_post_process
+            # results_list = bbox_head.predict(
+            #     x, batch_data_samples, rescale=rescale)
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in batch_data_samples
+            ]
+            outs = bbox_head(x)
+            results_list = bbox_head.predict_by_feat(
+                *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+            # batch_data_samples = self.add_pred_to_datasample(
+            #     batch_data_samples, results_list)
+            RTMDetInsSepBNHeadCustom._bbox_mask_post_process = old_postprocess
+            return results_list
+        old_predict = SingleStageDetector.predict
+        SingleStageDetector.predict = hijack_detector_predict
+        test_pipeline, imgs, _ = self.prepare_data_pipeline(imgs, det_size)
+        if len(imgs) > 1:
+            imgs = tqdm(imgs)
+        model = self.model
+        img = imgs[0]
+        data_, img = test_pipeline(img)
+        data = model.data_preprocessor(data_, False)
+        instance_data, mask_feat = model(**data, mode='predict')[0]
+        SingleStageDetector.predict = old_predict
+        # print((instance_data.scores > 0.9).sum())
+        return img, instance_data, mask_feat
+    def segment_with_bboxes(self, img, bboxes: torch.Tensor, instance_data, mask_feat: torch.Tensor):
+        # instance_data.bboxes: x1, y1, x2, y2
+        maxidx = torch.argmax(instance_data.scores)
+        bbox = instance_data.bboxes[maxidx].cpu().numpy()
+        p1, p2 = (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3]))
+        tgt_bboxes = instance_data.bboxes
+        im_h, im_w = img.shape[:2]
+        long_side = max(im_h, im_w)
+        bbox_head: RTMDetInsSepBNHeadCustom = self.model.bbox_head
+        priors, kernels = instance_data.priors, instance_data.kernels
+        stride = bbox_head.prior_generator.strides[0][0]
+        ins_bboxes, ins_segs, scores = [], [], []
+        for bbox in bboxes:
+            bbox = torch.from_numpy(np.array([bbox])).to(tgt_bboxes.dtype).to(tgt_bboxes.device)
+            ioulst = box_iou(bbox, tgt_bboxes).squeeze()
+            matched_idx = torch.argmax(ioulst)
+            mask_logits = bbox_head._mask_predict_by_feat_single(
+                mask_feat, kernels[matched_idx][None, ...], priors[matched_idx][None, ...])
+            mask_logits = F.interpolate(
+                mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear')
+            mask_logits = F.interpolate(
+                mask_logits,
+                size=[long_side, long_side],
+                mode='bilinear',
+                align_corners=False)[..., :im_h, :im_w]
+            mask = mask_logits.sigmoid().squeeze()
+            mask = mask > 0.5
+            mask = mask.cpu().numpy()
+            ins_segs.append(mask)
+            matched_iou_score = ioulst[matched_idx]
+            matched_score = instance_data.scores[matched_idx]
+            scores.append(matched_score.cpu().item())
+            matched_bbox = tgt_bboxes[matched_idx]
+            ins_bboxes.append(matched_bbox.cpu().numpy())
+            # p1, p2 = (int(matched_bbox[0]), int(matched_bbox[1])), (int(matched_bbox[2]), int(matched_bbox[3]))
+        if len(ins_bboxes) > 0:
+            ins_bboxes = np.array(ins_bboxes).astype(np.int32)
+            ins_bboxes[:, 2:] -= ins_bboxes[:, :2]
+            ins_segs = np.array(ins_segs)
+        instances = AnimeInstances(ins_segs, ins_bboxes, scores)
+        self._postprocess_refine(instances, img)
+        drawed = instances.draw_instances(img)
+        # cv2.imshow('drawed', drawed)
+        # cv2.waitKey(0)
+        return instances
+    def set_detect_size(self, det_size: Union[int, Tuple]):
+        if isinstance(det_size, int):
+            det_size = (det_size, det_size)
+        self.default_data_pipeline.transforms[1].scale = det_size
+        self.default_data_pipeline.transforms[2].size = det_size
+    @torch.no_grad()
+    def infer(self, imgs: Union[List, str, np.ndarray],
+              pred_score_thr: float = 0.3,
+              refine_kwargs: dict = None,
+              output_type: str="tensor",
+              det_size: int = None,
+              save_dir: str = '',
+              save_visualization: bool = False,
+              save_annotation: str = '',
+              infer_tags: bool = False,
+              obj_id_start: int = -1,
+              img_id_start: int = -1,
+              verbose: bool = False,
+              infer_grey: bool = False,
+              save_mask_only: bool = False,
+              val_dir=None,
+              max_instances: int = 100,
+              **kwargs) -> Union[List[AnimeInstances], AnimeInstances, None]:
+        """
+        Args:
+            imgs (str, ndarray, Sequence[str/ndarray]):
+                Either image files or loaded images.
+        Returns:
+            :obj:`AnimeInstances` or list[:obj:`AnimeInstances`]:
+            If save_annotation or save_annotation, return None.
+        """
+        if det_size is not None:
+            self.set_detect_size(det_size)
+        if refine_kwargs is not None:
+            self.set_refine_method(**refine_kwargs)
+        self.set_max_instance(max_instances)
+        if isinstance(imgs, str):
+            if imgs.endswith('.txt'):
+                imgs = read_imglst_from_txt(imgs)
+        if save_annotation or save_visualization:
+            return self._infer_save_annotations(imgs, pred_score_thr, det_size, save_dir, save_visualization, \
+                                               save_annotation, infer_tags, obj_id_start, img_id_start, val_dir=val_dir)
+        else:
+            return self._infer_simple(imgs, pred_score_thr, det_size, output_type, infer_tags, verbose=verbose, infer_grey=infer_grey)
+    def _det_forward(self, img, test_pipeline, pred_score_thr: float = 0.3) -> Tuple[AnimeInstances, np.ndarray]:
+        data_, img = test_pipeline(img)
+        with torch.no_grad():
+            results: DetDataSample = self.model.test_step(data_)[0]
+            pred_instances = results.pred_instances
+            pred_instances = pred_instances[pred_instances.scores > pred_score_thr]
+            if len(pred_instances) < 1:
+                return AnimeInstances(), img
+        del data_
+        bboxes = pred_instances.bboxes.to(torch.int32)
+        bboxes[:, 2:] -= bboxes[:, :2]
+        masks = pred_instances.masks
+        scores = pred_instances.scores
+        return AnimeInstances(masks, bboxes, scores), img
+    def _infer_simple(self, imgs: Union[List, str, np.ndarray],
+                      pred_score_thr: float = 0.3,
+                      det_size: int = None,
+                      output_type: str = "tensor",
+                      infer_tags: bool = False,
+                      infer_grey: bool = False,
+                      verbose: bool = False) -> Union[DetDataSample, List[DetDataSample]]:
+        if isinstance(imgs, List):
+            return_list = True
+        else:
+            return_list = False
+        assert output_type in {'tensor', 'numpy'}
+        test_pipeline, imgs, _ = self.prepare_data_pipeline(imgs, det_size)
+        predictions = []
+        if len(imgs) > 1:
+            imgs = tqdm(imgs)
+        for img in imgs:
+            instances, img = self._det_forward(img, test_pipeline, pred_score_thr)
+            # drawed = instances.draw_instances(img)
+            # cv2.imwrite('drawed.jpg', drawed)
+            self.postprocess_results(instances, img)
+            # drawed = instances.draw_instances(img)
+            # cv2.imwrite('drawed_post.jpg', drawed)
+            if infer_tags:
+                self.infer_tags(instances, img, infer_grey)
+            if output_type == 'numpy':
+                instances.to_numpy()
+            predictions.append(instances)
+        if return_list:
+            return predictions
+        else:
+            return predictions[0]
+    def _infer_save_annotations(self, imgs: Union[List, str, np.ndarray],
+              pred_score_thr: float = 0.3,
+              det_size: int = None,
+              save_dir: str = '',
+              save_visualization: bool = False,
+              save_annotation: str = '',
+              infer_tags: bool = False,
+              obj_id_start: int = 100000000000,
+              img_id_start: int = 100000000000,
+              save_mask_only: bool = False,
+              val_dir = None,
+              **kwargs) -> None:
+        coco_api = None
+        if isinstance(imgs, str) and imgs.endswith('.json'):
+            coco_api = COCO(imgs)
+            if val_dir is None:
+                val_dir = osp.join(osp.dirname(osp.dirname(imgs)), 'val')
+            imgs = coco_api.getImgIds()
+            imgp2ids = {}
+            imgps, coco_imgmetas = [], []
+            for imgid in imgs:
+                imeta = coco_api.loadImgs(imgid)[0]
+                imgname = imeta['file_name']
+                imgp = osp.join(val_dir, imgname)
+                imgp2ids[imgp] = imgid
+                imgps.append(imgp)
+                coco_imgmetas.append(imeta)
+            imgs = imgps
+        test_pipeline, imgs, target_dir = self.prepare_data_pipeline(imgs, det_size)
+        if save_dir == '':
+            save_dir = osp.join(target_dir, \
+                osp.basename(self.ckpt).replace('.ckpt', '').replace('.pth', '').replace('.pt', ''))
+        if not osp.exists(save_dir):
+            os.makedirs(save_dir)
+        det_annotations = []
+        image_meta = []
+        obj_id = obj_id_start + 1
+        image_id = img_id_start + 1
+        for ii, img in enumerate(tqdm(imgs)):
+            # prepare data
+            if isinstance(img, str):
+                img_name = osp.basename(img)
+            else:
+                img_name = f'{ii}'.zfill(12) + '.jpg'
+            if coco_api is not None:
+                image_id = imgp2ids[img]
+            try:
+                instances, img = self._det_forward(img, test_pipeline, pred_score_thr)
+            except Exception as e:
+                raise e
+                if isinstance(e, torch.cuda.OutOfMemoryError):
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                    try:
+                        instances, img = self._det_forward(img, test_pipeline, pred_score_thr)
+                    except:
+                        LOGGER.warning(f'cuda out of memory: {img_name}')
+                        if isinstance(img, str):
+                            img = cv2.imread(img)
+                        instances = None
+            if instances is not None:
+                self.postprocess_results(instances, img)
+                if infer_tags:
+                    self.infer_tags(instances, img)
+                if save_visualization:
+                    out_file = osp.join(save_dir, img_name)
+                    self.save_visualization(out_file, img, instances)
+            if save_annotation:
+                im_h, im_w = img.shape[:2]
+                image_meta.append({
+                    "id": image_id,"height": im_h,"width": im_w,
+                    "file_name": img_name, "id": image_id
+                })
+                if instances is not None:
+                    for ii in range(len(instances)):
+                        segmentation = instances.masks[ii].squeeze().cpu().numpy().astype(np.uint8)
+                        area = segmentation.sum()
+                        segmentation *= 255
+                        if save_mask_only:
+                            cv2.imwrite(osp.join(save_dir, 'mask_' + str(ii).zfill(3) + '_' +img_name+'.png'), segmentation)
+                        else:
+                            score = instances.scores[ii]
+                            if isinstance(score, torch.Tensor):
+                                score = score.item()
+                            score = float(score)
+                            bbox = instances.bboxes[ii].cpu().numpy()
+                            bbox = bbox.astype(np.float32).tolist()
+                            segmentation = mask2rle(segmentation)
+                            tag_string = instances.tags[ii]
+                            tag_string_character = instances.character_tags[ii]
+                            det_annotations.append({'id': obj_id, 'category_id': 0, 'iscrowd': 0, 'score': score,
+                                'segmentation': segmentation, 'image_id': image_id, 'area': area,
+                                'tag_string': tag_string, 'tag_string_character': tag_string_character, 'bbox': bbox
+                            })
+                        obj_id += 1
+                image_id += 1
+        if save_annotation != '' and not save_mask_only:
+            det_meta = {"info": {},"licenses": [], "images": image_meta,
+                        "annotations": det_annotations, "categories": CATEGORIES}
+            detp = save_annotation
+            dict2json(det_meta, detp)
+            LOGGER.info(f'annotations saved to {detp}')
+    def set_refine_method(self, refine_method: str = 'none', refine_size: int = 720):
+        if refine_method == 'none':
+            self.postprocess_refine = None
+        elif refine_method == 'animeseg':
+            if self.refinenet_animeseg is None:
+                self.refinenet_animeseg = load_refinenet(refine_method)
+            self.postprocess_refine = lambda det_pred, img: \
+                                        animeseg_refine(det_pred, img, self.refinenet_animeseg, True, refine_size)
+        elif refine_method == 'refinenet_isnet':
+            if self.refinenet is None:
+                self.refinenet = load_refinenet(refine_method)
+            self.postprocess_refine = self._postprocess_refine
+        else:
+            raise NotImplementedError(f'Invalid refine method: {refine_method}')
+    def _postprocess_refine(self, instances: AnimeInstances, img: np.ndarray, refine_size: int = 720, max_refine_batch: int = 4, **kwargs):
+        if instances.is_empty:
+            return
+        segs = instances.masks
+        is_tensor = instances.is_tensor
+        if is_tensor:
+            segs = segs.cpu().numpy()
+        segs = segs.astype(np.float32)
+        im_h, im_w = img.shape[:2]
+        masks = []
+        with torch.no_grad():
+            for batch, (pt, pb, pl, pr) in prepare_refine_batch(segs, img, max_refine_batch, self.device, refine_size):
+                preds = self.refinenet(batch)[0][0].sigmoid()
+                if pb == 0:
+                    pb = -im_h
+                if pr == 0:
+                    pr = -im_w
+                preds = preds[..., pt: -pb, pl: -pr]
+                preds  = torch.nn.functional.interpolate(preds, (im_h, im_w), mode='bilinear', align_corners=True)
+                masks.append(preds.cpu()[:, 0])
+        masks = (torch.concat(masks, dim=0) > self.mask_thr).to(self.device)
+        if not is_tensor:
+            masks = masks.cpu().numpy()
+        instances.masks = masks
+    def prepare_data_pipeline(self, imgs: Union[str, np.ndarray, List], det_size: int) -> Tuple[Compose, List, str]:
+        if det_size is None:
+            det_size = self.default_det_size
+        target_dir = './workspace/output'
+        # cast imgs to a list of np.ndarray or image_file_path  if necessary
+        if isinstance(imgs, str):
+            if osp.isdir(imgs):
+                target_dir = imgs
+                imgs = find_all_imgs(imgs, abs_path=True)
+            elif osp.isfile(imgs):
+                target_dir = osp.dirname(imgs)
+                imgs = [imgs]
+        elif isinstance(imgs, np.ndarray) or isinstance(imgs, str):
+            imgs = [imgs]
+        elif isinstance(imgs, List):
+            if len(imgs) > 0:
+                if isinstance(imgs[0], np.ndarray) or isinstance(imgs[0], str):
+                    pass
+                else:
+                    raise NotImplementedError
+        else:
+            raise NotImplementedError
+        test_pipeline = lambda img: single_image_preprocess(img, pipeline=self.default_data_pipeline)
+        return test_pipeline, imgs, target_dir
+    def save_visualization(self, out_file: str, img: np.ndarray, instances: AnimeInstances):
+        drawed = instances.draw_instances(img)
+        mmcv.imwrite(drawed, out_file)
+    def postprocess_results(self, results: DetDataSample, img: np.ndarray) -> None:
+        if self.postprocess_refine is not None:
+            self.postprocess_refine(results, img)
+    def set_mask_threshold(self, mask_thr: float):
+        self.model.bbox_head.test_cfg['mask_thr_binary'] = mask_thr
+    def set_max_instance(self, num_ins):
+        self.model.bbox_head.test_cfg['max_per_img'] = num_ins

animeinsseg/anime_instances.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import numpy as np
+from typing import List, Union, Tuple
+import torch
+from utils.constants import COLOR_PALETTE
+from utils.constants import get_color
+import cv2
+def tags2multilines(tags: Union[str, List], lw, tf, max_width):
+    if isinstance(tags, str):
+        taglist = tags.split(' ')
+    else:
+        taglist = tags
+    sz = cv2.getTextSize(' ', 0, lw / 3, tf)
+    line_height = sz[0][1]
+    line_width = 0
+    if len(taglist) > 0:
+        lines = [taglist[0]]
+        if len(taglist) > 1:
+            for t in taglist[1:]:
+                textl = len(t) * line_height
+                if line_width + line_height + textl > max_width:
+                    lines.append(t)
+                    line_width = 0
+                else:
+                    line_width = line_width + line_height + textl
+                    lines[-1] = lines[-1] + ' ' + t
+    return lines, line_height
+class AnimeInstances:
+    def __init__(self,
+                 masks: Union[np.ndarray, torch.Tensor ]= None,
+                 bboxes: Union[np.ndarray, torch.Tensor ] = None,
+                 scores: Union[np.ndarray, torch.Tensor ] = None,
+                 tags: List[str] = None, character_tags: List[str] = None) -> None:
+        self.masks = masks
+        self.tags = tags
+        self.bboxes =  bboxes
+        if scores is None:
+            scores = [1.] * len(self)
+            if self.is_numpy:
+                scores = np.array(scores)
+            elif self.is_tensor:
+                scores = torch.tensor(scores)
+        self.scores = scores
+        if tags is None:
+            self.tags = [''] * len(self)
+            self.character_tags = [''] * len(self)
+        else:
+            self.tags = tags
+            self.character_tags = character_tags
+    @property
+    def is_cuda(self):
+        if isinstance(self.masks, torch.Tensor) and self.masks.is_cuda:
+            return True
+        else:
+            return False
+    @property
+    def is_tensor(self):
+        if self.is_empty:
+            return False
+        else:
+            return isinstance(self.masks, torch.Tensor)
+    @property
+    def is_numpy(self):
+        if self.is_empty:
+            return True
+        else:
+            return isinstance(self.masks, np.ndarray)
+    @property
+    def is_empty(self):
+        return self.masks is None or len(self.masks) == 0\
+    def remove_duplicated(self):
+        num_masks = len(self)
+        if num_masks < 2:
+            return
+        need_cvt = False
+        if self.is_numpy:
+            need_cvt = True
+            self.to_tensor()
+        mask_areas = torch.Tensor([mask.sum() for mask in self.masks])
+        sids = torch.argsort(mask_areas, descending=True)
+        sids = sids.cpu().numpy().tolist()
+        mask_areas = mask_areas[sids]
+        masks = self.masks[sids]
+        bboxes = self.bboxes[sids]
+        tags = [self.tags[sid] for sid in sids]
+        scores = self.scores[sids]
+        canvas = masks[0]
+        valid_ids: List = np.arange(num_masks).tolist()
+        for ii, mask in enumerate(masks[1:]):
+            mask_id = ii + 1
+            canvas_and = torch.bitwise_and(canvas, mask)
+            and_area = canvas_and.sum()
+            mask_area = mask_areas[mask_id]
+            if and_area / mask_area > 0.8:
+                valid_ids.remove(mask_id)
+            elif mask_id != num_masks - 1:
+                canvas = torch.bitwise_or(canvas, mask)
+        sids = valid_ids
+        self.masks = masks[sids]
+        self.bboxes = bboxes[sids]
+        self.tags = [tags[sid] for sid in sids]
+        self.scores = scores[sids]
+        if need_cvt:
+            self.to_numpy()
+        # sids =
+    def draw_instances(self,
+                      img: np.ndarray,
+                      draw_bbox: bool = True,
+                      draw_ins_mask: bool = True,
+                      draw_ins_contour: bool = True,
+                      draw_tags: bool = False,
+                      draw_indices: List = None,
+                      mask_alpha: float = 0.4):
+        mask_alpha = 0.75
+        drawed = img.copy()
+        if self.is_empty:
+            return drawed
+        im_h, im_w = img.shape[:2]
+        mask_shape = self.masks[0].shape
+        if mask_shape[0] != im_h or mask_shape[1] != im_w:
+            drawed = cv2.resize(drawed, (mask_shape[1], mask_shape[0]), interpolation=cv2.INTER_AREA)
+            im_h, im_w = mask_shape[0], mask_shape[1]
+        if draw_indices is None:
+            draw_indices = list(range(len(self)))
+        ins_dict = {'mask': [], 'tags': [], 'score': [], 'bbox': [], 'character_tags': []}
+        colors = []
+        for idx in draw_indices:
+            ins = self.get_instance(idx, out_type='numpy')
+            for key, data in ins.items():
+                ins_dict[key].append(data)
+            colors.append(get_color(idx))
+        if draw_bbox:
+            lw = max(round(sum(drawed.shape) / 2 * 0.003), 2)
+            for color, bbox in zip(colors, ins_dict['bbox']):
+                p1, p2 = (int(bbox[0]), int(bbox[1])), (int(bbox[2] + bbox[0]), int(bbox[3] + bbox[1]))
+                cv2.rectangle(drawed, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA)
+        if draw_ins_mask:
+            drawed = drawed.astype(np.float32)
+            for color, mask in zip(colors, ins_dict['mask']):
+                p = mask.astype(np.float32)
+                blend_mask = np.full((im_h, im_w, 3), color, dtype=np.float32)
+                alpha_msk = (mask_alpha * p)[..., None]
+                alpha_ori = 1 - alpha_msk
+                drawed = drawed * alpha_ori + alpha_msk * blend_mask
+            drawed = drawed.astype(np.uint8)
+        if draw_tags:
+            lw = max(round(sum(drawed.shape) / 2 * 0.002), 2)
+            tf = max(lw - 1, 1)
+            for color, tags, bbox in zip(colors, ins_dict['tags'], ins_dict['bbox']):
+                if not tags:
+                    continue
+                lines, line_height = tags2multilines(tags, lw, tf, bbox[2])
+                for ii, l in enumerate(lines):
+                    xy = (bbox[0], bbox[1] + line_height + int(line_height * 1.2 * ii))
+                    cv2.putText(drawed, l, xy, 0, lw / 3, color, thickness=tf, lineType=cv2.LINE_AA)
+        # cv2.imshow('canvas', drawed)
+        # cv2.waitKey(0)
+        return drawed
+    def cuda(self):
+        if self.is_empty:
+            return self
+        self.to_tensor(device='cuda')
+        return self
+    def cpu(self):
+        if not self.is_tensor or not self.is_cuda:
+            return self
+        self.masks = self.masks.cpu()
+        self.scores = self.scores.cpu()
+        self.bboxes = self.bboxes.cpu()
+        return self
+    def to_tensor(self, device: str = 'cpu'):
+        if self.is_empty:
+            return self
+        elif self.is_tensor and self.masks.device == device:
+            return self
+        self.masks = torch.from_numpy(self.masks).to(device)
+        self.bboxes = torch.from_numpy(self.bboxes).to(device)
+        self.scores = torch.from_numpy(self.scores ).to(device)
+        return self
+    def to_numpy(self):
+        if self.is_numpy:
+            return self
+        if self.is_cuda:
+            self.masks = self.masks.cpu().numpy()
+            self.scores = self.scores.cpu().numpy()
+            self.bboxes = self.bboxes.cpu().numpy()
+        else:
+            self.masks = self.masks.numpy()
+            self.scores = self.scores.numpy()
+            self.bboxes = self.bboxes.numpy()
+        return self
+    def get_instance(self, ins_idx: int, out_type: str = None, device: str = None):
+        mask = self.masks[ins_idx]
+        tags = self.tags[ins_idx]
+        character_tags = self.character_tags[ins_idx]
+        bbox = self.bboxes[ins_idx]
+        score = self.scores[ins_idx]
+        if out_type is not None:
+            if out_type == 'numpy' and not self.is_numpy:
+                mask = mask.cpu().numpy()
+                bbox = bbox.cpu().numpy()
+                score = score.cpu().numpy()
+            if out_type == 'tensor' and not self.is_tensor:
+                mask = torch.from_numpy(mask)
+                bbox = torch.from_numpy(bbox)
+                score = torch.from_numpy(score)
+            if isinstance(mask, torch.Tensor) and device is not None and mask.device != device:
+                mask = mask.to(device)
+                bbox = bbox.to(device)
+                score = score.to(device)
+        return {
+            'mask': mask,
+            'tags': tags,
+            'character_tags': character_tags,
+            'bbox': bbox,
+            'score': score
+        }
+    def __len__(self):
+        if self.is_empty:
+            return 0
+        else:
+            return len(self.masks)
+    def resize(self, h, w, mode = 'area'):
+        if self.is_empty:
+            return
+        if self.is_tensor:
+            masks = self.masks.to(torch.float).unsqueeze(1)
+            oh, ow = masks.shape[2], masks.shape[3]
+            hs, ws = h / oh, w / ow
+            bboxes = self.bboxes.float()
+            bboxes[:, ::2] *= hs
+            bboxes[:, 1::2] *= ws
+            self.bboxes = torch.round(bboxes).int()
+            masks = torch.nn.functional.interpolate(masks, (h, w), mode=mode)
+            self.masks = masks.squeeze(1) > 0.3
+    def compose_masks(self, output_type=None):
+        if self.is_empty:
+            return None
+        else:
+            mask = self.masks[0]
+            if len(self.masks) > 1:
+                for m in self.masks[1:]:
+                    if self.is_numpy:
+                        mask = np.logical_or(mask, m)
+                    else:
+                        mask = torch.logical_or(mask, m)
+            if output_type is not None:
+                if output_type == 'numpy' and not self.is_numpy:
+                    mask = mask.cpu().numpy()
+                if output_type == 'tensor' and not self.is_tensor:
+                    mask = torch.from_numpy(mask)
+            return mask

animeinsseg/data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # from .dataset import *
2	+ # from .syndataset import *

animeinsseg/data/dataset.py ADDED Viewed

	@@ -0,0 +1,929 @@

+import os.path as osp
+import numpy as np
+from typing import List, Optional, Sequence, Tuple, Union
+import copy
+from time import time
+import mmcv
+from mmcv.transforms import to_tensor
+from mmdet.datasets.transforms import LoadAnnotations, RandomCrop, PackDetInputs, Mosaic, CachedMosaic, CachedMixUp, FilterAnnotations
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.datasets import CocoDataset
+from mmdet.registry import DATASETS, TRANSFORMS
+from numpy import random
+from mmdet.structures.bbox import autocast_box_type, BaseBoxes
+from mmengine.structures import InstanceData, PixelData
+from mmdet.structures import DetDataSample
+from utils.io_utils import bbox_overlap_xy
+from utils.logger import LOGGER
+@DATASETS.register_module()
+class AnimeMangaMixedDataset(CocoDataset):
+    def __init__(self, animeins_root: str = None, animeins_annfile: str = None, manga109_annfile: str = None, manga109_root: str = None, *args, **kwargs) -> None:
+        self.animeins_annfile = animeins_annfile
+        self.animeins_root = animeins_root
+        self.manga109_annfile = manga109_annfile
+        self.manga109_root = manga109_root
+        self.cat_ids = []
+        self.cat_img_map = {}
+        super().__init__(*args, **kwargs)
+        LOGGER.info(f'total num data: {len(self.data_list)}')
+    def parse_data_info(self, raw_data_info: dict, data_prefix: str) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+        # TODO: need to change data_prefix['img'] to data_prefix['img_path']
+        img_path = osp.join(data_prefix, img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix)
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+    def load_data_list(self) -> List[dict]:
+        data_lst = []
+        if self.manga109_root is not None:
+            data_lst += self._data_list(self.manga109_annfile, osp.join(self.manga109_root, 'images'))
+            # if len(data_lst) > 8000:
+            #     data_lst = data_lst[:500]
+            LOGGER.info(f'num data from manga109: {len(data_lst)}')
+        if self.animeins_root is not None:
+            animeins_annfile = osp.join(self.animeins_root, self.animeins_annfile)
+            data_prefix = osp.join(self.animeins_root, self.data_prefix['img'])
+            anime_lst = self._data_list(animeins_annfile, data_prefix)
+            # if len(anime_lst) > 8000:
+            #     anime_lst = anime_lst[:500]
+            data_lst += anime_lst
+            LOGGER.info(f'num data from animeins: {len(data_lst)}')
+        return data_lst
+    def _data_list(self, annfile: str, data_prefix: str) -> List[dict]:
+        """Load annotations from an annotation file named as ``ann_file``
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with self.file_client.get_local_path(annfile) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+        for key, val in cat_img_map.items():
+            if key in self.cat_img_map:
+                self.cat_img_map[key] += val
+            else:
+                self.cat_img_map[key] = val
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            }, data_prefix)
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{annfile}' are not unique!"
+        del self.coco
+        return data_list
+@TRANSFORMS.register_module()
+class LoadAnnotationsNoSegs(LoadAnnotations):
+    def _process_masks(self, results: dict) -> list:
+        """Process gt_masks and filter invalid polygons.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            list: Processed gt_masks.
+        """
+        gt_masks = []
+        gt_ignore_flags = []
+        gt_ignore_mask_flags = []
+        for instance in results.get('instances', []):
+            gt_mask = instance['mask']
+            ignore_mask = False
+            # If the annotation of segmentation mask is invalid,
+            # ignore the whole instance.
+            if isinstance(gt_mask, list):
+                gt_mask = [
+                    np.array(polygon) for polygon in gt_mask
+                    if len(polygon) % 2 == 0 and len(polygon) >= 6
+                ]
+                if len(gt_mask) == 0:
+                    # ignore this instance and set gt_mask to a fake mask
+                    instance['ignore_flag'] = 1
+                    gt_mask = [np.zeros(6)]
+            elif not self.poly2mask:
+                # `PolygonMasks` requires a ploygon of format List[np.array],
+                # other formats are invalid.
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            elif isinstance(gt_mask, dict) and \
+                    not (gt_mask.get('counts') is not None and
+                         gt_mask.get('size') is not None and
+                         isinstance(gt_mask['counts'], (list, str))):
+                # if gt_mask is a dict, it should include `counts` and `size`,
+                # so that `BitmapMasks` can uncompressed RLE
+                # instance['ignore_flag'] = 1
+                ignore_mask = True
+                gt_mask = [np.zeros(6)]
+            gt_masks.append(gt_mask)
+            # re-process gt_ignore_flags
+            gt_ignore_flags.append(instance['ignore_flag'])
+            gt_ignore_mask_flags.append(ignore_mask)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+        results['gt_ignore_mask_flags'] = np.array(gt_ignore_mask_flags, dtype=bool)
+        return gt_masks
+    def _load_masks(self, results: dict) -> None:
+        """Private function to load mask annotations.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        """
+        h, w = results['ori_shape']
+        gt_masks = self._process_masks(results)
+        if self.poly2mask:
+            p2masks = []
+            if len(gt_masks) > 0:
+                for ins, mask, ignore_mask in zip(results['instances'], gt_masks, results['gt_ignore_mask_flags']):
+                    bbox = [int(c) for c in ins['bbox']]
+                    if ignore_mask:
+                        m = np.zeros((h, w), dtype=np.uint8)
+                        m[bbox[1]:bbox[3], bbox[0]: bbox[2]] = 255
+                        # m[bbox[1]:bbox[3], bbox[0]: bbox[2]]
+                        p2masks.append(m)
+                    else:
+                        p2masks.append(self._poly2mask(mask, h, w))
+                # import cv2
+                # # cv2.imwrite('tmp_mask.png', p2masks[-1] * 255)
+                # cv2.imwrite('tmp_img.png', results['img'])
+                # cv2.imwrite('tmp_bbox.png', m * 225)
+                # print(p2masks[-1].shape, p2masks[-1].dtype)
+            gt_masks = BitmapMasks(p2masks, h, w)
+        else:
+            # fake polygon masks will be ignored in `PackDetInputs`
+            gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
+        results['gt_masks'] = gt_masks
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation.
+        """
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask:
+            self._load_masks(results)
+        if self.with_seg:
+            self._load_seg_map(results)
+        return results
+@TRANSFORMS.register_module()
+class PackDetIputsNoSeg(PackDetInputs):
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_ignore_mask_flags': 'ignore_mask',
+        'gt_masks': 'masks'
+    }
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            packed_results['inputs'] = to_tensor(img)
+        if 'gt_ignore_flags' in results:
+            valid_idx = np.where(results['gt_ignore_flags'] == 0)[0]
+            ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0]
+        data_sample = DetDataSample()
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks' or isinstance(results[key], BaseBoxes):
+                if 'gt_ignore_flags' in results:
+                    instance_data[
+                        self.mapping_table[key]] = results[key][valid_idx]
+                    ignore_instance_data[
+                        self.mapping_table[key]] = results[key][ignore_idx]
+                else:
+                    instance_data[self.mapping_table[key]] = results[key]
+            else:
+                if 'gt_ignore_flags' in results:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][valid_idx])
+                    ignore_instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][ignore_idx])
+                else:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key])
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+        if 'proposals' in results:
+            proposals = InstanceData(
+                bboxes=to_tensor(results['proposals']),
+                scores=to_tensor(results['proposals_scores']))
+            data_sample.proposals = proposals
+        if 'gt_seg_map' in results:
+            gt_sem_seg_data = dict(
+                sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy()))
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+        img_meta = {}
+        for key in self.meta_keys:
+            assert key in results, f'`{key}` is not found in `results`, ' \
+                f'the valid keys are {list(results)}.'
+            img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+        return packed_results
+def translate_bitmapmask(bitmap_masks: BitmapMasks,
+              out_shape,
+              offset_x,
+              offset_y,):
+    if len(bitmap_masks.masks) == 0:
+        translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+    else:
+        masks = bitmap_masks.masks
+        out_h, out_w = out_shape
+        mask_h, mask_w = masks.shape[1:]
+        translated_masks = np.zeros((masks.shape[0], *out_shape),
+                                dtype=masks.dtype)
+        ix, iy = bbox_overlap_xy([0, 0, out_w, out_h], [offset_x, offset_y, mask_w, mask_h])
+        if ix > 2 and iy > 2:
+            if offset_x > 0:
+                mx1 = 0
+                tx1 = offset_x
+            else:
+                mx1 = -offset_x
+                tx1 = 0
+            mx2 = min(out_w - offset_x, mask_w)
+            tx2 = tx1 + mx2 - mx1
+            if offset_y > 0:
+                my1 = 0
+                ty1 = offset_y
+            else:
+                my1 = -offset_y
+                ty1 = 0
+            my2 = min(out_h - offset_y, mask_h)
+            ty2 = ty1 + my2 - my1
+            translated_masks[:, ty1: ty2, tx1: tx2] = \
+                masks[:, my1: my2, mx1: mx2]
+    return BitmapMasks(translated_masks, *out_shape)
+@TRANSFORMS.register_module()
+class CachedMosaicNoSeg(CachedMosaic):
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+        if len(self.results_cache) <= 4:
+            return results
+        if random.uniform(0, 1) > self.prob:
+            return results
+        indices = self.get_indexes(self.results_cache)
+        mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices]
+        # TODO: refactor mosaic to reuse these code.
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        mosaic_ignore_mask_flags = []
+        with_mask = True if 'gt_masks' in results else False
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        n_manga = 0
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(mix_results[i - 1])
+            is_manga = results_patch['img_id'] > 900000000
+            if is_manga:
+                n_manga += 1
+                if n_manga > 3:
+                    continue
+                im_h, im_w = results_patch['img'].shape[:2]
+                if im_w > im_h and random.random() < 0.75:
+                    results_patch = hcrop(results_patch, (im_h, im_w // 2), True)
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+            gt_ignore_mask_i = results_patch['gt_ignore_mask_flags']
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            mosaic_ignore_mask_flags.append(gt_ignore_mask_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = translate_bitmapmask(gt_masks_i,
+                    out_shape=(int(self.img_scale[0] * 2),
+                    int(self.img_scale[1] * 2)),
+                    offset_x=padw, offset_y=padh)
+                # gt_masks_i = gt_masks_i.translate(
+                #     out_shape=(int(self.img_scale[0] * 2),
+                #                int(self.img_scale[1] * 2)),
+                #     offset=padw,
+                #     direction='horizontal')
+                # gt_masks_i = gt_masks_i.translate(
+                #     out_shape=(int(self.img_scale[0] * 2),
+                #                int(self.img_scale[1] * 2)),
+                #     offset=padh,
+                #     direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+        mosaic_ignore_mask_flags = np.concatenate(mosaic_ignore_mask_flags, 0)
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+        mosaic_ignore_mask_flags = mosaic_ignore_mask_flags[inside_inds]
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        results['gt_ignore_mask_flags'] = mosaic_ignore_mask_flags
+        if with_mask:
+            total_instances = len(inside_inds)
+            assert total_instances == np.array([m.masks.shape[0] for m in mosaic_masks]).sum()
+            if total_instances > 10:
+                masks = np.empty((inside_inds.sum(), mosaic_masks[0].height, mosaic_masks[0].width), dtype=np.uint8)
+                msk_idx = 0
+                mmsk_idx = 0
+                for m in mosaic_masks:
+                    for ii in range(m.masks.shape[0]):
+                        if inside_inds[msk_idx]:
+                            masks[mmsk_idx] = m.masks[ii]
+                            mmsk_idx += 1
+                        msk_idx += 1
+                results['gt_masks'] = BitmapMasks(masks, mosaic_masks[0].height, mosaic_masks[0].width)
+            else:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+                results['gt_masks'] = mosaic_masks[inside_inds]
+            # assert np.all(results['gt_masks'].masks == masks) and results['gt_masks'].masks.shape == masks.shape
+        # assert inside_inds.sum() == results['gt_masks'].masks.shape[0]
+        return results
+@TRANSFORMS.register_module()
+class FilterAnnotationsNoSeg(FilterAnnotations):
+    def __init__(self,
+                 min_gt_bbox_wh: Tuple[int, int] = (1, 1),
+                 min_gt_mask_area: int = 1,
+                 by_box: bool = True,
+                 by_mask: bool = False,
+                 keep_empty: bool = True) -> None:
+        # TODO: add more filter options
+        assert by_box or by_mask
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+        self.min_gt_mask_area = min_gt_mask_area
+        self.by_box = by_box
+        self.by_mask = by_mask
+        self.keep_empty = keep_empty
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to filter annotations.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        if gt_bboxes.shape[0] == 0:
+            return results
+        tests = []
+        if self.by_box:
+            tests.append(
+                ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) &
+                 (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy())
+        if self.by_mask:
+            assert 'gt_masks' in results
+            gt_masks = results['gt_masks']
+            tests.append(gt_masks.areas >= self.min_gt_mask_area)
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+        # if not keep.any():
+        #     if self.keep_empty:
+        #         return None
+        assert len(results['gt_ignore_flags']) == len(results['gt_ignore_mask_flags'])
+        keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags', 'gt_ignore_mask_flags')
+        for key in keys:
+            if key in results:
+                try:
+                    results[key] = results[key][keep]
+                except Exception as e:
+                    raise e
+        return results
+def hcrop(results: dict, crop_size: Tuple[int, int],
+                allow_negative_crop: bool) -> Union[dict, None]:
+    assert crop_size[0] > 0 and crop_size[1] > 0
+    img = results['img']
+    offset_h, offset_w = 0, random.choice([0, crop_size[1]])
+    crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+    crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+    # Record the homography matrix for the RandomCrop
+    homography_matrix = np.array(
+        [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]],
+        dtype=np.float32)
+    if results.get('homography_matrix', None) is None:
+        results['homography_matrix'] = homography_matrix
+    else:
+        results['homography_matrix'] = homography_matrix @ results[
+            'homography_matrix']
+    # crop the image
+    img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+    img_shape = img.shape
+    results['img'] = img
+    results['img_shape'] = img_shape
+    # crop bboxes accordingly and clip to the image boundary
+    if results.get('gt_bboxes', None) is not None:
+        bboxes = results['gt_bboxes']
+        bboxes.translate_([-offset_w, -offset_h])
+        bboxes.clip_(img_shape[:2])
+        valid_inds = bboxes.is_inside(img_shape[:2]).numpy()
+        # If the crop does not contain any gt-bbox area and
+        # allow_negative_crop is False, skip this image.
+        if (not valid_inds.any() and not allow_negative_crop):
+            return None
+        results['gt_bboxes'] = bboxes[valid_inds]
+        if results.get('gt_ignore_flags', None) is not None:
+            results['gt_ignore_flags'] = \
+                results['gt_ignore_flags'][valid_inds]
+        if results.get('gt_ignore_mask_flags', None) is not None:
+            results['gt_ignore_mask_flags'] = \
+                results['gt_ignore_mask_flags'][valid_inds]
+        if results.get('gt_bboxes_labels', None) is not None:
+            results['gt_bboxes_labels'] = \
+                results['gt_bboxes_labels'][valid_inds]
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'][
+                valid_inds.nonzero()[0]].crop(
+                    np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+            results['gt_bboxes'] = results['gt_masks'].get_bboxes(
+                type(results['gt_bboxes']))
+    # crop semantic seg
+    if results.get('gt_seg_map', None) is not None:
+        results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                        crop_x1:crop_x2]
+    return results
+@TRANSFORMS.register_module()
+class RandomCropNoSeg(RandomCrop):
+    def _crop_data(self, results: dict, crop_size: Tuple[int, int],
+                   allow_negative_crop: bool) -> Union[dict, None]:
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        img = results['img']
+        margin_h = max(img.shape[0] - crop_size[0], 0)
+        margin_w = max(img.shape[1] - crop_size[1], 0)
+        offset_h, offset_w = self._rand_offset((margin_h, margin_w))
+        crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+        crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+        # Record the homography matrix for the RandomCrop
+        homography_matrix = np.array(
+            [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]],
+            dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+        # crop the image
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        img_shape = img.shape
+        results['img'] = img
+        results['img_shape'] = img_shape
+        # crop bboxes accordingly and clip to the image boundary
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes']
+            bboxes.translate_([-offset_w, -offset_h])
+            if self.bbox_clip_border:
+                bboxes.clip_(img_shape[:2])
+            valid_inds = bboxes.is_inside(img_shape[:2]).numpy()
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (not valid_inds.any() and not allow_negative_crop):
+                return None
+            results['gt_bboxes'] = bboxes[valid_inds]
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+            if results.get('gt_ignore_mask_flags', None) is not None:
+                results['gt_ignore_mask_flags'] = \
+                    results['gt_ignore_mask_flags'][valid_inds]
+            if results.get('gt_bboxes_labels', None) is not None:
+                results['gt_bboxes_labels'] = \
+                    results['gt_bboxes_labels'][valid_inds]
+            if results.get('gt_masks', None) is not None:
+                results['gt_masks'] = results['gt_masks'][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results['gt_bboxes'] = results['gt_masks'].get_bboxes(
+                        type(results['gt_bboxes']))
+        # crop semantic seg
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                          crop_x1:crop_x2]
+        return results
+@TRANSFORMS.register_module()
+class CachedMixUpNoSeg(CachedMixUp):
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+        if len(self.results_cache) <= 1:
+            return results
+        if random.uniform(0, 1) > self.prob:
+            return results
+        index = self.get_indexes(self.results_cache)
+        retrieve_results = copy.deepcopy(self.results_cache[index])
+        # TODO: refactor mixup to reuse these code.
+        if retrieve_results['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+        retrieve_img = retrieve_results['img']
+        with_mask = True if 'gt_masks' in results else False
+        jit_factor = random.uniform(*self.ratio_range)
+        is_filp = random.uniform(0, 1) > self.flip_ratio
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+        # 4. flip
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if with_mask:
+            retrieve_gt_masks = retrieve_results['gt_masks'].rescale(
+                scale_ratio)
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+        if is_filp:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+            if with_mask:
+                retrieve_gt_masks = retrieve_gt_masks.flip()
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if with_mask:
+            retrieve_gt_masks = translate_bitmapmask(retrieve_gt_masks,
+                out_shape=(target_h, target_w),
+                offset_x=-x_offset, offset_y=-y_offset)
+            # retrieve_gt_masks = retrieve_gt_masks.translate(
+            #     out_shape=(target_h, target_w),
+            #     offset=-x_offset,
+            #     direction='horizontal')
+            # retrieve_gt_masks = retrieve_gt_masks.translate(
+            #     out_shape=(target_h, target_w),
+            #     offset=-y_offset,
+            #     direction='vertical')
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+        retrieve_gt_ignore_mask_flags = retrieve_results['gt_ignore_mask_flags']
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        mixup_gt_ignore_mask_flags = np.concatenate(
+            (results['gt_ignore_mask_flags'], retrieve_gt_ignore_mask_flags), axis=0)
+        if with_mask:
+            mixup_gt_masks = retrieve_gt_masks.cat(
+                [results['gt_masks'], retrieve_gt_masks])
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+        mixup_gt_ignore_mask_flags = mixup_gt_ignore_mask_flags[inside_inds]
+        if with_mask:
+            mixup_gt_masks = mixup_gt_masks[inside_inds]
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+        results['gt_ignore_mask_flags'] = mixup_gt_ignore_mask_flags
+        if with_mask:
+            results['gt_masks'] = mixup_gt_masks
+        return results

animeinsseg/data/maskrefine_dataset.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import albumentations as A
+from torch.utils.data import Dataset, DataLoader
+import pycocotools.mask as maskUtils
+from pycocotools.coco import COCO
+import random
+import os.path as osp
+import cv2
+import numpy as np
+from scipy.ndimage import distance_transform_bf, distance_transform_edt, distance_transform_cdt
+def is_grey(img: np.ndarray):
+    if len(img.shape) == 3 and img.shape[2] == 3:
+        return False
+    else:
+        return True
+def square_pad_resize(img: np.ndarray, tgt_size: int, pad_value = (0, 0, 0)):
+    h, w = img.shape[:2]
+    pad_h, pad_w = 0, 0
+    # make square image
+    if w < h:
+        pad_w = h - w
+        w += pad_w
+    elif h < w:
+        pad_h = w - h
+        h += pad_h
+    pad_size = tgt_size - h
+    if pad_size > 0:
+        pad_h += pad_size
+        pad_w += pad_size
+    if pad_h > 0 or pad_w > 0:
+        c = 1
+        if is_grey(img):
+            if isinstance(pad_value, tuple):
+                pad_value = pad_value[0]
+        else:
+            if isinstance(pad_value, int):
+                pad_value = (pad_value, pad_value, pad_value)
+        img = cv2.copyMakeBorder(img, 0, pad_h, 0, pad_w, cv2.BORDER_CONSTANT, value=pad_value)
+    resize_ratio = tgt_size / img.shape[0]
+    if resize_ratio < 1:
+        img = cv2.resize(img, (tgt_size, tgt_size), interpolation=cv2.INTER_AREA)
+    elif resize_ratio > 1:
+        img = cv2.resize(img, (tgt_size, tgt_size), interpolation=cv2.INTER_LINEAR)
+    return img, resize_ratio, pad_h, pad_w
+class MaskRefineDataset(Dataset):
+    def __init__(self,
+                 refine_ann_path: str,
+                 data_root: str,
+                 load_instance_mask: bool = True,
+                 aug_ins_prob: float = 0.,
+                 ins_rect_prob: float = 0.,
+                 output_size: int = 720,
+                 augmentation: bool = False,
+                 with_distance: bool = False):
+        self.load_instance_mask = load_instance_mask
+        self.ann_util = COCO(refine_ann_path)
+        self.img_ids = self.ann_util.getImgIds()
+        self.set_load_method(load_instance_mask)
+        self.data_root = data_root
+        self.ins_rect_prob = ins_rect_prob
+        self.aug_ins_prob = aug_ins_prob
+        self.augmentation = augmentation
+        if augmentation:
+            transform = [
+                A.OpticalDistortion(),
+                A.HorizontalFlip(),
+                A.CLAHE(),
+                A.Posterize(),
+                A.CropAndPad(percent=0.1, p=0.3, pad_mode=cv2.BORDER_CONSTANT, pad_cval=0, pad_cval_mask=0, keep_size=True),
+                A.RandomContrast(),
+                A.Rotate(30, p=0.3, mask_value=0, border_mode=cv2.BORDER_CONSTANT)
+            ]
+            self._aug_transform = A.Compose(transform)
+        else:
+            self._aug_transform = None
+        self.output_size = output_size
+        self.with_distance = with_distance
+    def set_output_size(self, size: int):
+        self.output_size = size
+    def set_load_method(self, load_instance_mask: bool):
+        if load_instance_mask:
+            self._load_mask = self._load_with_instance
+        else:
+            self._load_mask = self._load_without_instance
+    def __getitem__(self, idx: int):
+        img_id = self.img_ids[idx]
+        img_meta = self.ann_util.imgs[img_id]
+        img_path = osp.join(self.data_root, img_meta['file_name'])
+        img = cv2.imread(img_path)
+        annids = self.ann_util.getAnnIds([img_id])
+        if len(annids) > 0:
+            ann = random.choice(annids)
+            ann = self.ann_util.anns[ann]
+            assert ann['image_id'] == img_id
+        else:
+            ann = None
+        return self._load_mask(img, ann)
+    def transform(self, img: np.ndarray, mask: np.ndarray, ins_seg: np.ndarray = None) -> dict:
+        if ins_seg is not None:
+            use_seg = True
+        else:
+            use_seg = False
+        if self.augmentation:
+            masks = [mask]
+            if use_seg:
+                masks.append(ins_seg)
+            data = self._aug_transform(image=img, masks=masks)
+            img = data['image']
+            masks = data['masks']
+            mask = masks[0]
+            if use_seg:
+                ins_seg = masks[1]
+        img = square_pad_resize(img, self.output_size, random.randint(0, 255))[0]
+        mask = square_pad_resize(mask, self.output_size, 0)[0]
+        if ins_seg is not None:
+            ins_seg = square_pad_resize(ins_seg, self.output_size, 0)[0]
+        img = (img.astype(np.float32) / 255.).transpose((2, 0, 1))
+        mask = mask[None, ...]
+        if use_seg:
+            ins_seg = ins_seg[None, ...]
+            img = np.concatenate((img, ins_seg), axis=0)
+        data = {'img': img, 'mask': mask}
+        if self.with_distance:
+            dist = distance_transform_edt(mask[0])
+            dist_max = dist.max()
+            if dist_max != 0:
+                dist = 1 - dist / dist_max
+                # diff_mat = cv2.bitwise_xor(mask[0], ins_seg[0])
+                # dist = dist + diff_mat + 0.2
+                dist = dist + 0.2
+                dist = dist.size / (dist.sum() + 1) * dist
+                dist = np.clip(dist, 0, 20)
+            else:
+                dist = np.ones_like(dist)
+                # print(dist.max(), dist.min())
+            data['dist_weight'] = dist[None, ...]
+        return data
+    def _load_with_instance(self, img: np.ndarray, ann: dict):
+        if ann is None:
+            mask = np.zeros(img.shape[:2], dtype=np.float32)
+            ins_seg = mask
+        else:
+            mask = maskUtils.decode(ann['segmentation']).astype(np.float32)
+            if self.augmentation and random.random() < self.ins_rect_prob:
+                ins_seg = np.zeros_like(mask)
+                bbox = [int(b) for b in ann['bbox']]
+                ins_seg[bbox[1]: bbox[1] + bbox[3], bbox[0]: bbox[0] + bbox[2]] = 1
+            elif len(ann['pred_segmentations']) > 0:
+                ins_seg = random.choice(ann['pred_segmentations'])
+                ins_seg = maskUtils.decode(ins_seg).astype(np.float32)
+            else:
+                ins_seg = mask
+            if self.augmentation and random.random() < self.aug_ins_prob:
+                ksize = random.choice([1, 3, 5, 7])
+                ksize = ksize * 2 + 1
+                kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, ksize=(ksize, ksize))
+                if random.random() < 0.5:
+                    ins_seg = cv2.dilate(ins_seg, kernel)
+                else:
+                    ins_seg = cv2.erode(ins_seg, kernel)
+        return self.transform(img, mask, ins_seg)
+    def _load_without_instance(self, img: np.ndarray, ann: dict):
+        if ann is None:
+            mask = np.zeros(img.shape[:2], dtype=np.float32)
+        else:
+            mask = maskUtils.decode(ann['segmentation']).astype(np.float32)
+        return self.transform(img, mask)
+    def __len__(self):
+        return len(self.img_ids)
+if __name__ == '__main__':
+    ann_path = r'workspace/test_syndata/annotations/refine_train.json'
+    data_root = r'workspace/test_syndata/train'
+    ann_path = r'workspace/test_syndata/annotations/refine_train.json'
+    data_root = r'workspace/test_syndata/train'
+    aug_ins_prob = 0.5
+    load_instance_mask = True
+    ins_rect_prob = 0.25
+    output_size = 640
+    augmentation = True
+    random.seed(0)
+    md = MaskRefineDataset(ann_path, data_root, load_instance_mask, aug_ins_prob, ins_rect_prob, output_size, augmentation, with_distance=True)
+    dl = DataLoader(md, batch_size=1, shuffle=False, persistent_workers=True,
+                                  num_workers=1, pin_memory=True)
+    for data in dl:
+        img = data['img'].cpu().numpy()
+        img = (img[0, :3].transpose((1, 2, 0)) * 255).astype(np.uint8)
+        mask = (data['mask'].cpu().numpy()[0][0] * 255).astype(np.uint8)
+        if load_instance_mask:
+            ins = (data['img'].cpu().numpy()[0][3] * 255).astype(np.uint8)
+            cv2.imshow('ins', ins)
+        dist = data['dist_weight'].cpu().numpy()[0][0]
+        dist = (dist / dist.max() * 255).astype(np.uint8)
+        cv2.imshow('img', img)
+        cv2.imshow('mask', mask)
+        cv2.imshow('dist_weight', dist)
+        cv2.waitKey(0)
+        # cv2.imwrite('')

animeinsseg/data/metrics.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Union
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import FileClient, dump, load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+from mmdet.datasets.api_wrappers import COCO, COCOeval
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+# from ..functional import eval_recalls
+from mmdet.evaluation.metrics import CocoMetric
+@METRICS.register_module()
+class AnimeMangaMetric(CocoMetric):
+    def __init__(self,
+                 manga109_annfile=None,
+                 animeins_annfile=None,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = dict(backend='disk'),
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 sort_categories: bool = False) -> None:
+        super().__init__(ann_file, metric, classwise, proposal_nums, iou_thrs, metric_items, format_only, outfile_prefix, file_client_args, collect_device, prefix, sort_categories)
+        self.manga109_img_ids = set()
+        if manga109_annfile is not None:
+            with self.file_client.get_local_path(manga109_annfile) as local_path:
+                self._manga109_coco_api = COCO(local_path)
+                if sort_categories:
+                    # 'categories' list in objects365_train.json and
+                    # objects365_val.json is inconsistent, need sort
+                    # list(or dict) before get cat_ids.
+                    cats = self._manga109_coco_api.cats
+                    sorted_cats = {i: cats[i] for i in sorted(cats)}
+                    self._manga109_coco_api.cats = sorted_cats
+                    categories = self._manga109_coco_api.dataset['categories']
+                    sorted_categories = sorted(
+                        categories, key=lambda i: i['id'])
+                    self._manga109_coco_api.dataset['categories'] = sorted_categories
+                self.manga109_img_ids = set(self._manga109_coco_api.get_img_ids())
+        else:
+            self._manga109_coco_api = None
+        self.animeins_img_ids = set()
+        if animeins_annfile is not None:
+            with self.file_client.get_local_path(animeins_annfile) as local_path:
+                self._animeins_coco_api = COCO(local_path)
+                if sort_categories:
+                    # 'categories' list in objects365_train.json and
+                    # objects365_val.json is inconsistent, need sort
+                    # list(or dict) before get cat_ids.
+                    cats = self._animeins_coco_api.cats
+                    sorted_cats = {i: cats[i] for i in sorted(cats)}
+                    self._animeins_coco_api.cats = sorted_cats
+                    categories = self._animeins_coco_api.dataset['categories']
+                    sorted_categories = sorted(
+                        categories, key=lambda i: i['id'])
+                    self._animeins_coco_api.dataset['categories'] = sorted_categories
+            self.animeins_img_ids = set(self._animeins_coco_api.get_img_ids())
+        else:
+            self._animeins_coco_api = None
+        if self._animeins_coco_api is not None:
+            self._coco_api = self._animeins_coco_api
+        else:
+            self._coco_api = self._manga109_coco_api
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        # split gt and prediction list
+        gts, preds = zip(*results)
+        manga109_gts, animeins_gts = [], []
+        manga109_preds, animeins_preds = [], []
+        for gt, pred in zip(gts, preds):
+            if gt['img_id'] in self.manga109_img_ids:
+                manga109_gts.append(gt)
+                manga109_preds.append(pred)
+            else:
+                animeins_gts.append(gt)
+                animeins_preds.append(pred)
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+        eval_results = OrderedDict()
+        if len(manga109_gts) > 0:
+            metrics = []
+            for m in self.metrics:
+                if m != 'segm':
+                    metrics.append(m)
+            self.cat_ids = self._manga109_coco_api.get_cat_ids(cat_names=self.dataset_meta['classes'])
+            self.img_ids = self._manga109_coco_api.get_img_ids()
+            rst = self._compute_metrics(metrics, self._manga109_coco_api, manga109_preds, outfile_prefix, tmp_dir)
+            for key, item in rst.items():
+                eval_results['manga109_'+key] = item
+        if len(animeins_gts) > 0:
+            self.cat_ids = self._animeins_coco_api.get_cat_ids(cat_names=self.dataset_meta['classes'])
+            self.img_ids = self._animeins_coco_api.get_img_ids()
+            rst = self._compute_metrics(self.metrics, self._animeins_coco_api, animeins_preds, outfile_prefix, tmp_dir)
+            for key, item in rst.items():
+                eval_results['animeins_'+key] = item
+        return eval_results
+    def results2json(self, results: Sequence[dict],
+                     outfile_prefix: str) -> dict:
+        """Dump the detection results to a COCO style json file.
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        segm_json_results = [] if 'masks' in results[0] else None
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            # bbox results
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = self.cat_ids[label]
+                bbox_json_results.append(data)
+            if segm_json_results is None:
+                continue
+            # segm results
+            masks = result['masks']
+            mask_scores = result.get('mask_scores', scores)
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(mask_scores[i])
+                data['category_id'] = self.cat_ids[label]
+                if isinstance(masks[i]['counts'], bytes):
+                    masks[i]['counts'] = masks[i]['counts'].decode()
+                data['segmentation'] = masks[i]
+                segm_json_results.append(data)
+        logger: MMLogger = MMLogger.get_current_instance()
+        logger.info('dumping predictions ... ')
+        result_files = dict()
+        result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+        result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+        dump(bbox_json_results, result_files['bbox'])
+        if segm_json_results is not None:
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            dump(segm_json_results, result_files['segm'])
+        return result_files
+    def _compute_metrics(self, metrics, tgt_api, preds, outfile_prefix, tmp_dir):
+        logger: MMLogger = MMLogger.get_current_instance()
+        result_files = self.results2json(preds, outfile_prefix)
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+        for metric in metrics:
+            logger.info(f'Evaluating {metric}...')
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = tgt_api.loadRes(predictions)
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+            coco_eval = COCOeval(tgt_api, coco_dt, iou_type)
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = tgt_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{round(ap, 3)}'))
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results

animeinsseg/data/paste_methods.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import numpy as np
+from typing import List, Union, Tuple, Dict
+import random
+from PIL import Image
+import cv2
+import os.path as osp
+from tqdm import tqdm
+from panopticapi.utils import rgb2id, id2rgb
+from time import time
+import traceback
+from utils.io_utils import bbox_overlap_area
+from utils.logger import LOGGER
+from utils.constants import COLOR_PALETTE
+class PartitionTree:
+    def __init__(self, bleft: int, btop: int, bright: int, bbottom: int, parent = None) -> None:
+        self.left: PartitionTree = None
+        self.right: PartitionTree = None
+        self.top: PartitionTree = None
+        self.bottom: PartitionTree = None
+        if bright < bleft:
+            bright = bleft
+        if bbottom < btop:
+            bbottom = btop
+        self.bleft = bleft
+        self.bright = bright
+        self.btop = btop
+        self.bbottom = bbottom
+        self.parent: PartitionTree = parent
+    def is_leaf(self):
+        return self.left is None
+    def new_partition(self, new_rect: List):
+        self.left = PartitionTree(self.bleft, self.btop, new_rect[0], self.bbottom, self)
+        self.top = PartitionTree(self.bleft, self.btop, self.bright, new_rect[1], self)
+        self.right = PartitionTree(new_rect[2], self.btop, self.bright, self.bbottom, self)
+        self.bottom = PartitionTree(self.bleft, new_rect[3], self.bright, self.bbottom, self)
+        if self.parent is not None:
+            self.root_update_rect(new_rect)
+    def root_update_rect(self, rect):
+        root = self.get_root()
+        root.update_child_rect(rect)
+    def update_child_rect(self, rect: List):
+        if self.is_leaf():
+            self.update_from_rect(rect)
+        else:
+            self.left.update_child_rect(rect)
+            self.right.update_child_rect(rect)
+            self.top.update_child_rect(rect)
+            self.bottom.update_child_rect(rect)
+    def get_root(self):
+        if self.parent is not None:
+            return self.parent.get_root()
+        else:
+            return self
+    def update_from_rect(self, rect: List):
+        if not self.is_leaf():
+            return
+        ix = min(self.bright, rect[2]) - max(self.bleft, rect[0])
+        iy = min(self.bbottom, rect[3]) - max(self.btop, rect[1])
+        if not (ix > 0  and iy > 0):
+            return
+        new_ltrb0 = np.array([self.bleft, self.btop, self.bright, self.bbottom])
+        new_ltrb1 = new_ltrb0.copy()
+        if rect[0] > self.bleft and rect[0] < self.bright:
+            new_ltrb0[2] = rect[0]
+        else:
+            new_ltrb0[0] = rect[2]
+        if rect[1] > self.btop and rect[1] < self.bbottom:
+            new_ltrb1[3]= rect[1]
+        else:
+            new_ltrb1[1] = rect[3]
+        if (new_ltrb0[2:] - new_ltrb0[:2]).prod() > (new_ltrb1[2:] - new_ltrb1[:2]).prod():
+            self.bleft, self.btop, self.bright, self.bbottom = new_ltrb0
+        else:
+            self.bleft, self.btop, self.bright, self.bbottom = new_ltrb1
+    @property
+    def width(self) -> int:
+        return self.bright  - self.bleft
+    @property
+    def height(self) -> int:
+        return self.bbottom -  self.btop
+    def prefer_partition(self, tgt_h: int, tgt_w: int):
+        if self.is_leaf():
+            return self, min(self.width / tgt_w, 1.2) * min(self.height / tgt_h, 1.2)
+        else:
+            lp, ls = self.left.prefer_partition(tgt_h, tgt_w)
+            rp, rs = self.right.prefer_partition(tgt_h, tgt_w)
+            tp, ts = self.top.prefer_partition(tgt_h, tgt_w)
+            bp, bs = self.bottom.prefer_partition(tgt_h, tgt_w)
+            preferp = [(p, s) for s, p in sorted(zip([ls, rs, ts, bs],[lp, rp, tp, bp]), key=lambda pair: pair[0], reverse=True)][0]
+            return preferp
+    def new_random_pos(self, fg_h: int, fg_w: int, im_h: int, im_w: int, random_sample: bool = False):
+        extx, exty = int(fg_w / 3), int(fg_h / 3)
+        extxb, extyb = int(fg_w / 10), int(fg_h / 10)
+        region_w, region_h = self.width + extx, self.height + exty
+        downscale_ratio = max(min(region_w / fg_w, region_h / fg_h), 0.8)
+        if downscale_ratio < 1:
+            fg_h = int(downscale_ratio * fg_h)
+            fg_w = int(downscale_ratio * fg_w)
+        max_x, max_y = self.bright + extx - fg_w, self.bbottom + exty - fg_h
+        max_x = min(im_w+extxb-fg_w, max_x)
+        max_y = min(im_h+extyb-fg_h, max_y)
+        min_x = max(min(self.bright + extx - fg_w, self.bleft - extx), -extx)
+        min_x = max(-extxb, min_x)
+        min_y = max(min(self.bbottom + exty - fg_h, self.btop - exty), -exty)
+        min_y = max(-extyb, min_y)
+        px, py = min_x, min_y
+        if min_x < max_x:
+            if random_sample:
+                px = random.randint(min_x, max_x)
+            else:
+                px = int((min_x + max_x) / 2)
+        if min_y < max_y:
+            if random_sample:
+                py = random.randint(min_y, max_y)
+            else:
+                py = int((min_y + max_y) / 2)
+        return px, py, downscale_ratio
+    def drawpartition(self, image: np.ndarray, color = None):
+        if color is None:
+            color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+        if not self.is_leaf():
+            cv2.rectangle(image, (self.bleft, self.btop), (self.bright, self.bbottom), color, 2)
+        if not self.is_leaf():
+            c = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+            self.left.drawpartition(image, c)
+            self.right.drawpartition(image, c)
+            self.top.drawpartition(image, c)
+            self.bottom.drawpartition(image, c)
+def paste_one_fg(fg_pil: Image, bg: Image, segments: np.ndarray, px: int, py: int, seg_color: Tuple, cal_area=True):
+    fg_h, fg_w = fg_pil.height, fg_pil.width
+    im_h, im_w = bg.height, bg.width
+    bg.paste(fg_pil, (px, py), mask=fg_pil)
+    bgx1, bgx2, bgy1, bgy2 = px, px+fg_w, py, py+fg_h
+    fgx1, fgx2, fgy1, fgy2 = 0, fg_w, 0, fg_h
+    if bgx1 < 0:
+        fgx1 = -bgx1
+        bgx1 = 0
+    if bgy1 < 0:
+        fgy1 = -bgy1
+        bgy1 = 0
+    if bgx2 > im_w:
+        fgx2 = im_w - bgx2
+        bgx2 = im_w
+    if bgy2 > im_h:
+        fgy2 = im_h - bgy2
+        bgy2 = im_h
+    fg_mask = np.array(fg_pil)[fgy1: fgy2, fgx1: fgx2, 3] > 30
+    segments[bgy1: bgy2, bgx1: bgx2][np.where(fg_mask)] = seg_color
+    if cal_area:
+        area = fg_mask.sum()
+    else:
+        area = 1
+    bbox = [bgx1, bgy1, bgx2-bgx1, bgy2-bgy1]
+    return area, bbox, [bgx1, bgy1, bgx2, bgy2]
+def partition_paste(fg_list, bg: Image):
+    segments_info = []
+    fg_list.sort(key = lambda x: x['image'].shape[0] * x['image'].shape[1], reverse=True)
+    pnode: PartitionTree = None
+    im_h, im_w = bg.height, bg.width
+    ptree = PartitionTree(0, 0, bg.width, bg.height)
+    segments = np.zeros((im_h, im_w, 3), np.uint8)
+    for ii, fg_dict in enumerate(fg_list):
+        fg = fg_dict['image']
+        fg_h, fg_w = fg.shape[:2]
+        pnode, _ = ptree.prefer_partition(fg_h, fg_w)
+        px, py, downscale_ratio = pnode.new_random_pos(fg_h, fg_w, im_h, im_w, True)
+        fg_pil = Image.fromarray(fg)
+        if downscale_ratio < 1:
+            fg_pil = fg_pil.resize((int(fg_w * downscale_ratio), int(fg_h * downscale_ratio)), resample=Image.Resampling.LANCZOS)
+            # fg_h, fg_w = fg_pil.height, fg_pil.width
+        seg_color = COLOR_PALETTE[ii]
+        area, bbox, xyxy = paste_one_fg(fg_pil, bg, segments, px,py, seg_color, cal_area=False)
+        pnode.new_partition(xyxy)
+        segments_info.append({
+            'id': rgb2id(seg_color),
+            'bbox': bbox,
+            'area': area
+        })
+    return segments_info, segments
+        # if downscale_ratio < 1:
+        #     fg_pil = fg_pil.resize((int(fg_w * downscale_ratio), int(fg_h * downscale_ratio)), resample=Image.Resampling.LANCZOS)
+        #     fg_h, fg_w = fg_pil.height, fg_pil.width
+def gen_fg_regbboxes(fg_list: List[Dict], tgt_size: int, min_overlap=0.15, max_overlap=0.8):
+    def _sample_y(h):
+        y = (tgt_size - h) // 2
+        if y > 0:
+            yrange = min(y, h // 4)
+            y += random.randint(-yrange, yrange)
+            return y
+        else:
+            return 0
+    shape_list = []
+    depth_list = []
+    for fg_dict in fg_list:
+        shape_list.append(fg_dict['image'].shape[:2])
+    shape_list = np.array(shape_list)
+    depth_list = np.random.random(len(fg_list))
+    depth_list[shape_list[..., 1] > 0.6 * tgt_size] += 1
+    # num_fg = len(fg_list)
+    # grid_sample = random.random() < 0.4 or num_fg > 6
+    # grid_sample = grid_sample and num_fg < 9 and num_fg > 3
+    # grid_sample = False
+    # if grid_sample:
+    #     grid_pos = np.arange(9)
+    #     np.random.shuffle(grid_pos)
+    #     grid_pos = grid_pos[: num_fg]
+    #     grid_x = grid_pos % 3
+    #     grid_y = grid_pos // 3
+    # else:
+    pos_list = [[0, _sample_y(shape_list[0][0])]]
+    pre_overlap = 0
+    for ii, ((h, w), d) in enumerate(zip(shape_list[1:], depth_list[1:])):
+        (preh, prew), predepth, (prex, prey) = shape_list[ii], depth_list[ii], pos_list[ii]
+        isfg = d < predepth
+        y = _sample_y(h)
+        x = prex+prew
+        if isfg:
+            min_x = max_x = x
+            if pre_overlap < max_overlap:
+                min_x -= (max_overlap - pre_overlap) * prew
+                min_x = int(min_x)
+                if pre_overlap < min_overlap:
+                    max_x -= (min_overlap - pre_overlap) * prew
+                    max_x = int(max_x)
+                x = random.randint(min_x, max_x)
+            pre_overlap = 0
+        else:
+            overlap = random.uniform(min_overlap, max_overlap)
+            x -= int(overlap * w)
+            area = h * w
+            overlap_area = bbox_overlap_area([x, y, w, h], [prex, prey, prew, preh])
+            pre_overlap = overlap_area / area
+        pos_list.append([x, y])
+    pos_list = np.array(pos_list)
+    last_x2 = pos_list[-1][0] + shape_list[-1][1]
+    valid_shiftx = tgt_size - last_x2
+    if valid_shiftx > 0:
+        shiftx = random.randint(0, valid_shiftx)
+        pos_list[:, 0] += shiftx
+    else:
+        pos_list[:, 0] += valid_shiftx // 2
+    for pos, fg_dict, depth in zip(pos_list, fg_list, depth_list):
+        fg_dict['pos'] = pos
+        fg_dict['depth'] = depth
+    fg_list.sort(key=lambda x: x['depth'], reverse=True)
+def regular_paste(fg_list, bg: Image, regen_bboxes=False):
+    segments_info = []
+    im_h, im_w = bg.height, bg.width
+    if regen_bboxes:
+        random.shuffle(fg_list)
+        gen_fg_regbboxes(fg_list, im_h)
+    segments = np.zeros((im_h, im_w, 3), np.uint8)
+    for ii, fg_dict in enumerate(fg_list):
+        fg = fg_dict['image']
+        px, py = fg_dict.pop('pos')
+        fg_pil = Image.fromarray(fg)
+        seg_color = COLOR_PALETTE[ii]
+        area, bbox, xyxy = paste_one_fg(fg_pil, bg, segments, px,py, seg_color, cal_area=True)
+        segments_info.append({
+            'id': rgb2id(seg_color),
+            'bbox': bbox,
+            'area': area
+        })
+    return segments_info, segments

animeinsseg/data/sampler.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import numpy as np
+from random import choice as rchoice
+from random import randint
+import random
+import cv2, traceback, imageio
+import os.path as osp
+from typing import Optional, List, Union, Tuple, Dict
+from utils.io_utils import imread_nogrey_rgb, json2dict
+from .transforms import rotate_image
+from utils.logger import LOGGER
+class NameSampler:
+    def __init__(self, name_prob_dict, sample_num=2048) -> None:
+        self.name_prob_dict = name_prob_dict
+        self._id2name = list(name_prob_dict.keys())
+        self.sample_ids = []
+        total_prob = 0.
+        for ii, (_, prob) in enumerate(name_prob_dict.items()):
+            tgt_num = int(prob * sample_num)
+            total_prob += prob
+            if tgt_num > 0:
+                self.sample_ids += [ii] * tgt_num
+        nsamples = len(self.sample_ids)
+        assert prob <= 1
+        if prob < 1 and nsamples < sample_num:
+            self.sample_ids += [len(self._id2name)] * (sample_num - nsamples)
+            self._id2name.append('_')
+    def sample(self) -> str:
+        return self._id2name[rchoice(self.sample_ids)]
+class PossionSampler:
+    def __init__(self, lam=3, min_val=1, max_val=8) -> None:
+        self._distr = np.random.poisson(lam, 1024)
+        invalid = np.where(np.logical_or(self._distr<min_val, self._distr > max_val))
+        self._distr[invalid] = np.random.randint(min_val, max_val, len(invalid[0]))
+    def sample(self) -> int:
+        return rchoice(self._distr)
+class NormalSampler:
+    def __init__(self, loc=0.33, std=0.2, min_scale=0.15, max_scale=0.85, scalar=1, to_int = True):
+        s = np.random.normal(loc, std, 4096)
+        valid = np.where(np.logical_and(s>min_scale, s<max_scale))
+        self._distr = s[valid] * scalar
+        if to_int:
+            self._distr = self._distr.astype(np.int32)
+    def sample(self) -> int:
+        return rchoice(self._distr)
+class PersonBBoxSampler:
+    def __init__(self, sample_path: Union[str, List]='data/cocoperson_bbox_samples.json', fg_info_list: List = None, fg_transform=None, is_train=True) -> None:
+        if isinstance(sample_path, str):
+            sample_path = [sample_path]
+        self.bbox_list = []
+        for sp in sample_path:
+            bboxlist = json2dict(sp)
+            for bboxes in bboxlist:
+                if isinstance(bboxes, dict):
+                    bboxes = bboxes['bboxes']
+                bboxes = np.array(bboxes)
+                bboxes[:, [0, 1]] -= bboxes[:, [0, 1]].min(axis=0)
+                self.bbox_list.append(bboxes)
+        self.fg_info_list = fg_info_list
+        self.fg_transform = fg_transform
+        self.is_train = is_train
+    def sample(self, tgt_size: int, scale_range=(1, 1), size_thres=(0.02, 0.85)) -> List[np.ndarray]:
+        bboxes_normalized = rchoice(self.bbox_list)
+        if scale_range[0] != 1 or scale_range[1] != 1:
+            bbox_scale = random.uniform(scale_range[0], scale_range[1])
+        else:
+            bbox_scale = 1
+        bboxes = (bboxes_normalized * tgt_size * bbox_scale).astype(np.int32)
+        xyxy_array = np.copy(bboxes)
+        xyxy_array[:, [2, 3]] += xyxy_array[:, [0, 1]]
+        x_max, y_max = xyxy_array[:, 2].max(), xyxy_array[:, 3].max()
+        x_shift = tgt_size - x_max
+        x_shift = randint(0, x_shift) if x_shift > 0 else 0
+        y_shift = tgt_size - y_max
+        y_shift = randint(0, y_shift) if y_shift > 0 else 0
+        bboxes[:, [0, 1]] += [x_shift, y_shift]
+        valid_bboxes = []
+        max_size = size_thres[1] * tgt_size
+        min_size = size_thres[0] * tgt_size
+        for bbox in bboxes:
+            w = min(bbox[2], tgt_size - bbox[0])
+            h = min(bbox[3], tgt_size - bbox[1])
+            if max(h, w) < max_size and min(h, w) > min_size:
+                valid_bboxes.append(bbox)
+        return valid_bboxes
+    def sample_matchfg(self, tgt_size: int):
+        while True:
+            bboxes = self.sample(tgt_size, (1.1, 1.8))
+            if len(bboxes) > 0:
+                break
+        MIN_FG_SIZE = 20
+        num_fg = len(bboxes)
+        rotate = 20 if self.is_train else 15
+        fgs = random_load_nfg(num_fg, self.fg_info_list, random_rotate_prob=0.33, random_rotate=rotate)
+        assert len(fgs) == num_fg
+        bboxes.sort(key=lambda x: x[2] / x[3])
+        fgs.sort(key=lambda x: x['asp_ratio'])
+        for fg, bbox in zip(fgs, bboxes):
+            x, y, w, h = bbox
+            img = fg['image']
+            im_h, im_w = img.shape[:2]
+            if im_h < h and im_w < w:
+                scale = min(h / im_h, w / im_w)
+                new_h, new_w = int(scale * im_h), int(scale * im_w)
+                img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+            else:
+                scale_h, scale_w = min(1, h / im_h), min(1, w / im_w)
+                scale = (scale_h + scale_w) / 2
+                if scale < 1:
+                    new_h, new_w = max(int(scale * im_h), MIN_FG_SIZE), max(int(scale * im_w), MIN_FG_SIZE)
+                    img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+            if self.fg_transform is not None:
+                img = self.fg_transform(image=img)['image']
+            im_h, im_w = img.shape[:2]
+            fg['image'] = img
+            px = int(x + w / 2 - im_w / 2)
+            py = int(y + h / 2 - im_h / 2)
+            fg['pos'] = (px, py)
+        random.shuffle(fgs)
+        slist, llist = [], []
+        large_size = int(tgt_size * 0.55)
+        for fg in fgs:
+            if max(fg['image'].shape[:2]) > large_size:
+                llist.append(fg)
+            else:
+                slist.append(fg)
+        return llist + slist
+def random_load_nfg(num_fg: int, fg_info_list: List[Union[Dict, str]], random_rotate=0, random_rotate_prob=0.):
+    fgs = []
+    while len(fgs) < num_fg:
+        fg, fginfo = random_load_valid_fg(fg_info_list)
+        if random.random() < random_rotate_prob:
+            rotate_deg = randint(-random_rotate, random_rotate)
+            fg = rotate_image(fg, rotate_deg, alpha_crop=True)
+        asp_ratio = fg.shape[1] / fg.shape[0]
+        fgs.append({'image': fg, 'asp_ratio': asp_ratio, 'fginfo': fginfo})
+        while len(fgs) < num_fg and random.random() < 0.12:
+            fgs.append({'image': fg, 'asp_ratio': asp_ratio, 'fginfo': fginfo})
+    return fgs
+def random_load_valid_fg(fg_info_list: List[Union[Dict, str]]) -> Tuple[np.ndarray, Dict]:
+    while True:
+        item = fginfo = rchoice(fg_info_list)
+        file_path = fginfo['file_path']
+        if 'root_dir' in fginfo and fginfo['root_dir']:
+            file_path = osp.join(fginfo['root_dir'], file_path)
+        try:
+            fg = imageio.imread(file_path)
+        except:
+            LOGGER.error(traceback.format_exc())
+            LOGGER.error(f'invalid fg: {file_path}')
+            fg_info_list.remove(item)
+            continue
+        c = 1
+        if len(fg.shape) == 3:
+            c = fg.shape[-1]
+        if c != 4:
+            LOGGER.warning(f'fg {file_path} doesnt have alpha channel')
+            fg_info_list.remove(item)
+        else:
+            if 'xyxy' in fginfo:
+                x1, y1, x2, y2 = fginfo['xyxy']
+            else:
+                oh, ow = fg.shape[:2]
+                ksize = 5
+                mask = cv2.blur(fg[..., 3], (ksize,ksize))
+                _, mask = cv2.threshold(mask, 20, 255, cv2.THRESH_BINARY)
+                x1, y1, w, h = cv2.boundingRect(cv2.findNonZero(mask))
+                x2, y2 = x1 + w, y1 + h
+                if oh - h > 15 or ow - w > 15:
+                    crop = True
+                else:
+                    x1 = y1 = 0
+                    x2, y2 = ow, oh
+            fginfo['xyxy'] = [x1, y1, x2, y2]
+            fg = fg[y1: y2, x1: x2]
+            return fg, fginfo
+def random_load_valid_bg(bg_list: List[str]) -> np.ndarray:
+    while True:
+        try:
+            bgp = rchoice(bg_list)
+            return imread_nogrey_rgb(bgp)
+        except:
+            LOGGER.error(traceback.format_exc())
+            LOGGER.error(f'invalid bg: {bgp}')
+            bg_list.remove(bgp)
+            continue

animeinsseg/data/syndataset.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import numpy as np
+from typing import List, Union, Tuple, Dict
+import random
+from PIL import Image
+import cv2
+import imageio, os
+import os.path as osp
+from tqdm import tqdm
+from panopticapi.utils import rgb2id
+import traceback
+from utils.io_utils import mask2rle, dict2json, fgbg_hist_matching
+from utils.logger import LOGGER
+from utils.constants import CATEGORIES, IMAGE_ID_ZFILL
+from .transforms import get_fg_transforms, get_bg_transforms, quantize_image, resize2height, rotate_image
+from .sampler import random_load_valid_bg, random_load_valid_fg, NameSampler, NormalSampler, PossionSampler, PersonBBoxSampler
+from .paste_methods import regular_paste, partition_paste
+def syn_animecoco_dataset(
+    bg_list: List, fg_info_list: List[Dict], dataset_save_dir: str, policy: str='train',
+    tgt_size=640, syn_num_multiplier=2.5, regular_paste_prob=0.4, person_paste_prob=0.4,
+    max_syn_num=-1, image_id_start=0, obj_id_start=0, hist_match_prob=0.2, quantize_prob=0.25):
+    LOGGER.info(f'syn data policy: {policy}')
+    LOGGER.info(f'background: {len(bg_list)} foreground: {len(fg_info_list)}')
+    numfg_sampler = PossionSampler(min_val=1, max_val=9, lam=2.5)
+    numfg_regpaste_sampler = PossionSampler(min_val=2, max_val=9, lam=3.5)
+    regpaste_size_sampler = NormalSampler(scalar=tgt_size, to_int=True, max_scale=0.75)
+    color_correction_sampler = NameSampler({'hist_match': hist_match_prob, 'quantize': quantize_prob}, )
+    paste_method_sampler = NameSampler({'regular': regular_paste_prob, 'personbbox': person_paste_prob,
+                            'partition': 1-regular_paste_prob-person_paste_prob})
+    fg_transform = get_fg_transforms(tgt_size, transform_variant=policy)
+    fg_distort_transform = get_fg_transforms(tgt_size, transform_variant='distort_only')
+    bg_transform = get_bg_transforms('train', tgt_size)
+    image_id = image_id_start + 1
+    obj_id = obj_id_start + 1
+    det_annotations, image_meta = [], []
+    syn_num = int(syn_num_multiplier * len(fg_info_list))
+    if max_syn_num > 0:
+        syn_num = max_syn_num
+    ann_save_dir = osp.join(dataset_save_dir, 'annotations')
+    image_save_dir = osp.join(dataset_save_dir, policy)
+    if not osp.exists(image_save_dir):
+        os.makedirs(image_save_dir)
+    if not osp.exists(ann_save_dir):
+        os.makedirs(ann_save_dir)
+    is_train =  policy == 'train'
+    if is_train:
+        jpg_save_quality = [75, 85, 95]
+    else:
+        jpg_save_quality = [95]
+    if isinstance(fg_info_list[0], str):
+        for ii, fgp in enumerate(fg_info_list):
+            if isinstance(fgp, str):
+                fg_info_list[ii] = {'file_path': fgp, 'tag_string': [], 'danbooru': False, 'category_id': 0}
+    if person_paste_prob > 0:
+        personbbox_sampler = PersonBBoxSampler(
+            'data/cocoperson_bbox_samples.json', fg_info_list,
+            fg_transform=fg_distort_transform if is_train else None, is_train=is_train)
+    total = tqdm(range(syn_num))
+    for fin in total:
+        try:
+            paste_method = paste_method_sampler.sample()
+            fgs = []
+            if paste_method == 'regular':
+                num_fg = numfg_regpaste_sampler.sample()
+                size = regpaste_size_sampler.sample()
+                while len(fgs) < num_fg:
+                    tgt_height = int(random.uniform(0.7, 1.2) * size)
+                    fg, fginfo = random_load_valid_fg(fg_info_list)
+                    fg = resize2height(fg, tgt_height)
+                    if is_train:
+                        fg = fg_distort_transform(image=fg)['image']
+                        rotate_deg = random.randint(-40, 40)
+                    else:
+                        rotate_deg = random.randint(-30, 30)
+                    if random.random() < 0.3:
+                        fg = rotate_image(fg, rotate_deg, alpha_crop=True)
+                    fgs.append({'image': fg, 'fginfo': fginfo})
+                    while len(fgs) < num_fg and random.random() < 0.15:
+                        fgs.append({'image': fg, 'fginfo': fginfo})
+            elif paste_method == 'personbbox':
+                fgs = personbbox_sampler.sample_matchfg(tgt_size)
+            else:
+                num_fg = numfg_sampler.sample()
+                fgs = []
+                for ii in range(num_fg):
+                    fg, fginfo = random_load_valid_fg(fg_info_list)
+                    fg = fg_transform(image=fg)['image']
+                    h, w = fg.shape[:2]
+                    if num_fg > 6:
+                        downscale = min(tgt_size / 2.5 / w, tgt_size / 2.5 / h)
+                        if downscale < 1:
+                            fg = cv2.resize(fg, (int(w * downscale), int(h * downscale)), interpolation=cv2.INTER_AREA)
+                    fgs.append({'image': fg, 'fginfo': fginfo})
+            bg = random_load_valid_bg(bg_list)
+            bg = bg_transform(image=bg)['image']
+            color_correct = color_correction_sampler.sample()
+            if color_correct == 'hist_match':
+                fgbg_hist_matching(fgs, bg)
+            bg: Image = Image.fromarray(bg)
+            if paste_method == 'regular':
+                segments_info, segments = regular_paste(fgs, bg, regen_bboxes=True)
+            elif paste_method == 'personbbox':
+                segments_info, segments = regular_paste(fgs, bg, regen_bboxes=False)
+            elif paste_method == 'partition':
+                segments_info, segments = partition_paste(fgs, bg, )
+            else:
+                print(f'invalid paste method: {paste_method}')
+                raise NotImplementedError
+            image = np.array(bg)
+            if color_correct == 'quantize':
+                mask = cv2.inRange(segments, np.array([0,0,0]), np.array([0,0,0]))
+                # cv2.imshow("mask", mask)
+                image = quantize_image(image, random.choice([12, 16, 32]), 'kmeans', mask=mask)[0]
+            # postprocess & check if instance is valid
+            for ii, segi in enumerate(segments_info):
+                if segi['area'] == 0:
+                    continue
+                x, y, w, h = segi['bbox']
+                x2, y2 = x+w, y+h
+                c = segments[y: y2, x: x2]
+                pan_png = rgb2id(c)
+                cmask = (pan_png == segi['id'])
+                area = cmask.sum()
+                if paste_method != 'partition' and \
+                    area / (fgs[ii]['image'][..., 3] > 30).sum() < 0.25:
+                    # cv2.imshow('im', fgs[ii]['image'])
+                    # cv2.imshow('mask', fgs[ii]['image'][..., 3])
+                    # cv2.imshow('seg', segments)
+                    # cv2.waitKey(0)
+                    cmask_ids = np.where(cmask)
+                    segments[y: y2, x: x2][cmask_ids] = 0
+                    image[y: y2, x: x2][cmask_ids] = (127, 127, 127)
+                    continue
+                cmask = cmask.astype(np.uint8) * 255
+                dx, dy, w, h = cv2.boundingRect(cv2.findNonZero(cmask))
+                _bbox = [dx + x, dy + y, w, h]
+                seg = cv2.copyMakeBorder(cmask, y, tgt_size-y2, x, tgt_size-x2, cv2.BORDER_CONSTANT) > 0
+                assert seg.shape[0] == tgt_size and seg.shape[1] == tgt_size
+                segmentation = mask2rle(seg)
+                det_annotations.append({
+                    'id': obj_id,
+                    'category_id': fgs[ii]['fginfo']['category_id'],
+                    'iscrowd': 0,
+                    'segmentation': segmentation,
+                    'image_id': image_id,
+                    'area': area,
+                    'tag_string': fgs[ii]['fginfo']['tag_string'],
+                    'tag_string_character': fgs[ii]['fginfo']['tag_string_character'],
+                    'bbox': [float(c) for c in _bbox]
+                })
+                obj_id += 1
+                # cv2.imshow('c', cv2.cvtColor(c, cv2.COLOR_RGB2BGR))
+                # cv2.imshow('cmask', cmask)
+                # cv2.waitKey(0)
+            image_id_str = str(image_id).zfill(IMAGE_ID_ZFILL)
+            image_file_name = image_id_str + '.jpg'
+            image_meta.append({
+                "id": image_id,"height": tgt_size,"width": tgt_size, "file_name": image_file_name, "id": image_id
+            })
+            # LOGGER.info(f'paste method: {paste_method} color correct: {color_correct}')
+            # cv2.imshow('image', cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+            # cv2.imshow('segments', cv2.cvtColor(segments, cv2.COLOR_RGB2BGR))
+            # cv2.waitKey(0)
+            imageio.imwrite(osp.join(image_save_dir, image_file_name), image, quality=random.choice(jpg_save_quality))
+            image_id += 1
+        except:
+            LOGGER.error(traceback.format_exc())
+            continue
+    det_meta = {
+        "info": {},
+        "licenses": [],
+        "images": image_meta,
+        "annotations": det_annotations,
+        "categories": CATEGORIES
+    }
+    detp = osp.join(ann_save_dir, f'det_{policy}.json')
+    dict2json(det_meta, detp)
+    LOGGER.info(f'annotations saved to {detp}')
+    return image_id, obj_id

animeinsseg/data/transforms.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import albumentations as A
+from albumentations import DualIAATransform, to_tuple
+import imgaug.augmenters as iaa
+import cv2
+from tqdm import tqdm
+from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin
+from sklearn.utils import shuffle
+import numpy as np
+class IAAAffine2(DualIAATransform):
+    """Place a regular grid of points on the input and randomly move the neighbourhood of these point around
+    via affine transformations.
+    Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+    Args:
+        p (float): probability of applying the transform. Default: 0.5.
+    Targets:
+        image, mask
+    """
+    def __init__(
+        self,
+        scale=(0.7, 1.3),
+        translate_percent=None,
+        translate_px=None,
+        rotate=0.0,
+        shear=(-0.1, 0.1),
+        order=1,
+        cval=0,
+        mode="reflect",
+        always_apply=False,
+        p=0.5,
+    ):
+        super(IAAAffine2, self).__init__(always_apply, p)
+        self.scale = dict(x=scale, y=scale)
+        self.translate_percent = to_tuple(translate_percent, 0)
+        self.translate_px = to_tuple(translate_px, 0)
+        self.rotate = to_tuple(rotate)
+        self.shear = dict(x=shear, y=shear)
+        self.order = order
+        self.cval = cval
+        self.mode = mode
+    @property
+    def processor(self):
+        return iaa.Affine(
+            self.scale,
+            self.translate_percent,
+            self.translate_px,
+            self.rotate,
+            self.shear,
+            self.order,
+            self.cval,
+            self.mode,
+        )
+    def get_transform_init_args_names(self):
+        return ("scale", "translate_percent", "translate_px", "rotate", "shear", "order", "cval", "mode")
+class IAAPerspective2(DualIAATransform):
+    """Perform a random four point perspective transform of the input.
+    Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+    Args:
+        scale ((float, float): standard deviation of the normal distributions. These are used to sample
+            the random distances of the subimage's corners from the full image's corners. Default: (0.05, 0.1).
+        p (float): probability of applying the transform. Default: 0.5.
+    Targets:
+        image, mask
+    """
+    def __init__(self, scale=(0.05, 0.1), keep_size=True, always_apply=False, p=0.5,
+                 order=1, cval=0, mode="replicate"):
+        super(IAAPerspective2, self).__init__(always_apply, p)
+        self.scale = to_tuple(scale, 1.0)
+        self.keep_size = keep_size
+        self.cval = cval
+        self.mode = mode
+    @property
+    def processor(self):
+        return iaa.PerspectiveTransform(self.scale, keep_size=self.keep_size, mode=self.mode, cval=self.cval)
+    def get_transform_init_args_names(self):
+        return ("scale", "keep_size")
+def get_bg_transforms(transform_variant, out_size):
+    max_size = int(out_size * 1.2)
+    if transform_variant == 'train':
+        transform = [
+            A.SmallestMaxSize(max_size, always_apply=True, interpolation=cv2.INTER_AREA),
+            A.RandomResizedCrop(out_size, out_size, scale=(0.9, 1.5), p=1, ratio=(0.9, 1.1)),
+        ]
+    else:
+        transform = [
+            A.SmallestMaxSize(out_size, always_apply=True),
+            A.RandomCrop(out_size, out_size, True),
+        ]
+    return A.Compose(transform)
+def get_fg_transforms(out_size, scale_limit=(-0.85, -0.3), transform_variant='train'):
+    if transform_variant == 'train':
+        transform = [
+            A.LongestMaxSize(out_size),
+            A.RandomScale(scale_limit=scale_limit, always_apply=True, interpolation=cv2.INTER_AREA),
+            IAAAffine2(scale=(1, 1),
+                       rotate=(-15, 15),
+                       shear=(-0.1, 0.1), p=0.3, mode='constant'),
+            IAAPerspective2(scale=(0.0, 0.06), p=0.3, mode='constant'),
+            A.HorizontalFlip(),
+            A.ElasticTransform(alpha=0.3, sigma=15, alpha_affine=15, border_mode=cv2.BORDER_CONSTANT, p=0.3),
+            A.GridDistortion(border_mode=cv2.BORDER_CONSTANT, p=0.3)
+        ]
+    elif transform_variant == 'distort_only':
+        transform = [
+            IAAAffine2(scale=(1, 1),
+                       shear=(-0.1, 0.1), p=0.3, mode='constant'),
+            IAAPerspective2(scale=(0.0, 0.06), p=0.3, mode='constant'),
+            A.HorizontalFlip(),
+            A.ElasticTransform(alpha=0.3, sigma=15, alpha_affine=15, border_mode=cv2.BORDER_CONSTANT, p=0.3),
+            A.GridDistortion(border_mode=cv2.BORDER_CONSTANT, p=0.3)
+        ]
+    else:
+        transform = [
+            A.LongestMaxSize(out_size),
+            A.RandomScale(scale_limit=scale_limit, always_apply=True, interpolation=cv2.INTER_LINEAR)
+        ]
+    return A.Compose(transform)
+def get_transforms(transform_variant, out_size, to_float=True):
+    if transform_variant == 'distortions':
+        transform = [
+            IAAAffine2(scale=(1, 1.3),
+                       rotate=(-20, 20),
+                       shear=(-0.1, 0.1), p=1, mode='constant'),
+            IAAPerspective2(scale=(0.0, 0.06), p=0.3, mode='constant'),
+            A.OpticalDistortion(),
+            A.HorizontalFlip(),
+            A.Sharpen(p=0.3),
+            A.CLAHE(),
+            A.GaussNoise(p=0.3),
+            A.Posterize(),
+            A.ElasticTransform(alpha=0.3, sigma=15, alpha_affine=15, border_mode=cv2.BORDER_CONSTANT),
+        ]
+    elif transform_variant == 'default':
+        transform = [
+            A.HorizontalFlip(),
+            A.Rotate(20, p=0.3)
+        ]
+    elif transform_variant == 'identity':
+        transform = []
+    else:
+        raise ValueError(f'Unexpected transform_variant {transform_variant}')
+    if to_float:
+        transform.append(A.ToFloat())
+    return A.Compose(transform)
+def get_template_transforms(transform_variant, out_size, to_float=True):
+    if transform_variant == 'distortions':
+        transform = [
+            A.Cutout(p=0.3, max_w_size=30, max_h_size=30, num_holes=1),
+            IAAAffine2(scale=(1, 1.3),
+                       rotate=(-20, 20),
+                       shear=(-0.1, 0.1), p=1, mode='constant'),
+            IAAPerspective2(scale=(0.0, 0.06), p=0.3, mode='constant'),
+            A.OpticalDistortion(),
+            A.HorizontalFlip(),
+            A.Sharpen(p=0.3),
+            A.CLAHE(),
+            A.GaussNoise(p=0.3),
+            A.Posterize(),
+            A.ElasticTransform(alpha=0.3, sigma=15, alpha_affine=15, border_mode=cv2.BORDER_CONSTANT),
+        ]
+    elif transform_variant == 'identity':
+        transform = []
+    else:
+        raise ValueError(f'Unexpected transform_variant {transform_variant}')
+    if to_float:
+        transform.append(A.ToFloat())
+    return A.Compose(transform)
+def rotate_image(mat: np.ndarray, angle: float, alpha_crop: bool = False) -> np.ndarray:
+    """
+    Rotates an image (angle in degrees) and expands image to avoid cropping
+    # https://stackoverflow.com/questions/43892506/opencv-python-rotate-image-without-cropping-sides
+    """
+    height, width = mat.shape[:2] # image shape has 3 dimensions
+    image_center = (width/2, height/2) # getRotationMatrix2D needs coordinates in reverse order (width, height) compared to shape
+    rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.)
+    # rotation calculates the cos and sin, taking absolutes of those.
+    abs_cos = abs(rotation_mat[0,0])
+    abs_sin = abs(rotation_mat[0,1])
+    # find the new width and height bounds
+    bound_w = int(height * abs_sin + width * abs_cos)
+    bound_h = int(height * abs_cos + width * abs_sin)
+    # subtract old image center (bringing image back to origo) and adding the new image center coordinates
+    rotation_mat[0, 2] += bound_w/2 - image_center[0]
+    rotation_mat[1, 2] += bound_h/2 - image_center[1]
+    # rotate image with the new bounds and translated rotation matrix
+    rotated_mat = cv2.warpAffine(mat, rotation_mat, (bound_w, bound_h))
+    if alpha_crop and len(rotated_mat.shape) == 3 and rotated_mat.shape[-1] == 4:
+        x, y, w, h = cv2.boundingRect(rotated_mat[..., -1])
+        rotated_mat = rotated_mat[y: y+h, x: x+w]
+    return rotated_mat
+def recreate_image(codebook, labels, w, h):
+    """Recreate the (compressed) image from the code book & labels"""
+    return (codebook[labels].reshape(w, h, -1) * 255).astype(np.uint8)
+def quantize_image(image: np.ndarray, n_colors: int, method='kmeans', mask=None):
+    # https://scikit-learn.org/stable/auto_examples/cluster/plot_color_quantization.html
+    image = np.array(image, dtype=np.float64) / 255
+    if len(image.shape) == 3:
+        w, h, d = tuple(image.shape)
+    else:
+        w, h = image.shape
+        d = 1
+    # assert d == 3
+    image_array = image.reshape(-1, d)
+    if method == 'kmeans':
+        image_array_sample = None
+        if mask is not None:
+            ids  = np.where(mask)
+            if len(ids[0]) > 10:
+                bg = image[ids][::2]
+                fg = image[np.where(mask == 0)]
+                max_bg_num = int(fg.shape[0] * 1.5)
+                if bg.shape[0] > max_bg_num:
+                    bg = shuffle(bg, random_state=0, n_samples=max_bg_num)
+                image_array_sample = np.concatenate((fg, bg), axis=0)
+                if image_array_sample.shape[0] > 2048:
+                    image_array_sample = shuffle(image_array_sample, random_state=0, n_samples=2048)
+                else:
+                    image_array_sample = None
+        if image_array_sample is None:
+            image_array_sample = shuffle(image_array, random_state=0, n_samples=2048)
+        kmeans = KMeans(n_clusters=n_colors, n_init=10, random_state=0).fit(
+            image_array_sample
+        )
+        labels = kmeans.predict(image_array)
+        quantized  = recreate_image(kmeans.cluster_centers_, labels, w, h)
+        return quantized, kmeans.cluster_centers_, labels
+    else:
+        codebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)
+        labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)
+        return [recreate_image(codebook_random, labels_random, w, h)]
+def resize2height(img: np.ndarray, height: int):
+    im_h, im_w = img.shape[:2]
+    if im_h > height:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LINEAR
+    if im_h != height:
+        img = cv2.resize(img, (int(height / im_h * im_w), height), interpolation=interpolation)
+    return img
+if __name__ == '__main__':
+    import os.path as osp
+    img_path = r'tmp\megumin.png'
+    save_dir = r'tmp'
+    sample_num = 24
+    tv = 'distortions'
+    out_size = 224
+    transforms = get_transforms(tv, out_size ,to_float=False)
+    img = cv2.imread(img_path)
+    for idx in tqdm(range(sample_num)):
+        transformed = transforms(image=img)['image']
+        print(transformed.shape)
+        cv2.imwrite(osp.join(save_dir, str(idx)+'-transform.jpg'), transformed)
+    # cv2.waitKey(0)
+    pass

animeinsseg/inpainting/__init__.py ADDED Viewed

File without changes

animeinsseg/inpainting/ldm_inpaint.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+from omegaconf import OmegaConf
+import safetensors
+import os
+import einops
+import cv2
+from PIL import Image, ImageFilter, ImageOps
+from utils.io_utils import resize_pad2divisior
+import os
+from utils.io_utils import submit_request, img2b64
+import json
+# Debug by Francis
+# from ldm.util import instantiate_from_config
+# from ldm.models.diffusion.ddpm import LatentDiffusion
+# from ldm.models.diffusion.ddim import DDIMSampler
+# from ldm.modules.diffusionmodules.util import noise_like
+import io
+import base64
+from requests.auth import HTTPBasicAuth
+# Debug by Francis
+# def create_model(config_path):
+#     config = OmegaConf.load(config_path)
+#     model = instantiate_from_config(config.model).cpu()
+#     return model
+#
+# def get_state_dict(d):
+#     return d.get('state_dict', d)
+#
+# def load_state_dict(ckpt_path, location='cpu'):
+#     _, extension = os.path.splitext(ckpt_path)
+#     if extension.lower() == ".safetensors":
+#         import safetensors.torch
+#         state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+#     else:
+#         state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+#     state_dict = get_state_dict(state_dict)
+#     return state_dict
+#
+#
+# def load_ldm_sd(model, path) :
+#     if path.endswith('.safetensor') :
+#         sd = safetensors.torch.load_file(path)
+#     else :
+#         sd = load_state_dict(path)
+#     model.load_state_dict(sd, strict = False)
+#
+# def fill_mask_input(image, mask):
+#     """fills masked regions with colors from image using blur. Not extremely effective."""
+#
+#     image_mod = Image.new('RGBA', (image.width, image.height))
+#
+#     image_masked = Image.new('RGBa', (image.width, image.height))
+#     image_masked.paste(image.convert("RGBA").convert("RGBa"), mask=ImageOps.invert(mask.convert('L')))
+#
+#     image_masked = image_masked.convert('RGBa')
+#
+#     for radius, repeats in [(256, 1), (64, 1), (16, 2), (4, 4), (2, 2), (0, 1)]:
+#         blurred = image_masked.filter(ImageFilter.GaussianBlur(radius)).convert('RGBA')
+#         for _ in range(repeats):
+#             image_mod.alpha_composite(blurred)
+#
+#     return image_mod.convert("RGB")
+#
+#
+# def get_inpainting_image_condition(model, image, mask) :
+#     conditioning_mask = np.array(mask.convert("L"))
+#     conditioning_mask = conditioning_mask.astype(np.float32) / 255.0
+#     conditioning_mask = torch.from_numpy(conditioning_mask[None, None])
+#     conditioning_mask = torch.round(conditioning_mask)
+#     conditioning_mask = conditioning_mask.to(device=image.device, dtype=image.dtype)
+#     conditioning_image = torch.lerp(
+#         image,
+#         image * (1.0 - conditioning_mask),
+#         1
+#     )
+#     conditioning_image = model.get_first_stage_encoding(model.encode_first_stage(conditioning_image))
+#     conditioning_mask = torch.nn.functional.interpolate(conditioning_mask, size=conditioning_image.shape[-2:])
+#     conditioning_mask = conditioning_mask.expand(conditioning_image.shape[0], -1, -1, -1)
+#     image_conditioning = torch.cat([conditioning_mask, conditioning_image], dim=1)
+#     return image_conditioning
+#
+#
+# class GuidedLDM(LatentDiffusion):
+#     def __init__(self,  *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#
+#     @torch.no_grad()
+#     def img2img_inpaint(
+#         self,
+#         image: Image.Image,
+#         c_text: str,
+#         uc_text: str,
+#         mask: Image.Image,
+#         ddim_steps = 50,
+#         mask_blur: int = 0,
+#         use_cuda: bool = True,
+#         **kwargs) -> Image.Image :
+#         ddim_sampler = GuidedDDIMSample(self)
+#         if use_cuda :
+#             self.cond_stage_model.cuda()
+#             self.first_stage_model.cuda()
+#         c_text = self.get_learned_conditioning([c_text])
+#         uc_text = self.get_learned_conditioning([uc_text])
+#         cond = {"c_crossattn": [c_text]}
+#         uc_cond = {"c_crossattn": [uc_text]}
+#
+#         if use_cuda :
+#             device = torch.device('cuda:0')
+#         else :
+#             device = torch.device('cpu')
+#
+#         image_mask = mask
+#         image_mask = image_mask.convert('L')
+#         image_mask = image_mask.filter(ImageFilter.GaussianBlur(mask_blur))
+#         latent_mask = image_mask
+#         # image = fill_mask_input(image, latent_mask)
+#         # image.save('image_fill.png')
+#         image = np.array(image).astype(np.float32) / 127.5 - 1.0
+#         image = np.moveaxis(image, 2, 0)
+#         image = torch.from_numpy(image).to(device)[None]
+#         init_latent = self.get_first_stage_encoding(self.encode_first_stage(image))
+#         init_mask = latent_mask
+#         latmask = init_mask.convert('RGB').resize((init_latent.shape[3], init_latent.shape[2]))
+#         latmask = np.moveaxis(np.array(latmask, dtype=np.float32), 2, 0) / 255
+#         latmask = latmask[0]
+#         latmask = np.around(latmask)
+#         latmask = np.tile(latmask[None], (4, 1, 1))
+#         nmask = torch.asarray(latmask).to(init_latent.device).float()
+#         init_latent = (1 - nmask) * init_latent + nmask * torch.randn_like(init_latent)
+#
+#         denoising_strength = 1
+#         if self.model.conditioning_key == 'hybrid' :
+#             image_cdt = get_inpainting_image_condition(self, image, image_mask)
+#             cond["c_concat"] = [image_cdt]
+#             uc_cond["c_concat"] = [image_cdt]
+#
+#         steps = ddim_steps
+#         t_enc = int(min(denoising_strength, 0.999) * steps)
+#         eta = 0
+#
+#         noise = torch.randn_like(init_latent)
+#         ddim_sampler.make_schedule(ddim_num_steps=steps, ddim_eta=eta, ddim_discretize="uniform", verbose=False)
+#         x1 = ddim_sampler.stochastic_encode(init_latent, torch.tensor([t_enc] * int(init_latent.shape[0])).to(device), noise=noise)
+#
+#         if use_cuda :
+#             self.cond_stage_model.cpu()
+#             self.first_stage_model.cpu()
+#
+#         if use_cuda :
+#             self.model.cuda()
+#         decoded = ddim_sampler.decode(x1, cond,t_enc,init_latent=init_latent,nmask=nmask,unconditional_guidance_scale=7,unconditional_conditioning=uc_cond)
+#         if use_cuda :
+#             self.model.cpu()
+#
+#         if mask is not None :
+#             decoded = init_latent * (1 - nmask) + decoded * nmask
+#
+#         if use_cuda :
+#             self.first_stage_model.cuda()
+#         with torch.cuda.amp.autocast(enabled=False):
+#             x_samples = self.decode_first_stage(decoded.to(torch.float32))
+#         if use_cuda :
+#             self.first_stage_model.cpu()
+#         return torch.clip(x_samples, -1, 1)
+#
+#
+#
+# class GuidedDDIMSample(DDIMSampler) :
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#
+#     @torch.no_grad()
+#     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+#                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+#                       unconditional_guidance_scale=1., unconditional_conditioning=None,
+#                       dynamic_threshold=None):
+#         b, *_, device = *x.shape, x.device
+#
+#         if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+#             model_output = self.model.apply_model(x, t, c)
+#         else:
+#             x_in = torch.cat([x] * 2)
+#             t_in = torch.cat([t] * 2)
+#             if isinstance(c, dict):
+#                 assert isinstance(unconditional_conditioning, dict)
+#                 c_in = dict()
+#                 for k in c:
+#                     if isinstance(c[k], list):
+#                         c_in[k] = [torch.cat([
+#                             unconditional_conditioning[k][i],
+#                             c[k][i]]) for i in range(len(c[k]))]
+#                     else:
+#                         c_in[k] = torch.cat([
+#                                 unconditional_conditioning[k],
+#                                 c[k]])
+#             elif isinstance(c, list):
+#                 c_in = list()
+#                 assert isinstance(unconditional_conditioning, list)
+#                 for i in range(len(c)):
+#                     c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
+#             else:
+#                 c_in = torch.cat([unconditional_conditioning, c])
+#             model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+#             model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
+#
+#         e_t = model_output
+#
+#         alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+#         alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+#         sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+#         sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+#         # select parameters corresponding to the currently considered timestep
+#         a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+#         a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+#         sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+#         sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+#
+#         # current prediction for x_0
+#         pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+#
+#         # direction pointing to x_t
+#         dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+#         noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+#         if noise_dropout > 0.:
+#             noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+#         x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+#         return x_prev, pred_x0
+#
+#     @torch.no_grad()
+#     def decode(self, x_latent, cond, t_start, init_latent=None, nmask=None, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
+#                use_original_steps=False, callback=None):
+#
+#         timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+#         total_steps = len(timesteps)
+#         timesteps = timesteps[:t_start]
+#
+#         time_range = np.flip(timesteps)
+#         total_steps = timesteps.shape[0]
+#         print(f"Running Guided DDIM Sampling with {len(timesteps)} timesteps, t_start={t_start}")
+#         iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+#         x_dec = x_latent
+#         for i, step in enumerate(iterator):
+#             p = (i + (total_steps - t_start) + 1) / (total_steps)
+#             index = total_steps - i - 1
+#             ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
+#             if nmask is not None :
+#                 noised_input = self.model.q_sample(init_latent.to(x_latent.device), ts.to(x_latent.device))
+#                 x_dec = (1 - nmask) * noised_input + nmask * x_dec
+#             x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
+#                                           unconditional_guidance_scale=unconditional_guidance_scale,
+#                                           unconditional_conditioning=unconditional_conditioning)
+#             if callback: callback(i)
+#         return x_dec
+#
+#
+# def ldm_inpaint(model, img, mask, inpaint_size=720, pos_prompt='', neg_prompt = '', use_cuda=True):
+#         img_original = np.copy(img)
+#         im_h, im_w = img.shape[:2]
+#         img_resized, (pad_h, pad_w) = resize_pad2divisior(img, inpaint_size)
+#
+#         mask_original = np.copy(mask)
+#         mask_original[mask_original < 127] = 0
+#         mask_original[mask_original >= 127] = 1
+#         mask_original = mask_original[:, :, None]
+#         mask, _ = resize_pad2divisior(mask, inpaint_size)
+#
+#         # cv2.imwrite('img_resized.png', img_resized)
+#         # cv2.imwrite('mask_resized.png', mask)
+#
+#
+#         if use_cuda :
+#             with torch.autocast(enabled = True, device_type = 'cuda') :
+#                 img = model.img2img_inpaint(
+#                     image = Image.fromarray(img_resized),
+#                     c_text = pos_prompt,
+#                     uc_text = neg_prompt,
+#                     mask = Image.fromarray(mask),
+#                     use_cuda = True
+#                     )
+#         else :
+#             img = model.img2img_inpaint(
+#                 image = Image.fromarray(img_resized),
+#                 c_text = pos_prompt,
+#                 uc_text = neg_prompt,
+#                 mask = Image.fromarray(mask),
+#                 use_cuda = False
+#                 )
+#
+#         img_inpainted = (einops.rearrange(img, '1 c h w -> h w c').cpu().numpy() * 127.5 + 127.5).astype(np.uint8)
+#         if pad_h != 0:
+#             img_inpainted = img_inpainted[:-pad_h]
+#         if pad_w != 0:
+#             img_inpainted = img_inpainted[:, :-pad_w]
+#
+#
+#         if img_inpainted.shape[0] != im_h or img_inpainted.shape[1] != im_w:
+#             img_inpainted = cv2.resize(img_inpainted, (im_w, im_h), interpolation = cv2.INTER_LINEAR)
+#         ans = img_inpainted * mask_original + img_original * (1 - mask_original)
+#         ans = img_inpainted
+#         return ans
+import requests
+from PIL import Image
+def ldm_inpaint_webui(
+        img, mask, resolution: int, url: str, prompt: str = '', neg_prompt: str = '',
+        **inpaint_ldm_options):
+    if isinstance(img, np.ndarray):
+        img = Image.fromarray(img)
+    im_h, im_w = img.height, img.width
+    if img.height > img.width:
+        W = resolution
+        H = (img.height / img.width * resolution) // 32 * 32
+        H = int(H)
+    else:
+        H = resolution
+        W = (img.width / img.height * resolution) // 32 * 32
+        W = int(W)
+    auth = None
+    if 'username' in inpaint_ldm_options:
+        username = inpaint_ldm_options.pop('username')
+        password = inpaint_ldm_options.pop('password')
+        auth = HTTPBasicAuth(username, password)
+    img_b64 = img2b64(img)
+    mask_b64 = img2b64(mask)
+    data = {
+        "init_images": [img_b64],
+        "mask": mask_b64,
+        "prompt": prompt,
+        "negative_prompt": neg_prompt,
+        "width": W,
+        "height": H,
+        **inpaint_ldm_options,
+    }
+    data = json.dumps(data)
+    response = submit_request(url, data, auth=auth)
+    inpainted_b64 = response.json()['images'][0]
+    inpainted = Image.open(io.BytesIO(base64.b64decode(inpainted_b64)))
+    if inpainted.height != im_h or inpainted.width != im_w:
+        inpainted = inpainted.resize((im_w, im_h), resample=Image.Resampling.LANCZOS)
+    inpainted = np.array(inpainted)
+    return inpainted

animeinsseg/inpainting/patch_match.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+# File   : patch_match.py
+# Author : Jiayuan Mao
+# Email  : [email protected]
+# Date   : 01/09/2020
+#
+# Distributed under terms of the MIT license.
+import ctypes, os
+import os.path as osp
+from typing import Optional, Union
+import numpy as np
+from PIL import Image
+# try:
+#     # If the Jacinle library (https://github.com/vacancy/Jacinle) is present, use its auto_travis feature.
+#     from jacinle.jit.cext import auto_travis
+#     auto_travis(__file__, required_files=['*.so'])
+# except ImportError as e:
+#     # Otherwise, fall back to the subprocess.
+#     import subprocess
+#     print('Compiling and loading c extensions from "{}".'.format(osp.realpath(osp.dirname(__file__))))
+#     subprocess.check_call(['./travis.sh'], cwd=osp.dirname(__file__))
+__all__ = ['set_random_seed', 'set_verbose', 'inpaint', 'inpaint_regularity']
+class CShapeT(ctypes.Structure):
+    _fields_ = [
+        ('width', ctypes.c_int),
+        ('height', ctypes.c_int),
+        ('channels', ctypes.c_int),
+    ]
+class CMatT(ctypes.Structure):
+    _fields_ = [
+        ('data_ptr', ctypes.c_void_p),
+        ('shape', CShapeT),
+        ('dtype', ctypes.c_int)
+    ]
+import sys
+if sys.platform == 'linux':
+    PMLIB = ctypes.CDLL('data/libs/libpatchmatch_inpaint.so')
+else:
+    PMLIB = ctypes.CDLL('data/libs/libpatchmatch.dll')
+PMLIB.PM_set_random_seed.argtypes = [ctypes.c_uint]
+PMLIB.PM_set_verbose.argtypes = [ctypes.c_int]
+PMLIB.PM_free_pymat.argtypes = [CMatT]
+PMLIB.PM_inpaint.argtypes = [CMatT, CMatT, ctypes.c_int]
+PMLIB.PM_inpaint.restype = CMatT
+PMLIB.PM_inpaint_regularity.argtypes = [CMatT, CMatT, CMatT, ctypes.c_int, ctypes.c_float]
+PMLIB.PM_inpaint_regularity.restype = CMatT
+PMLIB.PM_inpaint2.argtypes = [CMatT, CMatT, CMatT, ctypes.c_int]
+PMLIB.PM_inpaint2.restype = CMatT
+PMLIB.PM_inpaint2_regularity.argtypes = [CMatT, CMatT, CMatT, CMatT, ctypes.c_int, ctypes.c_float]
+PMLIB.PM_inpaint2_regularity.restype = CMatT
+def set_random_seed(seed: int):
+    PMLIB.PM_set_random_seed(ctypes.c_uint(seed))
+def set_verbose(verbose: bool):
+    PMLIB.PM_set_verbose(ctypes.c_int(verbose))
+def inpaint(
+    image: Union[np.ndarray, Image.Image],
+    mask: Optional[Union[np.ndarray, Image.Image]] = None,
+    *,
+    global_mask: Optional[Union[np.ndarray, Image.Image]] = None,
+    patch_size: int = 15
+) -> np.ndarray:
+    """
+    PatchMatch based inpainting proposed in:
+        PatchMatch : A Randomized Correspondence Algorithm for Structural Image Editing
+        C.Barnes, E.Shechtman, A.Finkelstein and Dan B.Goldman
+        SIGGRAPH 2009
+    Args:
+        image (Union[np.ndarray, Image.Image]): the input image, should be 3-channel RGB/BGR.
+        mask (Union[np.array, Image.Image], optional): the mask of the hole(s) to be filled, should be 1-channel.
+        If not provided (None), the algorithm will treat all purely white pixels as the holes (255, 255, 255).
+        global_mask (Union[np.array, Image.Image], optional): the target mask of the output image.
+        patch_size (int): the patch size for the inpainting algorithm.
+    Return:
+        result (np.ndarray): the repaired image, of the same size as the input image.
+    """
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    image = np.ascontiguousarray(image)
+    assert image.ndim == 3 and image.shape[2] == 3 and image.dtype == 'uint8'
+    if mask is None:
+        mask = (image == (255, 255, 255)).all(axis=2, keepdims=True).astype('uint8')
+        mask = np.ascontiguousarray(mask)
+    else:
+        mask = _canonize_mask_array(mask)
+    if global_mask is None:
+        ret_pymat = PMLIB.PM_inpaint(np_to_pymat(image), np_to_pymat(mask), ctypes.c_int(patch_size))
+    else:
+        global_mask = _canonize_mask_array(global_mask)
+        ret_pymat = PMLIB.PM_inpaint2(np_to_pymat(image), np_to_pymat(mask), np_to_pymat(global_mask), ctypes.c_int(patch_size))
+    ret_npmat = pymat_to_np(ret_pymat)
+    PMLIB.PM_free_pymat(ret_pymat)
+    return ret_npmat
+def inpaint_regularity(
+    image: Union[np.ndarray, Image.Image],
+    mask: Optional[Union[np.ndarray, Image.Image]],
+    ijmap: np.ndarray,
+    *,
+    global_mask: Optional[Union[np.ndarray, Image.Image]] = None,
+    patch_size: int = 15, guide_weight: float = 0.25
+) -> np.ndarray:
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    image = np.ascontiguousarray(image)
+    assert isinstance(ijmap, np.ndarray) and ijmap.ndim == 3 and ijmap.shape[2] == 3 and ijmap.dtype == 'float32'
+    ijmap = np.ascontiguousarray(ijmap)
+    assert image.ndim == 3 and image.shape[2] == 3 and image.dtype == 'uint8'
+    if mask is None:
+        mask = (image == (255, 255, 255)).all(axis=2, keepdims=True).astype('uint8')
+        mask = np.ascontiguousarray(mask)
+    else:
+        mask = _canonize_mask_array(mask)
+    if global_mask is None:
+        ret_pymat = PMLIB.PM_inpaint_regularity(np_to_pymat(image), np_to_pymat(mask), np_to_pymat(ijmap), ctypes.c_int(patch_size), ctypes.c_float(guide_weight))
+    else:
+        global_mask = _canonize_mask_array(global_mask)
+        ret_pymat = PMLIB.PM_inpaint2_regularity(np_to_pymat(image), np_to_pymat(mask), np_to_pymat(global_mask), np_to_pymat(ijmap), ctypes.c_int(patch_size), ctypes.c_float(guide_weight))
+    ret_npmat = pymat_to_np(ret_pymat)
+    PMLIB.PM_free_pymat(ret_pymat)
+    return ret_npmat
+def _canonize_mask_array(mask):
+    if isinstance(mask, Image.Image):
+        mask = np.array(mask)
+    if mask.ndim == 2 and mask.dtype == 'uint8':
+        mask = mask[..., np.newaxis]
+    assert mask.ndim == 3 and mask.shape[2] == 1 and mask.dtype == 'uint8'
+    return np.ascontiguousarray(mask)
+dtype_pymat_to_ctypes = [
+    ctypes.c_uint8,
+    ctypes.c_int8,
+    ctypes.c_uint16,
+    ctypes.c_int16,
+    ctypes.c_int32,
+    ctypes.c_float,
+    ctypes.c_double,
+]
+dtype_np_to_pymat = {
+    'uint8': 0,
+    'int8': 1,
+    'uint16': 2,
+    'int16': 3,
+    'int32': 4,
+    'float32': 5,
+    'float64': 6,
+}
+def np_to_pymat(npmat):
+    assert npmat.ndim == 3
+    return CMatT(
+        ctypes.cast(npmat.ctypes.data, ctypes.c_void_p),
+        CShapeT(npmat.shape[1], npmat.shape[0], npmat.shape[2]),
+        dtype_np_to_pymat[str(npmat.dtype)]
+    )
+def pymat_to_np(pymat):
+    npmat = np.ctypeslib.as_array(
+        ctypes.cast(pymat.data_ptr, ctypes.POINTER(dtype_pymat_to_ctypes[pymat.dtype])),
+        (pymat.shape.height, pymat.shape.width, pymat.shape.channels)
+    )
+    ret = np.empty(npmat.shape, npmat.dtype)
+    ret[:] = npmat
+    return ret

animeinsseg/models/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+import numpy as np
+import cv2
+from typing import Union

animeinsseg/models/animeseg_refine/__init__.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# modified from https://github.com/SkyTNT/anime-segmentation/blob/main/train.py
+import os
+import argparse
+import torch
+import torch.nn.functional as F
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import Dataset, DataLoader
+import torch.optim as optim
+import numpy as np
+import cv2
+from torch.cuda import amp
+from utils.constants import DEFAULT_DEVICE
+# from data_loader import create_training_datasets
+import pytorch_lightning as pl
+import warnings
+from .isnet import ISNetDIS, ISNetGTEncoder
+from .u2net import U2NET, U2NET_full, U2NET_full2, U2NET_lite2
+from .modnet import MODNet
+# warnings.filterwarnings("ignore")
+def get_net(net_name):
+    if net_name == "isnet":
+        return ISNetDIS()
+    elif net_name == "isnet_is":
+        return ISNetDIS()
+    elif net_name == "isnet_gt":
+        return ISNetGTEncoder()
+    elif net_name == "u2net":
+        return U2NET_full2()
+    elif net_name == "u2netl":
+        return U2NET_lite2()
+    elif net_name == "modnet":
+        return MODNet()
+    raise NotImplemented
+def f1_torch(pred, gt):
+    # micro F1-score
+    pred = pred.float().view(pred.shape[0], -1)
+    gt = gt.float().view(gt.shape[0], -1)
+    tp1 = torch.sum(pred * gt, dim=1)
+    tp_fp1 = torch.sum(pred, dim=1)
+    tp_fn1 = torch.sum(gt, dim=1)
+    pred = 1 - pred
+    gt = 1 - gt
+    tp2 = torch.sum(pred * gt, dim=1)
+    tp_fp2 = torch.sum(pred, dim=1)
+    tp_fn2 = torch.sum(gt, dim=1)
+    precision = (tp1 + tp2) / (tp_fp1 + tp_fp2 + 0.0001)
+    recall = (tp1 + tp2) / (tp_fn1 + tp_fn2 + 0.0001)
+    f1 = (1 + 0.3) * precision * recall / (0.3 * precision + recall + 0.0001)
+    return precision, recall, f1
+class AnimeSegmentation(pl.LightningModule):
+    def __init__(self, net_name):
+        super().__init__()
+        assert net_name in ["isnet_is", "isnet", "isnet_gt", "u2net", "u2netl", "modnet"]
+        self.net = get_net(net_name)
+        if net_name == "isnet_is":
+            self.gt_encoder = get_net("isnet_gt")
+            self.gt_encoder.requires_grad_(False)
+        else:
+            self.gt_encoder = None
+    @classmethod
+    def try_load(cls, net_name, ckpt_path, map_location=None):
+        state_dict = torch.load(ckpt_path, map_location=map_location)
+        if "epoch" in state_dict:
+            return cls.load_from_checkpoint(ckpt_path, net_name=net_name, map_location=map_location)
+        else:
+            model = cls(net_name)
+            if any([k.startswith("net.") for k, v in state_dict.items()]):
+                model.load_state_dict(state_dict)
+            else:
+                model.net.load_state_dict(state_dict)
+            return model
+    def configure_optimizers(self):
+        optimizer = optim.Adam(self.net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
+        return optimizer
+    def forward(self, x):
+        if isinstance(self.net, ISNetDIS):
+            return self.net(x)[0][0].sigmoid()
+        if isinstance(self.net, ISNetGTEncoder):
+            return self.net(x)[0][0].sigmoid()
+        elif isinstance(self.net, U2NET):
+            return self.net(x)[0].sigmoid()
+        elif isinstance(self.net, MODNet):
+            return self.net(x, True)[2]
+        raise NotImplemented
+    def training_step(self, batch, batch_idx):
+        images, labels = batch["image"], batch["label"]
+        if isinstance(self.net, ISNetDIS):
+            ds, dfs = self.net(images)
+            loss_args = [ds, dfs, labels]
+        elif isinstance(self.net, ISNetGTEncoder):
+            ds = self.net(labels)[0]
+            loss_args = [ds, labels]
+        elif isinstance(self.net, U2NET):
+            ds = self.net(images)
+            loss_args = [ds, labels]
+        elif isinstance(self.net, MODNet):
+            trimaps = batch["trimap"]
+            pred_semantic, pred_detail, pred_matte = self.net(images, False)
+            loss_args = [pred_semantic, pred_detail, pred_matte, images, trimaps, labels]
+        else:
+            raise NotImplemented
+        if self.gt_encoder is not None:
+            fs = self.gt_encoder(labels)[1]
+            loss_args.append(fs)
+        loss0, loss = self.net.compute_loss(loss_args)
+        self.log_dict({"train/loss": loss, "train/loss_tar": loss0})
+        return loss
+    def validation_step(self, batch, batch_idx):
+        images, labels = batch["image"], batch["label"]
+        if isinstance(self.net, ISNetGTEncoder):
+            preds = self.forward(labels)
+        else:
+            preds = self.forward(images)
+        pre, rec, f1, = f1_torch(preds.nan_to_num(nan=0, posinf=1, neginf=0), labels)
+        mae_m = F.l1_loss(preds, labels, reduction="mean")
+        pre_m = pre.mean()
+        rec_m = rec.mean()
+        f1_m = f1.mean()
+        self.log_dict({"val/precision": pre_m, "val/recall": rec_m, "val/f1": f1_m, "val/mae": mae_m}, sync_dist=True)
+def get_gt_encoder(train_dataloader, val_dataloader, opt):
+    print("---start train ground truth encoder---")
+    gt_encoder = AnimeSegmentation("isnet_gt")
+    trainer = Trainer(precision=32 if opt.fp32 else 16, accelerator=opt.accelerator,
+                      devices=opt.devices, max_epochs=opt.gt_epoch,
+                      benchmark=opt.benchmark, accumulate_grad_batches=opt.acc_step,
+                      check_val_every_n_epoch=opt.val_epoch, log_every_n_steps=opt.log_step,
+                      strategy="ddp_find_unused_parameters_false" if opt.devices > 1 else None,
+                      )
+    trainer.fit(gt_encoder, train_dataloader, val_dataloader)
+    return gt_encoder.net
+def load_refinenet(refine_method = 'animeseg', device: str = None) -> AnimeSegmentation:
+    if device is None:
+        device = DEFAULT_DEVICE
+    if refine_method == 'animeseg':
+        model = AnimeSegmentation.try_load('isnet_is', 'models/anime-seg/isnetis.ckpt', device)
+    elif refine_method == 'refinenet_isnet':
+        model = ISNetDIS(in_ch=4)
+        sd = torch.load('models/AnimeInstanceSegmentation/refine_last.ckpt', map_location='cpu')
+        # sd = torch.load('models/AnimeInstanceSegmentation/refine_noweight_dist.ckpt', map_location='cpu')
+        # sd = torch.load('models/AnimeInstanceSegmentation/refine_f3loss.ckpt', map_location='cpu')
+        model.load_state_dict(sd)
+    else:
+        raise NotImplementedError
+    return model.eval().to(device)
+def get_mask(model, input_img, use_amp=True, s=640):
+    h0, w0 = h, w = input_img.shape[0], input_img.shape[1]
+    if h > w:
+        h, w = s, int(s * w / h)
+    else:
+        h, w = int(s * h / w), s
+    ph, pw = s - h, s - w
+    tmpImg = np.zeros([s, s, 3], dtype=np.float32)
+    tmpImg[ph // 2:ph // 2 + h, pw // 2:pw // 2 + w] = cv2.resize(input_img, (w, h)) / 255
+    tmpImg = tmpImg.transpose((2, 0, 1))
+    tmpImg = torch.from_numpy(tmpImg).unsqueeze(0).type(torch.FloatTensor).to(model.device)
+    with torch.no_grad():
+        if use_amp:
+            with amp.autocast():
+                pred = model(tmpImg)
+            pred = pred.to(dtype=torch.float32)
+        else:
+            pred = model(tmpImg)
+        pred = pred[0, :, ph // 2:ph // 2 + h, pw // 2:pw // 2 + w]
+        pred = cv2.resize(pred.cpu().numpy().transpose((1, 2, 0)), (w0, h0))[:, :, np.newaxis]
+        return pred

animeinsseg/models/animeseg_refine/encoders.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class IdentityEncoder(AbstractEncoder):
+    def encode(self, x):
+        return x
+class ClassEmbedder(nn.Module):
+    def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+        self.n_classes = n_classes
+        self.ucg_rate = ucg_rate
+    def forward(self, batch, key=None, disable_dropout=False):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        if self.ucg_rate > 0. and not disable_dropout:
+            mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
+            c = mask * c + (1-mask) * torch.ones_like(c)*(self.n_classes-1)
+            c = c.long()
+        c = self.embedding(c)
+        return c
+    def get_unconditional_conditioning(self, bs, device="cuda"):
+        uc_class = self.n_classes - 1  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
+        uc = torch.ones((bs,), device=device) * uc_class
+        uc = {self.key: uc}
+        return uc
+class DanbooruEmbedder(AbstractEncoder):
+    def __init__(self):
+        super().__init__()

animeinsseg/models/animeseg_refine/isnet.py ADDED Viewed

	@@ -0,0 +1,645 @@

+# Codes are borrowed from
+# https://github.com/xuebinqin/DIS/blob/main/IS-Net/models/isnet.py
+import torch
+import torch.nn as nn
+from torchvision import models
+import torch.nn.functional as F
+_bce_loss = nn.BCEWithLogitsLoss(reduction="mean")
+_bce_loss_none = nn.BCEWithLogitsLoss(reduction='none')
+def bce_loss(p, t, weights=None):
+    if weights is None:
+        return _bce_loss(p, t)
+    else:
+        loss = _bce_loss_none(p, t)
+        loss = loss * weights
+        return loss.mean()
+_fea_loss = nn.MSELoss(reduction="mean")
+_fea_loss_none = nn.MSELoss(reduction="none")
+def fea_loss(p, t, weights=None):
+    return _fea_loss(p, t)
+kl_loss = nn.KLDivLoss(reduction="mean")
+l1_loss = nn.L1Loss(reduction="mean")
+smooth_l1_loss = nn.SmoothL1Loss(reduction="mean")
+def structure_loss(pred, mask):
+    weit  = 1+5*torch.abs(F.avg_pool2d(mask, kernel_size=15, stride=1, padding=7)-mask)
+    wbce  = F.binary_cross_entropy_with_logits(pred, mask, reduction='none')
+    wbce  = (weit*wbce).sum(dim=(2,3))/weit.sum(dim=(2,3))
+    pred  = torch.sigmoid(pred)
+    inter = ((pred*mask)*weit).sum(dim=(2,3))
+    union = ((pred+mask)*weit).sum(dim=(2,3))
+    wiou  = 1-(inter+1)/(union-inter+1)
+    return (wbce+wiou).mean()
+def muti_loss_fusion(preds, target, dist_weight=None, loss0_weight=1.0):
+    loss0 = 0.0
+    loss = 0.0
+    for i in range(0, len(preds)):
+        weight = dist_weight if i == 0 else None
+        if preds[i].shape[2] != target.shape[2] or preds[i].shape[3] != target.shape[3]:
+            tmp_target = F.interpolate(target, size=preds[i].size()[2:], mode='bilinear', align_corners=True)
+            loss = loss + structure_loss(preds[i], tmp_target)
+        else:
+            # loss = loss + bce_loss(preds[i], target, weight)
+            loss = loss + structure_loss(preds[i], target)
+        if i == 0:
+            loss *= loss0_weight
+            loss0 = loss
+    return loss0, loss
+def muti_loss_fusion_kl(preds, target, dfs, fs, mode='MSE', dist_weight=None, loss0_weight=1.0):
+    loss0 = 0.0
+    loss = 0.0
+    for i in range(0, len(preds)):
+        weight = dist_weight if i == 0 else None
+        if preds[i].shape[2] != target.shape[2] or preds[i].shape[3] != target.shape[3]:
+            tmp_target = F.interpolate(target, size=preds[i].size()[2:], mode='bilinear', align_corners=True)
+            # loss = loss + bce_loss(preds[i], tmp_target, weight)
+            loss = loss + structure_loss(preds[i], tmp_target)
+        else:
+            # loss = loss + bce_loss(preds[i], target, weight)
+            loss = loss + structure_loss(preds[i], target)
+        if i == 0:
+            loss *= loss0_weight
+            loss0 = loss
+    for i in range(0, len(dfs)):
+        df = dfs[i]
+        fs_i = fs[i]
+        if mode == 'MSE':
+            loss = loss + fea_loss(df, fs_i, dist_weight)  ### add the mse loss of features as additional constraints
+        elif mode == 'KL':
+            loss = loss + kl_loss(F.log_softmax(df, dim=1), F.softmax(fs_i, dim=1))
+        elif mode == 'MAE':
+            loss = loss + l1_loss(df, fs_i)
+        elif mode == 'SmoothL1':
+            loss = loss + smooth_l1_loss(df, fs_i)
+    return loss0, loss
+class REBNCONV(nn.Module):
+    def __init__(self, in_ch=3, out_ch=3, dirate=1, stride=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate, stride=stride)
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+    def forward(self, x):
+        hx = x
+        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+        return xout
+## upsample tensor 'src' to have the same spatial size with tensor 'tar'
+def _upsample_like(src, tar):
+    src = F.interpolate(src, size=tar.shape[2:], mode='bilinear', align_corners=False)
+    return src
+### RSU-7 ###
+class RSU7(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3, img_size=512):
+        super(RSU7, self).__init__()
+        self.in_ch = in_ch
+        self.mid_ch = mid_ch
+        self.out_ch = out_ch
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)  ## 1 -> 1/2
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx = self.pool5(hx5)
+        hx6 = self.rebnconv6(hx)
+        hx7 = self.rebnconv7(hx6)
+        hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
+        hx6dup = _upsample_like(hx6d, hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-6 ###
+class RSU6(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU6, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx6 = self.rebnconv6(hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-5 ###
+class RSU5(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU5, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx5 = self.rebnconv5(hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4 ###
+class RSU4(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4F ###
+class RSU4F(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4F, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx2 = self.rebnconv2(hx1)
+        hx3 = self.rebnconv3(hx2)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
+        hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
+        return hx1d + hxin
+class myrebnconv(nn.Module):
+    def __init__(self, in_ch=3,
+                 out_ch=1,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1):
+        super(myrebnconv, self).__init__()
+        self.conv = nn.Conv2d(in_ch,
+                              out_ch,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              dilation=dilation,
+                              groups=groups)
+        self.bn = nn.BatchNorm2d(out_ch)
+        self.rl = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.rl(self.bn(self.conv(x)))
+class ISNetGTEncoder(nn.Module):
+    def __init__(self, in_ch=1, out_ch=1):
+        super(ISNetGTEncoder, self).__init__()
+        self.conv_in = myrebnconv(in_ch, 16, 3, stride=2, padding=1)  # nn.Conv2d(in_ch,64,3,stride=2,padding=1)
+        self.stage1 = RSU7(16, 16, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 16, 64)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(64, 32, 128)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(128, 32, 256)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(256, 64, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 64, 512)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+    @staticmethod
+    def compute_loss(args, dist_weight=None):
+        preds, targets = args
+        return muti_loss_fusion(preds, targets, dist_weight)
+    def forward(self, x):
+        hx = x
+        hxin = self.conv_in(hx)
+        # hx = self.pool_in(hxin)
+        # stage 1
+        hx1 = self.stage1(hxin)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        # side output
+        d1 = self.side1(hx1)
+        d1 = _upsample_like(d1, x)
+        d2 = self.side2(hx2)
+        d2 = _upsample_like(d2, x)
+        d3 = self.side3(hx3)
+        d3 = _upsample_like(d3, x)
+        d4 = self.side4(hx4)
+        d4 = _upsample_like(d4, x)
+        d5 = self.side5(hx5)
+        d5 = _upsample_like(d5, x)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, x)
+        # d0 = self.outconv(torch.cat((d1,d2,d3,d4,d5,d6),1))
+        # return [torch.sigmoid(d1), torch.sigmoid(d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(d5), torch.sigmoid(d6)], [hx1, hx2, hx3, hx4, hx5, hx6]
+        return [d1, d2, d3, d4, d5, d6], [hx1, hx2, hx3, hx4, hx5, hx6]
+class ISNetDIS(nn.Module):
+    def __init__(self, in_ch=3, out_ch=1):
+        super(ISNetDIS, self).__init__()
+        self.conv_in = nn.Conv2d(in_ch, 64, 3, stride=2, padding=1)
+        self.pool_in = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage1 = RSU7(64, 32, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 32, 128)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(128, 64, 256)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(256, 128, 512)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(512, 256, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 256, 512)
+        # decoder
+        self.stage5d = RSU4F(1024, 256, 512)
+        self.stage4d = RSU4(1024, 128, 256)
+        self.stage3d = RSU5(512, 64, 128)
+        self.stage2d = RSU6(256, 32, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+        # self.outconv = nn.Conv2d(6*out_ch,out_ch,1)
+    @staticmethod
+    def compute_loss_kl(preds, targets, dfs, fs, mode='MSE'):
+        return muti_loss_fusion_kl(preds, targets, dfs, fs, mode=mode, loss0_weight=5.0)
+    @staticmethod
+    def compute_loss(args, dist_weight=None):
+        if len(args) == 3:
+            ds, dfs, labels = args
+            return muti_loss_fusion(ds, labels, dist_weight, loss0_weight=5.0)
+        else:
+            ds, dfs, labels, fs = args
+            return muti_loss_fusion_kl(ds, labels, dfs, fs, mode="MSE", dist_weight=dist_weight, loss0_weight=5.0)
+    def forward(self, x):
+        hx = x
+        hxin = self.conv_in(hx)
+        hx = self.pool_in(hxin)
+        # stage 1
+        hx1 = self.stage1(hxin)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+        # -------------------- decoder --------------------
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        # side output
+        d1 = self.side1(hx1d)
+        d1 = _upsample_like(d1, x)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, x)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, x)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, x)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, x)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, x)
+        # d0 = self.outconv(torch.cat((d1,d2,d3,d4,d5,d6),1))
+        # return [torch.sigmoid(d1), torch.sigmoid(d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(d5), torch.sigmoid(d6)], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6]
+        return [d1, d2, d3, d4, d5, d6], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6]

animeinsseg/models/animeseg_refine/models.py ADDED Viewed

File without changes

animeinsseg/models/animeseg_refine/modnet.py ADDED Viewed

	@@ -0,0 +1,667 @@

+# Codes are borrowed from
+# https://github.com/ZHKKKe/MODNet/blob/master/src/trainer.py
+# https://github.com/ZHKKKe/MODNet/blob/master/src/models/backbones/mobilenetv2.py
+# https://github.com/ZHKKKe/MODNet/blob/master/src/models/modnet.py
+import numpy as np
+import scipy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import math
+import torch
+from scipy.ndimage import gaussian_filter
+# ----------------------------------------------------------------------------------
+# Loss Functions
+# ----------------------------------------------------------------------------------
+class GaussianBlurLayer(nn.Module):
+    """ Add Gaussian Blur to a 4D tensors
+    This layer takes a 4D tensor of {N, C, H, W} as input.
+    The Gaussian blur will be performed in given channel number (C) splitly.
+    """
+    def __init__(self, channels, kernel_size):
+        """
+        Arguments:
+            channels (int): Channel for input tensor
+            kernel_size (int): Size of the kernel used in blurring
+        """
+        super(GaussianBlurLayer, self).__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        assert self.kernel_size % 2 != 0
+        self.op = nn.Sequential(
+            nn.ReflectionPad2d(math.floor(self.kernel_size / 2)),
+            nn.Conv2d(channels, channels, self.kernel_size,
+                      stride=1, padding=0, bias=None, groups=channels)
+        )
+        self._init_kernel()
+    def forward(self, x):
+        """
+        Arguments:
+            x (torch.Tensor): input 4D tensor
+        Returns:
+            torch.Tensor: Blurred version of the input
+        """
+        if not len(list(x.shape)) == 4:
+            print('\'GaussianBlurLayer\' requires a 4D tensor as input\n')
+            exit()
+        elif not x.shape[1] == self.channels:
+            print('In \'GaussianBlurLayer\', the required channel ({0}) is'
+                  'not the same as input ({1})\n'.format(self.channels, x.shape[1]))
+            exit()
+        return self.op(x)
+    def _init_kernel(self):
+        sigma = 0.3 * ((self.kernel_size - 1) * 0.5 - 1) + 0.8
+        n = np.zeros((self.kernel_size, self.kernel_size))
+        i = math.floor(self.kernel_size / 2)
+        n[i, i] = 1
+        kernel = gaussian_filter(n, sigma)
+        for name, param in self.named_parameters():
+            param.data.copy_(torch.from_numpy(kernel))
+            param.requires_grad = False
+blurer = GaussianBlurLayer(1, 3)
+def loss_func(pred_semantic, pred_detail, pred_matte, image, trimap, gt_matte,
+              semantic_scale=10.0, detail_scale=10.0, matte_scale=1.0):
+    """ loss of MODNet
+    Arguments:
+        blurer: GaussianBlurLayer
+        pred_semantic: model output
+        pred_detail: model output
+        pred_matte: model output
+        image : input RGB image ts pixel values should be normalized
+        trimap : trimap used to calculate the losses
+                its pixel values can be 0, 0.5, or 1
+                (foreground=1, background=0, unknown=0.5)
+        gt_matte: ground truth alpha matte its pixel values are between [0, 1]
+        semantic_scale (float): scale of the semantic loss
+                                NOTE: please adjust according to your dataset
+        detail_scale (float): scale of the detail loss
+                              NOTE: please adjust according to your dataset
+        matte_scale (float): scale of the matte loss
+                             NOTE: please adjust according to your dataset
+    Returns:
+        semantic_loss (torch.Tensor): loss of the semantic estimation [Low-Resolution (LR) Branch]
+        detail_loss (torch.Tensor): loss of the detail prediction [High-Resolution (HR) Branch]
+        matte_loss (torch.Tensor): loss of the semantic-detail fusion [Fusion Branch]
+    """
+    trimap = trimap.float()
+    # calculate the boundary mask from the trimap
+    boundaries = (trimap < 0.5) + (trimap > 0.5)
+    # calculate the semantic loss
+    gt_semantic = F.interpolate(gt_matte, scale_factor=1 / 16, mode='bilinear')
+    gt_semantic = blurer(gt_semantic)
+    semantic_loss = torch.mean(F.mse_loss(pred_semantic, gt_semantic))
+    semantic_loss = semantic_scale * semantic_loss
+    # calculate the detail loss
+    pred_boundary_detail = torch.where(boundaries, trimap, pred_detail.float())
+    gt_detail = torch.where(boundaries, trimap, gt_matte.float())
+    detail_loss = torch.mean(F.l1_loss(pred_boundary_detail, gt_detail.float()))
+    detail_loss = detail_scale * detail_loss
+    # calculate the matte loss
+    pred_boundary_matte = torch.where(boundaries, trimap, pred_matte.float())
+    matte_l1_loss = F.l1_loss(pred_matte, gt_matte) + 4.0 * F.l1_loss(pred_boundary_matte, gt_matte)
+    matte_compositional_loss = F.l1_loss(image * pred_matte, image * gt_matte) \
+                               + 4.0 * F.l1_loss(image * pred_boundary_matte, image * gt_matte)
+    matte_loss = torch.mean(matte_l1_loss + matte_compositional_loss)
+    matte_loss = matte_scale * matte_loss
+    return semantic_loss, detail_loss, matte_loss
+# ------------------------------------------------------------------------------
+#  Useful functions
+# ------------------------------------------------------------------------------
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+# ------------------------------------------------------------------------------
+#  Class of Inverted Residual block
+# ------------------------------------------------------------------------------
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expansion, dilation=1):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = round(inp * expansion)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        if expansion == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+# ------------------------------------------------------------------------------
+#  Class of MobileNetV2
+# ------------------------------------------------------------------------------
+class MobileNetV2(nn.Module):
+    def __init__(self, in_channels, alpha=1.0, expansion=6, num_classes=1000):
+        super(MobileNetV2, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [expansion, 24, 2, 2],
+            [expansion, 32, 3, 2],
+            [expansion, 64, 4, 2],
+            [expansion, 96, 3, 1],
+            [expansion, 160, 3, 2],
+            [expansion, 320, 1, 1],
+        ]
+        # building first layer
+        input_channel = _make_divisible(input_channel * alpha, 8)
+        self.last_channel = _make_divisible(last_channel * alpha, 8) if alpha > 1.0 else last_channel
+        self.features = [conv_bn(self.in_channels, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = _make_divisible(int(c * alpha), 8)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(InvertedResidual(input_channel, output_channel, s, expansion=t))
+                else:
+                    self.features.append(InvertedResidual(input_channel, output_channel, 1, expansion=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building classifier
+        if self.num_classes is not None:
+            self.classifier = nn.Sequential(
+                nn.Dropout(0.2),
+                nn.Linear(self.last_channel, num_classes),
+            )
+        # Initialize weights
+        self._init_weights()
+    def forward(self, x):
+        # Stage1
+        x = self.features[0](x)
+        x = self.features[1](x)
+        # Stage2
+        x = self.features[2](x)
+        x = self.features[3](x)
+        # Stage3
+        x = self.features[4](x)
+        x = self.features[5](x)
+        x = self.features[6](x)
+        # Stage4
+        x = self.features[7](x)
+        x = self.features[8](x)
+        x = self.features[9](x)
+        x = self.features[10](x)
+        x = self.features[11](x)
+        x = self.features[12](x)
+        x = self.features[13](x)
+        # Stage5
+        x = self.features[14](x)
+        x = self.features[15](x)
+        x = self.features[16](x)
+        x = self.features[17](x)
+        x = self.features[18](x)
+        # Classification
+        if self.num_classes is not None:
+            x = x.mean(dim=(2, 3))
+            x = self.classifier(x)
+        # Output
+        return x
+    def _load_pretrained_model(self, pretrained_file):
+        pretrain_dict = torch.load(pretrained_file, map_location='cpu')
+        model_dict = {}
+        state_dict = self.state_dict()
+        print("[MobileNetV2] Loading pretrained model...")
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                model_dict[k] = v
+            else:
+                print(k, "is ignored")
+        state_dict.update(model_dict)
+        self.load_state_dict(state_dict)
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+class BaseBackbone(nn.Module):
+    """ Superclass of Replaceable Backbone Model for Semantic Estimation
+    """
+    def __init__(self, in_channels):
+        super(BaseBackbone, self).__init__()
+        self.in_channels = in_channels
+        self.model = None
+        self.enc_channels = []
+    def forward(self, x):
+        raise NotImplementedError
+    def load_pretrained_ckpt(self):
+        raise NotImplementedError
+class MobileNetV2Backbone(BaseBackbone):
+    """ MobileNetV2 Backbone
+    """
+    def __init__(self, in_channels):
+        super(MobileNetV2Backbone, self).__init__(in_channels)
+        self.model = MobileNetV2(self.in_channels, alpha=1.0, expansion=6, num_classes=None)
+        self.enc_channels = [16, 24, 32, 96, 1280]
+    def forward(self, x):
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(0, 2)), x)
+        x = self.model.features[0](x)
+        x = self.model.features[1](x)
+        enc2x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(2, 4)), x)
+        x = self.model.features[2](x)
+        x = self.model.features[3](x)
+        enc4x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(4, 7)), x)
+        x = self.model.features[4](x)
+        x = self.model.features[5](x)
+        x = self.model.features[6](x)
+        enc8x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(7, 14)), x)
+        x = self.model.features[7](x)
+        x = self.model.features[8](x)
+        x = self.model.features[9](x)
+        x = self.model.features[10](x)
+        x = self.model.features[11](x)
+        x = self.model.features[12](x)
+        x = self.model.features[13](x)
+        enc16x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(14, 19)), x)
+        x = self.model.features[14](x)
+        x = self.model.features[15](x)
+        x = self.model.features[16](x)
+        x = self.model.features[17](x)
+        x = self.model.features[18](x)
+        enc32x = x
+        return [enc2x, enc4x, enc8x, enc16x, enc32x]
+    def load_pretrained_ckpt(self):
+        # the pre-trained model is provided by https://github.com/thuyngch/Human-Segmentation-PyTorch
+        ckpt_path = './pretrained/mobilenetv2_human_seg.ckpt'
+        if not os.path.exists(ckpt_path):
+            print('cannot find the pretrained mobilenetv2 backbone')
+            exit()
+        ckpt = torch.load(ckpt_path)
+        self.model.load_state_dict(ckpt)
+SUPPORTED_BACKBONES = {
+    'mobilenetv2': MobileNetV2Backbone,
+}
+# ------------------------------------------------------------------------------
+#  MODNet Basic Modules
+# ------------------------------------------------------------------------------
+class IBNorm(nn.Module):
+    """ Combine Instance Norm and Batch Norm into One Layer
+    """
+    def __init__(self, in_channels):
+        super(IBNorm, self).__init__()
+        in_channels = in_channels
+        self.bnorm_channels = int(in_channels / 2)
+        self.inorm_channels = in_channels - self.bnorm_channels
+        self.bnorm = nn.BatchNorm2d(self.bnorm_channels, affine=True)
+        self.inorm = nn.InstanceNorm2d(self.inorm_channels, affine=False)
+    def forward(self, x):
+        bn_x = self.bnorm(x[:, :self.bnorm_channels, ...].contiguous())
+        in_x = self.inorm(x[:, self.bnorm_channels:, ...].contiguous())
+        return torch.cat((bn_x, in_x), 1)
+class Conv2dIBNormRelu(nn.Module):
+    """ Convolution + IBNorm + ReLu
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, bias=True,
+                 with_ibn=True, with_relu=True):
+        super(Conv2dIBNormRelu, self).__init__()
+        layers = [
+            nn.Conv2d(in_channels, out_channels, kernel_size,
+                      stride=stride, padding=padding, dilation=dilation,
+                      groups=groups, bias=bias)
+        ]
+        if with_ibn:
+            layers.append(IBNorm(out_channels))
+        if with_relu:
+            layers.append(nn.ReLU(inplace=True))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class SEBlock(nn.Module):
+    """ SE Block Proposed in https://arxiv.org/pdf/1709.01507.pdf
+    """
+    def __init__(self, in_channels, out_channels, reduction=1):
+        super(SEBlock, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, int(in_channels // reduction), bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(int(in_channels // reduction), out_channels, bias=False),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        w = self.pool(x).view(b, c)
+        w = self.fc(w).view(b, c, 1, 1)
+        return x * w.expand_as(x)
+# ------------------------------------------------------------------------------
+#  MODNet Branches
+# ------------------------------------------------------------------------------
+class LRBranch(nn.Module):
+    """ Low Resolution Branch of MODNet
+    """
+    def __init__(self, backbone):
+        super(LRBranch, self).__init__()
+        enc_channels = backbone.enc_channels
+        self.backbone = backbone
+        self.se_block = SEBlock(enc_channels[4], enc_channels[4], reduction=4)
+        self.conv_lr16x = Conv2dIBNormRelu(enc_channels[4], enc_channels[3], 5, stride=1, padding=2)
+        self.conv_lr8x = Conv2dIBNormRelu(enc_channels[3], enc_channels[2], 5, stride=1, padding=2)
+        self.conv_lr = Conv2dIBNormRelu(enc_channels[2], 1, kernel_size=3, stride=2, padding=1, with_ibn=False,
+                                        with_relu=False)
+    def forward(self, img, inference):
+        enc_features = self.backbone.forward(img)
+        enc2x, enc4x, enc32x = enc_features[0], enc_features[1], enc_features[4]
+        enc32x = self.se_block(enc32x)
+        lr16x = F.interpolate(enc32x, scale_factor=2, mode='bilinear', align_corners=False)
+        lr16x = self.conv_lr16x(lr16x)
+        lr8x = F.interpolate(lr16x, scale_factor=2, mode='bilinear', align_corners=False)
+        lr8x = self.conv_lr8x(lr8x)
+        pred_semantic = None
+        if not inference:
+            lr = self.conv_lr(lr8x)
+            pred_semantic = torch.sigmoid(lr)
+        return pred_semantic, lr8x, [enc2x, enc4x]
+class HRBranch(nn.Module):
+    """ High Resolution Branch of MODNet
+    """
+    def __init__(self, hr_channels, enc_channels):
+        super(HRBranch, self).__init__()
+        self.tohr_enc2x = Conv2dIBNormRelu(enc_channels[0], hr_channels, 1, stride=1, padding=0)
+        self.conv_enc2x = Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=2, padding=1)
+        self.tohr_enc4x = Conv2dIBNormRelu(enc_channels[1], hr_channels, 1, stride=1, padding=0)
+        self.conv_enc4x = Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1)
+        self.conv_hr4x = nn.Sequential(
+            Conv2dIBNormRelu(3 * hr_channels + 3, 2 * hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1),
+        )
+        self.conv_hr2x = nn.Sequential(
+            Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1),
+        )
+        self.conv_hr = nn.Sequential(
+            Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(hr_channels, 1, kernel_size=1, stride=1, padding=0, with_ibn=False, with_relu=False),
+        )
+    def forward(self, img, enc2x, enc4x, lr8x, inference):
+        img2x = F.interpolate(img, scale_factor=1 / 2, mode='bilinear', align_corners=False)
+        img4x = F.interpolate(img, scale_factor=1 / 4, mode='bilinear', align_corners=False)
+        enc2x = self.tohr_enc2x(enc2x)
+        hr4x = self.conv_enc2x(torch.cat((img2x, enc2x), dim=1))
+        enc4x = self.tohr_enc4x(enc4x)
+        hr4x = self.conv_enc4x(torch.cat((hr4x, enc4x), dim=1))
+        lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False)
+        hr4x = self.conv_hr4x(torch.cat((hr4x, lr4x, img4x), dim=1))
+        hr2x = F.interpolate(hr4x, scale_factor=2, mode='bilinear', align_corners=False)
+        hr2x = self.conv_hr2x(torch.cat((hr2x, enc2x), dim=1))
+        pred_detail = None
+        if not inference:
+            hr = F.interpolate(hr2x, scale_factor=2, mode='bilinear', align_corners=False)
+            hr = self.conv_hr(torch.cat((hr, img), dim=1))
+            pred_detail = torch.sigmoid(hr)
+        return pred_detail, hr2x
+class FusionBranch(nn.Module):
+    """ Fusion Branch of MODNet
+    """
+    def __init__(self, hr_channels, enc_channels):
+        super(FusionBranch, self).__init__()
+        self.conv_lr4x = Conv2dIBNormRelu(enc_channels[2], hr_channels, 5, stride=1, padding=2)
+        self.conv_f2x = Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1)
+        self.conv_f = nn.Sequential(
+            Conv2dIBNormRelu(hr_channels + 3, int(hr_channels / 2), 3, stride=1, padding=1),
+            Conv2dIBNormRelu(int(hr_channels / 2), 1, 1, stride=1, padding=0, with_ibn=False, with_relu=False),
+        )
+    def forward(self, img, lr8x, hr2x):
+        lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False)
+        lr4x = self.conv_lr4x(lr4x)
+        lr2x = F.interpolate(lr4x, scale_factor=2, mode='bilinear', align_corners=False)
+        f2x = self.conv_f2x(torch.cat((lr2x, hr2x), dim=1))
+        f = F.interpolate(f2x, scale_factor=2, mode='bilinear', align_corners=False)
+        f = self.conv_f(torch.cat((f, img), dim=1))
+        pred_matte = torch.sigmoid(f)
+        return pred_matte
+# ------------------------------------------------------------------------------
+#  MODNet
+# ------------------------------------------------------------------------------
+class MODNet(nn.Module):
+    """ Architecture of MODNet
+    """
+    def __init__(self, in_channels=3, hr_channels=32, backbone_arch='mobilenetv2', backbone_pretrained=False):
+        super(MODNet, self).__init__()
+        self.in_channels = in_channels
+        self.hr_channels = hr_channels
+        self.backbone_arch = backbone_arch
+        self.backbone_pretrained = backbone_pretrained
+        self.backbone = SUPPORTED_BACKBONES[self.backbone_arch](self.in_channels)
+        self.lr_branch = LRBranch(self.backbone)
+        self.hr_branch = HRBranch(self.hr_channels, self.backbone.enc_channels)
+        self.f_branch = FusionBranch(self.hr_channels, self.backbone.enc_channels)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                self._init_conv(m)
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.InstanceNorm2d):
+                self._init_norm(m)
+        if self.backbone_pretrained:
+            self.backbone.load_pretrained_ckpt()
+    def forward(self, img, inference):
+        pred_semantic, lr8x, [enc2x, enc4x] = self.lr_branch(img, inference)
+        pred_detail, hr2x = self.hr_branch(img, enc2x, enc4x, lr8x, inference)
+        pred_matte = self.f_branch(img, lr8x, hr2x)
+        return pred_semantic, pred_detail, pred_matte
+    @staticmethod
+    def compute_loss(args):
+        pred_semantic, pred_detail, pred_matte, image, trimap, gt_matte = args
+        semantic_loss, detail_loss, matte_loss = loss_func(pred_semantic, pred_detail, pred_matte,
+                                                           image, trimap, gt_matte)
+        loss = semantic_loss + detail_loss + matte_loss
+        return matte_loss, loss
+    def freeze_norm(self):
+        norm_types = [nn.BatchNorm2d, nn.InstanceNorm2d]
+        for m in self.modules():
+            for n in norm_types:
+                if isinstance(m, n):
+                    m.eval()
+                    continue
+    def _init_conv(self, conv):
+        nn.init.kaiming_uniform_(
+            conv.weight, a=0, mode='fan_in', nonlinearity='relu')
+        if conv.bias is not None:
+            nn.init.constant_(conv.bias, 0)
+    def _init_norm(self, norm):
+        if norm.weight is not None:
+            nn.init.constant_(norm.weight, 1)
+            nn.init.constant_(norm.bias, 0)
+    def _apply(self, fn):
+        super(MODNet, self)._apply(fn)
+        blurer._apply(fn)  # let blurer's device same as modnet
+        return self

animeinsseg/models/animeseg_refine/u2net.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Codes are borrowed from
+# https://github.com/xuebinqin/U-2-Net/blob/master/model/u2net_refactor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+__all__ = ['U2NET_full', 'U2NET_full2', 'U2NET_lite', 'U2NET_lite2', "U2NET"]
+bce_loss = nn.BCEWithLogitsLoss(reduction='mean')
+def _upsample_like(x, size):
+    return F.interpolate(x, size=size, mode='bilinear', align_corners=False)
+def _size_map(x, height):
+    # {height: size} for Upsample
+    size = list(x.shape[-2:])
+    sizes = {}
+    for h in range(1, height):
+        sizes[h] = size
+        size = [math.ceil(w / 2) for w in size]
+    return sizes
+class REBNCONV(nn.Module):
+    def __init__(self, in_ch=3, out_ch=3, dilate=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(in_ch, out_ch, 3, padding=1 * dilate, dilation=1 * dilate)
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.relu_s1(self.bn_s1(self.conv_s1(x)))
+class RSU(nn.Module):
+    def __init__(self, name, height, in_ch, mid_ch, out_ch, dilated=False):
+        super(RSU, self).__init__()
+        self.name = name
+        self.height = height
+        self.dilated = dilated
+        self._make_layers(height, in_ch, mid_ch, out_ch, dilated)
+    def forward(self, x):
+        sizes = _size_map(x, self.height)
+        x = self.rebnconvin(x)
+        # U-Net like symmetric encoder-decoder structure
+        def unet(x, height=1):
+            if height < self.height:
+                x1 = getattr(self, f'rebnconv{height}')(x)
+                if not self.dilated and height < self.height - 1:
+                    x2 = unet(getattr(self, 'downsample')(x1), height + 1)
+                else:
+                    x2 = unet(x1, height + 1)
+                x = getattr(self, f'rebnconv{height}d')(torch.cat((x2, x1), 1))
+                return _upsample_like(x, sizes[height - 1]) if not self.dilated and height > 1 else x
+            else:
+                return getattr(self, f'rebnconv{height}')(x)
+        return x + unet(x)
+    def _make_layers(self, height, in_ch, mid_ch, out_ch, dilated=False):
+        self.add_module('rebnconvin', REBNCONV(in_ch, out_ch))
+        self.add_module('downsample', nn.MaxPool2d(2, stride=2, ceil_mode=True))
+        self.add_module(f'rebnconv1', REBNCONV(out_ch, mid_ch))
+        self.add_module(f'rebnconv1d', REBNCONV(mid_ch * 2, out_ch))
+        for i in range(2, height):
+            dilate = 1 if not dilated else 2 ** (i - 1)
+            self.add_module(f'rebnconv{i}', REBNCONV(mid_ch, mid_ch, dilate=dilate))
+            self.add_module(f'rebnconv{i}d', REBNCONV(mid_ch * 2, mid_ch, dilate=dilate))
+        dilate = 2 if not dilated else 2 ** (height - 1)
+        self.add_module(f'rebnconv{height}', REBNCONV(mid_ch, mid_ch, dilate=dilate))
+class U2NET(nn.Module):
+    def __init__(self, cfgs, out_ch):
+        super(U2NET, self).__init__()
+        self.out_ch = out_ch
+        self._make_layers(cfgs)
+    def forward(self, x):
+        sizes = _size_map(x, self.height)
+        maps = []  # storage for maps
+        # side saliency map
+        def unet(x, height=1):
+            if height < 6:
+                x1 = getattr(self, f'stage{height}')(x)
+                x2 = unet(getattr(self, 'downsample')(x1), height + 1)
+                x = getattr(self, f'stage{height}d')(torch.cat((x2, x1), 1))
+                side(x, height)
+                return _upsample_like(x, sizes[height - 1]) if height > 1 else x
+            else:
+                x = getattr(self, f'stage{height}')(x)
+                side(x, height)
+                return _upsample_like(x, sizes[height - 1])
+        def side(x, h):
+            # side output saliency map (before sigmoid)
+            x = getattr(self, f'side{h}')(x)
+            x = _upsample_like(x, sizes[1])
+            maps.append(x)
+        def fuse():
+            # fuse saliency probability maps
+            maps.reverse()
+            x = torch.cat(maps, 1)
+            x = getattr(self, 'outconv')(x)
+            maps.insert(0, x)
+            # return [torch.sigmoid(x) for x in maps]
+            return [x for x in maps]
+        unet(x)
+        maps = fuse()
+        return maps
+    @staticmethod
+    def compute_loss(args):
+        preds, labels_v = args
+        d0, d1, d2, d3, d4, d5, d6 = preds
+        loss0 = bce_loss(d0, labels_v)
+        loss1 = bce_loss(d1, labels_v)
+        loss2 = bce_loss(d2, labels_v)
+        loss3 = bce_loss(d3, labels_v)
+        loss4 = bce_loss(d4, labels_v)
+        loss5 = bce_loss(d5, labels_v)
+        loss6 = bce_loss(d6, labels_v)
+        loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6
+        return loss0, loss
+    def _make_layers(self, cfgs):
+        self.height = int((len(cfgs) + 1) / 2)
+        self.add_module('downsample', nn.MaxPool2d(2, stride=2, ceil_mode=True))
+        for k, v in cfgs.items():
+            # build rsu block
+            self.add_module(k, RSU(v[0], *v[1]))
+            if v[2] > 0:
+                # build side layer
+                self.add_module(f'side{v[0][-1]}', nn.Conv2d(v[2], self.out_ch, 3, padding=1))
+        # build fuse layer
+        self.add_module('outconv', nn.Conv2d(int(self.height * self.out_ch), self.out_ch, 1))
+def U2NET_full():
+    full = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (7, 3, 32, 64), -1],
+        'stage2': ['En_2', (6, 64, 32, 128), -1],
+        'stage3': ['En_3', (5, 128, 64, 256), -1],
+        'stage4': ['En_4', (4, 256, 128, 512), -1],
+        'stage5': ['En_5', (4, 512, 256, 512, True), -1],
+        'stage6': ['En_6', (4, 512, 256, 512, True), 512],
+        'stage5d': ['De_5', (4, 1024, 256, 512, True), 512],
+        'stage4d': ['De_4', (4, 1024, 128, 256), 256],
+        'stage3d': ['De_3', (5, 512, 64, 128), 128],
+        'stage2d': ['De_2', (6, 256, 32, 64), 64],
+        'stage1d': ['De_1', (7, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=full, out_ch=1)
+def U2NET_full2():
+    full = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (8, 3, 32, 64), -1],
+        'stage2': ['En_2', (7, 64, 32, 128), -1],
+        'stage3': ['En_3', (6, 128, 64, 256), -1],
+        'stage4': ['En_4', (5, 256, 128, 512), -1],
+        'stage5': ['En_5', (5, 512, 256, 512, True), -1],
+        'stage6': ['En_6', (5, 512, 256, 512, True), 512],
+        'stage5d': ['De_5', (5, 1024, 256, 512, True), 512],
+        'stage4d': ['De_4', (5, 1024, 128, 256), 256],
+        'stage3d': ['De_3', (6, 512, 64, 128), 128],
+        'stage2d': ['De_2', (7, 256, 32, 64), 64],
+        'stage1d': ['De_1', (8, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=full, out_ch=1)
+def U2NET_lite():
+    lite = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (7, 3, 16, 64), -1],
+        'stage2': ['En_2', (6, 64, 16, 64), -1],
+        'stage3': ['En_3', (5, 64, 16, 64), -1],
+        'stage4': ['En_4', (4, 64, 16, 64), -1],
+        'stage5': ['En_5', (4, 64, 16, 64, True), -1],
+        'stage6': ['En_6', (4, 64, 16, 64, True), 64],
+        'stage5d': ['De_5', (4, 128, 16, 64, True), 64],
+        'stage4d': ['De_4', (4, 128, 16, 64), 64],
+        'stage3d': ['De_3', (5, 128, 16, 64), 64],
+        'stage2d': ['De_2', (6, 128, 16, 64), 64],
+        'stage1d': ['De_1', (7, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=lite, out_ch=1)
+def U2NET_lite2():
+    lite = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (8, 3, 16, 64), -1],
+        'stage2': ['En_2', (7, 64, 16, 64), -1],
+        'stage3': ['En_3', (6, 64, 16, 64), -1],
+        'stage4': ['En_4', (5, 64, 16, 64), -1],
+        'stage5': ['En_5', (5, 64, 16, 64, True), -1],
+        'stage6': ['En_6', (5, 64, 16, 64, True), 64],
+        'stage5d': ['De_5', (5, 128, 16, 64, True), 64],
+        'stage4d': ['De_4', (5, 128, 16, 64), 64],
+        'stage3d': ['De_3', (6, 128, 16, 64), 64],
+        'stage2d': ['De_2', (7, 128, 16, 64), 64],
+        'stage1d': ['De_1', (8, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=lite, out_ch=1)

animeinsseg/models/rtmdet_inshead_custom.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmcv.ops import batched_nms
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.models.utils import (filter_scores_and_topk, multi_apply,
+                                select_single_mlvl, sigmoid_geometric_mean)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, distance2bbox, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from mmdet.models.dense_heads.rtmdet_head import RTMDetHead
+from mmdet.models.dense_heads.rtmdet_ins_head import RTMDetInsHead, RTMDetInsSepBNHead, MaskFeatModule
+from mmdet.utils import AvoidCUDAOOM
+def sthgoeswrong(logits):
+    return torch.any(torch.isnan(logits)) or torch.any(torch.isinf(logits))
+from time import time
+@MODELS.register_module(force=True)
+class RTMDetInsHeadCustom(RTMDetInsHead):
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     kernel_preds: List[Tensor],
+                     mask_feat: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_kernels = torch.cat([
+            kernel_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_gen_params)
+            for kernel_pred in kernel_preds
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+        for gt_instances in batch_gt_instances:
+            gt_instances.masks = gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=device)
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        loss_mask = self.loss_mask_by_feat(mask_feat, flatten_kernels,
+                                           sampling_results_list,
+                                           batch_gt_instances)
+        loss = dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_mask=loss_mask)
+        return loss
+    def _mask_predict_by_feat_single(self, mask_feat: Tensor, kernels: Tensor,
+                                     priors: Tensor) -> Tensor:
+        ori_maskfeat = mask_feat
+        num_inst = priors.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+        fp16_used = weights[0].dtype == torch.float16
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            with torch.cuda.amp.autocast(enabled=False):
+                if fp16_used:
+                    weight = weight.to(torch.float32)
+                    bias = bias.to(torch.float32)
+                x = F.conv2d(
+                    x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+                if i < n_layers - 1:
+                    x = F.relu(x)
+        if fp16_used:
+            x = torch.clip(x, -8192, 8192)
+        if sthgoeswrong(x):
+            torch.save({'mask_feat': ori_maskfeat, 'kernels': kernels, 'priors': priors}, 'maskhead_nan_input.pt')
+            raise Exception('Mask Head NaN')
+        x = x.reshape(num_inst, h, w)
+        return x
+    def loss_mask_by_feat(self, mask_feats: Tensor, flatten_kernels: Tensor,
+                          sampling_results_list: list,
+                          batch_gt_instances: InstanceList) -> Tensor:
+        batch_pos_mask_logits = []
+        pos_gt_masks = []
+        ignore_masks = []
+        for idx, (mask_feat, kernels, sampling_results,
+                  gt_instances) in enumerate(
+                      zip(mask_feats, flatten_kernels, sampling_results_list,
+                          batch_gt_instances)):
+            pos_priors = sampling_results.pos_priors
+            pos_inds = sampling_results.pos_inds
+            pos_kernels = kernels[pos_inds]  # n_pos, num_gen_params
+            pos_mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, pos_kernels, pos_priors)
+            if gt_instances.masks.numel() == 0:
+                gt_masks = torch.empty_like(gt_instances.masks)
+                if gt_masks.shape[0] > 0:
+                    ignore = torch.zeros(gt_masks.shape[0], dtype=torch.bool).to(device=gt_masks.device)
+                    ignore_masks.append(ignore)
+            else:
+                gt_masks = gt_instances.masks[
+                    sampling_results.pos_assigned_gt_inds, :]
+                ignore_masks.append(gt_instances.ignore_mask[sampling_results.pos_assigned_gt_inds])
+            batch_pos_mask_logits.append(pos_mask_logits)
+            pos_gt_masks.append(gt_masks)
+        pos_gt_masks = torch.cat(pos_gt_masks, 0)
+        batch_pos_mask_logits = torch.cat(batch_pos_mask_logits, 0)
+        ignore_masks = torch.logical_not(torch.cat(ignore_masks, 0))
+        pos_gt_masks = pos_gt_masks[ignore_masks]
+        batch_pos_mask_logits = batch_pos_mask_logits[ignore_masks]
+        # avg_factor
+        num_pos = batch_pos_mask_logits.shape[0]
+        num_pos = reduce_mean(mask_feats.new_tensor([num_pos
+                                                     ])).clamp_(min=1).item()
+        if batch_pos_mask_logits.shape[0] == 0:
+            return mask_feats.sum() * 0
+        scale = self.prior_generator.strides[0][0] // self.mask_loss_stride
+        # upsample pred masks
+        batch_pos_mask_logits = F.interpolate(
+            batch_pos_mask_logits.unsqueeze(0),
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        # downsample gt masks
+        pos_gt_masks = pos_gt_masks[:, self.mask_loss_stride //
+                                    2::self.mask_loss_stride,
+                                    self.mask_loss_stride //
+                                    2::self.mask_loss_stride]
+        loss_mask = self.loss_mask(
+            batch_pos_mask_logits,
+            pos_gt_masks,
+            weight=None,
+            avg_factor=num_pos)
+        return loss_mask
+@MODELS.register_module()
+class RTMDetInsSepBNHeadCustom(RTMDetInsSepBNHead):
+    def _mask_predict_by_feat_single(self, mask_feat: Tensor, kernels: Tensor,
+                                     priors: Tensor) -> Tensor:
+        ori_maskfeat = mask_feat
+        num_inst = priors.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+        fp16_used = weights[0].dtype == torch.float16
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            with torch.cuda.amp.autocast(enabled=False):
+                if fp16_used:
+                    weight = weight.to(torch.float32)
+                    bias = bias.to(torch.float32)
+                x = F.conv2d(
+                    x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+                if i < n_layers - 1:
+                    x = F.relu(x)
+        if fp16_used:
+            x = torch.clip(x, -8192, 8192)
+        if sthgoeswrong(x):
+            torch.save({'mask_feat': ori_maskfeat, 'kernels': kernels, 'priors': priors}, 'maskhead_nan_input.pt')
+            raise Exception('Mask Head NaN')
+        x = x.reshape(num_inst, h, w)
+        return x
+    @AvoidCUDAOOM.retry_if_cuda_oom
+    def loss_mask_by_feat(self, mask_feats: Tensor, flatten_kernels: Tensor,
+                          sampling_results_list: list,
+                          batch_gt_instances: InstanceList) -> Tensor:
+        batch_pos_mask_logits = []
+        pos_gt_masks = []
+        ignore_masks = []
+        for idx, (mask_feat, kernels, sampling_results,
+                  gt_instances) in enumerate(
+                      zip(mask_feats, flatten_kernels, sampling_results_list,
+                          batch_gt_instances)):
+            pos_priors = sampling_results.pos_priors
+            pos_inds = sampling_results.pos_inds
+            pos_kernels = kernels[pos_inds]  # n_pos, num_gen_params
+            pos_mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, pos_kernels, pos_priors)
+            if gt_instances.masks.numel() == 0:
+                gt_masks = torch.empty_like(gt_instances.masks)
+                # if gt_masks.shape[0] > 0:
+                    # ignore = torch.zeros(gt_masks.shape[0], dtype=torch.bool).to(device=gt_masks.device)
+                    # ignore_masks.append(ignore)
+            else:
+                msk = torch.logical_not(gt_instances.ignore_mask[sampling_results.pos_assigned_gt_inds])
+                gt_masks = gt_instances.masks[
+                    sampling_results.pos_assigned_gt_inds, :][msk]
+                pos_mask_logits = pos_mask_logits[msk]
+                # ignore_masks.append(gt_instances.ignore_mask[sampling_results.pos_assigned_gt_inds])
+            batch_pos_mask_logits.append(pos_mask_logits)
+            pos_gt_masks.append(gt_masks)
+        pos_gt_masks = torch.cat(pos_gt_masks, 0)
+        batch_pos_mask_logits = torch.cat(batch_pos_mask_logits, 0)
+        # ignore_masks = torch.logical_not(torch.cat(ignore_masks, 0))
+        # pos_gt_masks = pos_gt_masks[ignore_masks]
+        # batch_pos_mask_logits = batch_pos_mask_logits[ignore_masks]
+        # avg_factor
+        num_pos = batch_pos_mask_logits.shape[0]
+        num_pos = reduce_mean(mask_feats.new_tensor([num_pos
+                                                     ])).clamp_(min=1).item()
+        if batch_pos_mask_logits.shape[0] == 0:
+            return mask_feats.sum() * 0
+        scale = self.prior_generator.strides[0][0] // self.mask_loss_stride
+        # upsample pred masks
+        batch_pos_mask_logits = F.interpolate(
+            batch_pos_mask_logits.unsqueeze(0),
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        # downsample gt masks
+        pos_gt_masks = pos_gt_masks[:, self.mask_loss_stride //
+                                    2::self.mask_loss_stride,
+                                    self.mask_loss_stride //
+                                    2::self.mask_loss_stride]
+        loss_mask = self.loss_mask(
+            batch_pos_mask_logits,
+            pos_gt_masks,
+            weight=None,
+            avg_factor=num_pos)
+        return loss_mask

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gradio as gr
+import cv2
+from PIL import Image
+import numpy as np
+from animeinsseg import AnimeInsSeg, AnimeInstances
+from animeinsseg.anime_instances import get_color
+import os
+if not os.path.exists("models"):
+    os.mkdir("models")
+os.system("huggingface-cli lfs-enable-largefiles .")
+os.system("git clone https://huggingface.co/dreMaz/AnimeInstanceSegmentation models/AnimeInstanceSegmentation")
+ckpt = r'models/AnimeInstanceSegmentation/rtmdetl_e60.ckpt'
+mask_thres = 0.3
+instance_thres = 0.3
+refine_kwargs = {'refine_method': 'refinenet_isnet'} # set to None if not using refinenet
+# refine_kwargs = None
+net = AnimeInsSeg(ckpt, mask_thr=mask_thres, refine_kwargs=refine_kwargs)
+def fn(image):
+    img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    instances: AnimeInstances = net.infer(
+        img,
+        output_type='numpy',
+        pred_score_thr=instance_thres
+    )
+    drawed = img.copy()
+    im_h, im_w = img.shape[:2]
+    # instances.bboxes, instances.masks will be None, None if no obj is detected
+    for ii, (xywh, mask) in enumerate(zip(instances.bboxes, instances.masks)):
+        color = get_color(ii)
+        mask_alpha = 0.5
+        linewidth = max(round(sum(img.shape) / 2 * 0.003), 2)
+        # draw bbox
+        p1, p2 = (int(xywh[0]), int(xywh[1])), (int(xywh[2] + xywh[0]), int(xywh[3] + xywh[1]))
+        cv2.rectangle(drawed, p1, p2, color, thickness=linewidth, lineType=cv2.LINE_AA)
+        # draw mask
+        p = mask.astype(np.float32)
+        blend_mask = np.full((im_h, im_w, 3), color, dtype=np.float32)
+        alpha_msk = (mask_alpha * p)[..., None]
+        alpha_ori = 1 - alpha_msk
+        drawed = drawed * alpha_ori + alpha_msk * blend_mask
+        drawed = drawed.astype(np.uint8)
+    return Image.fromarray(drawed[..., ::-1])
+iface = gr.Interface(
+    inputs=gr.Image(type="numpy"),
+    outputs="Image",
+    fn=fn
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+einops
+imageio
+git+https://github.com/cocodataset/panopticapi.git
+pytorch-lightning
+albumentations
+huggingface_hub
+# For Web UI
+gradio
+torch
+torchvision
+openmim
+mmengine
+mmcv>=2.0.0
+mmdet