diff --git a/densepose/__init__.py b/densepose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b50a3da91dd0d2a69502af9d5d62f2f4280d973f --- /dev/null +++ b/densepose/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .data.datasets import builtin # just to register data +from .converters import builtin as builtin_converters # register converters +from .config import ( + add_densepose_config, + add_densepose_head_config, + add_hrnet_config, + add_dataset_category_config, + add_bootstrap_config, + load_bootstrap_config, +) +from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData +from .evaluation import DensePoseCOCOEvaluator +from .modeling.roi_heads import DensePoseROIHeads +from .modeling.test_time_augmentation import ( + DensePoseGeneralizedRCNNWithTTA, + DensePoseDatasetMapperTTA, +) +from .utils.transform import load_from_cfg +from .modeling.hrfpn import build_hrfpn_backbone diff --git a/densepose/config.py b/densepose/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2a06a09c80865ab987773511b2acc71e232b26ac --- /dev/null +++ b/densepose/config.py @@ -0,0 +1,277 @@ +# -*- coding = utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. +# pyre-ignore-all-errors + +from detectron2.config import CfgNode as CN + + +def add_dataset_category_config(cfg: CN) -> None: + """ + Add config for additional category-related dataset options + - category whitelisting + - category mapping + """ + _C = cfg + _C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True) + _C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True) + # class to mesh mapping + _C.DATASETS.CLASS_TO_MESH_NAME_MAPPING = CN(new_allowed=True) + + +def add_evaluation_config(cfg: CN) -> None: + _C = cfg + _C.DENSEPOSE_EVALUATION = CN() + # evaluator type, possible values: + # - "iou": evaluator for models that produce iou data + # - "cse": evaluator for models that produce cse data + _C.DENSEPOSE_EVALUATION.TYPE = "iou" + # storage for DensePose results, possible values: + # - "none": no explicit storage, all the results are stored in the + # dictionary with predictions, memory intensive; + # historically the default storage type + # - "ram": RAM storage, uses per-process RAM storage, which is + # reduced to a single process storage on later stages, + # less memory intensive + # - "file": file storage, uses per-process file-based storage, + # the least memory intensive, but may create bottlenecks + # on file system accesses + _C.DENSEPOSE_EVALUATION.STORAGE = "none" + # minimum threshold for IOU values: the lower its values is, + # the more matches are produced (and the higher the AP score) + _C.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD = 0.5 + # Non-distributed inference is slower (at inference time) but can avoid RAM OOM + _C.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE = True + # evaluate mesh alignment based on vertex embeddings, only makes sense in CSE context + _C.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT = False + # meshes to compute mesh alignment for + _C.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES = [] + + +def add_bootstrap_config(cfg: CN) -> None: + """ """ + _C = cfg + _C.BOOTSTRAP_DATASETS = [] + _C.BOOTSTRAP_MODEL = CN() + _C.BOOTSTRAP_MODEL.WEIGHTS = "" + _C.BOOTSTRAP_MODEL.DEVICE = "cuda" + + +def get_bootstrap_dataset_config() -> CN: + _C = CN() + _C.DATASET = "" + # ratio used to mix data loaders + _C.RATIO = 0.1 + # image loader + _C.IMAGE_LOADER = CN(new_allowed=True) + _C.IMAGE_LOADER.TYPE = "" + _C.IMAGE_LOADER.BATCH_SIZE = 4 + _C.IMAGE_LOADER.NUM_WORKERS = 4 + _C.IMAGE_LOADER.CATEGORIES = [] + _C.IMAGE_LOADER.MAX_COUNT_PER_CATEGORY = 1_000_000 + _C.IMAGE_LOADER.CATEGORY_TO_CLASS_MAPPING = CN(new_allowed=True) + # inference + _C.INFERENCE = CN() + # batch size for model inputs + _C.INFERENCE.INPUT_BATCH_SIZE = 4 + # batch size to group model outputs + _C.INFERENCE.OUTPUT_BATCH_SIZE = 2 + # sampled data + _C.DATA_SAMPLER = CN(new_allowed=True) + _C.DATA_SAMPLER.TYPE = "" + _C.DATA_SAMPLER.USE_GROUND_TRUTH_CATEGORIES = False + # filter + _C.FILTER = CN(new_allowed=True) + _C.FILTER.TYPE = "" + return _C + + +def load_bootstrap_config(cfg: CN) -> None: + """ + Bootstrap datasets are given as a list of `dict` that are not automatically + converted into CfgNode. This method processes all bootstrap dataset entries + and ensures that they are in CfgNode format and comply with the specification + """ + if not cfg.BOOTSTRAP_DATASETS: + return + + bootstrap_datasets_cfgnodes = [] + for dataset_cfg in cfg.BOOTSTRAP_DATASETS: + _C = get_bootstrap_dataset_config().clone() + _C.merge_from_other_cfg(CN(dataset_cfg)) + bootstrap_datasets_cfgnodes.append(_C) + cfg.BOOTSTRAP_DATASETS = bootstrap_datasets_cfgnodes + + +def add_densepose_head_cse_config(cfg: CN) -> None: + """ + Add configuration options for Continuous Surface Embeddings (CSE) + """ + _C = cfg + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE = CN() + # Dimensionality D of the embedding space + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE = 16 + # Embedder specifications for various mesh IDs + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS = CN(new_allowed=True) + # normalization coefficient for embedding distances + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA = 0.01 + # normalization coefficient for geodesic distances + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA = 0.01 + # embedding loss weight + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT = 0.6 + # embedding loss name, currently the following options are supported: + # - EmbeddingLoss: cross-entropy on vertex labels + # - SoftEmbeddingLoss: cross-entropy on vertex label combined with + # Gaussian penalty on distance between vertices + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME = "EmbeddingLoss" + # optimizer hyperparameters + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR = 1.0 + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR = 1.0 + # Shape to shape cycle consistency loss parameters: + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False}) + # shape to shape cycle consistency loss weight + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.025 + # norm type used for loss computation + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P = 2 + # normalization term for embedding similarity matrices + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE = 0.05 + # maximum number of vertices to include into shape to shape cycle loss + # if negative or zero, all vertices are considered + # if positive, random subset of vertices of given size is considered + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES = 4936 + # Pixel to shape cycle consistency loss parameters: + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False}) + # pixel to shape cycle consistency loss weight + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.0001 + # norm type used for loss computation + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P = 2 + # map images to all meshes and back (if false, use only gt meshes from the batch) + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY = False + # Randomly select at most this number of pixels from every instance + # if negative or zero, all vertices are considered + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE = 100 + # normalization factor for pixel to pixel distances (higher value = smoother distribution) + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA = 5.0 + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX = 0.05 + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL = 0.05 + + +def add_densepose_head_config(cfg: CN) -> None: + """ + Add config for densepose head. + """ + _C = cfg + + _C.MODEL.DENSEPOSE_ON = True + + _C.MODEL.ROI_DENSEPOSE_HEAD = CN() + _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = "" + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8 + # Number of parts used for point labels + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4 + _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512 + _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3 + _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2 + _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112 + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2" + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28 + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2 + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2 # 15 or 2 + # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) + _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7 + # Loss weights for annotation masks.(14 Parts) + _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0 + # Loss weights for surface parts. (24 Parts) + _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0 + # Loss weights for UV regression. + _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01 + # Coarse segmentation is trained using instance segmentation task data + _C.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS = False + # For Decoder + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = "" + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4 + # For DeepLab head + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN() + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN" + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0 + # Predictor class name, must be registered in DENSEPOSE_PREDICTOR_REGISTRY + # Some registered predictors: + # "DensePoseChartPredictor": predicts segmentation and UV coordinates for predefined charts + # "DensePoseChartWithConfidencePredictor": predicts segmentation, UV coordinates + # and associated confidences for predefined charts (default) + # "DensePoseEmbeddingWithConfidencePredictor": predicts segmentation, embeddings + # and associated confidences for CSE + _C.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME = "DensePoseChartWithConfidencePredictor" + # Loss class name, must be registered in DENSEPOSE_LOSS_REGISTRY + # Some registered losses: + # "DensePoseChartLoss": loss for chart-based models that estimate + # segmentation and UV coordinates + # "DensePoseChartWithConfidenceLoss": loss for chart-based models that estimate + # segmentation, UV coordinates and the corresponding confidences (default) + _C.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME = "DensePoseChartWithConfidenceLoss" + # Confidences + # Enable learning UV confidences (variances) along with the actual values + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False}) + # UV confidence lower bound + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01 + # Enable learning segmentation confidences (variances) along with the actual values + _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE = CN({"ENABLED": False}) + # Segmentation confidence lower bound + _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON = 0.01 + # Statistical model type for confidence learning, possible values: + # - "iid_iso": statistically independent identically distributed residuals + # with isotropic covariance + # - "indep_aniso": statistically independent residuals with anisotropic + # covariances + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso" + # List of angles for rotation in data augmentation during training + _C.INPUT.ROTATION_ANGLES = [0] + _C.TEST.AUG.ROTATION_ANGLES = () # Rotation TTA + + add_densepose_head_cse_config(cfg) + + +def add_hrnet_config(cfg: CN) -> None: + """ + Add config for HRNet backbone. + """ + _C = cfg + + # For HigherHRNet w32 + _C.MODEL.HRNET = CN() + _C.MODEL.HRNET.STEM_INPLANES = 64 + _C.MODEL.HRNET.STAGE2 = CN() + _C.MODEL.HRNET.STAGE2.NUM_MODULES = 1 + _C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2 + _C.MODEL.HRNET.STAGE2.BLOCK = "BASIC" + _C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4] + _C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [32, 64] + _C.MODEL.HRNET.STAGE2.FUSE_METHOD = "SUM" + _C.MODEL.HRNET.STAGE3 = CN() + _C.MODEL.HRNET.STAGE3.NUM_MODULES = 4 + _C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3 + _C.MODEL.HRNET.STAGE3.BLOCK = "BASIC" + _C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4] + _C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [32, 64, 128] + _C.MODEL.HRNET.STAGE3.FUSE_METHOD = "SUM" + _C.MODEL.HRNET.STAGE4 = CN() + _C.MODEL.HRNET.STAGE4.NUM_MODULES = 3 + _C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4 + _C.MODEL.HRNET.STAGE4.BLOCK = "BASIC" + _C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] + _C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256] + _C.MODEL.HRNET.STAGE4.FUSE_METHOD = "SUM" + + _C.MODEL.HRNET.HRFPN = CN() + _C.MODEL.HRNET.HRFPN.OUT_CHANNELS = 256 + + +def add_densepose_config(cfg: CN) -> None: + add_densepose_head_config(cfg) + add_hrnet_config(cfg) + add_bootstrap_config(cfg) + add_dataset_category_config(cfg) + add_evaluation_config(cfg) diff --git a/densepose/converters/__init__.py b/densepose/converters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..930339e13f408ad46d0504fac557ef8cf0a57a56 --- /dev/null +++ b/densepose/converters/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .hflip import HFlipConverter +from .to_mask import ToMaskConverter +from .to_chart_result import ToChartResultConverter, ToChartResultConverterWithConfidences +from .segm_to_mask import ( + predictor_output_with_fine_and_coarse_segm_to_mask, + predictor_output_with_coarse_segm_to_mask, + resample_fine_and_coarse_segm_to_bbox, +) +from .chart_output_to_chart_result import ( + densepose_chart_predictor_output_to_result, + densepose_chart_predictor_output_to_result_with_confidences, +) +from .chart_output_hflip import densepose_chart_predictor_output_hflip diff --git a/densepose/converters/base.py b/densepose/converters/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c9dbe56cecff6dbbc1a1fda5a89c5f917513dcd8 --- /dev/null +++ b/densepose/converters/base.py @@ -0,0 +1,93 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, Tuple, Type +import torch + + +class BaseConverter: + """ + Converter base class to be reused by various converters. + Converter allows one to convert data from various source types to a particular + destination type. Each source type needs to register its converter. The + registration for each source type is valid for all descendants of that type. + """ + + @classmethod + def register(cls, from_type: Type, converter: Any = None): + """ + Registers a converter for the specified type. + Can be used as a decorator (if converter is None), or called as a method. + + Args: + from_type (type): type to register the converter for; + all instances of this type will use the same converter + converter (callable): converter to be registered for the given + type; if None, this method is assumed to be a decorator for the converter + """ + + if converter is not None: + cls._do_register(from_type, converter) + + def wrapper(converter: Any) -> Any: + cls._do_register(from_type, converter) + return converter + + return wrapper + + @classmethod + def _do_register(cls, from_type: Type, converter: Any): + cls.registry[from_type] = converter # pyre-ignore[16] + + @classmethod + def _lookup_converter(cls, from_type: Type) -> Any: + """ + Perform recursive lookup for the given type + to find registered converter. If a converter was found for some base + class, it gets registered for this class to save on further lookups. + + Args: + from_type: type for which to find a converter + Return: + callable or None - registered converter or None + if no suitable entry was found in the registry + """ + if from_type in cls.registry: # pyre-ignore[16] + return cls.registry[from_type] + for base in from_type.__bases__: + converter = cls._lookup_converter(base) + if converter is not None: + cls._do_register(from_type, converter) + return converter + return None + + @classmethod + def convert(cls, instance: Any, *args, **kwargs): + """ + Convert an instance to the destination type using some registered + converter. Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + instance: source instance to convert to the destination type + Return: + An instance of the destination type obtained from the source instance + Raises KeyError, if no suitable converter found + """ + instance_type = type(instance) + converter = cls._lookup_converter(instance_type) + if converter is None: + if cls.dst_type is None: # pyre-ignore[16] + output_type_str = "itself" + else: + output_type_str = cls.dst_type + raise KeyError(f"Could not find converter from {instance_type} to {output_type_str}") + return converter(instance, *args, **kwargs) + + +IntTupleBox = Tuple[int, int, int, int] + + +def make_int_box(box: torch.Tensor) -> IntTupleBox: + int_box = [0, 0, 0, 0] + int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist()) + return int_box[0], int_box[1], int_box[2], int_box[3] diff --git a/densepose/converters/builtin.py b/densepose/converters/builtin.py new file mode 100644 index 0000000000000000000000000000000000000000..3bd48f8f7afc49cf38bf410f01bc673d446f37d7 --- /dev/null +++ b/densepose/converters/builtin.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from ..structures import DensePoseChartPredictorOutput, DensePoseEmbeddingPredictorOutput +from . import ( + HFlipConverter, + ToChartResultConverter, + ToChartResultConverterWithConfidences, + ToMaskConverter, + densepose_chart_predictor_output_hflip, + densepose_chart_predictor_output_to_result, + densepose_chart_predictor_output_to_result_with_confidences, + predictor_output_with_coarse_segm_to_mask, + predictor_output_with_fine_and_coarse_segm_to_mask, +) + +ToMaskConverter.register( + DensePoseChartPredictorOutput, predictor_output_with_fine_and_coarse_segm_to_mask +) +ToMaskConverter.register( + DensePoseEmbeddingPredictorOutput, predictor_output_with_coarse_segm_to_mask +) + +ToChartResultConverter.register( + DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result +) + +ToChartResultConverterWithConfidences.register( + DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result_with_confidences +) + +HFlipConverter.register(DensePoseChartPredictorOutput, densepose_chart_predictor_output_hflip) diff --git a/densepose/converters/chart_output_hflip.py b/densepose/converters/chart_output_hflip.py new file mode 100644 index 0000000000000000000000000000000000000000..17d294841264c248cf7fa9e3d2d2b4efdbb9a5e8 --- /dev/null +++ b/densepose/converters/chart_output_hflip.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from dataclasses import fields +import torch + +from densepose.structures import DensePoseChartPredictorOutput, DensePoseTransformData + + +def densepose_chart_predictor_output_hflip( + densepose_predictor_output: DensePoseChartPredictorOutput, + transform_data: DensePoseTransformData, +) -> DensePoseChartPredictorOutput: + """ + Change to take into account a Horizontal flip. + """ + if len(densepose_predictor_output) > 0: + + PredictorOutput = type(densepose_predictor_output) + output_dict = {} + + for field in fields(densepose_predictor_output): + field_value = getattr(densepose_predictor_output, field.name) + # flip tensors + if isinstance(field_value, torch.Tensor): + setattr(densepose_predictor_output, field.name, torch.flip(field_value, [3])) + + densepose_predictor_output = _flip_iuv_semantics_tensor( + densepose_predictor_output, transform_data + ) + densepose_predictor_output = _flip_segm_semantics_tensor( + densepose_predictor_output, transform_data + ) + + for field in fields(densepose_predictor_output): + output_dict[field.name] = getattr(densepose_predictor_output, field.name) + + return PredictorOutput(**output_dict) + else: + return densepose_predictor_output + + +def _flip_iuv_semantics_tensor( + densepose_predictor_output: DensePoseChartPredictorOutput, + dp_transform_data: DensePoseTransformData, +) -> DensePoseChartPredictorOutput: + point_label_symmetries = dp_transform_data.point_label_symmetries + uv_symmetries = dp_transform_data.uv_symmetries + + N, C, H, W = densepose_predictor_output.u.shape + u_loc = (densepose_predictor_output.u[:, 1:, :, :].clamp(0, 1) * 255).long() + v_loc = (densepose_predictor_output.v[:, 1:, :, :].clamp(0, 1) * 255).long() + Iindex = torch.arange(C - 1, device=densepose_predictor_output.u.device)[ + None, :, None, None + ].expand(N, C - 1, H, W) + densepose_predictor_output.u[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc] + densepose_predictor_output.v[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc] + + for el in ["fine_segm", "u", "v"]: + densepose_predictor_output.__dict__[el] = densepose_predictor_output.__dict__[el][ + :, point_label_symmetries, :, : + ] + return densepose_predictor_output + + +def _flip_segm_semantics_tensor( + densepose_predictor_output: DensePoseChartPredictorOutput, dp_transform_data +): + if densepose_predictor_output.coarse_segm.shape[1] > 2: + densepose_predictor_output.coarse_segm = densepose_predictor_output.coarse_segm[ + :, dp_transform_data.mask_label_symmetries, :, : + ] + return densepose_predictor_output diff --git a/densepose/converters/chart_output_to_chart_result.py b/densepose/converters/chart_output_to_chart_result.py new file mode 100644 index 0000000000000000000000000000000000000000..4248f6c91b641a4ad1d00d0316ee82d701f9152f --- /dev/null +++ b/densepose/converters/chart_output_to_chart_result.py @@ -0,0 +1,188 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Dict +import torch +from torch.nn import functional as F + +from detectron2.structures.boxes import Boxes, BoxMode + +from ..structures import ( + DensePoseChartPredictorOutput, + DensePoseChartResult, + DensePoseChartResultWithConfidences, +) +from . import resample_fine_and_coarse_segm_to_bbox +from .base import IntTupleBox, make_int_box + + +def resample_uv_tensors_to_bbox( + u: torch.Tensor, + v: torch.Tensor, + labels: torch.Tensor, + box_xywh_abs: IntTupleBox, +) -> torch.Tensor: + """ + Resamples U and V coordinate estimates for the given bounding box + + Args: + u (tensor [1, C, H, W] of float): U coordinates + v (tensor [1, C, H, W] of float): V coordinates + labels (tensor [H, W] of long): labels obtained by resampling segmentation + outputs for the given bounding box + box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs + Return: + Resampled U and V coordinates - a tensor [2, H, W] of float + """ + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False) + v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False) + uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device) + for part_id in range(1, u_bbox.size(1)): + uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id] + uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id] + return uv + + +def resample_uv_to_bbox( + predictor_output: DensePoseChartPredictorOutput, + labels: torch.Tensor, + box_xywh_abs: IntTupleBox, +) -> torch.Tensor: + """ + Resamples U and V coordinate estimates for the given bounding box + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output to be resampled + labels (tensor [H, W] of long): labels obtained by resampling segmentation + outputs for the given bounding box + box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs + Return: + Resampled U and V coordinates - a tensor [2, H, W] of float + """ + return resample_uv_tensors_to_bbox( + predictor_output.u, + predictor_output.v, + labels, + box_xywh_abs, + ) + + +def densepose_chart_predictor_output_to_result( + predictor_output: DensePoseChartPredictorOutput, boxes: Boxes +) -> DensePoseChartResult: + """ + Convert densepose chart predictor outputs to results + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output to be converted to results, must contain only 1 output + boxes (Boxes): bounding box that corresponds to the predictor output, + must contain only 1 bounding box + Return: + DensePose chart-based result (DensePoseChartResult) + """ + assert len(predictor_output) == 1 and len(boxes) == 1, ( + f"Predictor output to result conversion can operate only single outputs" + f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes" + ) + + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + box_xywh = make_int_box(boxes_xywh_abs[0]) + + labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0) + uv = resample_uv_to_bbox(predictor_output, labels, box_xywh) + return DensePoseChartResult(labels=labels, uv=uv) + + +def resample_confidences_to_bbox( + predictor_output: DensePoseChartPredictorOutput, + labels: torch.Tensor, + box_xywh_abs: IntTupleBox, +) -> Dict[str, torch.Tensor]: + """ + Resamples confidences for the given bounding box + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output to be resampled + labels (tensor [H, W] of long): labels obtained by resampling segmentation + outputs for the given bounding box + box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs + Return: + Resampled confidences - a dict of [H, W] tensors of float + """ + + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + + confidence_names = [ + "sigma_1", + "sigma_2", + "kappa_u", + "kappa_v", + "fine_segm_confidence", + "coarse_segm_confidence", + ] + confidence_results = {key: None for key in confidence_names} + confidence_names = [ + key for key in confidence_names if getattr(predictor_output, key) is not None + ] + confidence_base = torch.zeros([h, w], dtype=torch.float32, device=predictor_output.u.device) + + # assign data from channels that correspond to the labels + for key in confidence_names: + resampled_confidence = F.interpolate( + getattr(predictor_output, key), + (h, w), + mode="bilinear", + align_corners=False, + ) + result = confidence_base.clone() + for part_id in range(1, predictor_output.u.size(1)): + if resampled_confidence.size(1) != predictor_output.u.size(1): + # confidence is not part-based, don't try to fill it part by part + continue + result[labels == part_id] = resampled_confidence[0, part_id][labels == part_id] + + if resampled_confidence.size(1) != predictor_output.u.size(1): + # confidence is not part-based, fill the data with the first channel + # (targeted for segmentation confidences that have only 1 channel) + result = resampled_confidence[0, 0] + + confidence_results[key] = result + + return confidence_results # pyre-ignore[7] + + +def densepose_chart_predictor_output_to_result_with_confidences( + predictor_output: DensePoseChartPredictorOutput, boxes: Boxes +) -> DensePoseChartResultWithConfidences: + """ + Convert densepose chart predictor outputs to results + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output with confidences to be converted to results, must contain only 1 output + boxes (Boxes): bounding box that corresponds to the predictor output, + must contain only 1 bounding box + Return: + DensePose chart-based result with confidences (DensePoseChartResultWithConfidences) + """ + assert len(predictor_output) == 1 and len(boxes) == 1, ( + f"Predictor output to result conversion can operate only single outputs" + f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes" + ) + + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + box_xywh = make_int_box(boxes_xywh_abs[0]) + + labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0) + uv = resample_uv_to_bbox(predictor_output, labels, box_xywh) + confidences = resample_confidences_to_bbox(predictor_output, labels, box_xywh) + return DensePoseChartResultWithConfidences(labels=labels, uv=uv, **confidences) diff --git a/densepose/converters/hflip.py b/densepose/converters/hflip.py new file mode 100644 index 0000000000000000000000000000000000000000..6df144280b2b84308acbb607e3313d0992faa68c --- /dev/null +++ b/densepose/converters/hflip.py @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any + +from .base import BaseConverter + + +class HFlipConverter(BaseConverter): + """ + Converts various DensePose predictor outputs to DensePose results. + Each DensePose predictor output type has to register its convertion strategy. + """ + + registry = {} + dst_type = None + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert(cls, predictor_outputs: Any, transform_data: Any, *args, **kwargs): + """ + Performs an horizontal flip on DensePose predictor outputs. + Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + predictor_outputs: DensePose predictor output to be converted to BitMasks + transform_data: Anything useful for the flip + Return: + An instance of the same type as predictor_outputs + """ + return super(HFlipConverter, cls).convert( + predictor_outputs, transform_data, *args, **kwargs + ) diff --git a/densepose/converters/segm_to_mask.py b/densepose/converters/segm_to_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..6433d5dec75c3d6141252af144b61d8999077bb7 --- /dev/null +++ b/densepose/converters/segm_to_mask.py @@ -0,0 +1,150 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any +import torch +from torch.nn import functional as F + +from detectron2.structures import BitMasks, Boxes, BoxMode + +from .base import IntTupleBox, make_int_box +from .to_mask import ImageSizeType + + +def resample_coarse_segm_tensor_to_bbox(coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox): + """ + Resample coarse segmentation tensor to the given + bounding box and derive labels for each pixel of the bounding box + + Args: + coarse_segm: float tensor of shape [1, K, Hout, Wout] + box_xywh_abs (tuple of 4 int): bounding box given by its upper-left + corner coordinates, width (W) and height (H) + Return: + Labels for each pixel of the bounding box, a long tensor of size [1, H, W] + """ + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + labels = F.interpolate(coarse_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) + return labels + + +def resample_fine_and_coarse_segm_tensors_to_bbox( + fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox +): + """ + Resample fine and coarse segmentation tensors to the given + bounding box and derive labels for each pixel of the bounding box + + Args: + fine_segm: float tensor of shape [1, C, Hout, Wout] + coarse_segm: float tensor of shape [1, K, Hout, Wout] + box_xywh_abs (tuple of 4 int): bounding box given by its upper-left + corner coordinates, width (W) and height (H) + Return: + Labels for each pixel of the bounding box, a long tensor of size [1, H, W] + """ + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + # coarse segmentation + coarse_segm_bbox = F.interpolate( + coarse_segm, + (h, w), + mode="bilinear", + align_corners=False, + ).argmax(dim=1) + # combined coarse and fine segmentation + labels = ( + F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) + * (coarse_segm_bbox > 0).long() + ) + return labels + + +def resample_fine_and_coarse_segm_to_bbox(predictor_output: Any, box_xywh_abs: IntTupleBox): + """ + Resample fine and coarse segmentation outputs from a predictor to the given + bounding box and derive labels for each pixel of the bounding box + + Args: + predictor_output: DensePose predictor output that contains segmentation + results to be resampled + box_xywh_abs (tuple of 4 int): bounding box given by its upper-left + corner coordinates, width (W) and height (H) + Return: + Labels for each pixel of the bounding box, a long tensor of size [1, H, W] + """ + return resample_fine_and_coarse_segm_tensors_to_bbox( + predictor_output.fine_segm, + predictor_output.coarse_segm, + box_xywh_abs, + ) + + +def predictor_output_with_coarse_segm_to_mask( + predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType +) -> BitMasks: + """ + Convert predictor output with coarse and fine segmentation to a mask. + Assumes that predictor output has the following attributes: + - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation + unnormalized scores for N instances; D is the number of coarse + segmentation labels, H and W is the resolution of the estimate + + Args: + predictor_output: DensePose predictor output to be converted to mask + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + image_size_hw (tuple [int, int]): image height Himg and width Wimg + Return: + BitMasks that contain a bool tensor of size [N, Himg, Wimg] with + a mask of the size of the image for each instance + """ + H, W = image_size_hw + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + N = len(boxes_xywh_abs) + masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device) + for i in range(len(boxes_xywh_abs)): + box_xywh = make_int_box(boxes_xywh_abs[i]) + box_mask = resample_coarse_segm_tensor_to_bbox(predictor_output[i].coarse_segm, box_xywh) + x, y, w, h = box_xywh + masks[i, y : y + h, x : x + w] = box_mask + + return BitMasks(masks) + + +def predictor_output_with_fine_and_coarse_segm_to_mask( + predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType +) -> BitMasks: + """ + Convert predictor output with coarse and fine segmentation to a mask. + Assumes that predictor output has the following attributes: + - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation + unnormalized scores for N instances; D is the number of coarse + segmentation labels, H and W is the resolution of the estimate + - fine_segm (tensor of size [N, C, H, W]): fine segmentation + unnormalized scores for N instances; C is the number of fine + segmentation labels, H and W is the resolution of the estimate + + Args: + predictor_output: DensePose predictor output to be converted to mask + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + image_size_hw (tuple [int, int]): image height Himg and width Wimg + Return: + BitMasks that contain a bool tensor of size [N, Himg, Wimg] with + a mask of the size of the image for each instance + """ + H, W = image_size_hw + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + N = len(boxes_xywh_abs) + masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device) + for i in range(len(boxes_xywh_abs)): + box_xywh = make_int_box(boxes_xywh_abs[i]) + labels_i = resample_fine_and_coarse_segm_to_bbox(predictor_output[i], box_xywh) + x, y, w, h = box_xywh + masks[i, y : y + h, x : x + w] = labels_i > 0 + return BitMasks(masks) diff --git a/densepose/converters/to_chart_result.py b/densepose/converters/to_chart_result.py new file mode 100644 index 0000000000000000000000000000000000000000..3eabd2614c285e8ea39d241b73f0d4b5762e6baa --- /dev/null +++ b/densepose/converters/to_chart_result.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any + +from detectron2.structures import Boxes + +from ..structures import DensePoseChartResult, DensePoseChartResultWithConfidences +from .base import BaseConverter + + +class ToChartResultConverter(BaseConverter): + """ + Converts various DensePose predictor outputs to DensePose results. + Each DensePose predictor output type has to register its convertion strategy. + """ + + registry = {} + dst_type = DensePoseChartResult + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert(cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs) -> DensePoseChartResult: + """ + Convert DensePose predictor outputs to DensePoseResult using some registered + converter. Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + densepose_predictor_outputs: DensePose predictor output to be + converted to BitMasks + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + Return: + An instance of DensePoseResult. If no suitable converter was found, raises KeyError + """ + return super(ToChartResultConverter, cls).convert(predictor_outputs, boxes, *args, **kwargs) + + +class ToChartResultConverterWithConfidences(BaseConverter): + """ + Converts various DensePose predictor outputs to DensePose results. + Each DensePose predictor output type has to register its convertion strategy. + """ + + registry = {} + dst_type = DensePoseChartResultWithConfidences + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert( + cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs + ) -> DensePoseChartResultWithConfidences: + """ + Convert DensePose predictor outputs to DensePoseResult with confidences + using some registered converter. Does recursive lookup for base classes, + so there's no need for explicit registration for derived classes. + + Args: + densepose_predictor_outputs: DensePose predictor output with confidences + to be converted to BitMasks + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + Return: + An instance of DensePoseResult. If no suitable converter was found, raises KeyError + """ + return super(ToChartResultConverterWithConfidences, cls).convert( + predictor_outputs, boxes, *args, **kwargs + ) diff --git a/densepose/converters/to_mask.py b/densepose/converters/to_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..a57fd71afc448a7d269a8a38c2014b14c8c5074f --- /dev/null +++ b/densepose/converters/to_mask.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, Tuple + +from detectron2.structures import BitMasks, Boxes + +from .base import BaseConverter + +ImageSizeType = Tuple[int, int] + + +class ToMaskConverter(BaseConverter): + """ + Converts various DensePose predictor outputs to masks + in bit mask format (see `BitMasks`). Each DensePose predictor output type + has to register its convertion strategy. + """ + + registry = {} + dst_type = BitMasks + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert( + cls, + densepose_predictor_outputs: Any, + boxes: Boxes, + image_size_hw: ImageSizeType, + *args, + **kwargs + ) -> BitMasks: + """ + Convert DensePose predictor outputs to BitMasks using some registered + converter. Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + densepose_predictor_outputs: DensePose predictor output to be + converted to BitMasks + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + image_size_hw (tuple [int, int]): image height and width + Return: + An instance of `BitMasks`. If no suitable converter was found, raises KeyError + """ + return super(ToMaskConverter, cls).convert( + densepose_predictor_outputs, boxes, image_size_hw, *args, **kwargs + ) diff --git a/densepose/data/__init__.py b/densepose/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf21ba75306970fd6a44069b49107320a84182b8 --- /dev/null +++ b/densepose/data/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .meshes import builtin +from .build import ( + build_detection_test_loader, + build_detection_train_loader, + build_combined_loader, + build_frame_selector, + build_inference_based_loaders, + has_inference_based_loaders, + BootstrapDatasetFactoryCatalog, +) +from .combined_loader import CombinedDataLoader +from .dataset_mapper import DatasetMapper +from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter +from .image_list_dataset import ImageListDataset +from .utils import is_relative_local_path, maybe_prepend_base_path + +# ensure the builtin datasets are registered +from . import datasets + +# ensure the bootstrap datasets builders are registered +from . import build + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/densepose/data/build.py b/densepose/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..39edbd89d88b7f66e4952add5d23289c8e7b9348 --- /dev/null +++ b/densepose/data/build.py @@ -0,0 +1,736 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import itertools +import logging +import numpy as np +from collections import UserDict, defaultdict +from dataclasses import dataclass +from typing import Any, Callable, Collection, Dict, Iterable, List, Optional, Sequence, Tuple +import torch +from torch.utils.data.dataset import Dataset + +from detectron2.config import CfgNode +from detectron2.data.build import build_detection_test_loader as d2_build_detection_test_loader +from detectron2.data.build import build_detection_train_loader as d2_build_detection_train_loader +from detectron2.data.build import ( + load_proposals_into_dataset, + print_instances_class_histogram, + trivial_batch_collator, + worker_init_reset_seed, +) +from detectron2.data.catalog import DatasetCatalog, Metadata, MetadataCatalog +from detectron2.data.samplers import TrainingSampler +from detectron2.utils.comm import get_world_size + +from densepose.config import get_bootstrap_dataset_config +from densepose.modeling import build_densepose_embedder + +from .combined_loader import CombinedDataLoader, Loader +from .dataset_mapper import DatasetMapper +from .datasets.coco import DENSEPOSE_CSE_KEYS_WITHOUT_MASK, DENSEPOSE_IUV_KEYS_WITHOUT_MASK +from .datasets.dataset_type import DatasetType +from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter +from .samplers import ( + DensePoseConfidenceBasedSampler, + DensePoseCSEConfidenceBasedSampler, + DensePoseCSEUniformSampler, + DensePoseUniformSampler, + MaskFromDensePoseSampler, + PredictionToGroundTruthSampler, +) +from .transform import ImageResizeTransform +from .utils import get_category_to_class_mapping, get_class_to_mesh_name_mapping +from .video import ( + FirstKFramesSelector, + FrameSelectionStrategy, + LastKFramesSelector, + RandomKFramesSelector, + VideoKeyframeDataset, + video_list_from_file, +) + +__all__ = ["build_detection_train_loader", "build_detection_test_loader"] + + +Instance = Dict[str, Any] +InstancePredicate = Callable[[Instance], bool] + + +def _compute_num_images_per_worker(cfg: CfgNode) -> int: + num_workers = get_world_size() + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + assert ( + images_per_batch % num_workers == 0 + ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( + images_per_batch, num_workers + ) + assert ( + images_per_batch >= num_workers + ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( + images_per_batch, num_workers + ) + images_per_worker = images_per_batch // num_workers + return images_per_worker + + +def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]) -> None: + meta = MetadataCatalog.get(dataset_name) + for dataset_dict in dataset_dicts: + for ann in dataset_dict["annotations"]: + ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]] + + +@dataclass +class _DatasetCategory: + """ + Class representing category data in a dataset: + - id: category ID, as specified in the dataset annotations file + - name: category name, as specified in the dataset annotations file + - mapped_id: category ID after applying category maps (DATASETS.CATEGORY_MAPS config option) + - mapped_name: category name after applying category maps + - dataset_name: dataset in which the category is defined + + For example, when training models in a class-agnostic manner, one could take LVIS 1.0 + dataset and map the animal categories to the same category as human data from COCO: + id = 225 + name = "cat" + mapped_id = 1 + mapped_name = "person" + dataset_name = "lvis_v1_animals_dp_train" + """ + + id: int + name: str + mapped_id: int + mapped_name: str + dataset_name: str + + +_MergedCategoriesT = Dict[int, List[_DatasetCategory]] + + +def _add_category_id_to_contiguous_id_maps_to_metadata( + merged_categories: _MergedCategoriesT, +) -> None: + merged_categories_per_dataset = {} + for contiguous_cat_id, cat_id in enumerate(sorted(merged_categories.keys())): + for cat in merged_categories[cat_id]: + if cat.dataset_name not in merged_categories_per_dataset: + merged_categories_per_dataset[cat.dataset_name] = defaultdict(list) + merged_categories_per_dataset[cat.dataset_name][cat_id].append( + ( + contiguous_cat_id, + cat, + ) + ) + + logger = logging.getLogger(__name__) + for dataset_name, merged_categories in merged_categories_per_dataset.items(): + meta = MetadataCatalog.get(dataset_name) + if not hasattr(meta, "thing_classes"): + meta.thing_classes = [] + meta.thing_dataset_id_to_contiguous_id = {} + meta.thing_dataset_id_to_merged_id = {} + else: + meta.thing_classes.clear() + meta.thing_dataset_id_to_contiguous_id.clear() + meta.thing_dataset_id_to_merged_id.clear() + logger.info(f"Dataset {dataset_name}: category ID to contiguous ID mapping:") + for _cat_id, categories in sorted(merged_categories.items()): + added_to_thing_classes = False + for contiguous_cat_id, cat in categories: + if not added_to_thing_classes: + meta.thing_classes.append(cat.mapped_name) + added_to_thing_classes = True + meta.thing_dataset_id_to_contiguous_id[cat.id] = contiguous_cat_id + meta.thing_dataset_id_to_merged_id[cat.id] = cat.mapped_id + logger.info(f"{cat.id} ({cat.name}) -> {contiguous_cat_id}") + + +def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + def has_annotations(instance: Instance) -> bool: + return "annotations" in instance + + def has_only_crowd_anotations(instance: Instance) -> bool: + for ann in instance["annotations"]: + if ann.get("is_crowd", 0) == 0: + return False + return True + + def general_keep_instance_predicate(instance: Instance) -> bool: + return has_annotations(instance) and not has_only_crowd_anotations(instance) + + if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS: + return None + return general_keep_instance_predicate + + +def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + + min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + + def has_sufficient_num_keypoints(instance: Instance) -> bool: + num_kpts = sum( + (np.array(ann["keypoints"][2::3]) > 0).sum() + for ann in instance["annotations"] + if "keypoints" in ann + ) + return num_kpts >= min_num_keypoints + + if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0): + return has_sufficient_num_keypoints + return None + + +def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + if not cfg.MODEL.MASK_ON: + return None + + def has_mask_annotations(instance: Instance) -> bool: + return any("segmentation" in ann for ann in instance["annotations"]) + + return has_mask_annotations + + +def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + if not cfg.MODEL.DENSEPOSE_ON: + return None + + use_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + + def has_densepose_annotations(instance: Instance) -> bool: + for ann in instance["annotations"]: + if all(key in ann for key in DENSEPOSE_IUV_KEYS_WITHOUT_MASK) or all( + key in ann for key in DENSEPOSE_CSE_KEYS_WITHOUT_MASK + ): + return True + if use_masks and "segmentation" in ann: + return True + return False + + return has_densepose_annotations + + +def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + specific_predicate_creators = [ + _maybe_create_keypoints_keep_instance_predicate, + _maybe_create_mask_keep_instance_predicate, + _maybe_create_densepose_keep_instance_predicate, + ] + predicates = [creator(cfg) for creator in specific_predicate_creators] + predicates = [p for p in predicates if p is not None] + if not predicates: + return None + + def combined_predicate(instance: Instance) -> bool: + return any(p(instance) for p in predicates) + + return combined_predicate + + +def _get_train_keep_instance_predicate(cfg: CfgNode): + general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg) + combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg) + + def combined_general_specific_keep_predicate(instance: Instance) -> bool: + return general_keep_predicate(instance) and combined_specific_keep_predicate(instance) + + if (general_keep_predicate is None) and (combined_specific_keep_predicate is None): + return None + if general_keep_predicate is None: + return combined_specific_keep_predicate + if combined_specific_keep_predicate is None: + return general_keep_predicate + return combined_general_specific_keep_predicate + + +def _get_test_keep_instance_predicate(cfg: CfgNode): + general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg) + return general_keep_predicate + + +def _maybe_filter_and_map_categories( + dataset_name: str, dataset_dicts: List[Instance] +) -> List[Instance]: + meta = MetadataCatalog.get(dataset_name) + category_id_map = meta.thing_dataset_id_to_contiguous_id + filtered_dataset_dicts = [] + for dataset_dict in dataset_dicts: + anns = [] + for ann in dataset_dict["annotations"]: + cat_id = ann["category_id"] + if cat_id not in category_id_map: + continue + ann["category_id"] = category_id_map[cat_id] + anns.append(ann) + dataset_dict["annotations"] = anns + filtered_dataset_dicts.append(dataset_dict) + return filtered_dataset_dicts + + +def _add_category_whitelists_to_metadata(cfg: CfgNode) -> None: + for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items(): + meta = MetadataCatalog.get(dataset_name) + meta.whitelisted_categories = whitelisted_cat_ids + logger = logging.getLogger(__name__) + logger.info( + "Whitelisted categories for dataset {}: {}".format( + dataset_name, meta.whitelisted_categories + ) + ) + + +def _add_category_maps_to_metadata(cfg: CfgNode) -> None: + for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items(): + category_map = { + int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items() + } + meta = MetadataCatalog.get(dataset_name) + meta.category_map = category_map + logger = logging.getLogger(__name__) + logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map)) + + +def _add_category_info_to_bootstrapping_metadata(dataset_name: str, dataset_cfg: CfgNode) -> None: + meta = MetadataCatalog.get(dataset_name) + meta.category_to_class_mapping = get_category_to_class_mapping(dataset_cfg) + meta.categories = dataset_cfg.CATEGORIES + meta.max_count_per_category = dataset_cfg.MAX_COUNT_PER_CATEGORY + logger = logging.getLogger(__name__) + logger.info( + "Category to class mapping for dataset {}: {}".format( + dataset_name, meta.category_to_class_mapping + ) + ) + + +def _maybe_add_class_to_mesh_name_map_to_metadata(dataset_names: List[str], cfg: CfgNode) -> None: + for dataset_name in dataset_names: + meta = MetadataCatalog.get(dataset_name) + if not hasattr(meta, "class_to_mesh_name"): + meta.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg) + + +def _merge_categories(dataset_names: Collection[str]) -> _MergedCategoriesT: + merged_categories = defaultdict(list) + category_names = {} + for dataset_name in dataset_names: + meta = MetadataCatalog.get(dataset_name) + whitelisted_categories = meta.get("whitelisted_categories") + category_map = meta.get("category_map", {}) + cat_ids = ( + whitelisted_categories if whitelisted_categories is not None else meta.categories.keys() + ) + for cat_id in cat_ids: + cat_name = meta.categories[cat_id] + cat_id_mapped = category_map.get(cat_id, cat_id) + if cat_id_mapped == cat_id or cat_id_mapped in cat_ids: + category_names[cat_id] = cat_name + else: + category_names[cat_id] = str(cat_id_mapped) + # assign temporary mapped category name, this name can be changed + # during the second pass, since mapped ID can correspond to a category + # from a different dataset + cat_name_mapped = meta.categories[cat_id_mapped] + merged_categories[cat_id_mapped].append( + _DatasetCategory( + id=cat_id, + name=cat_name, + mapped_id=cat_id_mapped, + mapped_name=cat_name_mapped, + dataset_name=dataset_name, + ) + ) + # second pass to assign proper mapped category names + for cat_id, categories in merged_categories.items(): + for cat in categories: + if cat_id in category_names and cat.mapped_name != category_names[cat_id]: + cat.mapped_name = category_names[cat_id] + + return merged_categories + + +def _warn_if_merged_different_categories(merged_categories: _MergedCategoriesT) -> None: + logger = logging.getLogger(__name__) + for cat_id in merged_categories: + merged_categories_i = merged_categories[cat_id] + first_cat_name = merged_categories_i[0].name + if len(merged_categories_i) > 1 and not all( + cat.name == first_cat_name for cat in merged_categories_i[1:] + ): + cat_summary_str = ", ".join( + [f"{cat.id} ({cat.name}) from {cat.dataset_name}" for cat in merged_categories_i] + ) + logger.warning( + f"Merged category {cat_id} corresponds to the following categories: " + f"{cat_summary_str}" + ) + + +def combine_detection_dataset_dicts( + dataset_names: Collection[str], + keep_instance_predicate: Optional[InstancePredicate] = None, + proposal_files: Optional[Collection[str]] = None, +) -> List[Instance]: + """ + Load and prepare dataset dicts for training / testing + + Args: + dataset_names (Collection[str]): a list of dataset names + keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate + applied to instance dicts which defines whether to keep the instance + proposal_files (Collection[str]): if given, a list of object proposal files + that match each dataset in `dataset_names`. + """ + assert len(dataset_names) + if proposal_files is None: + proposal_files = [None] * len(dataset_names) + assert len(dataset_names) == len(proposal_files) + # load datasets and metadata + dataset_name_to_dicts = {} + for dataset_name in dataset_names: + dataset_name_to_dicts[dataset_name] = DatasetCatalog.get(dataset_name) + assert len(dataset_name_to_dicts), f"Dataset '{dataset_name}' is empty!" + # merge categories, requires category metadata to be loaded + # cat_id -> [(orig_cat_id, cat_name, dataset_name)] + merged_categories = _merge_categories(dataset_names) + _warn_if_merged_different_categories(merged_categories) + merged_category_names = [ + merged_categories[cat_id][0].mapped_name for cat_id in sorted(merged_categories) + ] + # map to contiguous category IDs + _add_category_id_to_contiguous_id_maps_to_metadata(merged_categories) + # load annotations and dataset metadata + for dataset_name, proposal_file in zip(dataset_names, proposal_files): + dataset_dicts = dataset_name_to_dicts[dataset_name] + assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!" + if proposal_file is not None: + dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file) + dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts) + print_instances_class_histogram(dataset_dicts, merged_category_names) + dataset_name_to_dicts[dataset_name] = dataset_dicts + + if keep_instance_predicate is not None: + all_datasets_dicts_plain = [ + d + for d in itertools.chain.from_iterable(dataset_name_to_dicts.values()) + if keep_instance_predicate(d) + ] + else: + all_datasets_dicts_plain = list( + itertools.chain.from_iterable(dataset_name_to_dicts.values()) + ) + return all_datasets_dicts_plain + + +def build_detection_train_loader(cfg: CfgNode, mapper=None): + """ + A data loader is created in a way similar to that of Detectron2. + The main differences are: + - it allows to combine datasets with different but compatible object category sets + + The data loader is created by the following steps: + 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. + 2. Start workers to work on the dicts. Each worker will: + * Map each metadata dict into another format to be consumed by the model. + * Batch them by simply putting dicts into a list. + The batched ``list[mapped_dict]`` is what this dataloader will return. + + Args: + cfg (CfgNode): the config + mapper (callable): a callable which takes a sample (dict) from dataset and + returns the format to be consumed by the model. + By default it will be `DatasetMapper(cfg, True)`. + + Returns: + an infinite iterator of training data + """ + + _add_category_whitelists_to_metadata(cfg) + _add_category_maps_to_metadata(cfg) + _maybe_add_class_to_mesh_name_map_to_metadata(cfg.DATASETS.TRAIN, cfg) + dataset_dicts = combine_detection_dataset_dicts( + cfg.DATASETS.TRAIN, + keep_instance_predicate=_get_train_keep_instance_predicate(cfg), + proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, + ) + if mapper is None: + mapper = DatasetMapper(cfg, True) + return d2_build_detection_train_loader(cfg, dataset=dataset_dicts, mapper=mapper) + + +def build_detection_test_loader(cfg, dataset_name, mapper=None): + """ + Similar to `build_detection_train_loader`. + But this function uses the given `dataset_name` argument (instead of the names in cfg), + and uses batch size 1. + + Args: + cfg: a detectron2 CfgNode + dataset_name (str): a name of the dataset that's available in the DatasetCatalog + mapper (callable): a callable which takes a sample (dict) from dataset + and returns the format to be consumed by the model. + By default it will be `DatasetMapper(cfg, False)`. + + Returns: + DataLoader: a torch DataLoader, that loads the given detection + dataset, with test-time transformation and batching. + """ + _add_category_whitelists_to_metadata(cfg) + _add_category_maps_to_metadata(cfg) + _maybe_add_class_to_mesh_name_map_to_metadata([dataset_name], cfg) + dataset_dicts = combine_detection_dataset_dicts( + [dataset_name], + keep_instance_predicate=_get_test_keep_instance_predicate(cfg), + proposal_files=[ + cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] + ] + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + sampler = None + if not cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE: + sampler = torch.utils.data.SequentialSampler(dataset_dicts) + if mapper is None: + mapper = DatasetMapper(cfg, False) + return d2_build_detection_test_loader( + dataset_dicts, mapper=mapper, num_workers=cfg.DATALOADER.NUM_WORKERS, sampler=sampler + ) + + +def build_frame_selector(cfg: CfgNode): + strategy = FrameSelectionStrategy(cfg.STRATEGY) + if strategy == FrameSelectionStrategy.RANDOM_K: + frame_selector = RandomKFramesSelector(cfg.NUM_IMAGES) + elif strategy == FrameSelectionStrategy.FIRST_K: + frame_selector = FirstKFramesSelector(cfg.NUM_IMAGES) + elif strategy == FrameSelectionStrategy.LAST_K: + frame_selector = LastKFramesSelector(cfg.NUM_IMAGES) + elif strategy == FrameSelectionStrategy.ALL: + frame_selector = None + # pyre-fixme[61]: `frame_selector` may not be initialized here. + return frame_selector + + +def build_transform(cfg: CfgNode, data_type: str): + if cfg.TYPE == "resize": + if data_type == "image": + return ImageResizeTransform(cfg.MIN_SIZE, cfg.MAX_SIZE) + raise ValueError(f"Unknown transform {cfg.TYPE} for data type {data_type}") + + +def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]): + images_per_worker = _compute_num_images_per_worker(cfg) + return CombinedDataLoader(loaders, images_per_worker, ratios) + + +def build_bootstrap_dataset(dataset_name: str, cfg: CfgNode) -> Sequence[torch.Tensor]: + """ + Build dataset that provides data to bootstrap on + + Args: + dataset_name (str): Name of the dataset, needs to have associated metadata + to load the data + cfg (CfgNode): bootstrapping config + Returns: + Sequence[Tensor] - dataset that provides image batches, Tensors of size + [N, C, H, W] of type float32 + """ + logger = logging.getLogger(__name__) + _add_category_info_to_bootstrapping_metadata(dataset_name, cfg) + meta = MetadataCatalog.get(dataset_name) + factory = BootstrapDatasetFactoryCatalog.get(meta.dataset_type) + dataset = None + if factory is not None: + dataset = factory(meta, cfg) + if dataset is None: + logger.warning(f"Failed to create dataset {dataset_name} of type {meta.dataset_type}") + return dataset + + +def build_data_sampler(cfg: CfgNode, sampler_cfg: CfgNode, embedder: Optional[torch.nn.Module]): + if sampler_cfg.TYPE == "densepose_uniform": + data_sampler = PredictionToGroundTruthSampler() + # transform densepose pred -> gt + data_sampler.register_sampler( + "pred_densepose", + "gt_densepose", + DensePoseUniformSampler(count_per_class=sampler_cfg.COUNT_PER_CLASS), + ) + data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler()) + return data_sampler + elif sampler_cfg.TYPE == "densepose_UV_confidence": + data_sampler = PredictionToGroundTruthSampler() + # transform densepose pred -> gt + data_sampler.register_sampler( + "pred_densepose", + "gt_densepose", + DensePoseConfidenceBasedSampler( + confidence_channel="sigma_2", + count_per_class=sampler_cfg.COUNT_PER_CLASS, + search_proportion=0.5, + ), + ) + data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler()) + return data_sampler + elif sampler_cfg.TYPE == "densepose_fine_segm_confidence": + data_sampler = PredictionToGroundTruthSampler() + # transform densepose pred -> gt + data_sampler.register_sampler( + "pred_densepose", + "gt_densepose", + DensePoseConfidenceBasedSampler( + confidence_channel="fine_segm_confidence", + count_per_class=sampler_cfg.COUNT_PER_CLASS, + search_proportion=0.5, + ), + ) + data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler()) + return data_sampler + elif sampler_cfg.TYPE == "densepose_coarse_segm_confidence": + data_sampler = PredictionToGroundTruthSampler() + # transform densepose pred -> gt + data_sampler.register_sampler( + "pred_densepose", + "gt_densepose", + DensePoseConfidenceBasedSampler( + confidence_channel="coarse_segm_confidence", + count_per_class=sampler_cfg.COUNT_PER_CLASS, + search_proportion=0.5, + ), + ) + data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler()) + return data_sampler + elif sampler_cfg.TYPE == "densepose_cse_uniform": + assert embedder is not None + data_sampler = PredictionToGroundTruthSampler() + # transform densepose pred -> gt + data_sampler.register_sampler( + "pred_densepose", + "gt_densepose", + DensePoseCSEUniformSampler( + cfg=cfg, + use_gt_categories=sampler_cfg.USE_GROUND_TRUTH_CATEGORIES, + embedder=embedder, + count_per_class=sampler_cfg.COUNT_PER_CLASS, + ), + ) + data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler()) + return data_sampler + elif sampler_cfg.TYPE == "densepose_cse_coarse_segm_confidence": + assert embedder is not None + data_sampler = PredictionToGroundTruthSampler() + # transform densepose pred -> gt + data_sampler.register_sampler( + "pred_densepose", + "gt_densepose", + DensePoseCSEConfidenceBasedSampler( + cfg=cfg, + use_gt_categories=sampler_cfg.USE_GROUND_TRUTH_CATEGORIES, + embedder=embedder, + confidence_channel="coarse_segm_confidence", + count_per_class=sampler_cfg.COUNT_PER_CLASS, + search_proportion=0.5, + ), + ) + data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler()) + return data_sampler + + raise ValueError(f"Unknown data sampler type {sampler_cfg.TYPE}") + + +def build_data_filter(cfg: CfgNode): + if cfg.TYPE == "detection_score": + min_score = cfg.MIN_VALUE + return ScoreBasedFilter(min_score=min_score) + raise ValueError(f"Unknown data filter type {cfg.TYPE}") + + +def build_inference_based_loader( + cfg: CfgNode, + dataset_cfg: CfgNode, + model: torch.nn.Module, + embedder: Optional[torch.nn.Module] = None, +) -> InferenceBasedLoader: + """ + Constructs data loader based on inference results of a model. + """ + dataset = build_bootstrap_dataset(dataset_cfg.DATASET, dataset_cfg.IMAGE_LOADER) + meta = MetadataCatalog.get(dataset_cfg.DATASET) + training_sampler = TrainingSampler(len(dataset)) + data_loader = torch.utils.data.DataLoader( + dataset, # pyre-ignore[6] + batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE, + sampler=training_sampler, + num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS, + collate_fn=trivial_batch_collator, + worker_init_fn=worker_init_reset_seed, + ) + return InferenceBasedLoader( + model, + data_loader=data_loader, + data_sampler=build_data_sampler(cfg, dataset_cfg.DATA_SAMPLER, embedder), + data_filter=build_data_filter(dataset_cfg.FILTER), + shuffle=True, + batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE, + inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE, + category_to_class_mapping=meta.category_to_class_mapping, + ) + + +def has_inference_based_loaders(cfg: CfgNode) -> bool: + """ + Returns True, if at least one inferense-based loader must + be instantiated for training + """ + return len(cfg.BOOTSTRAP_DATASETS) > 0 + + +def build_inference_based_loaders( + cfg: CfgNode, model: torch.nn.Module +) -> Tuple[List[InferenceBasedLoader], List[float]]: + loaders = [] + ratios = [] + embedder = build_densepose_embedder(cfg).to(device=model.device) # pyre-ignore[16] + for dataset_spec in cfg.BOOTSTRAP_DATASETS: + dataset_cfg = get_bootstrap_dataset_config().clone() + dataset_cfg.merge_from_other_cfg(CfgNode(dataset_spec)) + loader = build_inference_based_loader(cfg, dataset_cfg, model, embedder) + loaders.append(loader) + ratios.append(dataset_cfg.RATIO) + return loaders, ratios + + +def build_video_list_dataset(meta: Metadata, cfg: CfgNode): + video_list_fpath = meta.video_list_fpath + video_base_path = meta.video_base_path + category = meta.category + if cfg.TYPE == "video_keyframe": + frame_selector = build_frame_selector(cfg.SELECT) + transform = build_transform(cfg.TRANSFORM, data_type="image") + video_list = video_list_from_file(video_list_fpath, video_base_path) + keyframe_helper_fpath = getattr(cfg, "KEYFRAME_HELPER", None) + return VideoKeyframeDataset( + video_list, category, frame_selector, transform, keyframe_helper_fpath + ) + + +class _BootstrapDatasetFactoryCatalog(UserDict): + """ + A global dictionary that stores information about bootstrapped datasets creation functions + from metadata and config, for diverse DatasetType + """ + + def register(self, dataset_type: DatasetType, factory: Callable[[Metadata, CfgNode], Dataset]): + """ + Args: + dataset_type (DatasetType): a DatasetType e.g. DatasetType.VIDEO_LIST + factory (Callable[Metadata, CfgNode]): a callable which takes Metadata and cfg + arguments and returns a dataset object. + """ + assert dataset_type not in self, "Dataset '{}' is already registered!".format(dataset_type) + self[dataset_type] = factory + + +BootstrapDatasetFactoryCatalog = _BootstrapDatasetFactoryCatalog() +BootstrapDatasetFactoryCatalog.register(DatasetType.VIDEO_LIST, build_video_list_dataset) diff --git a/densepose/data/combined_loader.py b/densepose/data/combined_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfbbdeaf53e184b83a6e0f951867b79d3d9f1fd --- /dev/null +++ b/densepose/data/combined_loader.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import random +from collections import deque +from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence + +Loader = Iterable[Any] + + +def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]): + if not pool: + pool.extend(next(iterator)) + return pool.popleft() + + +class CombinedDataLoader: + """ + Combines data loaders using the provided sampling ratios + """ + + BATCH_COUNT = 100 + + def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]): + self.loaders = loaders + self.batch_size = batch_size + self.ratios = ratios + + def __iter__(self) -> Iterator[List[Any]]: + iters = [iter(loader) for loader in self.loaders] + indices = [] + pool = [deque()] * len(iters) + # infinite iterator, as in D2 + while True: + if not indices: + # just a buffer of indices, its size doesn't matter + # as long as it's a multiple of batch_size + k = self.batch_size * self.BATCH_COUNT + indices = random.choices(range(len(self.loaders)), self.ratios, k=k) + try: + batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]] + except StopIteration: + break + indices = indices[self.batch_size :] + yield batch diff --git a/densepose/data/dataset_mapper.py b/densepose/data/dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..3229c4d7b9eab3e8e2d4f895d5209dd655d716a5 --- /dev/null +++ b/densepose/data/dataset_mapper.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +import copy +import logging +from typing import Any, Dict, List, Tuple +import torch + +from detectron2.data import MetadataCatalog +from detectron2.data import detection_utils as utils +from detectron2.data import transforms as T +from detectron2.layers import ROIAlign +from detectron2.structures import BoxMode +from detectron2.utils.file_io import PathManager + +from densepose.structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData + + +def build_augmentation(cfg, is_train): + logger = logging.getLogger(__name__) + result = utils.build_augmentation(cfg, is_train) + if is_train: + random_rotation = T.RandomRotation( + cfg.INPUT.ROTATION_ANGLES, expand=False, sample_style="choice" + ) + result.append(random_rotation) + logger.info("DensePose-specific augmentation used in training: " + str(random_rotation)) + return result + + +class DatasetMapper: + """ + A customized version of `detectron2.data.DatasetMapper` + """ + + def __init__(self, cfg, is_train=True): + self.augmentation = build_augmentation(cfg, is_train) + + # fmt: off + self.img_format = cfg.INPUT.FORMAT + self.mask_on = ( + cfg.MODEL.MASK_ON or ( + cfg.MODEL.DENSEPOSE_ON + and cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS) + ) + self.keypoint_on = cfg.MODEL.KEYPOINT_ON + self.densepose_on = cfg.MODEL.DENSEPOSE_ON + assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet" + # fmt: on + if self.keypoint_on and is_train: + # Flip only makes sense in training + self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) + else: + self.keypoint_hflip_indices = None + + if self.densepose_on: + densepose_transform_srcs = [ + MetadataCatalog.get(ds).densepose_transform_src + for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST + ] + assert len(densepose_transform_srcs) > 0 + # TODO: check that DensePose transformation data is the same for + # all the datasets. Otherwise one would have to pass DB ID with + # each entry to select proper transformation data. For now, since + # all DensePose annotated data uses the same data semantics, we + # omit this check. + densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0]) + self.densepose_transform_data = DensePoseTransformData.load( + densepose_transform_data_fpath + ) + + self.is_train = is_train + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + + image, transforms = T.apply_transform_gens(self.augmentation, image) + image_shape = image.shape[:2] # h, w + dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32")) + + if not self.is_train: + dataset_dict.pop("annotations", None) + return dataset_dict + + for anno in dataset_dict["annotations"]: + if not self.mask_on: + anno.pop("segmentation", None) + if not self.keypoint_on: + anno.pop("keypoints", None) + + # USER: Implement additional transformations if you have other types of data + # USER: Don't call transpose_densepose if you don't need + annos = [ + self._transform_densepose( + utils.transform_instance_annotations( + obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices + ), + transforms, + ) + for obj in dataset_dict.pop("annotations") + if obj.get("iscrowd", 0) == 0 + ] + + if self.mask_on: + self._add_densepose_masks_as_segmentation(annos, image_shape) + + instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask") + densepose_annotations = [obj.get("densepose") for obj in annos] + if densepose_annotations and not all(v is None for v in densepose_annotations): + instances.gt_densepose = DensePoseList( + densepose_annotations, instances.gt_boxes, image_shape + ) + + dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()] + return dataset_dict + + def _transform_densepose(self, annotation, transforms): + if not self.densepose_on: + return annotation + + # Handle densepose annotations + is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation) + if is_valid: + densepose_data = DensePoseDataRelative(annotation, cleanup=True) + densepose_data.apply_transform(transforms, self.densepose_transform_data) + annotation["densepose"] = densepose_data + else: + # logger = logging.getLogger(__name__) + # logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid)) + DensePoseDataRelative.cleanup_annotation(annotation) + # NOTE: annotations for certain instances may be unavailable. + # 'None' is accepted by the DensePostList data structure. + annotation["densepose"] = None + return annotation + + def _add_densepose_masks_as_segmentation( + self, annotations: List[Dict[str, Any]], image_shape_hw: Tuple[int, int] + ): + for obj in annotations: + if ("densepose" not in obj) or ("segmentation" in obj): + continue + # DP segmentation: torch.Tensor [S, S] of float32, S=256 + segm_dp = torch.zeros_like(obj["densepose"].segm) + segm_dp[obj["densepose"].segm > 0] = 1 + segm_h, segm_w = segm_dp.shape + bbox_segm_dp = torch.tensor((0, 0, segm_h - 1, segm_w - 1), dtype=torch.float32) + # image bbox + x0, y0, x1, y1 = ( + v.item() for v in BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) + ) + segm_aligned = ( + ROIAlign((y1 - y0, x1 - x0), 1.0, 0, aligned=True) + .forward(segm_dp.view(1, 1, *segm_dp.shape), bbox_segm_dp) + .squeeze() + ) + image_mask = torch.zeros(*image_shape_hw, dtype=torch.float32) + image_mask[y0:y1, x0:x1] = segm_aligned + # segmentation for BitMask: np.array [H, W] of bool + obj["segmentation"] = image_mask >= 0.5 diff --git a/densepose/data/datasets/__init__.py b/densepose/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..260ccb9c43e5aa2d0f1fd28cfcbdd4f31913d16a --- /dev/null +++ b/densepose/data/datasets/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from . import builtin # ensure the builtin datasets are registered + +__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] diff --git a/densepose/data/datasets/builtin.py b/densepose/data/datasets/builtin.py new file mode 100644 index 0000000000000000000000000000000000000000..7572cd6abc550fdce9d1fd079a7af4870de303bb --- /dev/null +++ b/densepose/data/datasets/builtin.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .chimpnsee import register_dataset as register_chimpnsee_dataset +from .coco import BASE_DATASETS as BASE_COCO_DATASETS +from .coco import DATASETS as COCO_DATASETS +from .coco import register_datasets as register_coco_datasets +from .lvis import DATASETS as LVIS_DATASETS +from .lvis import register_datasets as register_lvis_datasets + +DEFAULT_DATASETS_ROOT = "datasets" + + +register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT) +register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT) +register_lvis_datasets(LVIS_DATASETS, DEFAULT_DATASETS_ROOT) + +register_chimpnsee_dataset(DEFAULT_DATASETS_ROOT) # pyre-ignore[19] diff --git a/densepose/data/datasets/chimpnsee.py b/densepose/data/datasets/chimpnsee.py new file mode 100644 index 0000000000000000000000000000000000000000..61e0b506dc4ed6ad78c9c4ce4677415a27f5f6cd --- /dev/null +++ b/densepose/data/datasets/chimpnsee.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Optional + +from detectron2.data import DatasetCatalog, MetadataCatalog + +from ..utils import maybe_prepend_base_path +from .dataset_type import DatasetType + +CHIMPNSEE_DATASET_NAME = "chimpnsee" + + +def register_dataset(datasets_root: Optional[str] = None) -> None: + def empty_load_callback(): + pass + + video_list_fpath = maybe_prepend_base_path( + datasets_root, + "chimpnsee/cdna.eva.mpg.de/video_list.txt", + ) + video_base_path = maybe_prepend_base_path(datasets_root, "chimpnsee/cdna.eva.mpg.de") + + DatasetCatalog.register(CHIMPNSEE_DATASET_NAME, empty_load_callback) + MetadataCatalog.get(CHIMPNSEE_DATASET_NAME).set( + dataset_type=DatasetType.VIDEO_LIST, + video_list_fpath=video_list_fpath, + video_base_path=video_base_path, + category="chimpanzee", + ) diff --git a/densepose/data/datasets/coco.py b/densepose/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..c19f7b034b1641c9ccd88634f12fcdc3013bce09 --- /dev/null +++ b/densepose/data/datasets/coco.py @@ -0,0 +1,432 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import contextlib +import io +import logging +import os +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional +from fvcore.common.timer import Timer + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import BoxMode +from detectron2.utils.file_io import PathManager + +from ..utils import maybe_prepend_base_path + +DENSEPOSE_MASK_KEY = "dp_masks" +DENSEPOSE_IUV_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"] +DENSEPOSE_CSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_vertex", "ref_model"] +DENSEPOSE_ALL_POSSIBLE_KEYS = set( + DENSEPOSE_IUV_KEYS_WITHOUT_MASK + DENSEPOSE_CSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY] +) +DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/" + + +@dataclass +class CocoDatasetInfo: + name: str + images_root: str + annotations_fpath: str + + +DATASETS = [ + CocoDatasetInfo( + name="densepose_coco_2014_train", + images_root="coco/train2014", + annotations_fpath="coco/annotations/densepose_train2014.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_minival", + images_root="coco/val2014", + annotations_fpath="coco/annotations/densepose_minival2014.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_minival_100", + images_root="coco/val2014", + annotations_fpath="coco/annotations/densepose_minival2014_100.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_valminusminival", + images_root="coco/val2014", + annotations_fpath="coco/annotations/densepose_valminusminival2014.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_train_cse", + images_root="coco/train2014", + annotations_fpath="coco_cse/densepose_train2014_cse.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_minival_cse", + images_root="coco/val2014", + annotations_fpath="coco_cse/densepose_minival2014_cse.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_minival_100_cse", + images_root="coco/val2014", + annotations_fpath="coco_cse/densepose_minival2014_100_cse.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_valminusminival_cse", + images_root="coco/val2014", + annotations_fpath="coco_cse/densepose_valminusminival2014_cse.json", + ), + CocoDatasetInfo( + name="densepose_chimps", + images_root="densepose_chimps/images", + annotations_fpath="densepose_chimps/densepose_chimps_densepose.json", + ), + CocoDatasetInfo( + name="densepose_chimps_cse_train", + images_root="densepose_chimps/images", + annotations_fpath="densepose_chimps/densepose_chimps_cse_train.json", + ), + CocoDatasetInfo( + name="densepose_chimps_cse_val", + images_root="densepose_chimps/images", + annotations_fpath="densepose_chimps/densepose_chimps_cse_val.json", + ), + CocoDatasetInfo( + name="posetrack2017_train", + images_root="posetrack2017/posetrack_data_2017", + annotations_fpath="posetrack2017/densepose_posetrack_train2017.json", + ), + CocoDatasetInfo( + name="posetrack2017_val", + images_root="posetrack2017/posetrack_data_2017", + annotations_fpath="posetrack2017/densepose_posetrack_val2017.json", + ), + CocoDatasetInfo( + name="lvis_v05_train", + images_root="coco/train2017", + annotations_fpath="lvis/lvis_v0.5_plus_dp_train.json", + ), + CocoDatasetInfo( + name="lvis_v05_val", + images_root="coco/val2017", + annotations_fpath="lvis/lvis_v0.5_plus_dp_val.json", + ), +] + + +BASE_DATASETS = [ + CocoDatasetInfo( + name="base_coco_2017_train", + images_root="coco/train2017", + annotations_fpath="coco/annotations/instances_train2017.json", + ), + CocoDatasetInfo( + name="base_coco_2017_val", + images_root="coco/val2017", + annotations_fpath="coco/annotations/instances_val2017.json", + ), + CocoDatasetInfo( + name="base_coco_2017_val_100", + images_root="coco/val2017", + annotations_fpath="coco/annotations/instances_val2017_100.json", + ), +] + + +def get_metadata(base_path: Optional[str]) -> Dict[str, Any]: + """ + Returns metadata associated with COCO DensePose datasets + + Args: + base_path: Optional[str] + Base path used to load metadata from + + Returns: + Dict[str, Any] + Metadata in the form of a dictionary + """ + meta = { + "densepose_transform_src": maybe_prepend_base_path(base_path, "UV_symmetry_transforms.mat"), + "densepose_smpl_subdiv": maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"), + "densepose_smpl_subdiv_transform": maybe_prepend_base_path( + base_path, + "SMPL_SUBDIV_TRANSFORM.mat", + ), + } + return meta + + +def _load_coco_annotations(json_file: str): + """ + Load COCO annotations from a JSON file + + Args: + json_file: str + Path to the file to load annotations from + Returns: + Instance of `pycocotools.coco.COCO` that provides access to annotations + data + """ + from pycocotools.coco import COCO + + logger = logging.getLogger(__name__) + timer = Timer() + with contextlib.redirect_stdout(io.StringIO()): + coco_api = COCO(json_file) + if timer.seconds() > 1: + logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) + return coco_api + + +def _add_categories_metadata(dataset_name: str, categories: List[Dict[str, Any]]): + meta = MetadataCatalog.get(dataset_name) + meta.categories = {c["id"]: c["name"] for c in categories} + logger = logging.getLogger(__name__) + logger.info("Dataset {} categories: {}".format(dataset_name, meta.categories)) + + +def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]): + if "minival" in json_file: + # Skip validation on COCO2014 valminusminival and minival annotations + # The ratio of buggy annotations there is tiny and does not affect accuracy + # Therefore we explicitly white-list them + return + ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( + json_file + ) + + +def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + if "bbox" not in ann_dict: + return + obj["bbox"] = ann_dict["bbox"] + obj["bbox_mode"] = BoxMode.XYWH_ABS + + +def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + if "segmentation" not in ann_dict: + return + segm = ann_dict["segmentation"] + if not isinstance(segm, dict): + # filter out invalid polygons (< 3 points) + segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] + if len(segm) == 0: + return + obj["segmentation"] = segm + + +def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + if "keypoints" not in ann_dict: + return + keypts = ann_dict["keypoints"] # list[int] + for idx, v in enumerate(keypts): + if idx % 3 != 2: + # COCO's segmentation coordinates are floating points in [0, H or W], + # but keypoint coordinates are integers in [0, H-1 or W-1] + # Therefore we assume the coordinates are "pixel indices" and + # add 0.5 to convert to floating point coordinates. + keypts[idx] = v + 0.5 + obj["keypoints"] = keypts + + +def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + for key in DENSEPOSE_ALL_POSSIBLE_KEYS: + if key in ann_dict: + obj[key] = ann_dict[key] + + +def _combine_images_with_annotations( + dataset_name: str, + image_root: str, + img_datas: Iterable[Dict[str, Any]], + ann_datas: Iterable[Iterable[Dict[str, Any]]], +): + + ann_keys = ["iscrowd", "category_id"] + dataset_dicts = [] + contains_video_frame_info = False + + for img_dict, ann_dicts in zip(img_datas, ann_datas): + record = {} + record["file_name"] = os.path.join(image_root, img_dict["file_name"]) + record["height"] = img_dict["height"] + record["width"] = img_dict["width"] + record["image_id"] = img_dict["id"] + record["dataset"] = dataset_name + if "frame_id" in img_dict: + record["frame_id"] = img_dict["frame_id"] + record["video_id"] = img_dict.get("vid_id", None) + contains_video_frame_info = True + objs = [] + for ann_dict in ann_dicts: + assert ann_dict["image_id"] == record["image_id"] + assert ann_dict.get("ignore", 0) == 0 + obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict} + _maybe_add_bbox(obj, ann_dict) + _maybe_add_segm(obj, ann_dict) + _maybe_add_keypoints(obj, ann_dict) + _maybe_add_densepose(obj, ann_dict) + objs.append(obj) + record["annotations"] = objs + dataset_dicts.append(record) + if contains_video_frame_info: + create_video_frame_mapping(dataset_name, dataset_dicts) + return dataset_dicts + + +def get_contiguous_id_to_category_id_map(metadata): + cat_id_2_cont_id = metadata.thing_dataset_id_to_contiguous_id + cont_id_2_cat_id = {} + for cat_id, cont_id in cat_id_2_cont_id.items(): + if cont_id in cont_id_2_cat_id: + continue + cont_id_2_cat_id[cont_id] = cat_id + return cont_id_2_cat_id + + +def maybe_filter_categories_cocoapi(dataset_name, coco_api): + meta = MetadataCatalog.get(dataset_name) + cont_id_2_cat_id = get_contiguous_id_to_category_id_map(meta) + cat_id_2_cont_id = meta.thing_dataset_id_to_contiguous_id + # filter categories + cats = [] + for cat in coco_api.dataset["categories"]: + cat_id = cat["id"] + if cat_id not in cat_id_2_cont_id: + continue + cont_id = cat_id_2_cont_id[cat_id] + if (cont_id in cont_id_2_cat_id) and (cont_id_2_cat_id[cont_id] == cat_id): + cats.append(cat) + coco_api.dataset["categories"] = cats + # filter annotations, if multiple categories are mapped to a single + # contiguous ID, use only one category ID and map all annotations to that category ID + anns = [] + for ann in coco_api.dataset["annotations"]: + cat_id = ann["category_id"] + if cat_id not in cat_id_2_cont_id: + continue + cont_id = cat_id_2_cont_id[cat_id] + ann["category_id"] = cont_id_2_cat_id[cont_id] + anns.append(ann) + coco_api.dataset["annotations"] = anns + # recreate index + coco_api.createIndex() + + +def maybe_filter_and_map_categories_cocoapi(dataset_name, coco_api): + meta = MetadataCatalog.get(dataset_name) + category_id_map = meta.thing_dataset_id_to_contiguous_id + # map categories + cats = [] + for cat in coco_api.dataset["categories"]: + cat_id = cat["id"] + if cat_id not in category_id_map: + continue + cat["id"] = category_id_map[cat_id] + cats.append(cat) + coco_api.dataset["categories"] = cats + # map annotation categories + anns = [] + for ann in coco_api.dataset["annotations"]: + cat_id = ann["category_id"] + if cat_id not in category_id_map: + continue + ann["category_id"] = category_id_map[cat_id] + anns.append(ann) + coco_api.dataset["annotations"] = anns + # recreate index + coco_api.createIndex() + + +def create_video_frame_mapping(dataset_name, dataset_dicts): + mapping = defaultdict(dict) + for d in dataset_dicts: + video_id = d.get("video_id") + if video_id is None: + continue + mapping[video_id].update({d["frame_id"]: d["file_name"]}) + MetadataCatalog.get(dataset_name).set(video_frame_mapping=mapping) + + +def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str): + """ + Loads a JSON file with annotations in COCO instances format. + Replaces `detectron2.data.datasets.coco.load_coco_json` to handle metadata + in a more flexible way. Postpones category mapping to a later stage to be + able to combine several datasets with different (but coherent) sets of + categories. + + Args: + + annotations_json_file: str + Path to the JSON file with annotations in COCO instances format. + image_root: str + directory that contains all the images + dataset_name: str + the name that identifies a dataset, e.g. "densepose_coco_2014_train" + extra_annotation_keys: Optional[List[str]] + If provided, these keys are used to extract additional data from + the annotations. + """ + coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file)) + _add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds())) + # sort indices for reproducible results + img_ids = sorted(coco_api.imgs.keys()) + # imgs is a list of dicts, each looks something like: + # {'license': 4, + # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', + # 'file_name': 'COCO_val2014_000000001268.jpg', + # 'height': 427, + # 'width': 640, + # 'date_captured': '2013-11-17 05:57:24', + # 'id': 1268} + imgs = coco_api.loadImgs(img_ids) + logger = logging.getLogger(__name__) + logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file)) + # anns is a list[list[dict]], where each dict is an annotation + # record for an object. The inner list enumerates the objects in an image + # and the outer list enumerates over images. + anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] + _verify_annotations_have_unique_ids(annotations_json_file, anns) + dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns) + return dataset_records + + +def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[str] = None): + """ + Registers provided COCO DensePose dataset + + Args: + dataset_data: CocoDatasetInfo + Dataset data + datasets_root: Optional[str] + Datasets root folder (default: None) + """ + annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath) + images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root) + + def load_annotations(): + return load_coco_json( + annotations_json_file=annotations_fpath, + image_root=images_root, + dataset_name=dataset_data.name, + ) + + DatasetCatalog.register(dataset_data.name, load_annotations) + MetadataCatalog.get(dataset_data.name).set( + json_file=annotations_fpath, + image_root=images_root, + **get_metadata(DENSEPOSE_METADATA_URL_PREFIX) + ) + + +def register_datasets( + datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[str] = None +): + """ + Registers provided COCO DensePose datasets + + Args: + datasets_data: Iterable[CocoDatasetInfo] + An iterable of dataset datas + datasets_root: Optional[str] + Datasets root folder (default: None) + """ + for dataset_data in datasets_data: + register_dataset(dataset_data, datasets_root) diff --git a/densepose/data/datasets/dataset_type.py b/densepose/data/datasets/dataset_type.py new file mode 100644 index 0000000000000000000000000000000000000000..ed8f8f299af96847d9d16a77920429fe0195c526 --- /dev/null +++ b/densepose/data/datasets/dataset_type.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from enum import Enum + + +class DatasetType(Enum): + """ + Dataset type, mostly used for datasets that contain data to bootstrap models on + """ + + VIDEO_LIST = "video_list" diff --git a/densepose/data/datasets/lvis.py b/densepose/data/datasets/lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..b4af9fa292f445c81dc840ab53d07c1af313dfc7 --- /dev/null +++ b/densepose/data/datasets/lvis.py @@ -0,0 +1,257 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import os +from typing import Any, Dict, Iterable, List, Optional +from fvcore.common.timer import Timer + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets.lvis import get_lvis_instances_meta +from detectron2.structures import BoxMode +from detectron2.utils.file_io import PathManager + +from ..utils import maybe_prepend_base_path +from .coco import ( + DENSEPOSE_ALL_POSSIBLE_KEYS, + DENSEPOSE_METADATA_URL_PREFIX, + CocoDatasetInfo, + get_metadata, +) + +DATASETS = [ + CocoDatasetInfo( + name="densepose_lvis_v1_ds1_train_v1", + images_root="coco_", + annotations_fpath="lvis/densepose_lvis_v1_ds1_train_v1.json", + ), + CocoDatasetInfo( + name="densepose_lvis_v1_ds1_val_v1", + images_root="coco_", + annotations_fpath="lvis/densepose_lvis_v1_ds1_val_v1.json", + ), + CocoDatasetInfo( + name="densepose_lvis_v1_ds2_train_v1", + images_root="coco_", + annotations_fpath="lvis/densepose_lvis_v1_ds2_train_v1.json", + ), + CocoDatasetInfo( + name="densepose_lvis_v1_ds2_val_v1", + images_root="coco_", + annotations_fpath="lvis/densepose_lvis_v1_ds2_val_v1.json", + ), + CocoDatasetInfo( + name="densepose_lvis_v1_ds1_val_animals_100", + images_root="coco_", + annotations_fpath="lvis/densepose_lvis_v1_val_animals_100_v2.json", + ), +] + + +def _load_lvis_annotations(json_file: str): + """ + Load COCO annotations from a JSON file + + Args: + json_file: str + Path to the file to load annotations from + Returns: + Instance of `pycocotools.coco.COCO` that provides access to annotations + data + """ + from lvis import LVIS + + json_file = PathManager.get_local_path(json_file) + logger = logging.getLogger(__name__) + timer = Timer() + lvis_api = LVIS(json_file) + if timer.seconds() > 1: + logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) + return lvis_api + + +def _add_categories_metadata(dataset_name: str) -> None: + metadict = get_lvis_instances_meta(dataset_name) + categories = metadict["thing_classes"] + metadata = MetadataCatalog.get(dataset_name) + metadata.categories = {i + 1: categories[i] for i in range(len(categories))} + logger = logging.getLogger(__name__) + logger.info(f"Dataset {dataset_name} has {len(categories)} categories") + + +def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]) -> None: + ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( + json_file + ) + + +def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None: + if "bbox" not in ann_dict: + return + obj["bbox"] = ann_dict["bbox"] + obj["bbox_mode"] = BoxMode.XYWH_ABS + + +def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None: + if "segmentation" not in ann_dict: + return + segm = ann_dict["segmentation"] + if not isinstance(segm, dict): + # filter out invalid polygons (< 3 points) + segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] + if len(segm) == 0: + return + obj["segmentation"] = segm + + +def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None: + if "keypoints" not in ann_dict: + return + keypts = ann_dict["keypoints"] # list[int] + for idx, v in enumerate(keypts): + if idx % 3 != 2: + # COCO's segmentation coordinates are floating points in [0, H or W], + # but keypoint coordinates are integers in [0, H-1 or W-1] + # Therefore we assume the coordinates are "pixel indices" and + # add 0.5 to convert to floating point coordinates. + keypts[idx] = v + 0.5 + obj["keypoints"] = keypts + + +def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None: + for key in DENSEPOSE_ALL_POSSIBLE_KEYS: + if key in ann_dict: + obj[key] = ann_dict[key] + + +def _combine_images_with_annotations( + dataset_name: str, + image_root: str, + img_datas: Iterable[Dict[str, Any]], + ann_datas: Iterable[Iterable[Dict[str, Any]]], +): + + dataset_dicts = [] + + def get_file_name(img_root, img_dict): + # Determine the path including the split folder ("train2017", "val2017", "test2017") from + # the coco_url field. Example: + # 'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg' + split_folder, file_name = img_dict["coco_url"].split("/")[-2:] + return os.path.join(img_root + split_folder, file_name) + + for img_dict, ann_dicts in zip(img_datas, ann_datas): + record = {} + record["file_name"] = get_file_name(image_root, img_dict) + record["height"] = img_dict["height"] + record["width"] = img_dict["width"] + record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", []) + record["neg_category_ids"] = img_dict.get("neg_category_ids", []) + record["image_id"] = img_dict["id"] + record["dataset"] = dataset_name + + objs = [] + for ann_dict in ann_dicts: + assert ann_dict["image_id"] == record["image_id"] + obj = {} + _maybe_add_bbox(obj, ann_dict) + obj["iscrowd"] = ann_dict.get("iscrowd", 0) + obj["category_id"] = ann_dict["category_id"] + _maybe_add_segm(obj, ann_dict) + _maybe_add_keypoints(obj, ann_dict) + _maybe_add_densepose(obj, ann_dict) + objs.append(obj) + record["annotations"] = objs + dataset_dicts.append(record) + return dataset_dicts + + +def load_lvis_json(annotations_json_file: str, image_root: str, dataset_name: str): + """ + Loads a JSON file with annotations in LVIS instances format. + Replaces `detectron2.data.datasets.coco.load_lvis_json` to handle metadata + in a more flexible way. Postpones category mapping to a later stage to be + able to combine several datasets with different (but coherent) sets of + categories. + + Args: + + annotations_json_file: str + Path to the JSON file with annotations in COCO instances format. + image_root: str + directory that contains all the images + dataset_name: str + the name that identifies a dataset, e.g. "densepose_coco_2014_train" + extra_annotation_keys: Optional[List[str]] + If provided, these keys are used to extract additional data from + the annotations. + """ + lvis_api = _load_lvis_annotations(PathManager.get_local_path(annotations_json_file)) + + _add_categories_metadata(dataset_name) + + # sort indices for reproducible results + img_ids = sorted(lvis_api.imgs.keys()) + # imgs is a list of dicts, each looks something like: + # {'license': 4, + # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', + # 'file_name': 'COCO_val2014_000000001268.jpg', + # 'height': 427, + # 'width': 640, + # 'date_captured': '2013-11-17 05:57:24', + # 'id': 1268} + imgs = lvis_api.load_imgs(img_ids) + logger = logging.getLogger(__name__) + logger.info("Loaded {} images in LVIS format from {}".format(len(imgs), annotations_json_file)) + # anns is a list[list[dict]], where each dict is an annotation + # record for an object. The inner list enumerates the objects in an image + # and the outer list enumerates over images. + anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] + + _verify_annotations_have_unique_ids(annotations_json_file, anns) + dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns) + return dataset_records + + +def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[str] = None) -> None: + """ + Registers provided LVIS DensePose dataset + + Args: + dataset_data: CocoDatasetInfo + Dataset data + datasets_root: Optional[str] + Datasets root folder (default: None) + """ + annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath) + images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root) + + def load_annotations(): + return load_lvis_json( + annotations_json_file=annotations_fpath, + image_root=images_root, + dataset_name=dataset_data.name, + ) + + DatasetCatalog.register(dataset_data.name, load_annotations) + MetadataCatalog.get(dataset_data.name).set( + json_file=annotations_fpath, + image_root=images_root, + evaluator_type="lvis", + **get_metadata(DENSEPOSE_METADATA_URL_PREFIX), + ) + + +def register_datasets( + datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[str] = None +) -> None: + """ + Registers provided LVIS DensePose datasets + + Args: + datasets_data: Iterable[CocoDatasetInfo] + An iterable of dataset datas + datasets_root: Optional[str] + Datasets root folder (default: None) + """ + for dataset_data in datasets_data: + register_dataset(dataset_data, datasets_root) diff --git a/densepose/data/image_list_dataset.py b/densepose/data/image_list_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..92a95d3d5e7d4d7d6bf1d29d51295d32ae2104d2 --- /dev/null +++ b/densepose/data/image_list_dataset.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +import logging +import numpy as np +from typing import Any, Callable, Dict, List, Optional, Union +import torch +from torch.utils.data.dataset import Dataset + +from detectron2.data.detection_utils import read_image + +ImageTransform = Callable[[torch.Tensor], torch.Tensor] + + +class ImageListDataset(Dataset): + """ + Dataset that provides images from a list. + """ + + _EMPTY_IMAGE = torch.empty((0, 3, 1, 1)) + + def __init__( + self, + image_list: List[str], + category_list: Union[str, List[str], None] = None, + transform: Optional[ImageTransform] = None, + ): + """ + Args: + image_list (List[str]): list of paths to image files + category_list (Union[str, List[str], None]): list of animal categories for + each image. If it is a string, or None, this applies to all images + """ + if type(category_list) == list: + self.category_list = category_list + else: + self.category_list = [category_list] * len(image_list) + assert len(image_list) == len( + self.category_list + ), "length of image and category lists must be equal" + self.image_list = image_list + self.transform = transform + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """ + Gets selected images from the list + + Args: + idx (int): video index in the video list file + Returns: + A dictionary containing two keys: + images (torch.Tensor): tensor of size [N, 3, H, W] (N = 1, or 0 for _EMPTY_IMAGE) + categories (List[str]): categories of the frames + """ + categories = [self.category_list[idx]] + fpath = self.image_list[idx] + transform = self.transform + + try: + image = torch.from_numpy(np.ascontiguousarray(read_image(fpath, format="BGR"))) + image = image.permute(2, 0, 1).unsqueeze(0).float() # HWC -> NCHW + if transform is not None: + image = transform(image) + return {"images": image, "categories": categories} + except (OSError, RuntimeError) as e: + logger = logging.getLogger(__name__) + logger.warning(f"Error opening image file container {fpath}: {e}") + + return {"images": self._EMPTY_IMAGE, "categories": []} + + def __len__(self): + return len(self.image_list) diff --git a/densepose/data/inference_based_loader.py b/densepose/data/inference_based_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..cb89544500c29c4055353060ebbc8b428bd0262a --- /dev/null +++ b/densepose/data/inference_based_loader.py @@ -0,0 +1,172 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import random +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple +import torch +from torch import nn + +SampledData = Any +ModelOutput = Any + + +def _grouper(iterable: Iterable[Any], n: int, fillvalue=None) -> Iterator[Tuple[Any]]: + """ + Group elements of an iterable by chunks of size `n`, e.g. + grouper(range(9), 4) -> + (0, 1, 2, 3), (4, 5, 6, 7), (8, None, None, None) + """ + it = iter(iterable) + while True: + values = [] + for _ in range(n): + try: + value = next(it) + except StopIteration: + if values: + values.extend([fillvalue] * (n - len(values))) + yield tuple(values) + return + values.append(value) + yield tuple(values) + + +class ScoreBasedFilter: + """ + Filters entries in model output based on their scores + Discards all entries with score less than the specified minimum + """ + + def __init__(self, min_score: float = 0.8): + self.min_score = min_score + + def __call__(self, model_output: ModelOutput) -> ModelOutput: + for model_output_i in model_output: + instances = model_output_i["instances"] + if not instances.has("scores"): + continue + instances_filtered = instances[instances.scores >= self.min_score] + model_output_i["instances"] = instances_filtered + return model_output + + +class InferenceBasedLoader: + """ + Data loader based on results inferred by a model. Consists of: + - a data loader that provides batches of images + - a model that is used to infer the results + - a data sampler that converts inferred results to annotations + """ + + def __init__( + self, + model: nn.Module, + data_loader: Iterable[List[Dict[str, Any]]], + data_sampler: Optional[Callable[[ModelOutput], List[SampledData]]] = None, + data_filter: Optional[Callable[[ModelOutput], ModelOutput]] = None, + shuffle: bool = True, + batch_size: int = 4, + inference_batch_size: int = 4, + drop_last: bool = False, + category_to_class_mapping: Optional[dict] = None, + ): + """ + Constructor + + Args: + model (torch.nn.Module): model used to produce data + data_loader (Iterable[List[Dict[str, Any]]]): iterable that provides + dictionaries with "images" and "categories" fields to perform inference on + data_sampler (Callable: ModelOutput -> SampledData): functor + that produces annotation data from inference results; + (optional, default: None) + data_filter (Callable: ModelOutput -> ModelOutput): filter + that selects model outputs for further processing + (optional, default: None) + shuffle (bool): if True, the input images get shuffled + batch_size (int): batch size for the produced annotation data + inference_batch_size (int): batch size for input images + drop_last (bool): if True, drop the last batch if it is undersized + category_to_class_mapping (dict): category to class mapping + """ + self.model = model + self.model.eval() + self.data_loader = data_loader + self.data_sampler = data_sampler + self.data_filter = data_filter + self.shuffle = shuffle + self.batch_size = batch_size + self.inference_batch_size = inference_batch_size + self.drop_last = drop_last + if category_to_class_mapping is not None: + self.category_to_class_mapping = category_to_class_mapping + else: + self.category_to_class_mapping = {} + + def __iter__(self) -> Iterator[List[SampledData]]: + for batch in self.data_loader: + # batch : List[Dict[str: Tensor[N, C, H, W], str: Optional[str]]] + # images_batch : Tensor[N, C, H, W] + # image : Tensor[C, H, W] + images_and_categories = [ + {"image": image, "category": category} + for element in batch + for image, category in zip(element["images"], element["categories"]) + ] + if not images_and_categories: + continue + if self.shuffle: + random.shuffle(images_and_categories) + yield from self._produce_data(images_and_categories) # pyre-ignore[6] + + def _produce_data( + self, images_and_categories: List[Tuple[torch.Tensor, Optional[str]]] + ) -> Iterator[List[SampledData]]: + """ + Produce batches of data from images + + Args: + images_and_categories (List[Tuple[torch.Tensor, Optional[str]]]): + list of images and corresponding categories to process + + Returns: + Iterator over batches of data sampled from model outputs + """ + data_batches: List[SampledData] = [] + category_to_class_mapping = self.category_to_class_mapping + batched_images_and_categories = _grouper(images_and_categories, self.inference_batch_size) + for batch in batched_images_and_categories: + batch = [ + { + "image": image_and_category["image"].to(self.model.device), + "category": image_and_category["category"], + } + for image_and_category in batch + if image_and_category is not None + ] + if not batch: + continue + with torch.no_grad(): + model_output = self.model(batch) + for model_output_i, batch_i in zip(model_output, batch): + assert len(batch_i["image"].shape) == 3 + model_output_i["image"] = batch_i["image"] + instance_class = category_to_class_mapping.get(batch_i["category"], 0) + model_output_i["instances"].dataset_classes = torch.tensor( + [instance_class] * len(model_output_i["instances"]) + ) + model_output_filtered = ( + model_output if self.data_filter is None else self.data_filter(model_output) + ) + data = ( + model_output_filtered + if self.data_sampler is None + else self.data_sampler(model_output_filtered) + ) + for data_i in data: + if len(data_i["instances"]): + data_batches.append(data_i) + if len(data_batches) >= self.batch_size: + yield data_batches[: self.batch_size] + data_batches = data_batches[self.batch_size :] + if not self.drop_last and data_batches: + yield data_batches diff --git a/densepose/data/meshes/__init__.py b/densepose/data/meshes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e1f0d5dc439dc58914238b23572f586dd1c693e --- /dev/null +++ b/densepose/data/meshes/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from . import builtin + +__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] diff --git a/densepose/data/meshes/builtin.py b/densepose/data/meshes/builtin.py new file mode 100644 index 0000000000000000000000000000000000000000..c0b23760e8268b068149931b173a4285ba451993 --- /dev/null +++ b/densepose/data/meshes/builtin.py @@ -0,0 +1,101 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .catalog import MeshInfo, register_meshes + +DENSEPOSE_MESHES_DIR = "https://dl.fbaipublicfiles.com/densepose/meshes/" + +MESHES = [ + MeshInfo( + name="smpl_27554", + data="smpl_27554.pkl", + geodists="geodists/geodists_smpl_27554.pkl", + symmetry="symmetry/symmetry_smpl_27554.pkl", + texcoords="texcoords/texcoords_smpl_27554.pkl", + ), + MeshInfo( + name="chimp_5029", + data="chimp_5029.pkl", + geodists="geodists/geodists_chimp_5029.pkl", + symmetry="symmetry/symmetry_chimp_5029.pkl", + texcoords="texcoords/texcoords_chimp_5029.pkl", + ), + MeshInfo( + name="cat_5001", + data="cat_5001.pkl", + geodists="geodists/geodists_cat_5001.pkl", + symmetry="symmetry/symmetry_cat_5001.pkl", + texcoords="texcoords/texcoords_cat_5001.pkl", + ), + MeshInfo( + name="cat_7466", + data="cat_7466.pkl", + geodists="geodists/geodists_cat_7466.pkl", + symmetry="symmetry/symmetry_cat_7466.pkl", + texcoords="texcoords/texcoords_cat_7466.pkl", + ), + MeshInfo( + name="sheep_5004", + data="sheep_5004.pkl", + geodists="geodists/geodists_sheep_5004.pkl", + symmetry="symmetry/symmetry_sheep_5004.pkl", + texcoords="texcoords/texcoords_sheep_5004.pkl", + ), + MeshInfo( + name="zebra_5002", + data="zebra_5002.pkl", + geodists="geodists/geodists_zebra_5002.pkl", + symmetry="symmetry/symmetry_zebra_5002.pkl", + texcoords="texcoords/texcoords_zebra_5002.pkl", + ), + MeshInfo( + name="horse_5004", + data="horse_5004.pkl", + geodists="geodists/geodists_horse_5004.pkl", + symmetry="symmetry/symmetry_horse_5004.pkl", + texcoords="texcoords/texcoords_zebra_5002.pkl", + ), + MeshInfo( + name="giraffe_5002", + data="giraffe_5002.pkl", + geodists="geodists/geodists_giraffe_5002.pkl", + symmetry="symmetry/symmetry_giraffe_5002.pkl", + texcoords="texcoords/texcoords_giraffe_5002.pkl", + ), + MeshInfo( + name="elephant_5002", + data="elephant_5002.pkl", + geodists="geodists/geodists_elephant_5002.pkl", + symmetry="symmetry/symmetry_elephant_5002.pkl", + texcoords="texcoords/texcoords_elephant_5002.pkl", + ), + MeshInfo( + name="dog_5002", + data="dog_5002.pkl", + geodists="geodists/geodists_dog_5002.pkl", + symmetry="symmetry/symmetry_dog_5002.pkl", + texcoords="texcoords/texcoords_dog_5002.pkl", + ), + MeshInfo( + name="dog_7466", + data="dog_7466.pkl", + geodists="geodists/geodists_dog_7466.pkl", + symmetry="symmetry/symmetry_dog_7466.pkl", + texcoords="texcoords/texcoords_dog_7466.pkl", + ), + MeshInfo( + name="cow_5002", + data="cow_5002.pkl", + geodists="geodists/geodists_cow_5002.pkl", + symmetry="symmetry/symmetry_cow_5002.pkl", + texcoords="texcoords/texcoords_cow_5002.pkl", + ), + MeshInfo( + name="bear_4936", + data="bear_4936.pkl", + geodists="geodists/geodists_bear_4936.pkl", + symmetry="symmetry/symmetry_bear_4936.pkl", + texcoords="texcoords/texcoords_bear_4936.pkl", + ), +] + +register_meshes(MESHES, DENSEPOSE_MESHES_DIR) diff --git a/densepose/data/meshes/catalog.py b/densepose/data/meshes/catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..b258f3ce11a90666b9c764541ce299384cfddf4e --- /dev/null +++ b/densepose/data/meshes/catalog.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +from collections import UserDict +from dataclasses import dataclass +from typing import Iterable, Optional + +from ..utils import maybe_prepend_base_path + + +@dataclass +class MeshInfo: + name: str + data: str + geodists: Optional[str] = None + symmetry: Optional[str] = None + texcoords: Optional[str] = None + + +class _MeshCatalog(UserDict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.mesh_ids = {} + self.mesh_names = {} + self.max_mesh_id = -1 + + def __setitem__(self, key, value): + if key in self: + logger = logging.getLogger(__name__) + logger.warning( + f"Overwriting mesh catalog entry '{key}': old value {self[key]}" + f", new value {value}" + ) + mesh_id = self.mesh_ids[key] + else: + self.max_mesh_id += 1 + mesh_id = self.max_mesh_id + super().__setitem__(key, value) + self.mesh_ids[key] = mesh_id + self.mesh_names[mesh_id] = key + + def get_mesh_id(self, shape_name: str) -> int: + return self.mesh_ids[shape_name] + + def get_mesh_name(self, mesh_id: int) -> str: + return self.mesh_names[mesh_id] + + +MeshCatalog = _MeshCatalog() + + +def register_mesh(mesh_info: MeshInfo, base_path: Optional[str]) -> None: + geodists, symmetry, texcoords = mesh_info.geodists, mesh_info.symmetry, mesh_info.texcoords + if geodists: + geodists = maybe_prepend_base_path(base_path, geodists) + if symmetry: + symmetry = maybe_prepend_base_path(base_path, symmetry) + if texcoords: + texcoords = maybe_prepend_base_path(base_path, texcoords) + MeshCatalog[mesh_info.name] = MeshInfo( + name=mesh_info.name, + data=maybe_prepend_base_path(base_path, mesh_info.data), + geodists=geodists, + symmetry=symmetry, + texcoords=texcoords, + ) + + +def register_meshes(mesh_infos: Iterable[MeshInfo], base_path: Optional[str]) -> None: + for mesh_info in mesh_infos: + register_mesh(mesh_info, base_path) diff --git a/densepose/data/samplers/__init__.py b/densepose/data/samplers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7dba87ea1c6f37ab56071d2f5d715bd78fe8816f --- /dev/null +++ b/densepose/data/samplers/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .densepose_uniform import DensePoseUniformSampler +from .densepose_confidence_based import DensePoseConfidenceBasedSampler +from .densepose_cse_uniform import DensePoseCSEUniformSampler +from .densepose_cse_confidence_based import DensePoseCSEConfidenceBasedSampler +from .mask_from_densepose import MaskFromDensePoseSampler +from .prediction_to_gt import PredictionToGroundTruthSampler diff --git a/densepose/data/samplers/densepose_base.py b/densepose/data/samplers/densepose_base.py new file mode 100644 index 0000000000000000000000000000000000000000..4d499d8f20d811fb8197d7bdae358540bb5b0dfc --- /dev/null +++ b/densepose/data/samplers/densepose_base.py @@ -0,0 +1,203 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, Dict, List, Tuple +import torch +from torch.nn import functional as F + +from detectron2.structures import BoxMode, Instances + +from densepose.converters import ToChartResultConverter +from densepose.converters.base import IntTupleBox, make_int_box +from densepose.structures import DensePoseDataRelative, DensePoseList + + +class DensePoseBaseSampler: + """ + Base DensePose sampler to produce DensePose data from DensePose predictions. + Samples for each class are drawn according to some distribution over all pixels estimated + to belong to that class. + """ + + def __init__(self, count_per_class: int = 8): + """ + Constructor + + Args: + count_per_class (int): the sampler produces at most `count_per_class` + samples for each category + """ + self.count_per_class = count_per_class + + def __call__(self, instances: Instances) -> DensePoseList: + """ + Convert DensePose predictions (an instance of `DensePoseChartPredictorOutput`) + into DensePose annotations data (an instance of `DensePoseList`) + """ + boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + dp_datas = [] + for i in range(len(boxes_xywh_abs)): + annotation_i = self._sample(instances[i], make_int_box(boxes_xywh_abs[i])) + annotation_i[DensePoseDataRelative.S_KEY] = self._resample_mask( # pyre-ignore[6] + instances[i].pred_densepose + ) + dp_datas.append(DensePoseDataRelative(annotation_i)) + # create densepose annotations on CPU + dp_list = DensePoseList(dp_datas, boxes_xyxy_abs, instances.image_size) + return dp_list + + def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]: + """ + Sample DensPoseDataRelative from estimation results + """ + labels, dp_result = self._produce_labels_and_results(instance) + annotation = { + DensePoseDataRelative.X_KEY: [], + DensePoseDataRelative.Y_KEY: [], + DensePoseDataRelative.U_KEY: [], + DensePoseDataRelative.V_KEY: [], + DensePoseDataRelative.I_KEY: [], + } + n, h, w = dp_result.shape + for part_id in range(1, DensePoseDataRelative.N_PART_LABELS + 1): + # indices - tuple of 3 1D tensors of size k + # 0: index along the first dimension N + # 1: index along H dimension + # 2: index along W dimension + indices = torch.nonzero(labels.expand(n, h, w) == part_id, as_tuple=True) + # values - an array of size [n, k] + # n: number of channels (U, V, confidences) + # k: number of points labeled with part_id + values = dp_result[indices].view(n, -1) + k = values.shape[1] + count = min(self.count_per_class, k) + if count <= 0: + continue + index_sample = self._produce_index_sample(values, count) + sampled_values = values[:, index_sample] + sampled_y = indices[1][index_sample] + 0.5 + sampled_x = indices[2][index_sample] + 0.5 + # prepare / normalize data + x = (sampled_x / w * 256.0).cpu().tolist() + y = (sampled_y / h * 256.0).cpu().tolist() + u = sampled_values[0].clamp(0, 1).cpu().tolist() + v = sampled_values[1].clamp(0, 1).cpu().tolist() + fine_segm_labels = [part_id] * count + # extend annotations + annotation[DensePoseDataRelative.X_KEY].extend(x) + annotation[DensePoseDataRelative.Y_KEY].extend(y) + annotation[DensePoseDataRelative.U_KEY].extend(u) + annotation[DensePoseDataRelative.V_KEY].extend(v) + annotation[DensePoseDataRelative.I_KEY].extend(fine_segm_labels) + return annotation + + def _produce_index_sample(self, values: torch.Tensor, count: int): + """ + Abstract method to produce a sample of indices to select data + To be implemented in descendants + + Args: + values (torch.Tensor): an array of size [n, k] that contains + estimated values (U, V, confidences); + n: number of channels (U, V, confidences) + k: number of points labeled with part_id + count (int): number of samples to produce, should be positive and <= k + + Return: + list(int): indices of values (along axis 1) selected as a sample + """ + raise NotImplementedError + + def _produce_labels_and_results(self, instance: Instances) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Method to get labels and DensePose results from an instance + + Args: + instance (Instances): an instance of `DensePoseChartPredictorOutput` + + Return: + labels (torch.Tensor): shape [H, W], DensePose segmentation labels + dp_result (torch.Tensor): shape [2, H, W], stacked DensePose results u and v + """ + converter = ToChartResultConverter + chart_result = converter.convert(instance.pred_densepose, instance.pred_boxes) + labels, dp_result = chart_result.labels.cpu(), chart_result.uv.cpu() + return labels, dp_result + + def _resample_mask(self, output: Any) -> torch.Tensor: + """ + Convert DensePose predictor output to segmentation annotation - tensors of size + (256, 256) and type `int64`. + + Args: + output: DensePose predictor output with the following attributes: + - coarse_segm: tensor of size [N, D, H, W] with unnormalized coarse + segmentation scores + - fine_segm: tensor of size [N, C, H, W] with unnormalized fine + segmentation scores + Return: + Tensor of size (S, S) and type `int64` with coarse segmentation annotations, + where S = DensePoseDataRelative.MASK_SIZE + """ + sz = DensePoseDataRelative.MASK_SIZE + S = ( + F.interpolate(output.coarse_segm, (sz, sz), mode="bilinear", align_corners=False) + .argmax(dim=1) + .long() + ) + I = ( + ( + F.interpolate( + output.fine_segm, + (sz, sz), + mode="bilinear", + align_corners=False, + ).argmax(dim=1) + * (S > 0).long() + ) + .squeeze() + .cpu() + ) + # Map fine segmentation results to coarse segmentation ground truth + # TODO: extract this into separate classes + # coarse segmentation: 1 = Torso, 2 = Right Hand, 3 = Left Hand, + # 4 = Left Foot, 5 = Right Foot, 6 = Upper Leg Right, 7 = Upper Leg Left, + # 8 = Lower Leg Right, 9 = Lower Leg Left, 10 = Upper Arm Left, + # 11 = Upper Arm Right, 12 = Lower Arm Left, 13 = Lower Arm Right, + # 14 = Head + # fine segmentation: 1, 2 = Torso, 3 = Right Hand, 4 = Left Hand, + # 5 = Left Foot, 6 = Right Foot, 7, 9 = Upper Leg Right, + # 8, 10 = Upper Leg Left, 11, 13 = Lower Leg Right, + # 12, 14 = Lower Leg Left, 15, 17 = Upper Arm Left, + # 16, 18 = Upper Arm Right, 19, 21 = Lower Arm Left, + # 20, 22 = Lower Arm Right, 23, 24 = Head + FINE_TO_COARSE_SEGMENTATION = { + 1: 1, + 2: 1, + 3: 2, + 4: 3, + 5: 4, + 6: 5, + 7: 6, + 8: 7, + 9: 6, + 10: 7, + 11: 8, + 12: 9, + 13: 8, + 14: 9, + 15: 10, + 16: 11, + 17: 10, + 18: 11, + 19: 12, + 20: 13, + 21: 12, + 22: 13, + 23: 14, + 24: 14, + } + mask = torch.zeros((sz, sz), dtype=torch.int64, device=torch.device("cpu")) + for i in range(DensePoseDataRelative.N_PART_LABELS): + mask[I == i + 1] = FINE_TO_COARSE_SEGMENTATION[i + 1] + return mask diff --git a/densepose/data/samplers/densepose_confidence_based.py b/densepose/data/samplers/densepose_confidence_based.py new file mode 100644 index 0000000000000000000000000000000000000000..48e325b06e46817dafc0da2d984a8626d754e119 --- /dev/null +++ b/densepose/data/samplers/densepose_confidence_based.py @@ -0,0 +1,108 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import random +from typing import Optional, Tuple +import torch + +from densepose.converters import ToChartResultConverterWithConfidences + +from .densepose_base import DensePoseBaseSampler + + +class DensePoseConfidenceBasedSampler(DensePoseBaseSampler): + """ + Samples DensePose data from DensePose predictions. + Samples for each class are drawn using confidence value estimates. + """ + + def __init__( + self, + confidence_channel: str, + count_per_class: int = 8, + search_count_multiplier: Optional[float] = None, + search_proportion: Optional[float] = None, + ): + """ + Constructor + + Args: + confidence_channel (str): confidence channel to use for sampling; + possible values: + "sigma_2": confidences for UV values + "fine_segm_confidence": confidences for fine segmentation + "coarse_segm_confidence": confidences for coarse segmentation + (default: "sigma_2") + count_per_class (int): the sampler produces at most `count_per_class` + samples for each category (default: 8) + search_count_multiplier (float or None): if not None, the total number + of the most confident estimates of a given class to consider is + defined as `min(search_count_multiplier * count_per_class, N)`, + where `N` is the total number of estimates of the class; cannot be + specified together with `search_proportion` (default: None) + search_proportion (float or None): if not None, the total number of the + of the most confident estimates of a given class to consider is + defined as `min(max(search_proportion * N, count_per_class), N)`, + where `N` is the total number of estimates of the class; cannot be + specified together with `search_count_multiplier` (default: None) + """ + super().__init__(count_per_class) + self.confidence_channel = confidence_channel + self.search_count_multiplier = search_count_multiplier + self.search_proportion = search_proportion + assert (search_count_multiplier is None) or (search_proportion is None), ( + f"Cannot specify both search_count_multiplier (={search_count_multiplier})" + f"and search_proportion (={search_proportion})" + ) + + def _produce_index_sample(self, values: torch.Tensor, count: int): + """ + Produce a sample of indices to select data based on confidences + + Args: + values (torch.Tensor): an array of size [n, k] that contains + estimated values (U, V, confidences); + n: number of channels (U, V, confidences) + k: number of points labeled with part_id + count (int): number of samples to produce, should be positive and <= k + + Return: + list(int): indices of values (along axis 1) selected as a sample + """ + k = values.shape[1] + if k == count: + index_sample = list(range(k)) + else: + # take the best count * search_count_multiplier pixels, + # sample from them uniformly + # (here best = smallest variance) + _, sorted_confidence_indices = torch.sort(values[2]) + if self.search_count_multiplier is not None: + search_count = min(int(count * self.search_count_multiplier), k) + elif self.search_proportion is not None: + search_count = min(max(int(k * self.search_proportion), count), k) + else: + search_count = min(count, k) + sample_from_top = random.sample(range(search_count), count) + index_sample = sorted_confidence_indices[:search_count][sample_from_top] + return index_sample + + def _produce_labels_and_results(self, instance) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Method to get labels and DensePose results from an instance, with confidences + + Args: + instance (Instances): an instance of `DensePoseChartPredictorOutputWithConfidences` + + Return: + labels (torch.Tensor): shape [H, W], DensePose segmentation labels + dp_result (torch.Tensor): shape [3, H, W], DensePose results u and v + stacked with the confidence channel + """ + converter = ToChartResultConverterWithConfidences + chart_result = converter.convert(instance.pred_densepose, instance.pred_boxes) + labels, dp_result = chart_result.labels.cpu(), chart_result.uv.cpu() + dp_result = torch.cat( + (dp_result, getattr(chart_result, self.confidence_channel)[None].cpu()) + ) + + return labels, dp_result diff --git a/densepose/data/samplers/densepose_cse_base.py b/densepose/data/samplers/densepose_cse_base.py new file mode 100644 index 0000000000000000000000000000000000000000..845545c1438b9d2a4fbb4c6dac0642461a7e539f --- /dev/null +++ b/densepose/data/samplers/densepose_cse_base.py @@ -0,0 +1,139 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, Dict, List, Tuple +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from densepose.converters.base import IntTupleBox +from densepose.data.utils import get_class_to_mesh_name_mapping +from densepose.modeling.cse.utils import squared_euclidean_distance_matrix +from densepose.structures import DensePoseDataRelative + +from .densepose_base import DensePoseBaseSampler + + +class DensePoseCSEBaseSampler(DensePoseBaseSampler): + """ + Base DensePose sampler to produce DensePose data from DensePose predictions. + Samples for each class are drawn according to some distribution over all pixels estimated + to belong to that class. + """ + + def __init__( + self, + cfg: CfgNode, + use_gt_categories: bool, + embedder: torch.nn.Module, + count_per_class: int = 8, + ): + """ + Constructor + + Args: + cfg (CfgNode): the config of the model + embedder (torch.nn.Module): necessary to compute mesh vertex embeddings + count_per_class (int): the sampler produces at most `count_per_class` + samples for each category + """ + super().__init__(count_per_class) + self.embedder = embedder + self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg) + self.use_gt_categories = use_gt_categories + + def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]: + """ + Sample DensPoseDataRelative from estimation results + """ + if self.use_gt_categories: + instance_class = instance.dataset_classes.tolist()[0] + else: + instance_class = instance.pred_classes.tolist()[0] + mesh_name = self.class_to_mesh_name[instance_class] + + annotation = { + DensePoseDataRelative.X_KEY: [], + DensePoseDataRelative.Y_KEY: [], + DensePoseDataRelative.VERTEX_IDS_KEY: [], + DensePoseDataRelative.MESH_NAME_KEY: mesh_name, + } + + mask, embeddings, other_values = self._produce_mask_and_results(instance, bbox_xywh) + indices = torch.nonzero(mask, as_tuple=True) + selected_embeddings = embeddings.permute(1, 2, 0)[indices].cpu() + values = other_values[:, indices[0], indices[1]] + k = values.shape[1] + + count = min(self.count_per_class, k) + if count <= 0: + return annotation + + index_sample = self._produce_index_sample(values, count) + closest_vertices = squared_euclidean_distance_matrix( + selected_embeddings[index_sample], self.embedder(mesh_name) + ) + closest_vertices = torch.argmin(closest_vertices, dim=1) + + sampled_y = indices[0][index_sample] + 0.5 + sampled_x = indices[1][index_sample] + 0.5 + # prepare / normalize data + _, _, w, h = bbox_xywh + x = (sampled_x / w * 256.0).cpu().tolist() + y = (sampled_y / h * 256.0).cpu().tolist() + # extend annotations + annotation[DensePoseDataRelative.X_KEY].extend(x) + annotation[DensePoseDataRelative.Y_KEY].extend(y) + annotation[DensePoseDataRelative.VERTEX_IDS_KEY].extend(closest_vertices.cpu().tolist()) + return annotation + + def _produce_mask_and_results( + self, instance: Instances, bbox_xywh: IntTupleBox + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Method to get labels and DensePose results from an instance + + Args: + instance (Instances): an instance of `DensePoseEmbeddingPredictorOutput` + bbox_xywh (IntTupleBox): the corresponding bounding box + + Return: + mask (torch.Tensor): shape [H, W], DensePose segmentation mask + embeddings (Tuple[torch.Tensor]): a tensor of shape [D, H, W], + DensePose CSE Embeddings + other_values (Tuple[torch.Tensor]): a tensor of shape [0, H, W], + for potential other values + """ + densepose_output = instance.pred_densepose + S = densepose_output.coarse_segm + E = densepose_output.embedding + _, _, w, h = bbox_xywh + embeddings = F.interpolate(E, size=(h, w), mode="bilinear")[0] + coarse_segm_resized = F.interpolate(S, size=(h, w), mode="bilinear")[0] + mask = coarse_segm_resized.argmax(0) > 0 + other_values = torch.empty((0, h, w), device=E.device) + return mask, embeddings, other_values + + def _resample_mask(self, output: Any) -> torch.Tensor: + """ + Convert DensePose predictor output to segmentation annotation - tensors of size + (256, 256) and type `int64`. + + Args: + output: DensePose predictor output with the following attributes: + - coarse_segm: tensor of size [N, D, H, W] with unnormalized coarse + segmentation scores + Return: + Tensor of size (S, S) and type `int64` with coarse segmentation annotations, + where S = DensePoseDataRelative.MASK_SIZE + """ + sz = DensePoseDataRelative.MASK_SIZE + mask = ( + F.interpolate(output.coarse_segm, (sz, sz), mode="bilinear", align_corners=False) + .argmax(dim=1) + .long() + .squeeze() + .cpu() + ) + return mask diff --git a/densepose/data/samplers/densepose_cse_confidence_based.py b/densepose/data/samplers/densepose_cse_confidence_based.py new file mode 100644 index 0000000000000000000000000000000000000000..964b7f4ac41d2e1bb3da1cf6861af7f644b859fc --- /dev/null +++ b/densepose/data/samplers/densepose_cse_confidence_based.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import random +from typing import Optional, Tuple +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from densepose.converters.base import IntTupleBox + +from .densepose_cse_base import DensePoseCSEBaseSampler + + +class DensePoseCSEConfidenceBasedSampler(DensePoseCSEBaseSampler): + """ + Samples DensePose data from DensePose predictions. + Samples for each class are drawn using confidence value estimates. + """ + + def __init__( + self, + cfg: CfgNode, + use_gt_categories: bool, + embedder: torch.nn.Module, + confidence_channel: str, + count_per_class: int = 8, + search_count_multiplier: Optional[float] = None, + search_proportion: Optional[float] = None, + ): + """ + Constructor + + Args: + cfg (CfgNode): the config of the model + embedder (torch.nn.Module): necessary to compute mesh vertex embeddings + confidence_channel (str): confidence channel to use for sampling; + possible values: + "coarse_segm_confidence": confidences for coarse segmentation + (default: "coarse_segm_confidence") + count_per_class (int): the sampler produces at most `count_per_class` + samples for each category (default: 8) + search_count_multiplier (float or None): if not None, the total number + of the most confident estimates of a given class to consider is + defined as `min(search_count_multiplier * count_per_class, N)`, + where `N` is the total number of estimates of the class; cannot be + specified together with `search_proportion` (default: None) + search_proportion (float or None): if not None, the total number of the + of the most confident estimates of a given class to consider is + defined as `min(max(search_proportion * N, count_per_class), N)`, + where `N` is the total number of estimates of the class; cannot be + specified together with `search_count_multiplier` (default: None) + """ + super().__init__(cfg, use_gt_categories, embedder, count_per_class) + self.confidence_channel = confidence_channel + self.search_count_multiplier = search_count_multiplier + self.search_proportion = search_proportion + assert (search_count_multiplier is None) or (search_proportion is None), ( + f"Cannot specify both search_count_multiplier (={search_count_multiplier})" + f"and search_proportion (={search_proportion})" + ) + + def _produce_index_sample(self, values: torch.Tensor, count: int): + """ + Produce a sample of indices to select data based on confidences + + Args: + values (torch.Tensor): a tensor of length k that contains confidences + k: number of points labeled with part_id + count (int): number of samples to produce, should be positive and <= k + + Return: + list(int): indices of values (along axis 1) selected as a sample + """ + k = values.shape[1] + if k == count: + index_sample = list(range(k)) + else: + # take the best count * search_count_multiplier pixels, + # sample from them uniformly + # (here best = smallest variance) + _, sorted_confidence_indices = torch.sort(values[0]) + if self.search_count_multiplier is not None: + search_count = min(int(count * self.search_count_multiplier), k) + elif self.search_proportion is not None: + search_count = min(max(int(k * self.search_proportion), count), k) + else: + search_count = min(count, k) + sample_from_top = random.sample(range(search_count), count) + index_sample = sorted_confidence_indices[-search_count:][sample_from_top] + return index_sample + + def _produce_mask_and_results( + self, instance: Instances, bbox_xywh: IntTupleBox + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Method to get labels and DensePose results from an instance + + Args: + instance (Instances): an instance of + `DensePoseEmbeddingPredictorOutputWithConfidences` + bbox_xywh (IntTupleBox): the corresponding bounding box + + Return: + mask (torch.Tensor): shape [H, W], DensePose segmentation mask + embeddings (Tuple[torch.Tensor]): a tensor of shape [D, H, W] + DensePose CSE Embeddings + other_values: a tensor of shape [1, H, W], DensePose CSE confidence + """ + _, _, w, h = bbox_xywh + densepose_output = instance.pred_densepose + mask, embeddings, _ = super()._produce_mask_and_results(instance, bbox_xywh) + other_values = F.interpolate( + getattr(densepose_output, self.confidence_channel), + size=(h, w), + mode="bilinear", + )[0].cpu() + return mask, embeddings, other_values diff --git a/densepose/data/samplers/densepose_cse_uniform.py b/densepose/data/samplers/densepose_cse_uniform.py new file mode 100644 index 0000000000000000000000000000000000000000..567636cc7dfbcc9167dd7f4aa2b752c6e53d311f --- /dev/null +++ b/densepose/data/samplers/densepose_cse_uniform.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .densepose_cse_base import DensePoseCSEBaseSampler +from .densepose_uniform import DensePoseUniformSampler + + +class DensePoseCSEUniformSampler(DensePoseCSEBaseSampler, DensePoseUniformSampler): + """ + Uniform Sampler for CSE + """ + + pass diff --git a/densepose/data/samplers/densepose_uniform.py b/densepose/data/samplers/densepose_uniform.py new file mode 100644 index 0000000000000000000000000000000000000000..0d72cc30c9342b36efd6a7e80e55bf088b5c797c --- /dev/null +++ b/densepose/data/samplers/densepose_uniform.py @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import random +import torch + +from .densepose_base import DensePoseBaseSampler + + +class DensePoseUniformSampler(DensePoseBaseSampler): + """ + Samples DensePose data from DensePose predictions. + Samples for each class are drawn uniformly over all pixels estimated + to belong to that class. + """ + + def __init__(self, count_per_class: int = 8): + """ + Constructor + + Args: + count_per_class (int): the sampler produces at most `count_per_class` + samples for each category + """ + super().__init__(count_per_class) + + def _produce_index_sample(self, values: torch.Tensor, count: int): + """ + Produce a uniform sample of indices to select data + + Args: + values (torch.Tensor): an array of size [n, k] that contains + estimated values (U, V, confidences); + n: number of channels (U, V, confidences) + k: number of points labeled with part_id + count (int): number of samples to produce, should be positive and <= k + + Return: + list(int): indices of values (along axis 1) selected as a sample + """ + k = values.shape[1] + return random.sample(range(k), count) diff --git a/densepose/data/samplers/mask_from_densepose.py b/densepose/data/samplers/mask_from_densepose.py new file mode 100644 index 0000000000000000000000000000000000000000..0e6e812ba5af4675a81aec3ef8fd9b96d53325cc --- /dev/null +++ b/densepose/data/samplers/mask_from_densepose.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.structures import BitMasks, Instances + +from densepose.converters import ToMaskConverter + + +class MaskFromDensePoseSampler: + """ + Produce mask GT from DensePose predictions + This sampler simply converts DensePose predictions to BitMasks + that a contain a bool tensor of the size of the input image + """ + + def __call__(self, instances: Instances) -> BitMasks: + """ + Converts predicted data from `instances` into the GT mask data + + Args: + instances (Instances): predicted results, expected to have `pred_densepose` field + + Returns: + Boolean Tensor of the size of the input image that has non-zero + values at pixels that are estimated to belong to the detected object + """ + return ToMaskConverter.convert( + instances.pred_densepose, instances.pred_boxes, instances.image_size + ) diff --git a/densepose/data/samplers/prediction_to_gt.py b/densepose/data/samplers/prediction_to_gt.py new file mode 100644 index 0000000000000000000000000000000000000000..3881fa5503c32c9e2f0602971971995f1211e054 --- /dev/null +++ b/densepose/data/samplers/prediction_to_gt.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional + +from detectron2.structures import Instances + +ModelOutput = Dict[str, Any] +SampledData = Dict[str, Any] + + +@dataclass +class _Sampler: + """ + Sampler registry entry that contains: + - src (str): source field to sample from (deleted after sampling) + - dst (Optional[str]): destination field to sample to, if not None + - func (Optional[Callable: Any -> Any]): function that performs sampling, + if None, reference copy is performed + """ + + src: str + dst: Optional[str] + func: Optional[Callable[[Any], Any]] + + +class PredictionToGroundTruthSampler: + """ + Sampler implementation that converts predictions to GT using registered + samplers for different fields of `Instances`. + """ + + def __init__(self, dataset_name: str = ""): + self.dataset_name = dataset_name + self._samplers = {} + self.register_sampler("pred_boxes", "gt_boxes", None) + self.register_sampler("pred_classes", "gt_classes", None) + # delete scores + self.register_sampler("scores") + + def __call__(self, model_output: List[ModelOutput]) -> List[SampledData]: + """ + Transform model output into ground truth data through sampling + + Args: + model_output (Dict[str, Any]): model output + Returns: + Dict[str, Any]: sampled data + """ + for model_output_i in model_output: + instances: Instances = model_output_i["instances"] + # transform data in each field + for _, sampler in self._samplers.items(): + if not instances.has(sampler.src) or sampler.dst is None: + continue + if sampler.func is None: + instances.set(sampler.dst, instances.get(sampler.src)) + else: + instances.set(sampler.dst, sampler.func(instances)) + # delete model output data that was transformed + for _, sampler in self._samplers.items(): + if sampler.src != sampler.dst and instances.has(sampler.src): + instances.remove(sampler.src) + model_output_i["dataset"] = self.dataset_name + return model_output + + def register_sampler( + self, + prediction_attr: str, + gt_attr: Optional[str] = None, + func: Optional[Callable[[Any], Any]] = None, + ): + """ + Register sampler for a field + + Args: + prediction_attr (str): field to replace with a sampled value + gt_attr (Optional[str]): field to store the sampled value to, if not None + func (Optional[Callable: Any -> Any]): sampler function + """ + self._samplers[(prediction_attr, gt_attr)] = _Sampler( + src=prediction_attr, dst=gt_attr, func=func + ) + + def remove_sampler( + self, + prediction_attr: str, + gt_attr: Optional[str] = None, + ): + """ + Remove sampler for a field + + Args: + prediction_attr (str): field to replace with a sampled value + gt_attr (Optional[str]): field to store the sampled value to, if not None + """ + assert (prediction_attr, gt_attr) in self._samplers + del self._samplers[(prediction_attr, gt_attr)] diff --git a/densepose/data/transform/__init__.py b/densepose/data/transform/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..369e1b278899b225d55bfc729514873b4259c7b9 --- /dev/null +++ b/densepose/data/transform/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .image import ImageResizeTransform diff --git a/densepose/data/transform/image.py b/densepose/data/transform/image.py new file mode 100644 index 0000000000000000000000000000000000000000..8139b67841633841199a1aae3b25e326afaaf5e2 --- /dev/null +++ b/densepose/data/transform/image.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import torch + + +class ImageResizeTransform: + """ + Transform that resizes images loaded from a dataset + (BGR data in NCHW channel order, typically uint8) to a format ready to be + consumed by DensePose training (BGR float32 data in NCHW channel order) + """ + + def __init__(self, min_size: int = 800, max_size: int = 1333): + self.min_size = min_size + self.max_size = max_size + + def __call__(self, images: torch.Tensor) -> torch.Tensor: + """ + Args: + images (torch.Tensor): tensor of size [N, 3, H, W] that contains + BGR data (typically in uint8) + Returns: + images (torch.Tensor): tensor of size [N, 3, H1, W1] where + H1 and W1 are chosen to respect the specified min and max sizes + and preserve the original aspect ratio, the data channels + follow BGR order and the data type is `torch.float32` + """ + # resize with min size + images = images.float() + min_size = min(images.shape[-2:]) + max_size = max(images.shape[-2:]) + scale = min(self.min_size / min_size, self.max_size / max_size) + images = torch.nn.functional.interpolate( + images, + scale_factor=scale, + mode="bilinear", + align_corners=False, + ) + return images diff --git a/densepose/data/utils.py b/densepose/data/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9878c31d03bd4114425f89dd1c6dda74337fe2e2 --- /dev/null +++ b/densepose/data/utils.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import os +from typing import Dict, Optional + +from detectron2.config import CfgNode + + +def is_relative_local_path(path: str) -> bool: + path_str = os.fsdecode(path) + return ("://" not in path_str) and not os.path.isabs(path) + + +def maybe_prepend_base_path(base_path: Optional[str], path: str): + """ + Prepends the provided path with a base path prefix if: + 1) base path is not None; + 2) path is a local path + """ + if base_path is None: + return path + if is_relative_local_path(path): + return os.path.join(base_path, path) + return path + + +def get_class_to_mesh_name_mapping(cfg: CfgNode) -> Dict[int, str]: + return { + int(class_id): mesh_name + for class_id, mesh_name in cfg.DATASETS.CLASS_TO_MESH_NAME_MAPPING.items() + } + + +def get_category_to_class_mapping(dataset_cfg: CfgNode) -> Dict[str, int]: + return { + category: int(class_id) + for category, class_id in dataset_cfg.CATEGORY_TO_CLASS_MAPPING.items() + } diff --git a/densepose/data/video/__init__.py b/densepose/data/video/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72406e153b688461bfcb0ef21e35020399239309 --- /dev/null +++ b/densepose/data/video/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .frame_selector import ( + FrameSelectionStrategy, + RandomKFramesSelector, + FirstKFramesSelector, + LastKFramesSelector, + FrameTsList, + FrameSelector, +) + +from .video_keyframe_dataset import ( + VideoKeyframeDataset, + video_list_from_file, + list_keyframes, + read_keyframes, +) diff --git a/densepose/data/video/frame_selector.py b/densepose/data/video/frame_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..c28f0e96475537319ff584f73fa422f838ae7b40 --- /dev/null +++ b/densepose/data/video/frame_selector.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import random +from collections.abc import Callable +from enum import Enum +from typing import Callable as TCallable +from typing import List + +FrameTsList = List[int] +FrameSelector = TCallable[[FrameTsList], FrameTsList] + + +class FrameSelectionStrategy(Enum): + """ + Frame selection strategy used with videos: + - "random_k": select k random frames + - "first_k": select k first frames + - "last_k": select k last frames + - "all": select all frames + """ + + # fmt: off + RANDOM_K = "random_k" + FIRST_K = "first_k" + LAST_K = "last_k" + ALL = "all" + # fmt: on + + +class RandomKFramesSelector(Callable): # pyre-ignore[39] + """ + Selector that retains at most `k` random frames + """ + + def __init__(self, k: int): + self.k = k + + def __call__(self, frame_tss: FrameTsList) -> FrameTsList: + """ + Select `k` random frames + + Args: + frames_tss (List[int]): timestamps of input frames + Returns: + List[int]: timestamps of selected frames + """ + return random.sample(frame_tss, min(self.k, len(frame_tss))) + + +class FirstKFramesSelector(Callable): # pyre-ignore[39] + """ + Selector that retains at most `k` first frames + """ + + def __init__(self, k: int): + self.k = k + + def __call__(self, frame_tss: FrameTsList) -> FrameTsList: + """ + Select `k` first frames + + Args: + frames_tss (List[int]): timestamps of input frames + Returns: + List[int]: timestamps of selected frames + """ + return frame_tss[: self.k] + + +class LastKFramesSelector(Callable): # pyre-ignore[39] + """ + Selector that retains at most `k` last frames from video data + """ + + def __init__(self, k: int): + self.k = k + + def __call__(self, frame_tss: FrameTsList) -> FrameTsList: + """ + Select `k` last frames + + Args: + frames_tss (List[int]): timestamps of input frames + Returns: + List[int]: timestamps of selected frames + """ + return frame_tss[-self.k :] diff --git a/densepose/data/video/video_keyframe_dataset.py b/densepose/data/video/video_keyframe_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..be379d12bff5b6348087ba343d3c027b52524136 --- /dev/null +++ b/densepose/data/video/video_keyframe_dataset.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +import csv +import logging +import numpy as np +from typing import Any, Callable, Dict, List, Optional, Union +import av +import torch +from torch.utils.data.dataset import Dataset + +from detectron2.utils.file_io import PathManager + +from ..utils import maybe_prepend_base_path +from .frame_selector import FrameSelector, FrameTsList + +FrameList = List[av.frame.Frame] # pyre-ignore[16] +FrameTransform = Callable[[torch.Tensor], torch.Tensor] + + +def list_keyframes(video_fpath: str, video_stream_idx: int = 0) -> FrameTsList: + """ + Traverses all keyframes of a video file. Returns a list of keyframe + timestamps. Timestamps are counts in timebase units. + + Args: + video_fpath (str): Video file path + video_stream_idx (int): Video stream index (default: 0) + Returns: + List[int]: list of keyframe timestaps (timestamp is a count in timebase + units) + """ + try: + with PathManager.open(video_fpath, "rb") as io: + container = av.open(io, mode="r") + stream = container.streams.video[video_stream_idx] + keyframes = [] + pts = -1 + # Note: even though we request forward seeks for keyframes, sometimes + # a keyframe in backwards direction is returned. We introduce tolerance + # as a max count of ignored backward seeks + tolerance_backward_seeks = 2 + while True: + try: + container.seek(pts + 1, backward=False, any_frame=False, stream=stream) + except av.AVError as e: + # the exception occurs when the video length is exceeded, + # we then return whatever data we've already collected + logger = logging.getLogger(__name__) + logger.debug( + f"List keyframes: Error seeking video file {video_fpath}, " + f"video stream {video_stream_idx}, pts {pts + 1}, AV error: {e}" + ) + return keyframes + except OSError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"List keyframes: Error seeking video file {video_fpath}, " + f"video stream {video_stream_idx}, pts {pts + 1}, OS error: {e}" + ) + return [] + packet = next(container.demux(video=video_stream_idx)) + if packet.pts is not None and packet.pts <= pts: + logger = logging.getLogger(__name__) + logger.warning( + f"Video file {video_fpath}, stream {video_stream_idx}: " + f"bad seek for packet {pts + 1} (got packet {packet.pts}), " + f"tolerance {tolerance_backward_seeks}." + ) + tolerance_backward_seeks -= 1 + if tolerance_backward_seeks == 0: + return [] + pts += 1 + continue + tolerance_backward_seeks = 2 + pts = packet.pts + if pts is None: + return keyframes + if packet.is_keyframe: + keyframes.append(pts) + return keyframes + except OSError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"List keyframes: Error opening video file container {video_fpath}, " f"OS error: {e}" + ) + except RuntimeError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"List keyframes: Error opening video file container {video_fpath}, " + f"Runtime error: {e}" + ) + return [] + + +def read_keyframes( + video_fpath: str, keyframes: FrameTsList, video_stream_idx: int = 0 +) -> FrameList: # pyre-ignore[11] + """ + Reads keyframe data from a video file. + + Args: + video_fpath (str): Video file path + keyframes (List[int]): List of keyframe timestamps (as counts in + timebase units to be used in container seek operations) + video_stream_idx (int): Video stream index (default: 0) + Returns: + List[Frame]: list of frames that correspond to the specified timestamps + """ + try: + with PathManager.open(video_fpath, "rb") as io: + container = av.open(io) + stream = container.streams.video[video_stream_idx] + frames = [] + for pts in keyframes: + try: + container.seek(pts, any_frame=False, stream=stream) + frame = next(container.decode(video=0)) + frames.append(frame) + except av.AVError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"Read keyframes: Error seeking video file {video_fpath}, " + f"video stream {video_stream_idx}, pts {pts}, AV error: {e}" + ) + container.close() + return frames + except OSError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"Read keyframes: Error seeking video file {video_fpath}, " + f"video stream {video_stream_idx}, pts {pts}, OS error: {e}" + ) + container.close() + return frames + except StopIteration: + logger = logging.getLogger(__name__) + logger.warning( + f"Read keyframes: Error decoding frame from {video_fpath}, " + f"video stream {video_stream_idx}, pts {pts}" + ) + container.close() + return frames + + container.close() + return frames + except OSError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"Read keyframes: Error opening video file container {video_fpath}, OS error: {e}" + ) + except RuntimeError as e: + logger = logging.getLogger(__name__) + logger.warning( + f"Read keyframes: Error opening video file container {video_fpath}, Runtime error: {e}" + ) + return [] + + +def video_list_from_file(video_list_fpath: str, base_path: Optional[str] = None): + """ + Create a list of paths to video files from a text file. + + Args: + video_list_fpath (str): path to a plain text file with the list of videos + base_path (str): base path for entries from the video list (default: None) + """ + video_list = [] + with PathManager.open(video_list_fpath, "r") as io: + for line in io: + video_list.append(maybe_prepend_base_path(base_path, str(line.strip()))) + return video_list + + +def read_keyframe_helper_data(fpath: str): + """ + Read keyframe data from a file in CSV format: the header should contain + "video_id" and "keyframes" fields. Value specifications are: + video_id: int + keyframes: list(int) + Example of contents: + video_id,keyframes + 2,"[1,11,21,31,41,51,61,71,81]" + + Args: + fpath (str): File containing keyframe data + + Return: + video_id_to_keyframes (dict: int -> list(int)): for a given video ID it + contains a list of keyframes for that video + """ + video_id_to_keyframes = {} + try: + with PathManager.open(fpath, "r") as io: + csv_reader = csv.reader(io) + header = next(csv_reader) + video_id_idx = header.index("video_id") + keyframes_idx = header.index("keyframes") + for row in csv_reader: + video_id = int(row[video_id_idx]) + assert ( + video_id not in video_id_to_keyframes + ), f"Duplicate keyframes entry for video {fpath}" + video_id_to_keyframes[video_id] = ( + [int(v) for v in row[keyframes_idx][1:-1].split(",")] + if len(row[keyframes_idx]) > 2 + else [] + ) + except Exception as e: + logger = logging.getLogger(__name__) + logger.warning(f"Error reading keyframe helper data from {fpath}: {e}") + return video_id_to_keyframes + + +class VideoKeyframeDataset(Dataset): + """ + Dataset that provides keyframes for a set of videos. + """ + + _EMPTY_FRAMES = torch.empty((0, 3, 1, 1)) + + def __init__( + self, + video_list: List[str], + category_list: Union[str, List[str], None] = None, + frame_selector: Optional[FrameSelector] = None, + transform: Optional[FrameTransform] = None, + keyframe_helper_fpath: Optional[str] = None, + ): + """ + Dataset constructor + + Args: + video_list (List[str]): list of paths to video files + category_list (Union[str, List[str], None]): list of animal categories for each + video file. If it is a string, or None, this applies to all videos + frame_selector (Callable: KeyFrameList -> KeyFrameList): + selects keyframes to process, keyframes are given by + packet timestamps in timebase counts. If None, all keyframes + are selected (default: None) + transform (Callable: torch.Tensor -> torch.Tensor): + transforms a batch of RGB images (tensors of size [B, 3, H, W]), + returns a tensor of the same size. If None, no transform is + applied (default: None) + + """ + if type(category_list) == list: + self.category_list = category_list + else: + self.category_list = [category_list] * len(video_list) + assert len(video_list) == len( + self.category_list + ), "length of video and category lists must be equal" + self.video_list = video_list + self.frame_selector = frame_selector + self.transform = transform + self.keyframe_helper_data = ( + read_keyframe_helper_data(keyframe_helper_fpath) + if keyframe_helper_fpath is not None + else None + ) + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """ + Gets selected keyframes from a given video + + Args: + idx (int): video index in the video list file + Returns: + A dictionary containing two keys: + images (torch.Tensor): tensor of size [N, H, W, 3] or of size + defined by the transform that contains keyframes data + categories (List[str]): categories of the frames + """ + categories = [self.category_list[idx]] + fpath = self.video_list[idx] + keyframes = ( + list_keyframes(fpath) + if self.keyframe_helper_data is None or idx not in self.keyframe_helper_data + else self.keyframe_helper_data[idx] + ) + transform = self.transform + frame_selector = self.frame_selector + if not keyframes: + return {"images": self._EMPTY_FRAMES, "categories": []} + if frame_selector is not None: + keyframes = frame_selector(keyframes) + frames = read_keyframes(fpath, keyframes) + if not frames: + return {"images": self._EMPTY_FRAMES, "categories": []} + frames = np.stack([frame.to_rgb().to_ndarray() for frame in frames]) + frames = torch.as_tensor(frames, device=torch.device("cpu")) + frames = frames[..., [2, 1, 0]] # RGB -> BGR + frames = frames.permute(0, 3, 1, 2).float() # NHWC -> NCHW + if transform is not None: + frames = transform(frames) + return {"images": frames, "categories": categories} + + def __len__(self): + return len(self.video_list) diff --git a/densepose/engine/__init__.py b/densepose/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..539b93a7beca07d229a6b6d387f885469242ad86 --- /dev/null +++ b/densepose/engine/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .trainer import Trainer diff --git a/densepose/engine/trainer.py b/densepose/engine/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ffe82c3d64d01ae36bb3c07cc6d75950937389 --- /dev/null +++ b/densepose/engine/trainer.py @@ -0,0 +1,258 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import os +from collections import OrderedDict +from typing import List, Optional, Union +import torch +from torch import nn + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import CfgNode +from detectron2.engine import DefaultTrainer +from detectron2.evaluation import ( + DatasetEvaluator, + DatasetEvaluators, + inference_on_dataset, + print_csv_format, +) +from detectron2.solver.build import get_default_optimizer_params, maybe_add_gradient_clipping +from detectron2.utils import comm +from detectron2.utils.events import EventWriter, get_event_storage + +from densepose import DensePoseDatasetMapperTTA, DensePoseGeneralizedRCNNWithTTA, load_from_cfg +from densepose.data import ( + DatasetMapper, + build_combined_loader, + build_detection_test_loader, + build_detection_train_loader, + build_inference_based_loaders, + has_inference_based_loaders, +) +from densepose.evaluation.d2_evaluator_adapter import Detectron2COCOEvaluatorAdapter +from densepose.evaluation.evaluator import DensePoseCOCOEvaluator, build_densepose_evaluator_storage +from densepose.modeling.cse import Embedder + + +class SampleCountingLoader: + def __init__(self, loader): + self.loader = loader + + def __iter__(self): + it = iter(self.loader) + storage = get_event_storage() + while True: + try: + batch = next(it) + num_inst_per_dataset = {} + for data in batch: + dataset_name = data["dataset"] + if dataset_name not in num_inst_per_dataset: + num_inst_per_dataset[dataset_name] = 0 + num_inst = len(data["instances"]) + num_inst_per_dataset[dataset_name] += num_inst + for dataset_name in num_inst_per_dataset: + storage.put_scalar(f"batch/{dataset_name}", num_inst_per_dataset[dataset_name]) + yield batch + except StopIteration: + break + + +class SampleCountMetricPrinter(EventWriter): + def __init__(self): + self.logger = logging.getLogger(__name__) + + def write(self): + storage = get_event_storage() + batch_stats_strs = [] + for key, buf in storage.histories().items(): + if key.startswith("batch/"): + batch_stats_strs.append(f"{key} {buf.avg(20)}") + self.logger.info(", ".join(batch_stats_strs)) + + +class Trainer(DefaultTrainer): + @classmethod + def extract_embedder_from_model(cls, model: nn.Module) -> Optional[Embedder]: + if isinstance(model, nn.parallel.DistributedDataParallel): + model = model.module + if hasattr(model, "roi_heads") and hasattr(model.roi_heads, "embedder"): + return model.roi_heads.embedder + return None + + # TODO: the only reason to copy the base class code here is to pass the embedder from + # the model to the evaluator; that should be refactored to avoid unnecessary copy-pasting + @classmethod + def test( + cls, + cfg: CfgNode, + model: nn.Module, + evaluators: Optional[Union[DatasetEvaluator, List[DatasetEvaluator]]] = None, + ): + """ + Args: + cfg (CfgNode): + model (nn.Module): + evaluators (DatasetEvaluator, list[DatasetEvaluator] or None): if None, will call + :meth:`build_evaluator`. Otherwise, must have the same length as + ``cfg.DATASETS.TEST``. + + Returns: + dict: a dict of result metrics + """ + logger = logging.getLogger(__name__) + if isinstance(evaluators, DatasetEvaluator): + evaluators = [evaluators] + if evaluators is not None: + assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( + len(cfg.DATASETS.TEST), len(evaluators) + ) + + results = OrderedDict() + for idx, dataset_name in enumerate(cfg.DATASETS.TEST): + data_loader = cls.build_test_loader(cfg, dataset_name) + # When evaluators are passed in as arguments, + # implicitly assume that evaluators can be created before data_loader. + if evaluators is not None: + evaluator = evaluators[idx] + else: + try: + embedder = cls.extract_embedder_from_model(model) + evaluator = cls.build_evaluator(cfg, dataset_name, embedder=embedder) + except NotImplementedError: + logger.warn( + "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " + "or implement its `build_evaluator` method." + ) + results[dataset_name] = {} + continue + if cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE or comm.is_main_process(): + results_i = inference_on_dataset(model, data_loader, evaluator) + else: + results_i = {} + results[dataset_name] = results_i + if comm.is_main_process(): + assert isinstance( + results_i, dict + ), "Evaluator must return a dict on the main process. Got {} instead.".format( + results_i + ) + logger.info("Evaluation results for {} in csv format:".format(dataset_name)) + print_csv_format(results_i) + + if len(results) == 1: + results = list(results.values())[0] + return results + + @classmethod + def build_evaluator( + cls, + cfg: CfgNode, + dataset_name: str, + output_folder: Optional[str] = None, + embedder: Optional[Embedder] = None, + ) -> DatasetEvaluators: + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluators = [] + distributed = cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE + # Note: we currently use COCO evaluator for both COCO and LVIS datasets + # to have compatible metrics. LVIS bbox evaluator could also be used + # with an adapter to properly handle filtered / mapped categories + # evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + # if evaluator_type == "coco": + # evaluators.append(COCOEvaluator(dataset_name, output_dir=output_folder)) + # elif evaluator_type == "lvis": + # evaluators.append(LVISEvaluator(dataset_name, output_dir=output_folder)) + evaluators.append( + Detectron2COCOEvaluatorAdapter( + dataset_name, output_dir=output_folder, distributed=distributed + ) + ) + if cfg.MODEL.DENSEPOSE_ON: + storage = build_densepose_evaluator_storage(cfg, output_folder) + evaluators.append( + DensePoseCOCOEvaluator( + dataset_name, + distributed, + output_folder, + evaluator_type=cfg.DENSEPOSE_EVALUATION.TYPE, + min_iou_threshold=cfg.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD, + storage=storage, + embedder=embedder, + should_evaluate_mesh_alignment=cfg.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT, + mesh_alignment_mesh_names=cfg.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES, + ) + ) + return DatasetEvaluators(evaluators) + + @classmethod + def build_optimizer(cls, cfg: CfgNode, model: nn.Module): + params = get_default_optimizer_params( + model, + base_lr=cfg.SOLVER.BASE_LR, + weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, + bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR, + weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS, + overrides={ + "features": { + "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR, + }, + "embeddings": { + "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR, + }, + }, + ) + optimizer = torch.optim.SGD( + params, + cfg.SOLVER.BASE_LR, + momentum=cfg.SOLVER.MOMENTUM, + nesterov=cfg.SOLVER.NESTEROV, + weight_decay=cfg.SOLVER.WEIGHT_DECAY, + ) + # pyre-fixme[6]: For 2nd param expected `Type[Optimizer]` but got `SGD`. + return maybe_add_gradient_clipping(cfg, optimizer) + + @classmethod + def build_test_loader(cls, cfg: CfgNode, dataset_name): + return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False)) + + @classmethod + def build_train_loader(cls, cfg: CfgNode): + data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True)) + if not has_inference_based_loaders(cfg): + return data_loader + model = cls.build_model(cfg) + model.to(cfg.BOOTSTRAP_MODEL.DEVICE) + DetectionCheckpointer(model).resume_or_load(cfg.BOOTSTRAP_MODEL.WEIGHTS, resume=False) + inference_based_loaders, ratios = build_inference_based_loaders(cfg, model) + loaders = [data_loader] + inference_based_loaders + ratios = [1.0] + ratios + combined_data_loader = build_combined_loader(cfg, loaders, ratios) + sample_counting_loader = SampleCountingLoader(combined_data_loader) + return sample_counting_loader + + def build_writers(self): + writers = super().build_writers() + writers.append(SampleCountMetricPrinter()) + return writers + + @classmethod + def test_with_TTA(cls, cfg: CfgNode, model): + logger = logging.getLogger("detectron2.trainer") + # In the end of training, run an evaluation with TTA + # Only support some R-CNN models. + logger.info("Running inference with test-time augmentation ...") + transform_data = load_from_cfg(cfg) + model = DensePoseGeneralizedRCNNWithTTA( + cfg, model, transform_data, DensePoseDatasetMapperTTA(cfg) + ) + evaluators = [ + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) + for name in cfg.DATASETS.TEST + ] + res = cls.test(cfg, model, evaluators) # pyre-ignore[6] + res = OrderedDict({k + "_TTA": v for k, v in res.items()}) + return res diff --git a/densepose/evaluation/__init__.py b/densepose/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ae1f20cdc822ebf3c870f1289a0ad210c57ae7 --- /dev/null +++ b/densepose/evaluation/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .evaluator import DensePoseCOCOEvaluator diff --git a/densepose/evaluation/d2_evaluator_adapter.py b/densepose/evaluation/d2_evaluator_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..1fbc526059a191f9414231c1b21ed3e8b7b58580 --- /dev/null +++ b/densepose/evaluation/d2_evaluator_adapter.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.data.catalog import Metadata +from detectron2.evaluation import COCOEvaluator + +from densepose.data.datasets.coco import ( + get_contiguous_id_to_category_id_map, + maybe_filter_categories_cocoapi, +) + + +def _maybe_add_iscrowd_annotations(cocoapi) -> None: + for ann in cocoapi.dataset["annotations"]: + if "iscrowd" not in ann: + ann["iscrowd"] = 0 + + +class Detectron2COCOEvaluatorAdapter(COCOEvaluator): + def __init__( + self, + dataset_name, + output_dir=None, + distributed=True, + ): + super().__init__(dataset_name, output_dir=output_dir, distributed=distributed) + maybe_filter_categories_cocoapi(dataset_name, self._coco_api) + _maybe_add_iscrowd_annotations(self._coco_api) + # substitute category metadata to account for categories + # that are mapped to the same contiguous id + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + self._maybe_substitute_metadata() + + def _maybe_substitute_metadata(self): + cont_id_2_cat_id = get_contiguous_id_to_category_id_map(self._metadata) + cat_id_2_cont_id = self._metadata.thing_dataset_id_to_contiguous_id + if len(cont_id_2_cat_id) == len(cat_id_2_cont_id): + return + + cat_id_2_cont_id_injective = {} + for cat_id, cont_id in cat_id_2_cont_id.items(): + if (cont_id in cont_id_2_cat_id) and (cont_id_2_cat_id[cont_id] == cat_id): + cat_id_2_cont_id_injective[cat_id] = cont_id + + metadata_new = Metadata(name=self._metadata.name) + for key, value in self._metadata.__dict__.items(): + if key == "thing_dataset_id_to_contiguous_id": + setattr(metadata_new, key, cat_id_2_cont_id_injective) + else: + setattr(metadata_new, key, value) + self._metadata = metadata_new diff --git a/densepose/evaluation/densepose_coco_evaluation.py b/densepose/evaluation/densepose_coco_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..6370ba2a1bce45493e3d4bebd05b1b449334871d --- /dev/null +++ b/densepose/evaluation/densepose_coco_evaluation.py @@ -0,0 +1,1303 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# This is a modified version of cocoeval.py where we also have the densepose evaluation. + +__author__ = "tsungyi" + +import copy +import datetime +import logging +import numpy as np +import pickle +import time +from collections import defaultdict +from enum import Enum +from typing import Any, Dict, Tuple +import scipy.spatial.distance as ssd +import torch +import torch.nn.functional as F +from pycocotools import mask as maskUtils +from scipy.io import loadmat +from scipy.ndimage import zoom as spzoom + +from detectron2.utils.file_io import PathManager + +from densepose.converters.chart_output_to_chart_result import resample_uv_tensors_to_bbox +from densepose.converters.segm_to_mask import ( + resample_coarse_segm_tensor_to_bbox, + resample_fine_and_coarse_segm_tensors_to_bbox, +) +from densepose.modeling.cse.utils import squared_euclidean_distance_matrix +from densepose.structures import DensePoseDataRelative +from densepose.structures.mesh import create_mesh + +logger = logging.getLogger(__name__) + + +class DensePoseEvalMode(str, Enum): + # use both masks and geodesic distances (GPS * IOU) to compute scores + GPSM = "gpsm" + # use only geodesic distances (GPS) to compute scores + GPS = "gps" + # use only masks (IOU) to compute scores + IOU = "iou" + + +class DensePoseDataMode(str, Enum): + # use estimated IUV data (default mode) + IUV_DT = "iuvdt" + # use ground truth IUV data + IUV_GT = "iuvgt" + # use ground truth labels I and set UV to 0 + I_GT_UV_0 = "igtuv0" + # use ground truth labels I and estimated UV coordinates + I_GT_UV_DT = "igtuvdt" + # use estimated labels I and set UV to 0 + I_DT_UV_0 = "idtuv0" + + +class DensePoseCocoEval: + # Interface for evaluating detection on the Microsoft COCO dataset. + # + # The usage for CocoEval is as follows: + # cocoGt=..., cocoDt=... # load dataset and results + # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object + # E.params.recThrs = ...; # set parameters as desired + # E.evaluate(); # run per image evaluation + # E.accumulate(); # accumulate per image results + # E.summarize(); # display summary metrics of results + # For example usage see evalDemo.m and http://mscoco.org/. + # + # The evaluation parameters are as follows (defaults in brackets): + # imgIds - [all] N img ids to use for evaluation + # catIds - [all] K cat ids to use for evaluation + # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation + # recThrs - [0:.01:1] R=101 recall thresholds for evaluation + # areaRng - [...] A=4 object area ranges for evaluation + # maxDets - [1 10 100] M=3 thresholds on max detections per image + # iouType - ['segm'] set iouType to 'segm', 'bbox', 'keypoints' or 'densepose' + # iouType replaced the now DEPRECATED useSegm parameter. + # useCats - [1] if true use category labels for evaluation + # Note: if useCats=0 category labels are ignored as in proposal scoring. + # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. + # + # evaluate(): evaluates detections on every image and every category and + # concats the results into the "evalImgs" with fields: + # dtIds - [1xD] id for each of the D detections (dt) + # gtIds - [1xG] id for each of the G ground truths (gt) + # dtMatches - [TxD] matching gt id at each IoU or 0 + # gtMatches - [TxG] matching dt id at each IoU or 0 + # dtScores - [1xD] confidence of each dt + # gtIgnore - [1xG] ignore flag for each gt + # dtIgnore - [TxD] ignore flag for each dt at each IoU + # + # accumulate(): accumulates the per-image, per-category evaluation + # results in "evalImgs" into the dictionary "eval" with fields: + # params - parameters used for evaluation + # date - date evaluation was performed + # counts - [T,R,K,A,M] parameter dimensions (see above) + # precision - [TxRxKxAxM] precision for every evaluation setting + # recall - [TxKxAxM] max recall for every evaluation setting + # Note: precision and recall==-1 for settings with no gt objects. + # + # See also coco, mask, pycocoDemo, pycocoEvalDemo + # + # Microsoft COCO Toolbox. version 2.0 + # Data, paper, and tutorials available at: http://mscoco.org/ + # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. + # Licensed under the Simplified BSD License [see coco/license.txt] + def __init__( + self, + cocoGt=None, + cocoDt=None, + iouType: str = "densepose", + multi_storage=None, + embedder=None, + dpEvalMode: DensePoseEvalMode = DensePoseEvalMode.GPS, + dpDataMode: DensePoseDataMode = DensePoseDataMode.IUV_DT, + ): + """ + Initialize CocoEval using coco APIs for gt and dt + :param cocoGt: coco object with ground truth annotations + :param cocoDt: coco object with detection results + :return: None + """ + self.cocoGt = cocoGt # ground truth COCO API + self.cocoDt = cocoDt # detections COCO API + self.multi_storage = multi_storage + self.embedder = embedder + self._dpEvalMode = dpEvalMode + self._dpDataMode = dpDataMode + self.evalImgs = defaultdict(list) # per-image per-category eval results [KxAxI] + self.eval = {} # accumulated evaluation results + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + self.params = Params(iouType=iouType) # parameters + self._paramsEval = {} # parameters for evaluation + self.stats = [] # result summarization + self.ious = {} # ious between all gts and dts + if cocoGt is not None: + self.params.imgIds = sorted(cocoGt.getImgIds()) + self.params.catIds = sorted(cocoGt.getCatIds()) + self.ignoreThrBB = 0.7 + self.ignoreThrUV = 0.9 + + def _loadGEval(self): + smpl_subdiv_fpath = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/SMPL_subdiv.mat" + ) + pdist_transform_fpath = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/SMPL_SUBDIV_TRANSFORM.mat" + ) + pdist_matrix_fpath = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/Pdist_matrix.pkl", timeout_sec=120 + ) + SMPL_subdiv = loadmat(smpl_subdiv_fpath) + self.PDIST_transform = loadmat(pdist_transform_fpath) + self.PDIST_transform = self.PDIST_transform["index"].squeeze() + UV = np.array([SMPL_subdiv["U_subdiv"], SMPL_subdiv["V_subdiv"]]).squeeze() + ClosestVertInds = np.arange(UV.shape[1]) + 1 + self.Part_UVs = [] + self.Part_ClosestVertInds = [] + for i in np.arange(24): + self.Part_UVs.append(UV[:, SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)]) + self.Part_ClosestVertInds.append( + ClosestVertInds[SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)] + ) + + with open(pdist_matrix_fpath, "rb") as hFile: + arrays = pickle.load(hFile, encoding="latin1") + self.Pdist_matrix = arrays["Pdist_matrix"] + self.Part_ids = np.array(SMPL_subdiv["Part_ID_subdiv"].squeeze()) + # Mean geodesic distances for parts. + self.Mean_Distances = np.array([0, 0.351, 0.107, 0.126, 0.237, 0.173, 0.142, 0.128, 0.150]) + # Coarse Part labels. + self.CoarseParts = np.array( + [0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8] + ) + + def _prepare(self): + """ + Prepare ._gts and ._dts for evaluation based on params + :return: None + """ + + def _toMask(anns, coco): + # modify ann['segmentation'] by reference + for ann in anns: + # safeguard for invalid segmentation annotation; + # annotations containing empty lists exist in the posetrack + # dataset. This is not a correct segmentation annotation + # in terms of COCO format; we need to deal with it somehow + segm = ann["segmentation"] + if type(segm) == list and len(segm) == 0: + ann["segmentation"] = None + continue + rle = coco.annToRLE(ann) + ann["segmentation"] = rle + + def _getIgnoreRegion(iid, coco): + img = coco.imgs[iid] + + if "ignore_regions_x" not in img.keys(): + return None + + if len(img["ignore_regions_x"]) == 0: + return None + + rgns_merged = [ + [v for xy in zip(region_x, region_y) for v in xy] + for region_x, region_y in zip(img["ignore_regions_x"], img["ignore_regions_y"]) + ] + rles = maskUtils.frPyObjects(rgns_merged, img["height"], img["width"]) + rle = maskUtils.merge(rles) + return maskUtils.decode(rle) + + def _checkIgnore(dt, iregion): + if iregion is None: + return True + + bb = np.array(dt["bbox"]).astype(int) + x1, y1, x2, y2 = bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3] + x2 = min([x2, iregion.shape[1]]) + y2 = min([y2, iregion.shape[0]]) + + if bb[2] * bb[3] == 0: + return False + + crop_iregion = iregion[y1:y2, x1:x2] + + if crop_iregion.sum() == 0: + return True + + if "densepose" not in dt.keys(): # filtering boxes + return crop_iregion.sum() / bb[2] / bb[3] < self.ignoreThrBB + + # filtering UVs + ignoremask = np.require(crop_iregion, requirements=["F"]) + mask = self._extract_mask(dt) + uvmask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"]) + uvmask_ = maskUtils.encode(uvmask) + ignoremask_ = maskUtils.encode(ignoremask) + uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0] + return uviou < self.ignoreThrUV + + p = self.params + + if p.useCats: + gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) + dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) + else: + gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) + dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) + + imns = self.cocoGt.loadImgs(p.imgIds) + self.size_mapping = {} + for im in imns: + self.size_mapping[im["id"]] = [im["height"], im["width"]] + + # if iouType == 'uv', add point gt annotations + if p.iouType == "densepose": + self._loadGEval() + + # convert ground truth to mask if iouType == 'segm' + if p.iouType == "segm": + _toMask(gts, self.cocoGt) + _toMask(dts, self.cocoDt) + + # set ignore flag + for gt in gts: + gt["ignore"] = gt["ignore"] if "ignore" in gt else 0 + gt["ignore"] = "iscrowd" in gt and gt["iscrowd"] + if p.iouType == "keypoints": + gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"] + if p.iouType == "densepose": + gt["ignore"] = ("dp_x" in gt) == 0 + if p.iouType == "segm": + gt["ignore"] = gt["segmentation"] is None + + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + self._igrgns = defaultdict(list) + + for gt in gts: + iid = gt["image_id"] + if iid not in self._igrgns.keys(): + self._igrgns[iid] = _getIgnoreRegion(iid, self.cocoGt) + if _checkIgnore(gt, self._igrgns[iid]): + self._gts[iid, gt["category_id"]].append(gt) + for dt in dts: + iid = dt["image_id"] + if (iid not in self._igrgns) or _checkIgnore(dt, self._igrgns[iid]): + self._dts[iid, dt["category_id"]].append(dt) + + self.evalImgs = defaultdict(list) # per-image per-category evaluation results + self.eval = {} # accumulated evaluation results + + def evaluate(self): + """ + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + """ + tic = time.time() + logger.info("Running per image DensePose evaluation... {}".format(self.params.iouType)) + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = "segm" if p.useSegm == 1 else "bbox" + logger.info("useSegm (deprecated) is not None. Running DensePose evaluation") + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType in ["segm", "bbox"]: + computeIoU = self.computeIoU + elif p.iouType == "keypoints": + computeIoU = self.computeOks + elif p.iouType == "densepose": + computeIoU = self.computeOgps + if self._dpEvalMode in {DensePoseEvalMode.GPSM, DensePoseEvalMode.IOU}: + self.real_ious = { + (imgId, catId): self.computeDPIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds + } + + self.ious = { + (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds + } + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + self.evalImgs = [ + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + logger.info("DensePose evaluation DONE (t={:0.2f}s).".format(toc - tic)) + + def getDensePoseMask(self, polys): + maskGen = np.zeros([256, 256]) + stop = min(len(polys) + 1, 15) + for i in range(1, stop): + if polys[i - 1]: + currentMask = maskUtils.decode(polys[i - 1]) + maskGen[currentMask > 0] = i + return maskGen + + def _generate_rlemask_on_image(self, mask, imgId, data): + bbox_xywh = np.array(data["bbox"]) + x, y, w, h = bbox_xywh + im_h, im_w = self.size_mapping[imgId] + im_mask = np.zeros((im_h, im_w), dtype=np.uint8) + if mask is not None: + x0 = max(int(x), 0) + x1 = min(int(x + w), im_w, int(x) + mask.shape[1]) + y0 = max(int(y), 0) + y1 = min(int(y + h), im_h, int(y) + mask.shape[0]) + y = int(y) + x = int(x) + im_mask[y0:y1, x0:x1] = mask[y0 - y : y1 - y, x0 - x : x1 - x] + im_mask = np.require(np.asarray(im_mask > 0), dtype=np.uint8, requirements=["F"]) + rle_mask = maskUtils.encode(np.array(im_mask[:, :, np.newaxis], order="F"))[0] + return rle_mask + + def computeDPIoU(self, imgId, catId): + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return [] + inds = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt = dt[0 : p.maxDets[-1]] + + gtmasks = [] + for g in gt: + if DensePoseDataRelative.S_KEY in g: + # convert DensePose mask to a binary mask + mask = np.minimum(self.getDensePoseMask(g[DensePoseDataRelative.S_KEY]), 1.0) + _, _, w, h = g["bbox"] + scale_x = float(max(w, 1)) / mask.shape[1] + scale_y = float(max(h, 1)) / mask.shape[0] + mask = spzoom(mask, (scale_y, scale_x), order=1, prefilter=False) + mask = np.array(mask > 0.5, dtype=np.uint8) + rle_mask = self._generate_rlemask_on_image(mask, imgId, g) + elif "segmentation" in g: + segmentation = g["segmentation"] + if isinstance(segmentation, list) and segmentation: + # polygons + im_h, im_w = self.size_mapping[imgId] + rles = maskUtils.frPyObjects(segmentation, im_h, im_w) + rle_mask = maskUtils.merge(rles) + elif isinstance(segmentation, dict): + if isinstance(segmentation["counts"], list): + # uncompressed RLE + im_h, im_w = self.size_mapping[imgId] + rle_mask = maskUtils.frPyObjects(segmentation, im_h, im_w) + else: + # compressed RLE + rle_mask = segmentation + else: + rle_mask = self._generate_rlemask_on_image(None, imgId, g) + else: + rle_mask = self._generate_rlemask_on_image(None, imgId, g) + gtmasks.append(rle_mask) + + dtmasks = [] + for d in dt: + mask = self._extract_mask(d) + mask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"]) + rle_mask = self._generate_rlemask_on_image(mask, imgId, d) + dtmasks.append(rle_mask) + + # compute iou between each dt and gt region + iscrowd = [int(o.get("iscrowd", 0)) for o in gt] + iousDP = maskUtils.iou(dtmasks, gtmasks, iscrowd) + return iousDP + + def computeIoU(self, imgId, catId): + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return [] + inds = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt = dt[0 : p.maxDets[-1]] + + if p.iouType == "segm": + g = [g["segmentation"] for g in gt if g["segmentation"] is not None] + d = [d["segmentation"] for d in dt if d["segmentation"] is not None] + elif p.iouType == "bbox": + g = [g["bbox"] for g in gt] + d = [d["bbox"] for d in dt] + else: + raise Exception("unknown iouType for iou computation") + + # compute iou between each dt and gt region + iscrowd = [int(o.get("iscrowd", 0)) for o in gt] + ious = maskUtils.iou(d, g, iscrowd) + return ious + + def computeOks(self, imgId, catId): + p = self.params + # dimension here should be Nxm + gts = self._gts[imgId, catId] + dts = self._dts[imgId, catId] + inds = np.argsort([-d["score"] for d in dts], kind="mergesort") + dts = [dts[i] for i in inds] + if len(dts) > p.maxDets[-1]: + dts = dts[0 : p.maxDets[-1]] + # if len(gts) == 0 and len(dts) == 0: + if len(gts) == 0 or len(dts) == 0: + return [] + ious = np.zeros((len(dts), len(gts))) + sigmas = ( + np.array( + [ + 0.26, + 0.25, + 0.25, + 0.35, + 0.35, + 0.79, + 0.79, + 0.72, + 0.72, + 0.62, + 0.62, + 1.07, + 1.07, + 0.87, + 0.87, + 0.89, + 0.89, + ] + ) + / 10.0 + ) + vars = (sigmas * 2) ** 2 + k = len(sigmas) + # compute oks between each detection and ground truth object + for j, gt in enumerate(gts): + # create bounds for ignore regions(double the gt bbox) + g = np.array(gt["keypoints"]) + xg = g[0::3] + yg = g[1::3] + vg = g[2::3] + k1 = np.count_nonzero(vg > 0) + bb = gt["bbox"] + x0 = bb[0] - bb[2] + x1 = bb[0] + bb[2] * 2 + y0 = bb[1] - bb[3] + y1 = bb[1] + bb[3] * 2 + for i, dt in enumerate(dts): + d = np.array(dt["keypoints"]) + xd = d[0::3] + yd = d[1::3] + if k1 > 0: + # measure the per-keypoint distance if keypoints visible + dx = xd - xg + dy = yd - yg + else: + # measure minimum distance to keypoints in (x0,y0) & (x1,y1) + z = np.zeros(k) + dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0) + dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0) + e = (dx**2 + dy**2) / vars / (gt["area"] + np.spacing(1)) / 2 + if k1 > 0: + e = e[vg > 0] + ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] + return ious + + def _extract_mask(self, dt: Dict[str, Any]) -> np.ndarray: + if "densepose" in dt: + densepose_results_quantized = dt["densepose"] + return densepose_results_quantized.labels_uv_uint8[0].numpy() + elif "cse_mask" in dt: + return dt["cse_mask"] + elif "coarse_segm" in dt: + dy = max(int(dt["bbox"][3]), 1) + dx = max(int(dt["bbox"][2]), 1) + return ( + F.interpolate( + dt["coarse_segm"].unsqueeze(0), + (dy, dx), + mode="bilinear", + align_corners=False, + ) + .squeeze(0) + .argmax(0) + .numpy() + .astype(np.uint8) + ) + elif "record_id" in dt: + assert ( + self.multi_storage is not None + ), f"Storage record id encountered in a detection {dt}, but no storage provided!" + record = self.multi_storage.get(dt["rank"], dt["record_id"]) + coarse_segm = record["coarse_segm"] + dy = max(int(dt["bbox"][3]), 1) + dx = max(int(dt["bbox"][2]), 1) + return ( + F.interpolate( + coarse_segm.unsqueeze(0), + (dy, dx), + mode="bilinear", + align_corners=False, + ) + .squeeze(0) + .argmax(0) + .numpy() + .astype(np.uint8) + ) + else: + raise Exception(f"No mask data in the detection: {dt}") + raise ValueError('The prediction dict needs to contain either "densepose" or "cse_mask"') + + def _extract_iuv( + self, densepose_data: np.ndarray, py: np.ndarray, px: np.ndarray, gt: Dict[str, Any] + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Extract arrays of I, U and V values at given points as numpy arrays + given the data mode stored in self._dpDataMode + """ + if self._dpDataMode == DensePoseDataMode.IUV_DT: + # estimated labels and UV (default) + ipoints = densepose_data[0, py, px] + upoints = densepose_data[1, py, px] / 255.0 # convert from uint8 by /255. + vpoints = densepose_data[2, py, px] / 255.0 + elif self._dpDataMode == DensePoseDataMode.IUV_GT: + # ground truth + ipoints = np.array(gt["dp_I"]) + upoints = np.array(gt["dp_U"]) + vpoints = np.array(gt["dp_V"]) + elif self._dpDataMode == DensePoseDataMode.I_GT_UV_0: + # ground truth labels, UV = 0 + ipoints = np.array(gt["dp_I"]) + upoints = upoints * 0.0 + vpoints = vpoints * 0.0 + elif self._dpDataMode == DensePoseDataMode.I_GT_UV_DT: + # ground truth labels, estimated UV + ipoints = np.array(gt["dp_I"]) + upoints = densepose_data[1, py, px] / 255.0 # convert from uint8 by /255. + vpoints = densepose_data[2, py, px] / 255.0 + elif self._dpDataMode == DensePoseDataMode.I_DT_UV_0: + # estimated labels, UV = 0 + ipoints = densepose_data[0, py, px] + upoints = upoints * 0.0 + vpoints = vpoints * 0.0 + else: + raise ValueError(f"Unknown data mode: {self._dpDataMode}") + return ipoints, upoints, vpoints + + def computeOgps_single_pair(self, dt, gt, py, px, pt_mask): + if "densepose" in dt: + ipoints, upoints, vpoints = self.extract_iuv_from_quantized(dt, gt, py, px, pt_mask) + return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints) + elif "u" in dt: + ipoints, upoints, vpoints = self.extract_iuv_from_raw(dt, gt, py, px, pt_mask) + return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints) + elif "record_id" in dt: + assert ( + self.multi_storage is not None + ), f"Storage record id encountered in detection {dt}, but no storage provided!" + record = self.multi_storage.get(dt["rank"], dt["record_id"]) + record["bbox"] = dt["bbox"] + if "u" in record: + ipoints, upoints, vpoints = self.extract_iuv_from_raw(record, gt, py, px, pt_mask) + return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints) + elif "embedding" in record: + return self.computeOgps_single_pair_cse( + dt, + gt, + py, + px, + pt_mask, + record["coarse_segm"], + record["embedding"], + record["bbox"], + ) + else: + raise Exception(f"Unknown record format: {record}") + elif "embedding" in dt: + return self.computeOgps_single_pair_cse( + dt, gt, py, px, pt_mask, dt["coarse_segm"], dt["embedding"], dt["bbox"] + ) + raise Exception(f"Unknown detection format: {dt}") + + def extract_iuv_from_quantized(self, dt, gt, py, px, pt_mask): + densepose_results_quantized = dt["densepose"] + ipoints, upoints, vpoints = self._extract_iuv( + densepose_results_quantized.labels_uv_uint8.numpy(), py, px, gt + ) + ipoints[pt_mask == -1] = 0 + return ipoints, upoints, vpoints + + def extract_iuv_from_raw(self, dt, gt, py, px, pt_mask): + labels_dt = resample_fine_and_coarse_segm_tensors_to_bbox( + dt["fine_segm"].unsqueeze(0), + dt["coarse_segm"].unsqueeze(0), + dt["bbox"], + ) + uv = resample_uv_tensors_to_bbox( + dt["u"].unsqueeze(0), dt["v"].unsqueeze(0), labels_dt.squeeze(0), dt["bbox"] + ) + labels_uv_uint8 = torch.cat((labels_dt.byte(), (uv * 255).clamp(0, 255).byte())) + ipoints, upoints, vpoints = self._extract_iuv(labels_uv_uint8.numpy(), py, px, gt) + ipoints[pt_mask == -1] = 0 + return ipoints, upoints, vpoints + + def computeOgps_single_pair_iuv(self, dt, gt, ipoints, upoints, vpoints): + cVertsGT, ClosestVertsGTTransformed = self.findAllClosestVertsGT(gt) + cVerts = self.findAllClosestVertsUV(upoints, vpoints, ipoints) + # Get pairwise geodesic distances between gt and estimated mesh points. + dist = self.getDistancesUV(ClosestVertsGTTransformed, cVerts) + # Compute the Ogps measure. + # Find the mean geodesic normalization distance for + # each GT point, based on which part it is on. + Current_Mean_Distances = self.Mean_Distances[ + self.CoarseParts[self.Part_ids[cVertsGT[cVertsGT > 0].astype(int) - 1]] + ] + return dist, Current_Mean_Distances + + def computeOgps_single_pair_cse( + self, dt, gt, py, px, pt_mask, coarse_segm, embedding, bbox_xywh_abs + ): + # 0-based mesh vertex indices + cVertsGT = torch.as_tensor(gt["dp_vertex"], dtype=torch.int64) + # label for each pixel of the bbox, [H, W] tensor of long + labels_dt = resample_coarse_segm_tensor_to_bbox( + coarse_segm.unsqueeze(0), bbox_xywh_abs + ).squeeze(0) + x, y, w, h = bbox_xywh_abs + # embedding for each pixel of the bbox, [D, H, W] tensor of float32 + embedding = F.interpolate( + embedding.unsqueeze(0), (int(h), int(w)), mode="bilinear", align_corners=False + ).squeeze(0) + # valid locations py, px + py_pt = torch.from_numpy(py[pt_mask > -1]) + px_pt = torch.from_numpy(px[pt_mask > -1]) + cVerts = torch.ones_like(cVertsGT) * -1 + cVerts[pt_mask > -1] = self.findClosestVertsCse( + embedding, py_pt, px_pt, labels_dt, gt["ref_model"] + ) + # Get pairwise geodesic distances between gt and estimated mesh points. + dist = self.getDistancesCse(cVertsGT, cVerts, gt["ref_model"]) + # normalize distances + if (gt["ref_model"] == "smpl_27554") and ("dp_I" in gt): + Current_Mean_Distances = self.Mean_Distances[ + self.CoarseParts[np.array(gt["dp_I"], dtype=int)] + ] + else: + Current_Mean_Distances = 0.255 + return dist, Current_Mean_Distances + + def computeOgps(self, imgId, catId): + p = self.params + # dimension here should be Nxm + g = self._gts[imgId, catId] + d = self._dts[imgId, catId] + inds = np.argsort([-d_["score"] for d_ in d], kind="mergesort") + d = [d[i] for i in inds] + if len(d) > p.maxDets[-1]: + d = d[0 : p.maxDets[-1]] + # if len(gts) == 0 and len(dts) == 0: + if len(g) == 0 or len(d) == 0: + return [] + ious = np.zeros((len(d), len(g))) + # compute opgs between each detection and ground truth object + # sigma = self.sigma #0.255 # dist = 0.3m corresponds to ogps = 0.5 + # 1 # dist = 0.3m corresponds to ogps = 0.96 + # 1.45 # dist = 1.7m (person height) corresponds to ogps = 0.5) + for j, gt in enumerate(g): + if not gt["ignore"]: + g_ = gt["bbox"] + for i, dt in enumerate(d): + # + dy = int(dt["bbox"][3]) + dx = int(dt["bbox"][2]) + dp_x = np.array(gt["dp_x"]) * g_[2] / 255.0 + dp_y = np.array(gt["dp_y"]) * g_[3] / 255.0 + py = (dp_y + g_[1] - dt["bbox"][1]).astype(int) + px = (dp_x + g_[0] - dt["bbox"][0]).astype(int) + # + pts = np.zeros(len(px)) + pts[px >= dx] = -1 + pts[py >= dy] = -1 + pts[px < 0] = -1 + pts[py < 0] = -1 + if len(pts) < 1: + ogps = 0.0 + elif np.max(pts) == -1: + ogps = 0.0 + else: + px[pts == -1] = 0 + py[pts == -1] = 0 + dists_between_matches, dist_norm_coeffs = self.computeOgps_single_pair( + dt, gt, py, px, pts + ) + # Compute gps + ogps_values = np.exp( + -(dists_between_matches**2) / (2 * (dist_norm_coeffs**2)) + ) + # + ogps = np.mean(ogps_values) if len(ogps_values) > 0 else 0.0 + ious[i, j] = ogps + + gbb = [gt["bbox"] for gt in g] + dbb = [dt["bbox"] for dt in d] + + # compute iou between each dt and gt region + iscrowd = [int(o.get("iscrowd", 0)) for o in g] + ious_bb = maskUtils.iou(dbb, gbb, iscrowd) + return ious, ious_bb + + def evaluateImg(self, imgId, catId, aRng, maxDet): + """ + perform evaluation for single category and image + :return: dict (single image results) + """ + + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return None + + for g in gt: + # g['_ignore'] = g['ignore'] + if g["ignore"] or (g["area"] < aRng[0] or g["area"] > aRng[1]): + g["_ignore"] = True + else: + g["_ignore"] = False + + # sort dt highest score first, sort gt ignore last + gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort") + gt = [gt[i] for i in gtind] + dtind = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in dtind[0:maxDet]] + iscrowd = [int(o.get("iscrowd", 0)) for o in gt] + # load computed ious + if p.iouType == "densepose": + # print('Checking the length', len(self.ious[imgId, catId])) + # if len(self.ious[imgId, catId]) == 0: + # print(self.ious[imgId, catId]) + ious = ( + self.ious[imgId, catId][0][:, gtind] + if len(self.ious[imgId, catId]) > 0 + else self.ious[imgId, catId] + ) + ioubs = ( + self.ious[imgId, catId][1][:, gtind] + if len(self.ious[imgId, catId]) > 0 + else self.ious[imgId, catId] + ) + if self._dpEvalMode in {DensePoseEvalMode.GPSM, DensePoseEvalMode.IOU}: + iousM = ( + self.real_ious[imgId, catId][:, gtind] + if len(self.real_ious[imgId, catId]) > 0 + else self.real_ious[imgId, catId] + ) + else: + ious = ( + self.ious[imgId, catId][:, gtind] + if len(self.ious[imgId, catId]) > 0 + else self.ious[imgId, catId] + ) + + T = len(p.iouThrs) + G = len(gt) + D = len(dt) + gtm = np.zeros((T, G)) + dtm = np.zeros((T, D)) + gtIg = np.array([g["_ignore"] for g in gt]) + dtIg = np.zeros((T, D)) + if np.all(gtIg) and p.iouType == "densepose": + dtIg = np.logical_or(dtIg, True) + + if len(ious) > 0: # and not p.iouType == 'densepose': + for tind, t in enumerate(p.iouThrs): + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + iou = min([t, 1 - 1e-10]) + m = -1 + for gind, _g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind, gind] > 0 and not iscrowd[gind]: + continue + # if dt matched to reg gt, and on ignore gt, stop + if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1: + break + if p.iouType == "densepose": + if self._dpEvalMode == DensePoseEvalMode.GPSM: + new_iou = np.sqrt(iousM[dind, gind] * ious[dind, gind]) + elif self._dpEvalMode == DensePoseEvalMode.IOU: + new_iou = iousM[dind, gind] + elif self._dpEvalMode == DensePoseEvalMode.GPS: + new_iou = ious[dind, gind] + else: + new_iou = ious[dind, gind] + if new_iou < iou: + continue + if new_iou == 0.0: + continue + # if match successful and best so far, store appropriately + iou = new_iou + m = gind + # if match made store id of match for both dt and gt + if m == -1: + continue + dtIg[tind, dind] = gtIg[m] + dtm[tind, dind] = gt[m]["id"] + gtm[tind, m] = d["id"] + + if p.iouType == "densepose": + if not len(ioubs) == 0: + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + if dtm[tind, dind] == 0: + ioub = 0.8 + m = -1 + for gind, _g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind, gind] > 0 and not iscrowd[gind]: + continue + # continue to next gt unless better match made + if ioubs[dind, gind] < ioub: + continue + # if match successful and best so far, store appropriately + ioub = ioubs[dind, gind] + m = gind + # if match made store id of match for both dt and gt + if m > -1: + dtIg[:, dind] = gtIg[m] + if gtIg[m]: + dtm[tind, dind] = gt[m]["id"] + gtm[tind, m] = d["id"] + # set unmatched detections outside of area range to ignore + a = np.array([d["area"] < aRng[0] or d["area"] > aRng[1] for d in dt]).reshape((1, len(dt))) + dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0))) + # store results for given image and category + # print('Done with the function', len(self.ious[imgId, catId])) + return { + "image_id": imgId, + "category_id": catId, + "aRng": aRng, + "maxDet": maxDet, + "dtIds": [d["id"] for d in dt], + "gtIds": [g["id"] for g in gt], + "dtMatches": dtm, + "gtMatches": gtm, + "dtScores": [d["score"] for d in dt], + "gtIgnore": gtIg, + "dtIgnore": dtIg, + } + + def accumulate(self, p=None): + """ + Accumulate per image evaluation results and store the result in self.eval + :param p: input params for evaluation + :return: None + """ + logger.info("Accumulating evaluation results...") + tic = time.time() + if not self.evalImgs: + logger.info("Please run evaluate() first") + # allows input customized parameters + if p is None: + p = self.params + p.catIds = p.catIds if p.useCats == 1 else [-1] + T = len(p.iouThrs) + R = len(p.recThrs) + K = len(p.catIds) if p.useCats else 1 + A = len(p.areaRng) + M = len(p.maxDets) + precision = -(np.ones((T, R, K, A, M))) # -1 for the precision of absent categories + recall = -(np.ones((T, K, A, M))) + + # create dictionary for future indexing + logger.info("Categories: {}".format(p.catIds)) + _pe = self._paramsEval + catIds = _pe.catIds if _pe.useCats else [-1] + setK = set(catIds) + setA = set(map(tuple, _pe.areaRng)) + setM = set(_pe.maxDets) + setI = set(_pe.imgIds) + # get inds to evaluate + k_list = [n for n, k in enumerate(p.catIds) if k in setK] + m_list = [m for n, m in enumerate(p.maxDets) if m in setM] + a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] + i_list = [n for n, i in enumerate(p.imgIds) if i in setI] + I0 = len(_pe.imgIds) + A0 = len(_pe.areaRng) + # retrieve E at each category, area range, and max number of detections + for k, k0 in enumerate(k_list): + Nk = k0 * A0 * I0 + for a, a0 in enumerate(a_list): + Na = a0 * I0 + for m, maxDet in enumerate(m_list): + E = [self.evalImgs[Nk + Na + i] for i in i_list] + E = [e for e in E if e is not None] + if len(E) == 0: + continue + dtScores = np.concatenate([e["dtScores"][0:maxDet] for e in E]) + + # different sorting method generates slightly different results. + # mergesort is used to be consistent as Matlab implementation. + inds = np.argsort(-dtScores, kind="mergesort") + + dtm = np.concatenate([e["dtMatches"][:, 0:maxDet] for e in E], axis=1)[:, inds] + dtIg = np.concatenate([e["dtIgnore"][:, 0:maxDet] for e in E], axis=1)[:, inds] + gtIg = np.concatenate([e["gtIgnore"] for e in E]) + npig = np.count_nonzero(gtIg == 0) + if npig == 0: + continue + tps = np.logical_and(dtm, np.logical_not(dtIg)) + fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg)) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=float) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=float) + for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): + tp = np.array(tp) + fp = np.array(fp) + nd = len(tp) + rc = tp / npig + pr = tp / (fp + tp + np.spacing(1)) + q = np.zeros((R,)) + + if nd: + recall[t, k, a, m] = rc[-1] + else: + recall[t, k, a, m] = 0 + + # numpy is slow without cython optimization for accessing elements + # use python array gets significant speed improvement + pr = pr.tolist() + q = q.tolist() + + for i in range(nd - 1, 0, -1): + if pr[i] > pr[i - 1]: + pr[i - 1] = pr[i] + + inds = np.searchsorted(rc, p.recThrs, side="left") + try: + for ri, pi in enumerate(inds): + q[ri] = pr[pi] + except Exception: + pass + precision[t, :, k, a, m] = np.array(q) + logger.info( + "Final: max precision {}, min precision {}".format(np.max(precision), np.min(precision)) + ) + self.eval = { + "params": p, + "counts": [T, R, K, A, M], + "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "precision": precision, + "recall": recall, + } + toc = time.time() + logger.info("DONE (t={:0.2f}s).".format(toc - tic)) + + def summarize(self): + """ + Compute and display summary metrics for evaluation results. + Note this function can *only* be applied on the default parameter setting + """ + + def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): + p = self.params + iStr = " {:<18} {} @[ {}={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" + titleStr = "Average Precision" if ap == 1 else "Average Recall" + typeStr = "(AP)" if ap == 1 else "(AR)" + measure = "IoU" + if self.params.iouType == "keypoints": + measure = "OKS" + elif self.params.iouType == "densepose": + measure = "OGPS" + iouStr = ( + "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) + if iouThr is None + else "{:0.2f}".format(iouThr) + ) + + aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval["precision"] + # IoU + if iouThr is not None: + t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0] + s = s[t] + s = s[:, :, :, aind, mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval["recall"] + if iouThr is not None: + t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0] + s = s[t] + s = s[:, :, aind, mind] + if len(s[s > -1]) == 0: + mean_s = -1 + else: + mean_s = np.mean(s[s > -1]) + logger.info(iStr.format(titleStr, typeStr, measure, iouStr, areaRng, maxDets, mean_s)) + return mean_s + + def _summarizeDets(): + stats = np.zeros((12,)) + stats[0] = _summarize(1) + stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) + stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) + stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) + stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) + stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) + stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) + stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) + return stats + + def _summarizeKps(): + stats = np.zeros((10,)) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=0.5) + stats[2] = _summarize(1, maxDets=20, iouThr=0.75) + stats[3] = _summarize(1, maxDets=20, areaRng="medium") + stats[4] = _summarize(1, maxDets=20, areaRng="large") + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=0.5) + stats[7] = _summarize(0, maxDets=20, iouThr=0.75) + stats[8] = _summarize(0, maxDets=20, areaRng="medium") + stats[9] = _summarize(0, maxDets=20, areaRng="large") + return stats + + def _summarizeUvs(): + stats = [_summarize(1, maxDets=self.params.maxDets[0])] + min_threshold = self.params.iouThrs.min() + if min_threshold <= 0.201: + stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.2)] + if min_threshold <= 0.301: + stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.3)] + if min_threshold <= 0.401: + stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.4)] + stats += [ + _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5), + _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75), + _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium"), + _summarize(1, maxDets=self.params.maxDets[0], areaRng="large"), + _summarize(0, maxDets=self.params.maxDets[0]), + _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5), + _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75), + _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium"), + _summarize(0, maxDets=self.params.maxDets[0], areaRng="large"), + ] + return np.array(stats) + + def _summarizeUvsOld(): + stats = np.zeros((18,)) + stats[0] = _summarize(1, maxDets=self.params.maxDets[0]) + stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5) + stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.55) + stats[3] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.60) + stats[4] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.65) + stats[5] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.70) + stats[6] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75) + stats[7] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.80) + stats[8] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.85) + stats[9] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.90) + stats[10] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.95) + stats[11] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium") + stats[12] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large") + stats[13] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[14] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5) + stats[15] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75) + stats[16] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium") + stats[17] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large") + return stats + + if not self.eval: + raise Exception("Please run accumulate() first") + iouType = self.params.iouType + if iouType in ["segm", "bbox"]: + summarize = _summarizeDets + elif iouType in ["keypoints"]: + summarize = _summarizeKps + elif iouType in ["densepose"]: + summarize = _summarizeUvs + self.stats = summarize() + + def __str__(self): + self.summarize() + + # ================ functions for dense pose ============================== + def findAllClosestVertsUV(self, U_points, V_points, Index_points): + ClosestVerts = np.ones(Index_points.shape) * -1 + for i in np.arange(24): + # + if (i + 1) in Index_points: + UVs = np.array( + [U_points[Index_points == (i + 1)], V_points[Index_points == (i + 1)]] + ) + Current_Part_UVs = self.Part_UVs[i] + Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i] + D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze() + ClosestVerts[Index_points == (i + 1)] = Current_Part_ClosestVertInds[ + np.argmin(D, axis=0) + ] + ClosestVertsTransformed = self.PDIST_transform[ClosestVerts.astype(int) - 1] + ClosestVertsTransformed[ClosestVerts < 0] = 0 + return ClosestVertsTransformed + + def findClosestVertsCse(self, embedding, py, px, mask, mesh_name): + mesh_vertex_embeddings = self.embedder(mesh_name) + pixel_embeddings = embedding[:, py, px].t().to(device="cuda") + mask_vals = mask[py, px] + edm = squared_euclidean_distance_matrix(pixel_embeddings, mesh_vertex_embeddings) + vertex_indices = edm.argmin(dim=1).cpu() + vertex_indices[mask_vals <= 0] = -1 + return vertex_indices + + def findAllClosestVertsGT(self, gt): + # + I_gt = np.array(gt["dp_I"]) + U_gt = np.array(gt["dp_U"]) + V_gt = np.array(gt["dp_V"]) + # + # print(I_gt) + # + ClosestVertsGT = np.ones(I_gt.shape) * -1 + for i in np.arange(24): + if (i + 1) in I_gt: + UVs = np.array([U_gt[I_gt == (i + 1)], V_gt[I_gt == (i + 1)]]) + Current_Part_UVs = self.Part_UVs[i] + Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i] + D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze() + ClosestVertsGT[I_gt == (i + 1)] = Current_Part_ClosestVertInds[np.argmin(D, axis=0)] + # + ClosestVertsGTTransformed = self.PDIST_transform[ClosestVertsGT.astype(int) - 1] + ClosestVertsGTTransformed[ClosestVertsGT < 0] = 0 + return ClosestVertsGT, ClosestVertsGTTransformed + + def getDistancesCse(self, cVertsGT, cVerts, mesh_name): + geodists_vertices = torch.ones_like(cVertsGT) * float("inf") + selected = (cVertsGT >= 0) * (cVerts >= 0) + mesh = create_mesh(mesh_name, "cpu") + geodists_vertices[selected] = mesh.geodists[cVertsGT[selected], cVerts[selected]] + return geodists_vertices.numpy() + + def getDistancesUV(self, cVertsGT, cVerts): + # + n = 27554 + dists = [] + for d in range(len(cVertsGT)): + if cVertsGT[d] > 0: + if cVerts[d] > 0: + i = cVertsGT[d] - 1 + j = cVerts[d] - 1 + if j == i: + dists.append(0) + elif j > i: + ccc = i + i = j + j = ccc + i = n - i - 1 + j = n - j - 1 + k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1 + k = (n * n - n) / 2 - k - 1 + dists.append(self.Pdist_matrix[int(k)][0]) + else: + i = n - i - 1 + j = n - j - 1 + k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1 + k = (n * n - n) / 2 - k - 1 + dists.append(self.Pdist_matrix[int(k)][0]) + else: + dists.append(np.inf) + return np.atleast_1d(np.array(dists).squeeze()) + + +class Params: + """ + Params for coco evaluation api + """ + + def setDetParams(self): + self.imgIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange is slightly larger than the true value + self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True) + self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True) + self.maxDets = [1, 10, 100] + self.areaRng = [ + [0**2, 1e5**2], + [0**2, 32**2], + [32**2, 96**2], + [96**2, 1e5**2], + ] + self.areaRngLbl = ["all", "small", "medium", "large"] + self.useCats = 1 + + def setKpParams(self): + self.imgIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange is slightly larger than the true value + self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True) + self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True) + self.maxDets = [20] + self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]] + self.areaRngLbl = ["all", "medium", "large"] + self.useCats = 1 + + def setUvParams(self): + self.imgIds = [] + self.catIds = [] + self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True) + self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True) + self.maxDets = [20] + self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]] + self.areaRngLbl = ["all", "medium", "large"] + self.useCats = 1 + + def __init__(self, iouType="segm"): + if iouType == "segm" or iouType == "bbox": + self.setDetParams() + elif iouType == "keypoints": + self.setKpParams() + elif iouType == "densepose": + self.setUvParams() + else: + raise Exception("iouType not supported") + self.iouType = iouType + # useSegm is deprecated + self.useSegm = None diff --git a/densepose/evaluation/evaluator.py b/densepose/evaluation/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..d5d1d789bbe4b8791aa8529518ba1b964d31daca --- /dev/null +++ b/densepose/evaluation/evaluator.py @@ -0,0 +1,421 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +import contextlib +import copy +import io +import itertools +import logging +import numpy as np +import os +from collections import OrderedDict +from typing import Dict, Iterable, List, Optional +import pycocotools.mask as mask_utils +import torch +from pycocotools.coco import COCO +from tabulate import tabulate + +from detectron2.config import CfgNode +from detectron2.data import MetadataCatalog +from detectron2.evaluation import DatasetEvaluator +from detectron2.structures import BoxMode +from detectron2.utils.comm import gather, get_rank, is_main_process, synchronize +from detectron2.utils.file_io import PathManager +from detectron2.utils.logger import create_small_table + +from densepose.converters import ToChartResultConverter, ToMaskConverter +from densepose.data.datasets.coco import maybe_filter_and_map_categories_cocoapi +from densepose.structures import ( + DensePoseChartPredictorOutput, + DensePoseEmbeddingPredictorOutput, + quantize_densepose_chart_result, +) + +from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode +from .mesh_alignment_evaluator import MeshAlignmentEvaluator +from .tensor_storage import ( + SingleProcessFileTensorStorage, + SingleProcessRamTensorStorage, + SingleProcessTensorStorage, + SizeData, + storage_gather, +) + + +class DensePoseCOCOEvaluator(DatasetEvaluator): + def __init__( + self, + dataset_name, + distributed, + output_dir=None, + evaluator_type: str = "iuv", + min_iou_threshold: float = 0.5, + storage: Optional[SingleProcessTensorStorage] = None, + embedder=None, + should_evaluate_mesh_alignment: bool = False, + mesh_alignment_mesh_names: Optional[List[str]] = None, + ): + self._embedder = embedder + self._distributed = distributed + self._output_dir = output_dir + self._evaluator_type = evaluator_type + self._storage = storage + self._should_evaluate_mesh_alignment = should_evaluate_mesh_alignment + + assert not ( + should_evaluate_mesh_alignment and embedder is None + ), "Mesh alignment evaluation is activated, but no vertex embedder provided!" + if should_evaluate_mesh_alignment: + self._mesh_alignment_evaluator = MeshAlignmentEvaluator( + embedder, + mesh_alignment_mesh_names, + ) + + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + self._metadata = MetadataCatalog.get(dataset_name) + self._min_threshold = min_iou_threshold + json_file = PathManager.get_local_path(self._metadata.json_file) + with contextlib.redirect_stdout(io.StringIO()): + self._coco_api = COCO(json_file) + maybe_filter_and_map_categories_cocoapi(dataset_name, self._coco_api) + + def reset(self): + self._predictions = [] + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a COCO model. It is a list of dicts with key + "instances" that contains :class:`Instances`. + The :class:`Instances` object needs to have `densepose` field. + """ + for input, output in zip(inputs, outputs): + instances = output["instances"].to(self._cpu_device) + if not instances.has("pred_densepose"): + continue + prediction_list = prediction_to_dict( + instances, + input["image_id"], + self._embedder, + self._metadata.class_to_mesh_name, + self._storage is not None, + ) + if self._storage is not None: + for prediction_dict in prediction_list: + dict_to_store = {} + for field_name in self._storage.data_schema: + dict_to_store[field_name] = prediction_dict[field_name] + record_id = self._storage.put(dict_to_store) + prediction_dict["record_id"] = record_id + prediction_dict["rank"] = get_rank() + for field_name in self._storage.data_schema: + del prediction_dict[field_name] + self._predictions.extend(prediction_list) + + def evaluate(self, img_ids=None): + if self._distributed: + synchronize() + predictions = gather(self._predictions) + predictions = list(itertools.chain(*predictions)) + else: + predictions = self._predictions + + multi_storage = storage_gather(self._storage) if self._storage is not None else None + + if not is_main_process(): + return + return copy.deepcopy(self._eval_predictions(predictions, multi_storage, img_ids)) + + def _eval_predictions(self, predictions, multi_storage=None, img_ids=None): + """ + Evaluate predictions on densepose. + Return results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "coco_densepose_predictions.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(predictions, f) + + self._logger.info("Evaluating predictions ...") + res = OrderedDict() + results_gps, results_gpsm, results_segm = _evaluate_predictions_on_coco( + self._coco_api, + predictions, + multi_storage, + self._embedder, + class_names=self._metadata.get("thing_classes"), + min_threshold=self._min_threshold, + img_ids=img_ids, + ) + res["densepose_gps"] = results_gps + res["densepose_gpsm"] = results_gpsm + res["densepose_segm"] = results_segm + if self._should_evaluate_mesh_alignment: + res["densepose_mesh_alignment"] = self._evaluate_mesh_alignment() + return res + + def _evaluate_mesh_alignment(self): + self._logger.info("Mesh alignment evaluation ...") + mean_ge, mean_gps, per_mesh_metrics = self._mesh_alignment_evaluator.evaluate() + results = { + "GE": mean_ge * 100, + "GPS": mean_gps * 100, + } + mesh_names = set() + for metric_name in per_mesh_metrics: + for mesh_name, value in per_mesh_metrics[metric_name].items(): + results[f"{metric_name}-{mesh_name}"] = value * 100 + mesh_names.add(mesh_name) + self._print_mesh_alignment_results(results, mesh_names) + return results + + def _print_mesh_alignment_results(self, results: Dict[str, float], mesh_names: Iterable[str]): + self._logger.info("Evaluation results for densepose, mesh alignment:") + self._logger.info(f'| {"Mesh":13s} | {"GErr":7s} | {"GPS":7s} |') + self._logger.info("| :-----------: | :-----: | :-----: |") + for mesh_name in mesh_names: + ge_key = f"GE-{mesh_name}" + ge_str = f"{results[ge_key]:.4f}" if ge_key in results else " " + gps_key = f"GPS-{mesh_name}" + gps_str = f"{results[gps_key]:.4f}" if gps_key in results else " " + self._logger.info(f"| {mesh_name:13s} | {ge_str:7s} | {gps_str:7s} |") + self._logger.info("| :-------------------------------: |") + ge_key = "GE" + ge_str = f"{results[ge_key]:.4f}" if ge_key in results else " " + gps_key = "GPS" + gps_str = f"{results[gps_key]:.4f}" if gps_key in results else " " + self._logger.info(f'| {"MEAN":13s} | {ge_str:7s} | {gps_str:7s} |') + + +def prediction_to_dict(instances, img_id, embedder, class_to_mesh_name, use_storage): + """ + Args: + instances (Instances): the output of the model + img_id (str): the image id in COCO + + Returns: + list[dict]: the results in densepose evaluation format + """ + scores = instances.scores.tolist() + classes = instances.pred_classes.tolist() + raw_boxes_xywh = BoxMode.convert( + instances.pred_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + + if isinstance(instances.pred_densepose, DensePoseEmbeddingPredictorOutput): + results_densepose = densepose_cse_predictions_to_dict( + instances, embedder, class_to_mesh_name, use_storage + ) + elif isinstance(instances.pred_densepose, DensePoseChartPredictorOutput): + if not use_storage: + results_densepose = densepose_chart_predictions_to_dict(instances) + else: + results_densepose = densepose_chart_predictions_to_storage_dict(instances) + + results = [] + for k in range(len(instances)): + result = { + "image_id": img_id, + "category_id": classes[k], + "bbox": raw_boxes_xywh[k].tolist(), + "score": scores[k], + } + results.append({**result, **results_densepose[k]}) + return results + + +def densepose_chart_predictions_to_dict(instances): + segmentations = ToMaskConverter.convert( + instances.pred_densepose, instances.pred_boxes, instances.image_size + ) + + results = [] + for k in range(len(instances)): + densepose_results_quantized = quantize_densepose_chart_result( + ToChartResultConverter.convert(instances.pred_densepose[k], instances.pred_boxes[k]) + ) + densepose_results_quantized.labels_uv_uint8 = ( + densepose_results_quantized.labels_uv_uint8.cpu() + ) + segmentation = segmentations.tensor[k] + segmentation_encoded = mask_utils.encode( + np.require(segmentation.numpy(), dtype=np.uint8, requirements=["F"]) + ) + segmentation_encoded["counts"] = segmentation_encoded["counts"].decode("utf-8") + result = { + "densepose": densepose_results_quantized, + "segmentation": segmentation_encoded, + } + results.append(result) + return results + + +def densepose_chart_predictions_to_storage_dict(instances): + results = [] + for k in range(len(instances)): + densepose_predictor_output = instances.pred_densepose[k] + result = { + "coarse_segm": densepose_predictor_output.coarse_segm.squeeze(0).cpu(), + "fine_segm": densepose_predictor_output.fine_segm.squeeze(0).cpu(), + "u": densepose_predictor_output.u.squeeze(0).cpu(), + "v": densepose_predictor_output.v.squeeze(0).cpu(), + } + results.append(result) + return results + + +def densepose_cse_predictions_to_dict(instances, embedder, class_to_mesh_name, use_storage): + results = [] + for k in range(len(instances)): + cse = instances.pred_densepose[k] + results.append( + { + "coarse_segm": cse.coarse_segm[0].cpu(), + "embedding": cse.embedding[0].cpu(), + } + ) + return results + + +def _evaluate_predictions_on_coco( + coco_gt, + coco_results, + multi_storage=None, + embedder=None, + class_names=None, + min_threshold: float = 0.5, + img_ids=None, +): + logger = logging.getLogger(__name__) + + densepose_metrics = _get_densepose_metrics(min_threshold) + if len(coco_results) == 0: # cocoapi does not handle empty results very well + logger.warn("No predictions from the model! Set scores to -1") + results_gps = {metric: -1 for metric in densepose_metrics} + results_gpsm = {metric: -1 for metric in densepose_metrics} + results_segm = {metric: -1 for metric in densepose_metrics} + return results_gps, results_gpsm, results_segm + + coco_dt = coco_gt.loadRes(coco_results) + + results = [] + for eval_mode_name in ["GPS", "GPSM", "IOU"]: + eval_mode = getattr(DensePoseEvalMode, eval_mode_name) + coco_eval = DensePoseCocoEval( + coco_gt, coco_dt, "densepose", multi_storage, embedder, dpEvalMode=eval_mode + ) + result = _derive_results_from_coco_eval( + coco_eval, eval_mode_name, densepose_metrics, class_names, min_threshold, img_ids + ) + results.append(result) + return results + + +def _get_densepose_metrics(min_threshold: float = 0.5): + metrics = ["AP"] + if min_threshold <= 0.201: + metrics += ["AP20"] + if min_threshold <= 0.301: + metrics += ["AP30"] + if min_threshold <= 0.401: + metrics += ["AP40"] + metrics.extend(["AP50", "AP75", "APm", "APl", "AR", "AR50", "AR75", "ARm", "ARl"]) + return metrics + + +def _derive_results_from_coco_eval( + coco_eval, eval_mode_name, metrics, class_names, min_threshold: float, img_ids +): + if img_ids is not None: + coco_eval.params.imgIds = img_ids + coco_eval.params.iouThrs = np.linspace( + min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True + ) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)} + logger = logging.getLogger(__name__) + logger.info( + f"Evaluation results for densepose, {eval_mode_name} metric: \n" + + create_small_table(results) + ) + if class_names is None or len(class_names) <= 1: + return results + + # Compute per-category AP, the same way as it is done in D2 + # (see detectron2/evaluation/coco_evaluation.py): + precisions = coco_eval.eval["precision"] + # precision has dims (iou, recall, cls, area range, max dets) + assert len(class_names) == precisions.shape[2] + + results_per_category = [] + for idx, name in enumerate(class_names): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + ap = np.mean(precision) if precision.size else float("nan") + results_per_category.append((f"{name}", float(ap * 100))) + + # tabulate it + n_cols = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + results_2d = itertools.zip_longest(*[results_flatten[i::n_cols] for i in range(n_cols)]) + table = tabulate( + results_2d, + tablefmt="pipe", + floatfmt=".3f", + headers=["category", "AP"] * (n_cols // 2), + numalign="left", + ) + logger.info(f"Per-category {eval_mode_name} AP: \n" + table) + + results.update({"AP-" + name: ap for name, ap in results_per_category}) + return results + + +def build_densepose_evaluator_storage(cfg: CfgNode, output_folder: str): + storage_spec = cfg.DENSEPOSE_EVALUATION.STORAGE + if storage_spec == "none": + return None + evaluator_type = cfg.DENSEPOSE_EVALUATION.TYPE + # common output tensor sizes + hout = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE + wout = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE + n_csc = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + # specific output tensors + if evaluator_type == "iuv": + n_fsc = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + schema = { + "coarse_segm": SizeData(dtype="float32", shape=(n_csc, hout, wout)), + "fine_segm": SizeData(dtype="float32", shape=(n_fsc, hout, wout)), + "u": SizeData(dtype="float32", shape=(n_fsc, hout, wout)), + "v": SizeData(dtype="float32", shape=(n_fsc, hout, wout)), + } + elif evaluator_type == "cse": + embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE + schema = { + "coarse_segm": SizeData(dtype="float32", shape=(n_csc, hout, wout)), + "embedding": SizeData(dtype="float32", shape=(embed_size, hout, wout)), + } + else: + raise ValueError(f"Unknown evaluator type: {evaluator_type}") + # storage types + if storage_spec == "ram": + storage = SingleProcessRamTensorStorage(schema, io.BytesIO()) + elif storage_spec == "file": + fpath = os.path.join(output_folder, f"DensePoseEvaluatorStorage.{get_rank()}.bin") + PathManager.mkdirs(output_folder) + storage = SingleProcessFileTensorStorage(schema, fpath, "wb") + else: + raise ValueError(f"Unknown storage specification: {storage_spec}") + return storage diff --git a/densepose/evaluation/mesh_alignment_evaluator.py b/densepose/evaluation/mesh_alignment_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..9d67c1a88a56332fb708c4618a34e96900926083 --- /dev/null +++ b/densepose/evaluation/mesh_alignment_evaluator.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import json +import logging +from typing import List, Optional +import torch +from torch import nn + +from detectron2.utils.file_io import PathManager + +from densepose.structures.mesh import create_mesh + + +class MeshAlignmentEvaluator: + """ + Class for evaluation of 3D mesh alignment based on the learned vertex embeddings + """ + + def __init__(self, embedder: nn.Module, mesh_names: Optional[List[str]]): + self.embedder = embedder + # use the provided mesh names if not None and not an empty list + self.mesh_names = mesh_names if mesh_names else embedder.mesh_names + self.logger = logging.getLogger(__name__) + with PathManager.open( + "https://dl.fbaipublicfiles.com/densepose/data/cse/mesh_keyvertices_v0.json", "r" + ) as f: + self.mesh_keyvertices = json.load(f) + + def evaluate(self): + ge_per_mesh = {} + gps_per_mesh = {} + for mesh_name_1 in self.mesh_names: + avg_errors = [] + avg_gps = [] + embeddings_1 = self.embedder(mesh_name_1) + keyvertices_1 = self.mesh_keyvertices[mesh_name_1] + keyvertex_names_1 = list(keyvertices_1.keys()) + keyvertex_indices_1 = [keyvertices_1[name] for name in keyvertex_names_1] + for mesh_name_2 in self.mesh_names: + if mesh_name_1 == mesh_name_2: + continue + embeddings_2 = self.embedder(mesh_name_2) + keyvertices_2 = self.mesh_keyvertices[mesh_name_2] + sim_matrix_12 = embeddings_1[keyvertex_indices_1].mm(embeddings_2.T) + vertices_2_matching_keyvertices_1 = sim_matrix_12.argmax(axis=1) + mesh_2 = create_mesh(mesh_name_2, embeddings_2.device) + geodists = mesh_2.geodists[ + vertices_2_matching_keyvertices_1, + [keyvertices_2[name] for name in keyvertex_names_1], + ] + Current_Mean_Distances = 0.255 + gps = (-(geodists**2) / (2 * (Current_Mean_Distances**2))).exp() + avg_errors.append(geodists.mean().item()) + avg_gps.append(gps.mean().item()) + + ge_mean = torch.as_tensor(avg_errors).mean().item() + gps_mean = torch.as_tensor(avg_gps).mean().item() + ge_per_mesh[mesh_name_1] = ge_mean + gps_per_mesh[mesh_name_1] = gps_mean + ge_mean_global = torch.as_tensor(list(ge_per_mesh.values())).mean().item() + gps_mean_global = torch.as_tensor(list(gps_per_mesh.values())).mean().item() + per_mesh_metrics = { + "GE": ge_per_mesh, + "GPS": gps_per_mesh, + } + return ge_mean_global, gps_mean_global, per_mesh_metrics diff --git a/densepose/evaluation/tensor_storage.py b/densepose/evaluation/tensor_storage.py new file mode 100644 index 0000000000000000000000000000000000000000..db57c6ac73a423e39b1ed2e5a4a1f824aa233737 --- /dev/null +++ b/densepose/evaluation/tensor_storage.py @@ -0,0 +1,239 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import io +import numpy as np +import os +from dataclasses import dataclass +from functools import reduce +from operator import mul +from typing import BinaryIO, Dict, Optional, Tuple +import torch + +from detectron2.utils.comm import gather, get_rank +from detectron2.utils.file_io import PathManager + + +@dataclass +class SizeData: + dtype: str + shape: Tuple[int] + + +def _calculate_record_field_size_b(data_schema: Dict[str, SizeData], field_name: str) -> int: + schema = data_schema[field_name] + element_size_b = np.dtype(schema.dtype).itemsize + record_field_size_b = reduce(mul, schema.shape) * element_size_b + return record_field_size_b + + +def _calculate_record_size_b(data_schema: Dict[str, SizeData]) -> int: + record_size_b = 0 + for field_name in data_schema: + record_field_size_b = _calculate_record_field_size_b(data_schema, field_name) + record_size_b += record_field_size_b + return record_size_b + + +def _calculate_record_field_sizes_b(data_schema: Dict[str, SizeData]) -> Dict[str, int]: + field_sizes_b = {} + for field_name in data_schema: + field_sizes_b[field_name] = _calculate_record_field_size_b(data_schema, field_name) + return field_sizes_b + + +class SingleProcessTensorStorage: + """ + Compact tensor storage to keep tensor data of predefined size and type. + """ + + def __init__(self, data_schema: Dict[str, SizeData], storage_impl: BinaryIO): + """ + Construct tensor storage based on information on data shape and size. + Internally uses numpy to interpret the type specification. + The storage must support operations `seek(offset, whence=os.SEEK_SET)` and + `read(size)` to be able to perform the `get` operation. + The storage must support operation `write(bytes)` to be able to perform + the `put` operation. + + Args: + data_schema (dict: str -> SizeData): dictionary which maps tensor name + to its size data (shape and data type), e.g. + ``` + { + "coarse_segm": SizeData(dtype="float32", shape=(112, 112)), + "embedding": SizeData(dtype="float32", shape=(16, 112, 112)), + } + ``` + storage_impl (BinaryIO): io instance that handles file-like seek, read + and write operations, e.g. a file handle or a memory buffer like io.BytesIO + """ + self.data_schema = data_schema + self.record_size_b = _calculate_record_size_b(data_schema) + self.record_field_sizes_b = _calculate_record_field_sizes_b(data_schema) + self.storage_impl = storage_impl + self.next_record_id = 0 + + def get(self, record_id: int) -> Dict[str, torch.Tensor]: + """ + Load tensors from the storage by record ID + + Args: + record_id (int): Record ID, for which to load the data + + Return: + dict: str -> tensor: tensor name mapped to tensor data, recorded under the provided ID + """ + self.storage_impl.seek(record_id * self.record_size_b, os.SEEK_SET) + data_bytes = self.storage_impl.read(self.record_size_b) + assert len(data_bytes) == self.record_size_b, ( + f"Expected data size {self.record_size_b} B could not be read: " + f"got {len(data_bytes)} B" + ) + record = {} + cur_idx = 0 + # it's important to read and write in the same order + for field_name in sorted(self.data_schema): + schema = self.data_schema[field_name] + field_size_b = self.record_field_sizes_b[field_name] + chunk = data_bytes[cur_idx : cur_idx + field_size_b] + data_np = np.frombuffer( + chunk, dtype=schema.dtype, count=reduce(mul, schema.shape) + ).reshape(schema.shape) + record[field_name] = torch.from_numpy(data_np) + cur_idx += field_size_b + return record + + def put(self, data: Dict[str, torch.Tensor]) -> int: + """ + Store tensors in the storage + + Args: + data (dict: str -> tensor): data to store, a dictionary which maps + tensor names into tensors; tensor shapes must match those specified + in data schema. + Return: + int: record ID, under which the data is stored + """ + # it's important to read and write in the same order + for field_name in sorted(self.data_schema): + assert ( + field_name in data + ), f"Field '{field_name}' not present in data: data keys are {data.keys()}" + value = data[field_name] + assert value.shape == self.data_schema[field_name].shape, ( + f"Mismatched tensor shapes for field '{field_name}': " + f"expected {self.data_schema[field_name].shape}, got {value.shape}" + ) + data_bytes = value.cpu().numpy().tobytes() + assert len(data_bytes) == self.record_field_sizes_b[field_name], ( + f"Expected field {field_name} to be of size " + f"{self.record_field_sizes_b[field_name]} B, got {len(data_bytes)} B" + ) + self.storage_impl.write(data_bytes) + record_id = self.next_record_id + self.next_record_id += 1 + return record_id + + +class SingleProcessFileTensorStorage(SingleProcessTensorStorage): + """ + Implementation of a single process tensor storage which stores data in a file + """ + + def __init__(self, data_schema: Dict[str, SizeData], fpath: str, mode: str): + self.fpath = fpath + assert "b" in mode, f"Tensor storage should be opened in binary mode, got '{mode}'" + if "w" in mode: + # pyre-fixme[6]: For 2nd argument expected `Union[typing_extensions.Liter... + file_h = PathManager.open(fpath, mode) + elif "r" in mode: + local_fpath = PathManager.get_local_path(fpath) + file_h = open(local_fpath, mode) + else: + raise ValueError(f"Unsupported file mode {mode}, supported modes: rb, wb") + super().__init__(data_schema, file_h) # pyre-ignore[6] + + +class SingleProcessRamTensorStorage(SingleProcessTensorStorage): + """ + Implementation of a single process tensor storage which stores data in RAM + """ + + def __init__(self, data_schema: Dict[str, SizeData], buf: io.BytesIO): + super().__init__(data_schema, buf) + + +class MultiProcessTensorStorage: + """ + Representation of a set of tensor storages created by individual processes, + allows to access those storages from a single owner process. The storages + should either be shared or broadcasted to the owner process. + The processes are identified by their rank, data is uniquely defined by + the rank of the process and the record ID. + """ + + def __init__(self, rank_to_storage: Dict[int, SingleProcessTensorStorage]): + self.rank_to_storage = rank_to_storage + + def get(self, rank: int, record_id: int) -> Dict[str, torch.Tensor]: + storage = self.rank_to_storage[rank] + return storage.get(record_id) + + def put(self, rank: int, data: Dict[str, torch.Tensor]) -> int: + storage = self.rank_to_storage[rank] + return storage.put(data) + + +class MultiProcessFileTensorStorage(MultiProcessTensorStorage): + def __init__(self, data_schema: Dict[str, SizeData], rank_to_fpath: Dict[int, str], mode: str): + rank_to_storage = { + rank: SingleProcessFileTensorStorage(data_schema, fpath, mode) + for rank, fpath in rank_to_fpath.items() + } + super().__init__(rank_to_storage) # pyre-ignore[6] + + +class MultiProcessRamTensorStorage(MultiProcessTensorStorage): + def __init__(self, data_schema: Dict[str, SizeData], rank_to_buffer: Dict[int, io.BytesIO]): + rank_to_storage = { + rank: SingleProcessRamTensorStorage(data_schema, buf) + for rank, buf in rank_to_buffer.items() + } + super().__init__(rank_to_storage) # pyre-ignore[6] + + +def _ram_storage_gather( + storage: SingleProcessRamTensorStorage, dst_rank: int = 0 +) -> Optional[MultiProcessRamTensorStorage]: + storage.storage_impl.seek(0, os.SEEK_SET) + # TODO: overhead, pickling a bytes object, can just pass bytes in a tensor directly + # see detectron2/utils.comm.py + data_list = gather(storage.storage_impl.read(), dst=dst_rank) + if get_rank() != dst_rank: + return None + rank_to_buffer = {i: io.BytesIO(data_list[i]) for i in range(len(data_list))} + multiprocess_storage = MultiProcessRamTensorStorage(storage.data_schema, rank_to_buffer) + return multiprocess_storage + + +def _file_storage_gather( + storage: SingleProcessFileTensorStorage, + dst_rank: int = 0, + mode: str = "rb", +) -> Optional[MultiProcessFileTensorStorage]: + storage.storage_impl.close() + fpath_list = gather(storage.fpath, dst=dst_rank) + if get_rank() != dst_rank: + return None + rank_to_fpath = {i: fpath_list[i] for i in range(len(fpath_list))} + return MultiProcessFileTensorStorage(storage.data_schema, rank_to_fpath, mode) + + +def storage_gather( + storage: SingleProcessTensorStorage, dst_rank: int = 0 +) -> Optional[MultiProcessTensorStorage]: + if isinstance(storage, SingleProcessRamTensorStorage): + return _ram_storage_gather(storage, dst_rank) + elif isinstance(storage, SingleProcessFileTensorStorage): + return _file_storage_gather(storage, dst_rank) + raise Exception(f"Unsupported storage for gather operation: {storage}") diff --git a/densepose/modeling/__init__.py b/densepose/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4c49f6da0d182cc97f5fe6b21d77c8f8330d3c3d --- /dev/null +++ b/densepose/modeling/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType +from .filter import DensePoseDataFilter +from .inference import densepose_inference +from .utils import initialize_module_params +from .build import ( + build_densepose_data_filter, + build_densepose_embedder, + build_densepose_head, + build_densepose_losses, + build_densepose_predictor, +) diff --git a/densepose/modeling/build.py b/densepose/modeling/build.py new file mode 100644 index 0000000000000000000000000000000000000000..bb7f54b4a1044bc518d66d89432dd52c79fdf293 --- /dev/null +++ b/densepose/modeling/build.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Optional +from torch import nn + +from detectron2.config import CfgNode + +from .cse.embedder import Embedder +from .filter import DensePoseDataFilter + + +def build_densepose_predictor(cfg: CfgNode, input_channels: int): + """ + Create an instance of DensePose predictor based on configuration options. + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + Return: + An instance of DensePose predictor + """ + from .predictors import DENSEPOSE_PREDICTOR_REGISTRY + + predictor_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME + return DENSEPOSE_PREDICTOR_REGISTRY.get(predictor_name)(cfg, input_channels) + + +def build_densepose_data_filter(cfg: CfgNode): + """ + Build DensePose data filter which selects data for training + + Args: + cfg (CfgNode): configuration options + + Return: + Callable: list(Tensor), list(Instances) -> list(Tensor), list(Instances) + An instance of DensePose filter, which takes feature tensors and proposals + as an input and returns filtered features and proposals + """ + dp_filter = DensePoseDataFilter(cfg) + return dp_filter + + +def build_densepose_head(cfg: CfgNode, input_channels: int): + """ + Build DensePose head based on configurations options + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + Return: + An instance of DensePose head + """ + from .roi_heads.registry import ROI_DENSEPOSE_HEAD_REGISTRY + + head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME + return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels) + + +def build_densepose_losses(cfg: CfgNode): + """ + Build DensePose loss based on configurations options + + Args: + cfg (CfgNode): configuration options + Return: + An instance of DensePose loss + """ + from .losses import DENSEPOSE_LOSS_REGISTRY + + loss_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME + return DENSEPOSE_LOSS_REGISTRY.get(loss_name)(cfg) + + +def build_densepose_embedder(cfg: CfgNode) -> Optional[nn.Module]: + """ + Build embedder used to embed mesh vertices into an embedding space. + Embedder contains sub-embedders, one for each mesh ID. + + Args: + cfg (cfgNode): configuration options + Return: + Embedding module + """ + if cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS: + return Embedder(cfg) + return None diff --git a/densepose/modeling/confidence.py b/densepose/modeling/confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..6f4a72efec06e055036ba70bc75b2624d20e1e0e --- /dev/null +++ b/densepose/modeling/confidence.py @@ -0,0 +1,73 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import dataclass +from enum import Enum + +from detectron2.config import CfgNode + + +class DensePoseUVConfidenceType(Enum): + """ + Statistical model type for confidence learning, possible values: + - "iid_iso": statistically independent identically distributed residuals + with anisotropic covariance + - "indep_aniso": statistically independent residuals with anisotropic + covariances + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + # fmt: off + IID_ISO = "iid_iso" + INDEP_ANISO = "indep_aniso" + # fmt: on + + +@dataclass +class DensePoseUVConfidenceConfig: + """ + Configuration options for confidence on UV data + """ + + enabled: bool = False + # lower bound on UV confidences + epsilon: float = 0.01 + type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO + + +@dataclass +class DensePoseSegmConfidenceConfig: + """ + Configuration options for confidence on segmentation + """ + + enabled: bool = False + # lower bound on confidence values + epsilon: float = 0.01 + + +@dataclass +class DensePoseConfidenceModelConfig: + """ + Configuration options for confidence models + """ + + # confidence for U and V values + uv_confidence: DensePoseUVConfidenceConfig + # segmentation confidence + segm_confidence: DensePoseSegmConfidenceConfig + + @staticmethod + def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig": + return DensePoseConfidenceModelConfig( + uv_confidence=DensePoseUVConfidenceConfig( + enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED, + epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON, + type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE), + ), + segm_confidence=DensePoseSegmConfidenceConfig( + enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.ENABLED, + epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON, + ), + ) diff --git a/densepose/modeling/cse/__init__.py b/densepose/modeling/cse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a2273609cc54fb96d002a49dcd58788060945059 --- /dev/null +++ b/densepose/modeling/cse/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .vertex_direct_embedder import VertexDirectEmbedder +from .vertex_feature_embedder import VertexFeatureEmbedder +from .embedder import Embedder diff --git a/densepose/modeling/cse/embedder.py b/densepose/modeling/cse/embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..56f5cb9860b13aa38b2069e6b25c3f5f71ab1ecc --- /dev/null +++ b/densepose/modeling/cse/embedder.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import numpy as np +import pickle +from enum import Enum +from typing import Optional +import torch +from torch import nn + +from detectron2.config import CfgNode +from detectron2.utils.file_io import PathManager + +from .vertex_direct_embedder import VertexDirectEmbedder +from .vertex_feature_embedder import VertexFeatureEmbedder + + +class EmbedderType(Enum): + """ + Embedder type which defines how vertices are mapped into the embedding space: + - "vertex_direct": direct vertex embedding + - "vertex_feature": embedding vertex features + """ + + VERTEX_DIRECT = "vertex_direct" + VERTEX_FEATURE = "vertex_feature" + + +def create_embedder(embedder_spec: CfgNode, embedder_dim: int) -> nn.Module: + """ + Create an embedder based on the provided configuration + + Args: + embedder_spec (CfgNode): embedder configuration + embedder_dim (int): embedding space dimensionality + Return: + An embedder instance for the specified configuration + Raises ValueError, in case of unexpected embedder type + """ + embedder_type = EmbedderType(embedder_spec.TYPE) + if embedder_type == EmbedderType.VERTEX_DIRECT: + embedder = VertexDirectEmbedder( + num_vertices=embedder_spec.NUM_VERTICES, + embed_dim=embedder_dim, + ) + if embedder_spec.INIT_FILE != "": + embedder.load(embedder_spec.INIT_FILE) + elif embedder_type == EmbedderType.VERTEX_FEATURE: + embedder = VertexFeatureEmbedder( + num_vertices=embedder_spec.NUM_VERTICES, + feature_dim=embedder_spec.FEATURE_DIM, + embed_dim=embedder_dim, + train_features=embedder_spec.FEATURES_TRAINABLE, + ) + if embedder_spec.INIT_FILE != "": + embedder.load(embedder_spec.INIT_FILE) + else: + raise ValueError(f"Unexpected embedder type {embedder_type}") + + if not embedder_spec.IS_TRAINABLE: + embedder.requires_grad_(False) + + return embedder + + +class Embedder(nn.Module): + """ + Embedder module that serves as a container for embedders to use with different + meshes. Extends Module to automatically save / load state dict. + """ + + DEFAULT_MODEL_CHECKPOINT_PREFIX = "roi_heads.embedder." + + def __init__(self, cfg: CfgNode): + """ + Initialize mesh embedders. An embedder for mesh `i` is stored in a submodule + "embedder_{i}". + + Args: + cfg (CfgNode): configuration options + """ + super(Embedder, self).__init__() + self.mesh_names = set() + embedder_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE + logger = logging.getLogger(__name__) + for mesh_name, embedder_spec in cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.items(): + logger.info(f"Adding embedder embedder_{mesh_name} with spec {embedder_spec}") + self.add_module(f"embedder_{mesh_name}", create_embedder(embedder_spec, embedder_dim)) + self.mesh_names.add(mesh_name) + if cfg.MODEL.WEIGHTS != "": + self.load_from_model_checkpoint(cfg.MODEL.WEIGHTS) + + def load_from_model_checkpoint(self, fpath: str, prefix: Optional[str] = None): + if prefix is None: + prefix = Embedder.DEFAULT_MODEL_CHECKPOINT_PREFIX + state_dict = None + if fpath.endswith(".pkl"): + with PathManager.open(fpath, "rb") as hFile: + state_dict = pickle.load(hFile, encoding="latin1") + else: + with PathManager.open(fpath, "rb") as hFile: + state_dict = torch.load(hFile, map_location=torch.device("cpu")) + if state_dict is not None and "model" in state_dict: + state_dict_local = {} + for key in state_dict["model"]: + if key.startswith(prefix): + v_key = state_dict["model"][key] + if isinstance(v_key, np.ndarray): + v_key = torch.from_numpy(v_key) + state_dict_local[key[len(prefix) :]] = v_key + # non-strict loading to finetune on different meshes + self.load_state_dict(state_dict_local, strict=False) + + def forward(self, mesh_name: str) -> torch.Tensor: + """ + Produce vertex embeddings for the specific mesh; vertex embeddings are + a tensor of shape [N, D] where: + N = number of vertices + D = number of dimensions in the embedding space + Args: + mesh_name (str): name of a mesh for which to obtain vertex embeddings + Return: + Vertex embeddings, a tensor of shape [N, D] + """ + return getattr(self, f"embedder_{mesh_name}")() + + def has_embeddings(self, mesh_name: str) -> bool: + return hasattr(self, f"embedder_{mesh_name}") diff --git a/densepose/modeling/cse/utils.py b/densepose/modeling/cse/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6e70d25df7c8e2c1c408866cf7a6f0156b64114a --- /dev/null +++ b/densepose/modeling/cse/utils.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import torch +from torch.nn import functional as F + + +def squared_euclidean_distance_matrix(pts1: torch.Tensor, pts2: torch.Tensor) -> torch.Tensor: + """ + Get squared Euclidean Distance Matrix + Computes pairwise squared Euclidean distances between points + + Args: + pts1: Tensor [M x D], M is the number of points, D is feature dimensionality + pts2: Tensor [N x D], N is the number of points, D is feature dimensionality + + Return: + Tensor [M, N]: matrix of squared Euclidean distances; at index (m, n) + it contains || pts1[m] - pts2[n] ||^2 + """ + edm = torch.mm(-2 * pts1, pts2.t()) + edm += (pts1 * pts1).sum(1, keepdim=True) + (pts2 * pts2).sum(1, keepdim=True).t() + return edm.contiguous() + + +def normalize_embeddings(embeddings: torch.Tensor, epsilon: float = 1e-6) -> torch.Tensor: + """ + Normalize N D-dimensional embedding vectors arranged in a tensor [N, D] + + Args: + embeddings (tensor [N, D]): N D-dimensional embedding vectors + epsilon (float): minimum value for a vector norm + Return: + Normalized embeddings (tensor [N, D]), such that L2 vector norms are all equal to 1. + """ + return embeddings / torch.clamp(embeddings.norm(p=None, dim=1, keepdim=True), min=epsilon) + + +def get_closest_vertices_mask_from_ES( + E: torch.Tensor, + S: torch.Tensor, + h: int, + w: int, + mesh_vertex_embeddings: torch.Tensor, + device: torch.device, +): + """ + Interpolate Embeddings and Segmentations to the size of a given bounding box, + and compute closest vertices and the segmentation mask + + Args: + E (tensor [1, D, H, W]): D-dimensional embedding vectors for every point of the + default-sized box + S (tensor [1, 2, H, W]): 2-dimensional segmentation mask for every point of the + default-sized box + h (int): height of the target bounding box + w (int): width of the target bounding box + mesh_vertex_embeddings (tensor [N, D]): vertex embeddings for a chosen mesh + N is the number of vertices in the mesh, D is feature dimensionality + device (torch.device): device to move the tensors to + Return: + Closest Vertices (tensor [h, w]), int, for every point of the resulting box + Segmentation mask (tensor [h, w]), boolean, for every point of the resulting box + """ + embedding_resized = F.interpolate(E, size=(h, w), mode="bilinear")[0].to(device) + coarse_segm_resized = F.interpolate(S, size=(h, w), mode="bilinear")[0].to(device) + mask = coarse_segm_resized.argmax(0) > 0 + closest_vertices = torch.zeros(mask.shape, dtype=torch.long, device=device) + all_embeddings = embedding_resized[:, mask].t() + size_chunk = 10_000 # Chunking to avoid possible OOM + edm = [] + if len(all_embeddings) == 0: + return closest_vertices, mask + for chunk in range((len(all_embeddings) - 1) // size_chunk + 1): + chunk_embeddings = all_embeddings[size_chunk * chunk : size_chunk * (chunk + 1)] + edm.append( + torch.argmin( + squared_euclidean_distance_matrix(chunk_embeddings, mesh_vertex_embeddings), dim=1 + ) + ) + closest_vertices[mask] = torch.cat(edm) + return closest_vertices, mask diff --git a/densepose/modeling/cse/vertex_direct_embedder.py b/densepose/modeling/cse/vertex_direct_embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..9d802adf10c18beaedb3bd56963366662ba753f7 --- /dev/null +++ b/densepose/modeling/cse/vertex_direct_embedder.py @@ -0,0 +1,64 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pickle +import torch +from torch import nn + +from detectron2.utils.file_io import PathManager + +from .utils import normalize_embeddings + + +class VertexDirectEmbedder(nn.Module): + """ + Class responsible for embedding vertices. Vertex embeddings take + the form of a tensor of size [N, D], where + N = number of vertices + D = number of dimensions in the embedding space + """ + + def __init__(self, num_vertices: int, embed_dim: int): + """ + Initialize embedder, set random embeddings + + Args: + num_vertices (int): number of vertices to embed + embed_dim (int): number of dimensions in the embedding space + """ + super(VertexDirectEmbedder, self).__init__() + self.embeddings = nn.Parameter(torch.Tensor(num_vertices, embed_dim)) + self.reset_parameters() + + @torch.no_grad() + def reset_parameters(self): + """ + Reset embeddings to random values + """ + self.embeddings.zero_() + + def forward(self) -> torch.Tensor: + """ + Produce vertex embeddings, a tensor of shape [N, D] where: + N = number of vertices + D = number of dimensions in the embedding space + + Return: + Full vertex embeddings, a tensor of shape [N, D] + """ + return normalize_embeddings(self.embeddings) + + @torch.no_grad() + def load(self, fpath: str): + """ + Load data from a file + + Args: + fpath (str): file path to load data from + """ + with PathManager.open(fpath, "rb") as hFile: + data = pickle.load(hFile) + for name in ["embeddings"]: + if name in data: + getattr(self, name).copy_( + torch.tensor(data[name]).float().to(device=getattr(self, name).device) + ) diff --git a/densepose/modeling/cse/vertex_feature_embedder.py b/densepose/modeling/cse/vertex_feature_embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..e9c7709c1eaec8f3e39441aadbc2b749c67874f2 --- /dev/null +++ b/densepose/modeling/cse/vertex_feature_embedder.py @@ -0,0 +1,75 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pickle +import torch +from torch import nn + +from detectron2.utils.file_io import PathManager + +from .utils import normalize_embeddings + + +class VertexFeatureEmbedder(nn.Module): + """ + Class responsible for embedding vertex features. Mapping from + feature space to the embedding space is a tensor of size [K, D], where + K = number of dimensions in the feature space + D = number of dimensions in the embedding space + Vertex features is a tensor of size [N, K], where + N = number of vertices + K = number of dimensions in the feature space + Vertex embeddings are computed as F * E = tensor of size [N, D] + """ + + def __init__( + self, num_vertices: int, feature_dim: int, embed_dim: int, train_features: bool = False + ): + """ + Initialize embedder, set random embeddings + + Args: + num_vertices (int): number of vertices to embed + feature_dim (int): number of dimensions in the feature space + embed_dim (int): number of dimensions in the embedding space + train_features (bool): determines whether vertex features should + be trained (default: False) + """ + super(VertexFeatureEmbedder, self).__init__() + if train_features: + self.features = nn.Parameter(torch.Tensor(num_vertices, feature_dim)) + else: + self.register_buffer("features", torch.Tensor(num_vertices, feature_dim)) + self.embeddings = nn.Parameter(torch.Tensor(feature_dim, embed_dim)) + self.reset_parameters() + + @torch.no_grad() + def reset_parameters(self): + self.features.zero_() + self.embeddings.zero_() + + def forward(self) -> torch.Tensor: + """ + Produce vertex embeddings, a tensor of shape [N, D] where: + N = number of vertices + D = number of dimensions in the embedding space + + Return: + Full vertex embeddings, a tensor of shape [N, D] + """ + return normalize_embeddings(torch.mm(self.features, self.embeddings)) + + @torch.no_grad() + def load(self, fpath: str): + """ + Load data from a file + + Args: + fpath (str): file path to load data from + """ + with PathManager.open(fpath, "rb") as hFile: + data = pickle.load(hFile) + for name in ["features", "embeddings"]: + if name in data: + getattr(self, name).copy_( + torch.tensor(data[name]).float().to(device=getattr(self, name).device) + ) diff --git a/densepose/modeling/densepose_checkpoint.py b/densepose/modeling/densepose_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..8c2b4f2e2cc9c6c798cf1bdb9c38dedc84058bd5 --- /dev/null +++ b/densepose/modeling/densepose_checkpoint.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from collections import OrderedDict + +from detectron2.checkpoint import DetectionCheckpointer + + +def _rename_HRNet_weights(weights): + # We detect and rename HRNet weights for DensePose. 1956 and 1716 are values that are + # common to all HRNet pretrained weights, and should be enough to accurately identify them + if ( + len(weights["model"].keys()) == 1956 + and len([k for k in weights["model"].keys() if k.startswith("stage")]) == 1716 + ): + hrnet_weights = OrderedDict() + for k in weights["model"].keys(): + hrnet_weights["backbone.bottom_up." + str(k)] = weights["model"][k] + return {"model": hrnet_weights} + else: + return weights + + +class DensePoseCheckpointer(DetectionCheckpointer): + """ + Same as :class:`DetectionCheckpointer`, but is able to handle HRNet weights + """ + + def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables): + super().__init__(model, save_dir, save_to_disk=save_to_disk, **checkpointables) + + def _load_file(self, filename: str) -> object: + """ + Adding hrnet support + """ + weights = super()._load_file(filename) + return _rename_HRNet_weights(weights) diff --git a/densepose/modeling/filter.py b/densepose/modeling/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..4682b225dbba1ce330c8f4ed6ad14dafcc935e5c --- /dev/null +++ b/densepose/modeling/filter.py @@ -0,0 +1,94 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import List +import torch + +from detectron2.config import CfgNode +from detectron2.structures import Instances +from detectron2.structures.boxes import matched_pairwise_iou + + +class DensePoseDataFilter: + def __init__(self, cfg: CfgNode): + self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD + self.keep_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + + @torch.no_grad() + def __call__(self, features: List[torch.Tensor], proposals_with_targets: List[Instances]): + """ + Filters proposals with targets to keep only the ones relevant for + DensePose training + + Args: + features (list[Tensor]): input data as a list of features, + each feature is a tensor. Axis 0 represents the number of + images `N` in the input data; axes 1-3 are channels, + height, and width, which may vary between features + (e.g., if a feature pyramid is used). + proposals_with_targets (list[Instances]): length `N` list of + `Instances`. The i-th `Instances` contains instances + (proposals, GT) for the i-th input image, + Returns: + list[Tensor]: filtered features + list[Instances]: filtered proposals + """ + proposals_filtered = [] + # TODO: the commented out code was supposed to correctly deal with situations + # where no valid DensePose GT is available for certain images. The corresponding + # image features were sliced and proposals were filtered. This led to performance + # deterioration, both in terms of runtime and in terms of evaluation results. + # + # feature_mask = torch.ones( + # len(proposals_with_targets), + # dtype=torch.bool, + # device=features[0].device if len(features) > 0 else torch.device("cpu"), + # ) + for i, proposals_per_image in enumerate(proposals_with_targets): + if not proposals_per_image.has("gt_densepose") and ( + not proposals_per_image.has("gt_masks") or not self.keep_masks + ): + # feature_mask[i] = 0 + continue + gt_boxes = proposals_per_image.gt_boxes + est_boxes = proposals_per_image.proposal_boxes + # apply match threshold for densepose head + iou = matched_pairwise_iou(gt_boxes, est_boxes) + iou_select = iou > self.iou_threshold + proposals_per_image = proposals_per_image[iou_select] # pyre-ignore[6] + + N_gt_boxes = len(proposals_per_image.gt_boxes) + assert N_gt_boxes == len(proposals_per_image.proposal_boxes), ( + f"The number of GT boxes {N_gt_boxes} is different from the " + f"number of proposal boxes {len(proposals_per_image.proposal_boxes)}" + ) + # filter out any target without suitable annotation + if self.keep_masks: + gt_masks = ( + proposals_per_image.gt_masks + if hasattr(proposals_per_image, "gt_masks") + else [None] * N_gt_boxes + ) + else: + gt_masks = [None] * N_gt_boxes + gt_densepose = ( + proposals_per_image.gt_densepose + if hasattr(proposals_per_image, "gt_densepose") + else [None] * N_gt_boxes + ) + assert len(gt_masks) == N_gt_boxes + assert len(gt_densepose) == N_gt_boxes + selected_indices = [ + i + for i, (dp_target, mask_target) in enumerate(zip(gt_densepose, gt_masks)) + if (dp_target is not None) or (mask_target is not None) + ] + # if not len(selected_indices): + # feature_mask[i] = 0 + # continue + if len(selected_indices) != N_gt_boxes: + proposals_per_image = proposals_per_image[selected_indices] # pyre-ignore[6] + assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes) + proposals_filtered.append(proposals_per_image) + # features_filtered = [feature[feature_mask] for feature in features] + # return features_filtered, proposals_filtered + return features, proposals_filtered diff --git a/densepose/modeling/hrfpn.py b/densepose/modeling/hrfpn.py new file mode 100644 index 0000000000000000000000000000000000000000..08ec420fa24e1e8f5074baf2e9ae737aff2ab12e --- /dev/null +++ b/densepose/modeling/hrfpn.py @@ -0,0 +1,182 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +""" +MIT License +Copyright (c) 2019 Microsoft +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from detectron2.layers import ShapeSpec +from detectron2.modeling.backbone import BACKBONE_REGISTRY +from detectron2.modeling.backbone.backbone import Backbone + +from .hrnet import build_pose_hrnet_backbone + + +class HRFPN(Backbone): + """HRFPN (High Resolution Feature Pyramids) + Transforms outputs of HRNet backbone so they are suitable for the ROI_heads + arXiv: https://arxiv.org/abs/1904.04514 + Adapted from https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/hrfpn.py + Args: + bottom_up: (list) output of HRNet + in_features (list): names of the input features (output of HRNet) + in_channels (list): number of channels for each branch + out_channels (int): output channels of feature pyramids + n_out_features (int): number of output stages + pooling (str): pooling for generating feature pyramids (from {MAX, AVG}) + share_conv (bool): Have one conv per output, or share one with all the outputs + """ + + def __init__( + self, + bottom_up, + in_features, + n_out_features, + in_channels, + out_channels, + pooling="AVG", + share_conv=False, + ): + super(HRFPN, self).__init__() + assert isinstance(in_channels, list) + self.bottom_up = bottom_up + self.in_features = in_features + self.n_out_features = n_out_features + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.share_conv = share_conv + + if self.share_conv: + self.fpn_conv = nn.Conv2d( + in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1 + ) + else: + self.fpn_conv = nn.ModuleList() + for _ in range(self.n_out_features): + self.fpn_conv.append( + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + ) + ) + + # Custom change: Replaces a simple bilinear interpolation + self.interp_conv = nn.ModuleList() + for i in range(len(self.in_features)): + self.interp_conv.append( + nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels[i], + out_channels=in_channels[i], + kernel_size=4, + stride=2**i, + padding=0, + output_padding=0, + bias=False, + ), + nn.BatchNorm2d(in_channels[i], momentum=0.1), + nn.ReLU(inplace=True), + ) + ) + + # Custom change: Replaces a couple (reduction conv + pooling) by one conv + self.reduction_pooling_conv = nn.ModuleList() + for i in range(self.n_out_features): + self.reduction_pooling_conv.append( + nn.Sequential( + nn.Conv2d(sum(in_channels), out_channels, kernel_size=2**i, stride=2**i), + nn.BatchNorm2d(out_channels, momentum=0.1), + nn.ReLU(inplace=True), + ) + ) + + if pooling == "MAX": + self.pooling = F.max_pool2d + else: + self.pooling = F.avg_pool2d + + self._out_features = [] + self._out_feature_channels = {} + self._out_feature_strides = {} + + for i in range(self.n_out_features): + self._out_features.append("p%d" % (i + 1)) + self._out_feature_channels.update({self._out_features[-1]: self.out_channels}) + self._out_feature_strides.update({self._out_features[-1]: 2 ** (i + 2)}) + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, inputs): + bottom_up_features = self.bottom_up(inputs) + assert len(bottom_up_features) == len(self.in_features) + inputs = [bottom_up_features[f] for f in self.in_features] + + outs = [] + for i in range(len(inputs)): + outs.append(self.interp_conv[i](inputs[i])) + shape_2 = min(o.shape[2] for o in outs) + shape_3 = min(o.shape[3] for o in outs) + out = torch.cat([o[:, :, :shape_2, :shape_3] for o in outs], dim=1) + outs = [] + for i in range(self.n_out_features): + outs.append(self.reduction_pooling_conv[i](out)) + for i in range(len(outs)): # Make shapes consistent + outs[-1 - i] = outs[-1 - i][ + :, :, : outs[-1].shape[2] * 2**i, : outs[-1].shape[3] * 2**i + ] + outputs = [] + for i in range(len(outs)): + if self.share_conv: + outputs.append(self.fpn_conv(outs[i])) + else: + outputs.append(self.fpn_conv[i](outs[i])) + + assert len(self._out_features) == len(outputs) + return dict(zip(self._out_features, outputs)) + + +@BACKBONE_REGISTRY.register() +def build_hrfpn_backbone(cfg, input_shape: ShapeSpec) -> HRFPN: + + in_channels = cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS + in_features = ["p%d" % (i + 1) for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES)] + n_out_features = len(cfg.MODEL.ROI_HEADS.IN_FEATURES) + out_channels = cfg.MODEL.HRNET.HRFPN.OUT_CHANNELS + hrnet = build_pose_hrnet_backbone(cfg, input_shape) + hrfpn = HRFPN( + hrnet, + in_features, + n_out_features, + in_channels, + out_channels, + pooling="AVG", + share_conv=False, + ) + + return hrfpn diff --git a/densepose/modeling/hrnet.py b/densepose/modeling/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..ca2467107e8e5a50167de38ef6827fac646d1245 --- /dev/null +++ b/densepose/modeling/hrnet.py @@ -0,0 +1,474 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (leoxiaobin@gmail.com) +# Modified by Bowen Cheng (bcheng9@illinois.edu) +# Adapted from https://github.com/HRNet/Higher-HRNet-Human-Pose-Estimation/blob/master/lib/models/pose_higher_hrnet.py # noqa +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import, division, print_function +import logging +import torch.nn as nn + +from detectron2.layers import ShapeSpec +from detectron2.modeling.backbone import BACKBONE_REGISTRY +from detectron2.modeling.backbone.backbone import Backbone + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +__all__ = ["build_pose_hrnet_backbone", "PoseHigherResolutionNet"] + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + """HighResolutionModule + Building block of the PoseHigherResolutionNet (see lower) + arXiv: https://arxiv.org/abs/1908.10357 + Args: + num_branches (int): number of branches of the modyle + blocks (str): type of block of the module + num_blocks (int): number of blocks of the module + num_inchannels (int): number of input channels of the module + num_channels (list): number of channels of each branch + multi_scale_output (bool): only used by the last module of PoseHigherResolutionNet + """ + + def __init__( + self, + num_branches, + blocks, + num_blocks, + num_inchannels, + num_channels, + multi_scale_output=True, + ): + super(HighResolutionModule, self).__init__() + self._check_branches(num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(True) + + def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format( + num_branches, len(num_channels) + ) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format( + num_branches, len(num_inchannels) + ) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1): + downsample = None + if ( + stride != 1 + or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion + ): + downsample = nn.Sequential( + nn.Conv2d( + self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append( + block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample) + ) + self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion + for _ in range(1, num_blocks[branch_index]): + layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index])) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append(self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + num_inchannels = self.num_inchannels + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False), + nn.BatchNorm2d(num_inchannels[i]), + nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"), + ) + ) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i - j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, + 2, + 1, + bias=False, + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + ) + ) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, + 2, + 1, + bias=False, + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + nn.ReLU(True), + ) + ) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + z = self.fuse_layers[i][j](x[j])[:, :, : y.shape[2], : y.shape[3]] + y = y + z + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck} + + +class PoseHigherResolutionNet(Backbone): + """PoseHigherResolutionNet + Composed of several HighResolutionModule tied together with ConvNets + Adapted from the GitHub version to fit with HRFPN and the Detectron2 infrastructure + arXiv: https://arxiv.org/abs/1908.10357 + """ + + def __init__(self, cfg, **kwargs): + self.inplanes = cfg.MODEL.HRNET.STEM_INPLANES + super(PoseHigherResolutionNet, self).__init__() + + # stem net + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.layer1 = self._make_layer(Bottleneck, 64, 4) + + self.stage2_cfg = cfg.MODEL.HRNET.STAGE2 + num_channels = self.stage2_cfg.NUM_CHANNELS + block = blocks_dict[self.stage2_cfg.BLOCK] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition1 = self._make_transition_layer([256], num_channels) + self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels) + + self.stage3_cfg = cfg.MODEL.HRNET.STAGE3 + num_channels = self.stage3_cfg.NUM_CHANNELS + block = blocks_dict[self.stage3_cfg.BLOCK] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels) + + self.stage4_cfg = cfg.MODEL.HRNET.STAGE4 + num_channels = self.stage4_cfg.NUM_CHANNELS + block = blocks_dict[self.stage4_cfg.BLOCK] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=True + ) + + self._out_features = [] + self._out_feature_channels = {} + self._out_feature_strides = {} + + for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES): + self._out_features.append("p%d" % (i + 1)) + self._out_feature_channels.update( + {self._out_features[-1]: cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS[i]} + ) + self._out_feature_strides.update({self._out_features[-1]: 1}) + + def _get_deconv_cfg(self, deconv_kernel): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + nn.Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, + 1, + 1, + bias=False, + ), + nn.BatchNorm2d(num_channels_cur_layer[i]), + nn.ReLU(inplace=True), + ) + ) + else: + transition_layers.append(None) + else: + conv3x3s = [] + for j in range(i + 1 - num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = ( + num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels + ) + conv3x3s.append( + nn.Sequential( + nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False), + nn.BatchNorm2d(outchannels), + nn.ReLU(inplace=True), + ) + ) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True): + num_modules = layer_config["NUM_MODULES"] + num_branches = layer_config["NUM_BRANCHES"] + num_blocks = layer_config["NUM_BLOCKS"] + num_channels = layer_config["NUM_CHANNELS"] + block = blocks_dict[layer_config["BLOCK"]] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + + modules.append( + HighResolutionModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + reset_multi_scale_output, + ) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg.NUM_BRANCHES): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg.NUM_BRANCHES): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg.NUM_BRANCHES): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + assert len(self._out_features) == len(y_list) + return dict(zip(self._out_features, y_list)) # final_outputs + + +@BACKBONE_REGISTRY.register() +def build_pose_hrnet_backbone(cfg, input_shape: ShapeSpec): + model = PoseHigherResolutionNet(cfg) + return model diff --git a/densepose/modeling/inference.py b/densepose/modeling/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..81049649edddb23aeebeac4085514da838f1463b --- /dev/null +++ b/densepose/modeling/inference.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from dataclasses import fields +from typing import Any, List +import torch + +from detectron2.structures import Instances + + +def densepose_inference(densepose_predictor_output: Any, detections: List[Instances]) -> None: + """ + Splits DensePose predictor outputs into chunks, each chunk corresponds to + detections on one image. Predictor output chunks are stored in `pred_densepose` + attribute of the corresponding `Instances` object. + + Args: + densepose_predictor_output: a dataclass instance (can be of different types, + depending on predictor used for inference). Each field can be `None` + (if the corresponding output was not inferred) or a tensor of size + [N, ...], where N = N_1 + N_2 + .. + N_k is a total number of + detections on all images, N_1 is the number of detections on image 1, + N_2 is the number of detections on image 2, etc. + detections: a list of objects of type `Instance`, k-th object corresponds + to detections on k-th image. + """ + k = 0 + for detection_i in detections: + if densepose_predictor_output is None: + # don't add `pred_densepose` attribute + continue + n_i = detection_i.__len__() + + PredictorOutput = type(densepose_predictor_output) + output_i_dict = {} + # we assume here that `densepose_predictor_output` is a dataclass object + for field in fields(densepose_predictor_output): + field_value = getattr(densepose_predictor_output, field.name) + # slice tensors + if isinstance(field_value, torch.Tensor): + output_i_dict[field.name] = field_value[k : k + n_i] + # leave others as is + else: + output_i_dict[field.name] = field_value + detection_i.pred_densepose = PredictorOutput(**output_i_dict) + k += n_i diff --git a/densepose/modeling/losses/__init__.py b/densepose/modeling/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c593700e7274ea9cbaf8f4a52e8a229ef4c5a1 --- /dev/null +++ b/densepose/modeling/losses/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .chart import DensePoseChartLoss +from .chart_with_confidences import DensePoseChartWithConfidenceLoss +from .cse import DensePoseCseLoss +from .registry import DENSEPOSE_LOSS_REGISTRY + + +__all__ = [ + "DensePoseChartLoss", + "DensePoseChartWithConfidenceLoss", + "DensePoseCseLoss", + "DENSEPOSE_LOSS_REGISTRY", +] diff --git a/densepose/modeling/losses/chart.py b/densepose/modeling/losses/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..02cdae8db3a41fc197be7fcc792c7119c7a21726 --- /dev/null +++ b/densepose/modeling/losses/chart.py @@ -0,0 +1,291 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, List +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .mask_or_segm import MaskOrSegmentationLoss +from .registry import DENSEPOSE_LOSS_REGISTRY +from .utils import ( + BilinearInterpolationHelper, + ChartBasedAnnotationsAccumulator, + LossDict, + extract_packed_annotations_from_matches, +) + + +@DENSEPOSE_LOSS_REGISTRY.register() +class DensePoseChartLoss: + """ + DensePose loss for chart-based training. A mesh is split into charts, + each chart is given a label (I) and parametrized by 2 coordinates referred to + as U and V. Ground truth consists of a number of points annotated with + I, U and V values and coarse segmentation S defined for all pixels of the + object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`), + semantic segmentation annotations can be used as ground truth inputs as well. + + Estimated values are tensors: + * U coordinates, tensor of shape [N, C, S, S] + * V coordinates, tensor of shape [N, C, S, S] + * fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized + scores for each fine segmentation label at each location + * coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized + scores for each coarse segmentation label at each location + where N is the number of detections, C is the number of fine segmentation + labels, S is the estimate size ( = width = height) and D is the number of + coarse segmentation channels. + + The losses are: + * regression (smooth L1) loss for U and V coordinates + * cross entropy loss for fine (I) and coarse (S) segmentations + Each loss has an associated weight + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize chart-based loss from configuration options + + Args: + cfg (CfgNode): configuration options + """ + # fmt: off + self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE + self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS + self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS + self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS + self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + # fmt: on + self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + self.segm_loss = MaskOrSegmentationLoss(cfg) + + def __call__( + self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs + ) -> LossDict: + """ + Produce chart-based DensePose losses + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: an object of a dataclass that contains predictor outputs + with estimated values; assumed to have the following attributes: + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + where N is the number of detections, C is the number of fine segmentation + labels, S is the estimate size ( = width = height) and D is the number of + coarse segmentation channels. + + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: smooth L1 loss for U coordinate estimates + * `loss_densepose_V`: smooth L1 loss for V coordinate estimates + * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine + segmentation estimates given ground truth labels; + * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse + segmentation estimates given ground truth labels; + """ + # densepose outputs are computed for all images and all bounding boxes; + # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively, + # the outputs will have size(0) == 3+1+2+1 == 7 + + if not len(proposals_with_gt): + return self.produce_fake_densepose_losses(densepose_predictor_outputs) + + accumulator = ChartBasedAnnotationsAccumulator() + packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator) + + # NOTE: we need to keep the same computation graph on all the GPUs to + # perform reduction properly. Hence even if we have no data on one + # of the GPUs, we still need to generate the computation graph. + # Add fake (zero) loss in the form Tensor.sum() * 0 + if packed_annotations is None: + return self.produce_fake_densepose_losses(densepose_predictor_outputs) + + h, w = densepose_predictor_outputs.u.shape[2:] + interpolator = BilinearInterpolationHelper.from_matches( + packed_annotations, + (h, w), + ) + + j_valid_fg = interpolator.j_valid * ( # pyre-ignore[16] + packed_annotations.fine_segm_labels_gt > 0 + ) + # pyre-fixme[6]: For 1st param expected `Tensor` but got `int`. + if not torch.any(j_valid_fg): + return self.produce_fake_densepose_losses(densepose_predictor_outputs) + + losses_uv = self.produce_densepose_losses_uv( + proposals_with_gt, + densepose_predictor_outputs, + packed_annotations, + interpolator, + j_valid_fg, # pyre-ignore[6] + ) + + losses_segm = self.produce_densepose_losses_segm( + proposals_with_gt, + densepose_predictor_outputs, + packed_annotations, + interpolator, + j_valid_fg, # pyre-ignore[6] + ) + + return {**losses_uv, **losses_segm} + + def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Fake losses for fine segmentation and U/V coordinates. These are used when + no suitable ground truth data was found in a batch. The loss has a value 0 + and is primarily used to construct the computation graph, so that + `DistributedDataParallel` has similar graphs on all GPUs and can perform + reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: has value 0 + * `loss_densepose_V`: has value 0 + * `loss_densepose_I`: has value 0 + * `loss_densepose_S`: has value 0 + """ + losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs) + losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs) + return {**losses_uv, **losses_segm} + + def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Fake losses for U/V coordinates. These are used when no suitable ground + truth data was found in a batch. The loss has a value 0 + and is primarily used to construct the computation graph, so that + `DistributedDataParallel` has similar graphs on all GPUs and can perform + reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: has value 0 + * `loss_densepose_V`: has value 0 + """ + return { + "loss_densepose_U": densepose_predictor_outputs.u.sum() * 0, + "loss_densepose_V": densepose_predictor_outputs.v.sum() * 0, + } + + def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Fake losses for fine / coarse segmentation. These are used when + no suitable ground truth data was found in a batch. The loss has a value 0 + and is primarily used to construct the computation graph, so that + `DistributedDataParallel` has similar graphs on all GPUs and can perform + reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_I`: has value 0 + * `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False + """ + losses = { + "loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0, + "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs), + } + return losses + + def produce_densepose_losses_uv( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + interpolator: BilinearInterpolationHelper, + j_valid_fg: torch.Tensor, + ) -> LossDict: + """ + Compute losses for U/V coordinates: smooth L1 loss between + estimated coordinates and the ground truth. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: smooth L1 loss for U coordinate estimates + * `loss_densepose_V`: smooth L1 loss for V coordinate estimates + """ + u_gt = packed_annotations.u_gt[j_valid_fg] + u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg] + v_gt = packed_annotations.v_gt[j_valid_fg] + v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg] + return { + "loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points, + "loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points, + } + + def produce_densepose_losses_segm( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + interpolator: BilinearInterpolationHelper, + j_valid_fg: torch.Tensor, + ) -> LossDict: + """ + Losses for fine / coarse segmentation: cross-entropy + for segmentation unnormalized scores given ground truth labels at + annotated points for fine segmentation and dense mask annotations + for coarse segmentation. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine + segmentation estimates given ground truth labels + * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse + segmentation estimates given ground truth labels; + may be included if coarse segmentation is only trained + using DensePose ground truth; if additional supervision through + instance segmentation data is performed (`segm_trained_by_masks` is True), + this loss is handled by `produce_mask_losses` instead + """ + fine_segm_gt = packed_annotations.fine_segm_labels_gt[ + interpolator.j_valid # pyre-ignore[16] + ] + fine_segm_est = interpolator.extract_at_points( + densepose_predictor_outputs.fine_segm, + slice_fine_segm=slice(None), + w_ylo_xlo=interpolator.w_ylo_xlo[:, None], # pyre-ignore[16] + w_ylo_xhi=interpolator.w_ylo_xhi[:, None], # pyre-ignore[16] + w_yhi_xlo=interpolator.w_yhi_xlo[:, None], # pyre-ignore[16] + w_yhi_xhi=interpolator.w_yhi_xhi[:, None], # pyre-ignore[16] + )[interpolator.j_valid, :] + return { + "loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part, + "loss_densepose_S": self.segm_loss( + proposals_with_gt, densepose_predictor_outputs, packed_annotations + ) + * self.w_segm, + } diff --git a/densepose/modeling/losses/chart_with_confidences.py b/densepose/modeling/losses/chart_with_confidences.py new file mode 100644 index 0000000000000000000000000000000000000000..78ce7c6cb02fa01f6319d088349ff4f422001839 --- /dev/null +++ b/densepose/modeling/losses/chart_with_confidences.py @@ -0,0 +1,209 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import math +from typing import Any, List +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .. import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType +from .chart import DensePoseChartLoss +from .registry import DENSEPOSE_LOSS_REGISTRY +from .utils import BilinearInterpolationHelper, LossDict + + +@DENSEPOSE_LOSS_REGISTRY.register() +class DensePoseChartWithConfidenceLoss(DensePoseChartLoss): + """ """ + + def __init__(self, cfg: CfgNode): + super().__init__(cfg) + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss( + self.confidence_model_cfg.uv_confidence.epsilon + ) + elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO: + self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss( + self.confidence_model_cfg.uv_confidence.epsilon + ) + + def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Overrides fake losses for fine segmentation and U/V coordinates to + include computation graphs for additional confidence parameters. + These are used when no suitable ground truth data was found in a batch. + The loss has a value 0 and is primarily used to construct the computation graph, + so that `DistributedDataParallel` has similar graphs on all GPUs and can + perform reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: has value 0 + * `loss_densepose_V`: has value 0 + * `loss_densepose_I`: has value 0 + """ + conf_type = self.confidence_model_cfg.uv_confidence.type + if self.confidence_model_cfg.uv_confidence.enabled: + loss_uv = ( + densepose_predictor_outputs.u.sum() + densepose_predictor_outputs.v.sum() + ) * 0 + if conf_type == DensePoseUVConfidenceType.IID_ISO: + loss_uv += densepose_predictor_outputs.sigma_2.sum() * 0 + elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO: + loss_uv += ( + densepose_predictor_outputs.sigma_2.sum() + + densepose_predictor_outputs.kappa_u.sum() + + densepose_predictor_outputs.kappa_v.sum() + ) * 0 + return {"loss_densepose_UV": loss_uv} + else: + return super().produce_fake_densepose_losses_uv(densepose_predictor_outputs) + + def produce_densepose_losses_uv( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + interpolator: BilinearInterpolationHelper, + j_valid_fg: torch.Tensor, + ) -> LossDict: + conf_type = self.confidence_model_cfg.uv_confidence.type + if self.confidence_model_cfg.uv_confidence.enabled: + u_gt = packed_annotations.u_gt[j_valid_fg] + u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg] + v_gt = packed_annotations.v_gt[j_valid_fg] + v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg] + sigma_2_est = interpolator.extract_at_points(densepose_predictor_outputs.sigma_2)[ + j_valid_fg + ] + if conf_type == DensePoseUVConfidenceType.IID_ISO: + return { + "loss_densepose_UV": ( + self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt) + * self.w_points + ) + } + elif conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]: + kappa_u_est = interpolator.extract_at_points(densepose_predictor_outputs.kappa_u)[ + j_valid_fg + ] + kappa_v_est = interpolator.extract_at_points(densepose_predictor_outputs.kappa_v)[ + j_valid_fg + ] + return { + "loss_densepose_UV": ( + self.uv_loss_with_confidences( + u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt + ) + * self.w_points + ) + } + return super().produce_densepose_losses_uv( + proposals_with_gt, + densepose_predictor_outputs, + packed_annotations, + interpolator, + j_valid_fg, + ) + + +class IIDIsotropicGaussianUVLoss(nn.Module): + """ + Loss for the case of iid residuals with isotropic covariance: + $Sigma_i = sigma_i^2 I$ + The loss (negative log likelihood) is then: + $1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$, + where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates + difference between estimated and ground truth UV values + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + def __init__(self, sigma_lower_bound: float): + super(IIDIsotropicGaussianUVLoss, self).__init__() + self.sigma_lower_bound = sigma_lower_bound + self.log2pi = math.log(2 * math.pi) + + def forward( + self, + u: torch.Tensor, + v: torch.Tensor, + sigma_u: torch.Tensor, + target_u: torch.Tensor, + target_v: torch.Tensor, + ): + # compute $\sigma_i^2$ + # use sigma_lower_bound to avoid degenerate solution for variance + # (sigma -> 0) + sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound + # compute \|delta_i\|^2 + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`. + delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2 + # the total loss from the formula above: + loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2) + return loss.sum() + + +class IndepAnisotropicGaussianUVLoss(nn.Module): + """ + Loss for the case of independent residuals with anisotropic covariances: + $Sigma_i = sigma_i^2 I + r_i r_i^T$ + The loss (negative log likelihood) is then: + $1/2 sum_{i=1}^n (log(2 pi) + + log sigma_i^2 (sigma_i^2 + ||r_i||^2) + + ||delta_i||^2 / sigma_i^2 + - ^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$, + where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates + difference between estimated and ground truth UV values + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + def __init__(self, sigma_lower_bound: float): + super(IndepAnisotropicGaussianUVLoss, self).__init__() + self.sigma_lower_bound = sigma_lower_bound + self.log2pi = math.log(2 * math.pi) + + def forward( + self, + u: torch.Tensor, + v: torch.Tensor, + sigma_u: torch.Tensor, + kappa_u_est: torch.Tensor, + kappa_v_est: torch.Tensor, + target_u: torch.Tensor, + target_v: torch.Tensor, + ): + # compute $\sigma_i^2$ + sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound + # compute \|r_i\|^2 + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`. + r_sqnorm2 = kappa_u_est**2 + kappa_v_est**2 + delta_u = u - target_u + delta_v = v - target_v + # compute \|delta_i\|^2 + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`. + delta_sqnorm = delta_u**2 + delta_v**2 + delta_u_r_u = delta_u * kappa_u_est + delta_v_r_v = delta_v * kappa_v_est + # compute the scalar product + delta_r = delta_u_r_u + delta_v_r_v + # compute squared scalar product ^2 + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`. + delta_r_sqnorm = delta_r**2 + denom2 = sigma2 * (sigma2 + r_sqnorm2) + loss = 0.5 * ( + self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2 + ) + return loss.sum() diff --git a/densepose/modeling/losses/cse.py b/densepose/modeling/losses/cse.py new file mode 100644 index 0000000000000000000000000000000000000000..dd561ad518f42c769fd9a5c8517409ddc33edf6f --- /dev/null +++ b/densepose/modeling/losses/cse.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, List +from torch import nn + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .cycle_pix2shape import PixToShapeCycleLoss +from .cycle_shape2shape import ShapeToShapeCycleLoss +from .embed import EmbeddingLoss +from .embed_utils import CseAnnotationsAccumulator +from .mask_or_segm import MaskOrSegmentationLoss +from .registry import DENSEPOSE_LOSS_REGISTRY +from .soft_embed import SoftEmbeddingLoss +from .utils import BilinearInterpolationHelper, LossDict, extract_packed_annotations_from_matches + + +@DENSEPOSE_LOSS_REGISTRY.register() +class DensePoseCseLoss: + """ """ + + _EMBED_LOSS_REGISTRY = { + EmbeddingLoss.__name__: EmbeddingLoss, + SoftEmbeddingLoss.__name__: SoftEmbeddingLoss, + } + + def __init__(self, cfg: CfgNode): + """ + Initialize CSE loss from configuration options + + Args: + cfg (CfgNode): configuration options + """ + self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS + self.w_embed = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT + self.segm_loss = MaskOrSegmentationLoss(cfg) + self.embed_loss = DensePoseCseLoss.create_embed_loss(cfg) + self.do_shape2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.ENABLED + if self.do_shape2shape: + self.w_shape2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT + self.shape2shape_loss = ShapeToShapeCycleLoss(cfg) + self.do_pix2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.ENABLED + if self.do_pix2shape: + self.w_pix2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT + self.pix2shape_loss = PixToShapeCycleLoss(cfg) + + @classmethod + def create_embed_loss(cls, cfg: CfgNode): + # registry not used here, since embedding losses are currently local + # and are not used anywhere else + return cls._EMBED_LOSS_REGISTRY[cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME](cfg) + + def __call__( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + embedder: nn.Module, + ) -> LossDict: + if not len(proposals_with_gt): + return self.produce_fake_losses(densepose_predictor_outputs, embedder) + accumulator = CseAnnotationsAccumulator() + packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator) + if packed_annotations is None: + return self.produce_fake_losses(densepose_predictor_outputs, embedder) + h, w = densepose_predictor_outputs.embedding.shape[2:] + interpolator = BilinearInterpolationHelper.from_matches( + packed_annotations, + (h, w), + ) + meshid_to_embed_losses = self.embed_loss( + proposals_with_gt, + densepose_predictor_outputs, + packed_annotations, + interpolator, + embedder, + ) + embed_loss_dict = { + f"loss_densepose_E{meshid}": self.w_embed * meshid_to_embed_losses[meshid] + for meshid in meshid_to_embed_losses + } + all_loss_dict = { + "loss_densepose_S": self.w_segm + * self.segm_loss(proposals_with_gt, densepose_predictor_outputs, packed_annotations), + **embed_loss_dict, + } + if self.do_shape2shape: + all_loss_dict["loss_shape2shape"] = self.w_shape2shape * self.shape2shape_loss(embedder) + if self.do_pix2shape: + all_loss_dict["loss_pix2shape"] = self.w_pix2shape * self.pix2shape_loss( + proposals_with_gt, densepose_predictor_outputs, packed_annotations, embedder + ) + return all_loss_dict + + def produce_fake_losses( + self, densepose_predictor_outputs: Any, embedder: nn.Module + ) -> LossDict: + meshname_to_embed_losses = self.embed_loss.fake_values( + densepose_predictor_outputs, embedder=embedder + ) + embed_loss_dict = { + f"loss_densepose_E{mesh_name}": meshname_to_embed_losses[mesh_name] + for mesh_name in meshname_to_embed_losses + } + all_loss_dict = { + "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs), + **embed_loss_dict, + } + if self.do_shape2shape: + all_loss_dict["loss_shape2shape"] = self.shape2shape_loss.fake_value(embedder) + if self.do_pix2shape: + all_loss_dict["loss_pix2shape"] = self.pix2shape_loss.fake_value( + densepose_predictor_outputs, embedder + ) + return all_loss_dict diff --git a/densepose/modeling/losses/cycle_pix2shape.py b/densepose/modeling/losses/cycle_pix2shape.py new file mode 100644 index 0000000000000000000000000000000000000000..e305d29850ef04a712a0a3e7bdbffba887257777 --- /dev/null +++ b/densepose/modeling/losses/cycle_pix2shape.py @@ -0,0 +1,152 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, List +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from densepose.data.meshes.catalog import MeshCatalog +from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix + +from .embed_utils import PackedCseAnnotations +from .mask import extract_data_for_mask_loss_from_matches + + +def _create_pixel_dist_matrix(grid_size: int) -> torch.Tensor: + rows = torch.arange(grid_size) + cols = torch.arange(grid_size) + # at index `i` contains [row, col], where + # row = i // grid_size + # col = i % grid_size + pix_coords = ( + torch.stack(torch.meshgrid(rows, cols), -1).reshape((grid_size * grid_size, 2)).float() + ) + return squared_euclidean_distance_matrix(pix_coords, pix_coords) + + +def _sample_fg_pixels_randperm(fg_mask: torch.Tensor, sample_size: int) -> torch.Tensor: + fg_mask_flattened = fg_mask.reshape((-1,)) + num_pixels = int(fg_mask_flattened.sum().item()) + fg_pixel_indices = fg_mask_flattened.nonzero(as_tuple=True)[0] + if (sample_size <= 0) or (num_pixels <= sample_size): + return fg_pixel_indices + sample_indices = torch.randperm(num_pixels, device=fg_mask.device)[:sample_size] + return fg_pixel_indices[sample_indices] + + +def _sample_fg_pixels_multinomial(fg_mask: torch.Tensor, sample_size: int) -> torch.Tensor: + fg_mask_flattened = fg_mask.reshape((-1,)) + num_pixels = int(fg_mask_flattened.sum().item()) + if (sample_size <= 0) or (num_pixels <= sample_size): + return fg_mask_flattened.nonzero(as_tuple=True)[0] + return fg_mask_flattened.float().multinomial(sample_size, replacement=False) + + +class PixToShapeCycleLoss(nn.Module): + """ + Cycle loss for pixel-vertex correspondence + """ + + def __init__(self, cfg: CfgNode): + super().__init__() + self.shape_names = list(cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.keys()) + self.embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE + self.norm_p = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P + self.use_all_meshes_not_gt_only = ( + cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY + ) + self.num_pixels_to_sample = ( + cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE + ) + self.pix_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA + self.temperature_pix_to_vertex = ( + cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX + ) + self.temperature_vertex_to_pix = ( + cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL + ) + self.pixel_dists = _create_pixel_dist_matrix(cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE) + + def forward( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: PackedCseAnnotations, + embedder: nn.Module, + ): + """ + Args: + proposals_with_gt (list of Instances): detections with associated + ground truth data; each item corresponds to instances detected + on 1 image; the number of items corresponds to the number of + images in a batch + densepose_predictor_outputs: an object of a dataclass that contains predictor + outputs with estimated values; assumed to have the following attributes: + * embedding - embedding estimates, tensor of shape [N, D, S, S], where + N = number of instances (= sum N_i, where N_i is the number of + instances on image i) + D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE) + S = output size (width and height) + packed_annotations (PackedCseAnnotations): contains various data useful + for loss computation, each data is packed into a single tensor + embedder (nn.Module): module that computes vertex embeddings for different meshes + """ + pix_embeds = densepose_predictor_outputs.embedding + if self.pixel_dists.device != pix_embeds.device: + # should normally be done only once + self.pixel_dists = self.pixel_dists.to(device=pix_embeds.device) + with torch.no_grad(): + mask_loss_data = extract_data_for_mask_loss_from_matches( + proposals_with_gt, densepose_predictor_outputs.coarse_segm + ) + # GT masks - tensor of shape [N, S, S] of int64 + masks_gt = mask_loss_data.masks_gt.long() # pyre-ignore[16] + assert len(pix_embeds) == len(masks_gt), ( + f"Number of instances with embeddings {len(pix_embeds)} != " + f"number of instances with GT masks {len(masks_gt)}" + ) + losses = [] + mesh_names = ( + self.shape_names + if self.use_all_meshes_not_gt_only + else [ + MeshCatalog.get_mesh_name(mesh_id.item()) + for mesh_id in packed_annotations.vertex_mesh_ids_gt.unique() + ] + ) + for pixel_embeddings, mask_gt in zip(pix_embeds, masks_gt): + # pixel_embeddings [D, S, S] + # mask_gt [S, S] + for mesh_name in mesh_names: + mesh_vertex_embeddings = embedder(mesh_name) + # pixel indices [M] + pixel_indices_flattened = _sample_fg_pixels_randperm( + mask_gt, self.num_pixels_to_sample + ) + # pixel distances [M, M] + pixel_dists = self.pixel_dists.to(pixel_embeddings.device)[ + torch.meshgrid(pixel_indices_flattened, pixel_indices_flattened) + ] + # pixel embeddings [M, D] + pixel_embeddings_sampled = normalize_embeddings( + pixel_embeddings.reshape((self.embed_size, -1))[:, pixel_indices_flattened].T + ) + # pixel-vertex similarity [M, K] + sim_matrix = pixel_embeddings_sampled.mm(mesh_vertex_embeddings.T) + c_pix_vertex = F.softmax(sim_matrix / self.temperature_pix_to_vertex, dim=1) + c_vertex_pix = F.softmax(sim_matrix.T / self.temperature_vertex_to_pix, dim=1) + c_cycle = c_pix_vertex.mm(c_vertex_pix) + loss_cycle = torch.norm(pixel_dists * c_cycle, p=self.norm_p) + losses.append(loss_cycle) + + if len(losses) == 0: + return pix_embeds.sum() * 0 + return torch.stack(losses, dim=0).mean() + + def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module): + losses = [embedder(mesh_name).sum() * 0 for mesh_name in embedder.mesh_names] + losses.append(densepose_predictor_outputs.embedding.sum() * 0) + return torch.mean(torch.stack(losses)) diff --git a/densepose/modeling/losses/cycle_shape2shape.py b/densepose/modeling/losses/cycle_shape2shape.py new file mode 100644 index 0000000000000000000000000000000000000000..f71dbab7c5bd7484cd9001c3c15059971ff0f0cf --- /dev/null +++ b/densepose/modeling/losses/cycle_shape2shape.py @@ -0,0 +1,117 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import random +from typing import Tuple +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode + +from densepose.structures.mesh import create_mesh + +from .utils import sample_random_indices + + +class ShapeToShapeCycleLoss(nn.Module): + """ + Cycle Loss for Shapes. + Inspired by: + "Mapping in a Cycle: Sinkhorn Regularized Unsupervised Learning for Point Cloud Shapes". + """ + + def __init__(self, cfg: CfgNode): + super().__init__() + self.shape_names = list(cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.keys()) + self.all_shape_pairs = [ + (x, y) for i, x in enumerate(self.shape_names) for y in self.shape_names[i + 1 :] + ] + random.shuffle(self.all_shape_pairs) + self.cur_pos = 0 + self.norm_p = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P + self.temperature = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE + self.max_num_vertices = ( + cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES + ) + + def _sample_random_pair(self) -> Tuple[str, str]: + """ + Produce a random pair of different mesh names + + Return: + tuple(str, str): a pair of different mesh names + """ + if self.cur_pos >= len(self.all_shape_pairs): + random.shuffle(self.all_shape_pairs) + self.cur_pos = 0 + shape_pair = self.all_shape_pairs[self.cur_pos] + self.cur_pos += 1 + return shape_pair + + def forward(self, embedder: nn.Module): + """ + Do a forward pass with a random pair (src, dst) pair of shapes + Args: + embedder (nn.Module): module that computes vertex embeddings for different meshes + """ + src_mesh_name, dst_mesh_name = self._sample_random_pair() + return self._forward_one_pair(embedder, src_mesh_name, dst_mesh_name) + + def fake_value(self, embedder: nn.Module): + losses = [] + for mesh_name in embedder.mesh_names: + losses.append(embedder(mesh_name).sum() * 0) + return torch.mean(torch.stack(losses)) + + def _get_embeddings_and_geodists_for_mesh( + self, embedder: nn.Module, mesh_name: str + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Produces embeddings and geodesic distance tensors for a given mesh. May subsample + the mesh, if it contains too many vertices (controlled by + SHAPE_CYCLE_LOSS_MAX_NUM_VERTICES parameter). + Args: + embedder (nn.Module): module that computes embeddings for mesh vertices + mesh_name (str): mesh name + Return: + embeddings (torch.Tensor of size [N, D]): embeddings for selected mesh + vertices (N = number of selected vertices, D = embedding space dim) + geodists (torch.Tensor of size [N, N]): geodesic distances for the selected + mesh vertices (N = number of selected vertices) + """ + embeddings = embedder(mesh_name) + indices = sample_random_indices( + embeddings.shape[0], self.max_num_vertices, embeddings.device + ) + mesh = create_mesh(mesh_name, embeddings.device) + geodists = mesh.geodists + if indices is not None: + embeddings = embeddings[indices] + geodists = geodists[torch.meshgrid(indices, indices)] + return embeddings, geodists + + def _forward_one_pair( + self, embedder: nn.Module, mesh_name_1: str, mesh_name_2: str + ) -> torch.Tensor: + """ + Do a forward pass with a selected pair of meshes + Args: + embedder (nn.Module): module that computes vertex embeddings for different meshes + mesh_name_1 (str): first mesh name + mesh_name_2 (str): second mesh name + Return: + Tensor containing the loss value + """ + embeddings_1, geodists_1 = self._get_embeddings_and_geodists_for_mesh(embedder, mesh_name_1) + embeddings_2, geodists_2 = self._get_embeddings_and_geodists_for_mesh(embedder, mesh_name_2) + sim_matrix_12 = embeddings_1.mm(embeddings_2.T) + + c_12 = F.softmax(sim_matrix_12 / self.temperature, dim=1) + c_21 = F.softmax(sim_matrix_12.T / self.temperature, dim=1) + c_11 = c_12.mm(c_21) + c_22 = c_21.mm(c_12) + + loss_cycle_11 = torch.norm(geodists_1 * c_11, p=self.norm_p) + loss_cycle_22 = torch.norm(geodists_2 * c_22, p=self.norm_p) + + return loss_cycle_11 + loss_cycle_22 diff --git a/densepose/modeling/losses/embed.py b/densepose/modeling/losses/embed.py new file mode 100644 index 0000000000000000000000000000000000000000..1e3a069763ca6fab0acc7c455b416b9634ceaedf --- /dev/null +++ b/densepose/modeling/losses/embed.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, Dict, List +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from densepose.data.meshes.catalog import MeshCatalog +from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix + +from .embed_utils import PackedCseAnnotations +from .utils import BilinearInterpolationHelper + + +class EmbeddingLoss: + """ + Computes losses for estimated embeddings given annotated vertices. + Instances in a minibatch that correspond to the same mesh are grouped + together. For each group, loss is computed as cross-entropy for + unnormalized scores given ground truth mesh vertex ids. + Scores are based on squared distances between estimated vertex embeddings + and mesh vertex embeddings. + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize embedding loss from config + """ + self.embdist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA + + def __call__( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: PackedCseAnnotations, + interpolator: BilinearInterpolationHelper, + embedder: nn.Module, + ) -> Dict[int, torch.Tensor]: + """ + Produces losses for estimated embeddings given annotated vertices. + Embeddings for all the vertices of a mesh are computed by the embedder. + Embeddings for observed pixels are estimated by a predictor. + Losses are computed as cross-entropy for squared distances between + observed vertex embeddings and all mesh vertex embeddings given + ground truth vertex IDs. + + Args: + proposals_with_gt (list of Instances): detections with associated + ground truth data; each item corresponds to instances detected + on 1 image; the number of items corresponds to the number of + images in a batch + densepose_predictor_outputs: an object of a dataclass that contains predictor + outputs with estimated values; assumed to have the following attributes: + * embedding - embedding estimates, tensor of shape [N, D, S, S], where + N = number of instances (= sum N_i, where N_i is the number of + instances on image i) + D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE) + S = output size (width and height) + packed_annotations (PackedCseAnnotations): contains various data useful + for loss computation, each data is packed into a single tensor + interpolator (BilinearInterpolationHelper): bilinear interpolation helper + embedder (nn.Module): module that computes vertex embeddings for different meshes + Return: + dict(int -> tensor): losses for different mesh IDs + """ + losses = {} + for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique(): + mesh_id = mesh_id_tensor.item() + mesh_name = MeshCatalog.get_mesh_name(mesh_id) + # valid points are those that fall into estimated bbox + # and correspond to the current mesh + j_valid = interpolator.j_valid * ( # pyre-ignore[16] + packed_annotations.vertex_mesh_ids_gt == mesh_id + ) + if not torch.any(j_valid): + continue + # extract estimated embeddings for valid points + # -> tensor [J, D] + vertex_embeddings_i = normalize_embeddings( + interpolator.extract_at_points( + densepose_predictor_outputs.embedding, + slice_fine_segm=slice(None), + w_ylo_xlo=interpolator.w_ylo_xlo[:, None], # pyre-ignore[16] + w_ylo_xhi=interpolator.w_ylo_xhi[:, None], # pyre-ignore[16] + w_yhi_xlo=interpolator.w_yhi_xlo[:, None], # pyre-ignore[16] + w_yhi_xhi=interpolator.w_yhi_xhi[:, None], # pyre-ignore[16] + )[j_valid, :] + ) + # extract vertex ids for valid points + # -> tensor [J] + vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid] + # embeddings for all mesh vertices + # -> tensor [K, D] + mesh_vertex_embeddings = embedder(mesh_name) + # unnormalized scores for valid points + # -> tensor [J, K] + scores = squared_euclidean_distance_matrix( + vertex_embeddings_i, mesh_vertex_embeddings + ) / (-self.embdist_gauss_sigma) + losses[mesh_name] = F.cross_entropy(scores, vertex_indices_i, ignore_index=-1) + + for mesh_name in embedder.mesh_names: + if mesh_name not in losses: + losses[mesh_name] = self.fake_value( + densepose_predictor_outputs, embedder, mesh_name + ) + return losses + + def fake_values(self, densepose_predictor_outputs: Any, embedder: nn.Module): + losses = {} + for mesh_name in embedder.mesh_names: + losses[mesh_name] = self.fake_value(densepose_predictor_outputs, embedder, mesh_name) + return losses + + def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module, mesh_name: str): + return densepose_predictor_outputs.embedding.sum() * 0 + embedder(mesh_name).sum() * 0 diff --git a/densepose/modeling/losses/embed_utils.py b/densepose/modeling/losses/embed_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ca16fd3809b89e1c05636242a84d02d3a42d88 --- /dev/null +++ b/densepose/modeling/losses/embed_utils.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from dataclasses import dataclass +from typing import Any, Optional +import torch + +from detectron2.structures import BoxMode, Instances + +from .utils import AnnotationsAccumulator + + +@dataclass +class PackedCseAnnotations: + x_gt: torch.Tensor + y_gt: torch.Tensor + coarse_segm_gt: Optional[torch.Tensor] + vertex_mesh_ids_gt: torch.Tensor + vertex_ids_gt: torch.Tensor + bbox_xywh_gt: torch.Tensor + bbox_xywh_est: torch.Tensor + point_bbox_with_dp_indices: torch.Tensor + point_bbox_indices: torch.Tensor + bbox_indices: torch.Tensor + + +class CseAnnotationsAccumulator(AnnotationsAccumulator): + """ + Accumulates annotations by batches that correspond to objects detected on + individual images. Can pack them together into single tensors. + """ + + def __init__(self): + self.x_gt = [] + self.y_gt = [] + self.s_gt = [] + self.vertex_mesh_ids_gt = [] + self.vertex_ids_gt = [] + self.bbox_xywh_gt = [] + self.bbox_xywh_est = [] + self.point_bbox_with_dp_indices = [] + self.point_bbox_indices = [] + self.bbox_indices = [] + self.nxt_bbox_with_dp_index = 0 + self.nxt_bbox_index = 0 + + def accumulate(self, instances_one_image: Instances): + """ + Accumulate instances data for one image + + Args: + instances_one_image (Instances): instances data to accumulate + """ + boxes_xywh_est = BoxMode.convert( + instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + boxes_xywh_gt = BoxMode.convert( + instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + n_matches = len(boxes_xywh_gt) + assert n_matches == len( + boxes_xywh_est + ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes" + if not n_matches: + # no detection - GT matches + return + if ( + not hasattr(instances_one_image, "gt_densepose") + or instances_one_image.gt_densepose is None + ): + # no densepose GT for the detections, just increase the bbox index + self.nxt_bbox_index += n_matches + return + for box_xywh_est, box_xywh_gt, dp_gt in zip( + boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose + ): + if (dp_gt is not None) and (len(dp_gt.x) > 0): + # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`. + # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`. + self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt) + self.nxt_bbox_index += 1 + + def _do_accumulate(self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: Any): + """ + Accumulate instances data for one image, given that the data is not empty + + Args: + box_xywh_gt (tensor): GT bounding box + box_xywh_est (tensor): estimated bounding box + dp_gt: GT densepose data with the following attributes: + - x: normalized X coordinates + - y: normalized Y coordinates + - segm: tensor of size [S, S] with coarse segmentation + - + """ + self.x_gt.append(dp_gt.x) + self.y_gt.append(dp_gt.y) + if hasattr(dp_gt, "segm"): + self.s_gt.append(dp_gt.segm.unsqueeze(0)) + self.vertex_ids_gt.append(dp_gt.vertex_ids) + self.vertex_mesh_ids_gt.append(torch.full_like(dp_gt.vertex_ids, dp_gt.mesh_id)) + self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4)) + self.bbox_xywh_est.append(box_xywh_est.view(-1, 4)) + self.point_bbox_with_dp_indices.append( + torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_with_dp_index) + ) + self.point_bbox_indices.append(torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_index)) + self.bbox_indices.append(self.nxt_bbox_index) + self.nxt_bbox_with_dp_index += 1 + + def pack(self) -> Optional[PackedCseAnnotations]: + """ + Pack data into tensors + """ + if not len(self.x_gt): + # TODO: + # returning proper empty annotations would require + # creating empty tensors of appropriate shape and + # type on an appropriate device; + # we return None so far to indicate empty annotations + return None + return PackedCseAnnotations( + x_gt=torch.cat(self.x_gt, 0), + y_gt=torch.cat(self.y_gt, 0), + vertex_mesh_ids_gt=torch.cat(self.vertex_mesh_ids_gt, 0), + vertex_ids_gt=torch.cat(self.vertex_ids_gt, 0), + # ignore segmentation annotations, if not all the instances contain those + coarse_segm_gt=torch.cat(self.s_gt, 0) + if len(self.s_gt) == len(self.bbox_xywh_gt) + else None, + bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0), + bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0), + point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0), + point_bbox_indices=torch.cat(self.point_bbox_indices, 0), + bbox_indices=torch.as_tensor( + self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device + ), + ) diff --git a/densepose/modeling/losses/mask.py b/densepose/modeling/losses/mask.py new file mode 100644 index 0000000000000000000000000000000000000000..c16b15c53de9f02dc734148e05f2bde799046aa0 --- /dev/null +++ b/densepose/modeling/losses/mask.py @@ -0,0 +1,125 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from dataclasses import dataclass +from typing import Any, Iterable, List, Optional +import torch +from torch.nn import functional as F + +from detectron2.structures import Instances + + +@dataclass +class DataForMaskLoss: + """ + Contains mask GT and estimated data for proposals from multiple images: + """ + + # tensor of size (K, H, W) containing GT labels + masks_gt: Optional[torch.Tensor] = None + # tensor of size (K, C, H, W) containing estimated scores + masks_est: Optional[torch.Tensor] = None + + +def extract_data_for_mask_loss_from_matches( + proposals_targets: Iterable[Instances], estimated_segm: torch.Tensor +) -> DataForMaskLoss: + """ + Extract data for mask loss from instances that contain matched GT and + estimated bounding boxes. + Args: + proposals_targets: Iterable[Instances] + matched GT and estimated results, each item in the iterable + corresponds to data in 1 image + estimated_segm: tensor(K, C, S, S) of float - raw unnormalized + segmentation scores, here S is the size to which GT masks are + to be resized + Return: + masks_est: tensor(K, C, S, S) of float - class scores + masks_gt: tensor(K, S, S) of int64 - labels + """ + data = DataForMaskLoss() + masks_gt = [] + offset = 0 + assert estimated_segm.shape[2] == estimated_segm.shape[3], ( + f"Expected estimated segmentation to have a square shape, " + f"but the actual shape is {estimated_segm.shape[2:]}" + ) + mask_size = estimated_segm.shape[2] + num_proposals = sum(inst.proposal_boxes.tensor.size(0) for inst in proposals_targets) + num_estimated = estimated_segm.shape[0] + assert ( + num_proposals == num_estimated + ), "The number of proposals {} must be equal to the number of estimates {}".format( + num_proposals, num_estimated + ) + + for proposals_targets_per_image in proposals_targets: + n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0) + if not n_i: + continue + gt_masks_per_image = proposals_targets_per_image.gt_masks.crop_and_resize( + proposals_targets_per_image.proposal_boxes.tensor, mask_size + ).to(device=estimated_segm.device) + masks_gt.append(gt_masks_per_image) + offset += n_i + if masks_gt: + data.masks_est = estimated_segm + data.masks_gt = torch.cat(masks_gt, dim=0) + return data + + +class MaskLoss: + """ + Mask loss as cross-entropy for raw unnormalized scores given ground truth labels. + Mask ground truth labels are defined for the whole image and not only the + bounding box of interest. They are stored as objects that are assumed to implement + the `crop_and_resize` interface (e.g. BitMasks, PolygonMasks). + """ + + def __call__( + self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any + ) -> torch.Tensor: + """ + Computes segmentation loss as cross-entropy for raw unnormalized + scores given ground truth labels. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: an object of a dataclass that contains predictor outputs + with estimated values; assumed to have the following attribute: + * coarse_segm (tensor of shape [N, D, S, S]): coarse segmentation estimates + as raw unnormalized scores + where N is the number of detections, S is the estimate size ( = width = height) + and D is the number of coarse segmentation channels. + Return: + Cross entropy for raw unnormalized scores for coarse segmentation given + ground truth labels from masks + """ + if not len(proposals_with_gt): + return self.fake_value(densepose_predictor_outputs) + # densepose outputs are computed for all images and all bounding boxes; + # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively, + # the outputs will have size(0) == 3+1+2+1 == 7 + with torch.no_grad(): + mask_loss_data = extract_data_for_mask_loss_from_matches( + proposals_with_gt, densepose_predictor_outputs.coarse_segm + ) + if (mask_loss_data.masks_gt is None) or (mask_loss_data.masks_est is None): + return self.fake_value(densepose_predictor_outputs) + return F.cross_entropy(mask_loss_data.masks_est, mask_loss_data.masks_gt.long()) + + def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor: + """ + Fake segmentation loss used when no suitable ground truth data + was found in a batch. The loss has a value 0 and is primarily used to + construct the computation graph, so that `DistributedDataParallel` + has similar graphs on all GPUs and can perform reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have `coarse_segm` + attribute + Return: + Zero value loss with proper computation graph + """ + return densepose_predictor_outputs.coarse_segm.sum() * 0 diff --git a/densepose/modeling/losses/mask_or_segm.py b/densepose/modeling/losses/mask_or_segm.py new file mode 100644 index 0000000000000000000000000000000000000000..98b773d99fd29a48cbdfa94c5882c9c3d94003ee --- /dev/null +++ b/densepose/modeling/losses/mask_or_segm.py @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, List +import torch + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .mask import MaskLoss +from .segm import SegmentationLoss + + +class MaskOrSegmentationLoss: + """ + Mask or segmentation loss as cross-entropy for raw unnormalized scores + given ground truth labels. Ground truth labels are either defined by coarse + segmentation annotation, or by mask annotation, depending on the config + value MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize segmentation loss from configuration options + + Args: + cfg (CfgNode): configuration options + """ + self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + if self.segm_trained_by_masks: + self.mask_loss = MaskLoss() + self.segm_loss = SegmentationLoss(cfg) + + def __call__( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + ) -> torch.Tensor: + """ + Compute segmentation loss as cross-entropy between aligned unnormalized + score estimates and ground truth; with ground truth given + either by masks, or by coarse segmentation annotations. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: an object of a dataclass that contains predictor outputs + with estimated values; assumed to have the following attributes: + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + packed_annotations: packed annotations for efficient loss computation + Return: + tensor: loss value as cross-entropy for raw unnormalized scores + given ground truth labels + """ + if self.segm_trained_by_masks: + return self.mask_loss(proposals_with_gt, densepose_predictor_outputs) + return self.segm_loss(proposals_with_gt, densepose_predictor_outputs, packed_annotations) + + def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor: + """ + Fake segmentation loss used when no suitable ground truth data + was found in a batch. The loss has a value 0 and is primarily used to + construct the computation graph, so that `DistributedDataParallel` + has similar graphs on all GPUs and can perform reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have `coarse_segm` + attribute + Return: + Zero value loss with proper computation graph + """ + return densepose_predictor_outputs.coarse_segm.sum() * 0 diff --git a/densepose/modeling/losses/registry.py b/densepose/modeling/losses/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..d9c8817a743e42b2aec382818f0cc1bb39a66004 --- /dev/null +++ b/densepose/modeling/losses/registry.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.utils.registry import Registry + +DENSEPOSE_LOSS_REGISTRY = Registry("DENSEPOSE_LOSS") diff --git a/densepose/modeling/losses/segm.py b/densepose/modeling/losses/segm.py new file mode 100644 index 0000000000000000000000000000000000000000..1962b886e1946fa4896776da8a007ae0a9a4fab3 --- /dev/null +++ b/densepose/modeling/losses/segm.py @@ -0,0 +1,83 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, List +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .utils import resample_data + + +class SegmentationLoss: + """ + Segmentation loss as cross-entropy for raw unnormalized scores given ground truth + labels. Segmentation ground truth labels are defined for the bounding box of + interest at some fixed resolution [S, S], where + S = MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE. + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize segmentation loss from configuration options + + Args: + cfg (CfgNode): configuration options + """ + self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE + self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + + def __call__( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + ) -> torch.Tensor: + """ + Compute segmentation loss as cross-entropy on aligned segmentation + ground truth and estimated scores. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: an object of a dataclass that contains predictor outputs + with estimated values; assumed to have the following attributes: + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + packed_annotations: packed annotations for efficient loss computation; + the following attributes are used: + - coarse_segm_gt + - bbox_xywh_gt + - bbox_xywh_est + """ + if packed_annotations.coarse_segm_gt is None: + return self.fake_value(densepose_predictor_outputs) + coarse_segm_est = densepose_predictor_outputs.coarse_segm[packed_annotations.bbox_indices] + with torch.no_grad(): + coarse_segm_gt = resample_data( + packed_annotations.coarse_segm_gt.unsqueeze(1), + packed_annotations.bbox_xywh_gt, + packed_annotations.bbox_xywh_est, + self.heatmap_size, + self.heatmap_size, + mode="nearest", + padding_mode="zeros", + ).squeeze(1) + if self.n_segm_chan == 2: + coarse_segm_gt = coarse_segm_gt > 0 + return F.cross_entropy(coarse_segm_est, coarse_segm_gt.long()) + + def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor: + """ + Fake segmentation loss used when no suitable ground truth data + was found in a batch. The loss has a value 0 and is primarily used to + construct the computation graph, so that `DistributedDataParallel` + has similar graphs on all GPUs and can perform reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have `coarse_segm` + attribute + Return: + Zero value loss with proper computation graph + """ + return densepose_predictor_outputs.coarse_segm.sum() * 0 diff --git a/densepose/modeling/losses/soft_embed.py b/densepose/modeling/losses/soft_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..03b69ec36a59ae0d69bb77efa77f93c6f95fad97 --- /dev/null +++ b/densepose/modeling/losses/soft_embed.py @@ -0,0 +1,133 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, Dict, List +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from densepose.data.meshes.catalog import MeshCatalog +from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix +from densepose.structures.mesh import create_mesh + +from .embed_utils import PackedCseAnnotations +from .utils import BilinearInterpolationHelper + + +class SoftEmbeddingLoss: + """ + Computes losses for estimated embeddings given annotated vertices. + Instances in a minibatch that correspond to the same mesh are grouped + together. For each group, loss is computed as cross-entropy for + unnormalized scores given ground truth mesh vertex ids. + Scores are based on: + 1) squared distances between estimated vertex embeddings + and mesh vertex embeddings; + 2) geodesic distances between vertices of a mesh + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize embedding loss from config + """ + self.embdist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA + self.geodist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA + + def __call__( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: PackedCseAnnotations, + interpolator: BilinearInterpolationHelper, + embedder: nn.Module, + ) -> Dict[int, torch.Tensor]: + """ + Produces losses for estimated embeddings given annotated vertices. + Embeddings for all the vertices of a mesh are computed by the embedder. + Embeddings for observed pixels are estimated by a predictor. + Losses are computed as cross-entropy for unnormalized scores given + ground truth vertex IDs. + 1) squared distances between estimated vertex embeddings + and mesh vertex embeddings; + 2) geodesic distances between vertices of a mesh + + Args: + proposals_with_gt (list of Instances): detections with associated + ground truth data; each item corresponds to instances detected + on 1 image; the number of items corresponds to the number of + images in a batch + densepose_predictor_outputs: an object of a dataclass that contains predictor + outputs with estimated values; assumed to have the following attributes: + * embedding - embedding estimates, tensor of shape [N, D, S, S], where + N = number of instances (= sum N_i, where N_i is the number of + instances on image i) + D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE) + S = output size (width and height) + packed_annotations (PackedCseAnnotations): contains various data useful + for loss computation, each data is packed into a single tensor + interpolator (BilinearInterpolationHelper): bilinear interpolation helper + embedder (nn.Module): module that computes vertex embeddings for different meshes + Return: + dict(int -> tensor): losses for different mesh IDs + """ + losses = {} + for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique(): + mesh_id = mesh_id_tensor.item() + mesh_name = MeshCatalog.get_mesh_name(mesh_id) + # valid points are those that fall into estimated bbox + # and correspond to the current mesh + j_valid = interpolator.j_valid * ( # pyre-ignore[16] + packed_annotations.vertex_mesh_ids_gt == mesh_id + ) + if not torch.any(j_valid): + continue + # extract estimated embeddings for valid points + # -> tensor [J, D] + vertex_embeddings_i = normalize_embeddings( + interpolator.extract_at_points( + densepose_predictor_outputs.embedding, + slice_fine_segm=slice(None), + w_ylo_xlo=interpolator.w_ylo_xlo[:, None], # pyre-ignore[16] + w_ylo_xhi=interpolator.w_ylo_xhi[:, None], # pyre-ignore[16] + w_yhi_xlo=interpolator.w_yhi_xlo[:, None], # pyre-ignore[16] + w_yhi_xhi=interpolator.w_yhi_xhi[:, None], # pyre-ignore[16] + )[j_valid, :] + ) + # extract vertex ids for valid points + # -> tensor [J] + vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid] + # embeddings for all mesh vertices + # -> tensor [K, D] + mesh_vertex_embeddings = embedder(mesh_name) + # softmax values of geodesic distances for GT mesh vertices + # -> tensor [J, K] + mesh = create_mesh(mesh_name, mesh_vertex_embeddings.device) + geodist_softmax_values = F.softmax( + mesh.geodists[vertex_indices_i] / (-self.geodist_gauss_sigma), dim=1 + ) + # logsoftmax values for valid points + # -> tensor [J, K] + embdist_logsoftmax_values = F.log_softmax( + squared_euclidean_distance_matrix(vertex_embeddings_i, mesh_vertex_embeddings) + / (-self.embdist_gauss_sigma), + dim=1, + ) + losses[mesh_name] = (-geodist_softmax_values * embdist_logsoftmax_values).sum(1).mean() + + for mesh_name in embedder.mesh_names: + if mesh_name not in losses: + losses[mesh_name] = self.fake_value( + densepose_predictor_outputs, embedder, mesh_name + ) + return losses + + def fake_values(self, densepose_predictor_outputs: Any, embedder: nn.Module): + losses = {} + for mesh_name in embedder.mesh_names: + losses[mesh_name] = self.fake_value(densepose_predictor_outputs, embedder, mesh_name) + return losses + + def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module, mesh_name: str): + return densepose_predictor_outputs.embedding.sum() * 0 + embedder(mesh_name).sum() * 0 diff --git a/densepose/modeling/losses/utils.py b/densepose/modeling/losses/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ceea981d11650af80cb007fe129a3ee4864fc48f --- /dev/null +++ b/densepose/modeling/losses/utils.py @@ -0,0 +1,443 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple +import torch +from torch.nn import functional as F + +from detectron2.structures import BoxMode, Instances + +from densepose import DensePoseDataRelative + +LossDict = Dict[str, torch.Tensor] + + +def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z): + """ + Computes utility values for linear interpolation at points v. + The points are given as normalized offsets in the source interval + (v0_src, v0_src + size_src), more precisely: + v = v0_src + v_norm * size_src / 256.0 + The computed utilities include lower points v_lo, upper points v_hi, + interpolation weights v_w and flags j_valid indicating whether the + points falls into the destination interval (v0_dst, v0_dst + size_dst). + + Args: + v_norm (:obj: `torch.Tensor`): tensor of size N containing + normalized point offsets + v0_src (:obj: `torch.Tensor`): tensor of size N containing + left bounds of source intervals for normalized points + size_src (:obj: `torch.Tensor`): tensor of size N containing + source interval sizes for normalized points + v0_dst (:obj: `torch.Tensor`): tensor of size N containing + left bounds of destination intervals + size_dst (:obj: `torch.Tensor`): tensor of size N containing + destination interval sizes + size_z (int): interval size for data to be interpolated + + Returns: + v_lo (:obj: `torch.Tensor`): int tensor of size N containing + indices of lower values used for interpolation, all values are + integers from [0, size_z - 1] + v_hi (:obj: `torch.Tensor`): int tensor of size N containing + indices of upper values used for interpolation, all values are + integers from [0, size_z - 1] + v_w (:obj: `torch.Tensor`): float tensor of size N containing + interpolation weights + j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing + 0 for points outside the estimation interval + (v0_est, v0_est + size_est) and 1 otherwise + """ + v = v0_src + v_norm * size_src / 256.0 + j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst) + v_grid = (v - v0_dst) * size_z / size_dst + v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1) + v_hi = (v_lo + 1).clamp(max=size_z - 1) + v_grid = torch.min(v_hi.float(), v_grid) + v_w = v_grid - v_lo.float() + return v_lo, v_hi, v_w, j_valid + + +class BilinearInterpolationHelper: + """ + Args: + packed_annotations: object that contains packed annotations + j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing + 0 for points to be discarded and 1 for points to be selected + y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values + in z_est for each point + y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values + in z_est for each point + x_lo (:obj: `torch.Tensor`): int tensor of indices of left values + in z_est for each point + x_hi (:obj: `torch.Tensor`): int tensor of indices of right values + in z_est for each point + w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M; + contains upper-left value weight for each point + w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M; + contains upper-right value weight for each point + w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M; + contains lower-left value weight for each point + w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M; + contains lower-right value weight for each point + """ + + def __init__( + self, + packed_annotations: Any, + j_valid: torch.Tensor, + y_lo: torch.Tensor, + y_hi: torch.Tensor, + x_lo: torch.Tensor, + x_hi: torch.Tensor, + w_ylo_xlo: torch.Tensor, + w_ylo_xhi: torch.Tensor, + w_yhi_xlo: torch.Tensor, + w_yhi_xhi: torch.Tensor, + ): + for k, v in locals().items(): + if k != "self": + setattr(self, k, v) + + @staticmethod + def from_matches( + packed_annotations: Any, densepose_outputs_size_hw: Tuple[int, int] + ) -> "BilinearInterpolationHelper": + """ + Args: + packed_annotations: annotations packed into tensors, the following + attributes are required: + - bbox_xywh_gt + - bbox_xywh_est + - x_gt + - y_gt + - point_bbox_with_dp_indices + - point_bbox_indices + densepose_outputs_size_hw (tuple [int, int]): resolution of + DensePose predictor outputs (H, W) + Return: + An instance of `BilinearInterpolationHelper` used to perform + interpolation for the given annotation points and output resolution + """ + + zh, zw = densepose_outputs_size_hw + x0_gt, y0_gt, w_gt, h_gt = packed_annotations.bbox_xywh_gt[ + packed_annotations.point_bbox_with_dp_indices + ].unbind(dim=1) + x0_est, y0_est, w_est, h_est = packed_annotations.bbox_xywh_est[ + packed_annotations.point_bbox_with_dp_indices + ].unbind(dim=1) + x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities( + packed_annotations.x_gt, x0_gt, w_gt, x0_est, w_est, zw + ) + y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities( + packed_annotations.y_gt, y0_gt, h_gt, y0_est, h_est, zh + ) + j_valid = jx_valid * jy_valid + + w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w) + w_ylo_xhi = x_w * (1.0 - y_w) + w_yhi_xlo = (1.0 - x_w) * y_w + w_yhi_xhi = x_w * y_w + + return BilinearInterpolationHelper( + packed_annotations, + j_valid, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, # pyre-ignore[6] + w_ylo_xhi, + # pyre-fixme[6]: Expected `Tensor` for 9th param but got `float`. + w_yhi_xlo, + w_yhi_xhi, + ) + + def extract_at_points( + self, + z_est, + slice_fine_segm=None, + w_ylo_xlo=None, + w_ylo_xhi=None, + w_yhi_xlo=None, + w_yhi_xhi=None, + ): + """ + Extract ground truth values z_gt for valid point indices and estimated + values z_est using bilinear interpolation over top-left (y_lo, x_lo), + top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right + (y_hi, x_hi) values in z_est with corresponding weights: + w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi. + Use slice_fine_segm to slice dim=1 in z_est + """ + slice_fine_segm = ( + self.packed_annotations.fine_segm_labels_gt + if slice_fine_segm is None + else slice_fine_segm + ) + w_ylo_xlo = self.w_ylo_xlo if w_ylo_xlo is None else w_ylo_xlo + w_ylo_xhi = self.w_ylo_xhi if w_ylo_xhi is None else w_ylo_xhi + w_yhi_xlo = self.w_yhi_xlo if w_yhi_xlo is None else w_yhi_xlo + w_yhi_xhi = self.w_yhi_xhi if w_yhi_xhi is None else w_yhi_xhi + + index_bbox = self.packed_annotations.point_bbox_indices + z_est_sampled = ( + z_est[index_bbox, slice_fine_segm, self.y_lo, self.x_lo] * w_ylo_xlo + + z_est[index_bbox, slice_fine_segm, self.y_lo, self.x_hi] * w_ylo_xhi + + z_est[index_bbox, slice_fine_segm, self.y_hi, self.x_lo] * w_yhi_xlo + + z_est[index_bbox, slice_fine_segm, self.y_hi, self.x_hi] * w_yhi_xhi + ) + return z_est_sampled + + +def resample_data( + z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode: str = "nearest", padding_mode: str = "zeros" +): + """ + Args: + z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be + resampled + bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing + source bounding boxes in format XYWH + bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing + destination bounding boxes in format XYWH + Return: + zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout) + with resampled values of z, where D is the discretization size + """ + n = bbox_xywh_src.size(0) + assert n == bbox_xywh_dst.size(0), ( + "The number of " + "source ROIs for resampling ({}) should be equal to the number " + "of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0)) + ) + x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1) + x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1) + x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1 + y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1 + x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1 + y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1 + grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout + grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout + grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout) + grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout) + dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout) + dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout) + x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout) + y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout) + grid_x = grid_w_expanded * dx_expanded + x0_expanded + grid_y = grid_h_expanded * dy_expanded + y0_expanded + grid = torch.stack((grid_x, grid_y), dim=3) + # resample Z from (N, C, H, W) into (N, C, Hout, Wout) + zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True) + return zresampled + + +class AnnotationsAccumulator(ABC): + """ + Abstract class for an accumulator for annotations that can produce + dense annotations packed into tensors. + """ + + @abstractmethod + def accumulate(self, instances_one_image: Instances): + """ + Accumulate instances data for one image + + Args: + instances_one_image (Instances): instances data to accumulate + """ + pass + + @abstractmethod + def pack(self) -> Any: + """ + Pack data into tensors + """ + pass + + +@dataclass +class PackedChartBasedAnnotations: + """ + Packed annotations for chart-based model training. The following attributes + are defined: + - fine_segm_labels_gt (tensor [K] of `int64`): GT fine segmentation point labels + - x_gt (tensor [K] of `float32`): GT normalized X point coordinates + - y_gt (tensor [K] of `float32`): GT normalized Y point coordinates + - u_gt (tensor [K] of `float32`): GT point U values + - v_gt (tensor [K] of `float32`): GT point V values + - coarse_segm_gt (tensor [N, S, S] of `float32`): GT segmentation for bounding boxes + - bbox_xywh_gt (tensor [N, 4] of `float32`): selected GT bounding boxes in + XYWH format + - bbox_xywh_est (tensor [N, 4] of `float32`): selected matching estimated + bounding boxes in XYWH format + - point_bbox_with_dp_indices (tensor [K] of `int64`): indices of bounding boxes + with DensePose annotations that correspond to the point data + - point_bbox_indices (tensor [K] of `int64`): indices of bounding boxes + (not necessarily the selected ones with DensePose data) that correspond + to the point data + - bbox_indices (tensor [N] of `int64`): global indices of selected bounding + boxes with DensePose annotations; these indices could be used to access + features that are computed for all bounding boxes, not only the ones with + DensePose annotations. + Here K is the total number of points and N is the total number of instances + with DensePose annotations. + """ + + fine_segm_labels_gt: torch.Tensor + x_gt: torch.Tensor + y_gt: torch.Tensor + u_gt: torch.Tensor + v_gt: torch.Tensor + coarse_segm_gt: Optional[torch.Tensor] + bbox_xywh_gt: torch.Tensor + bbox_xywh_est: torch.Tensor + point_bbox_with_dp_indices: torch.Tensor + point_bbox_indices: torch.Tensor + bbox_indices: torch.Tensor + + +class ChartBasedAnnotationsAccumulator(AnnotationsAccumulator): + """ + Accumulates annotations by batches that correspond to objects detected on + individual images. Can pack them together into single tensors. + """ + + def __init__(self): + self.i_gt = [] + self.x_gt = [] + self.y_gt = [] + self.u_gt = [] + self.v_gt = [] + self.s_gt = [] + self.bbox_xywh_gt = [] + self.bbox_xywh_est = [] + self.point_bbox_with_dp_indices = [] + self.point_bbox_indices = [] + self.bbox_indices = [] + self.nxt_bbox_with_dp_index = 0 + self.nxt_bbox_index = 0 + + def accumulate(self, instances_one_image: Instances): + """ + Accumulate instances data for one image + + Args: + instances_one_image (Instances): instances data to accumulate + """ + boxes_xywh_est = BoxMode.convert( + instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + boxes_xywh_gt = BoxMode.convert( + instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + n_matches = len(boxes_xywh_gt) + assert n_matches == len( + boxes_xywh_est + ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes" + if not n_matches: + # no detection - GT matches + return + if ( + not hasattr(instances_one_image, "gt_densepose") + or instances_one_image.gt_densepose is None + ): + # no densepose GT for the detections, just increase the bbox index + self.nxt_bbox_index += n_matches + return + for box_xywh_est, box_xywh_gt, dp_gt in zip( + boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose + ): + if (dp_gt is not None) and (len(dp_gt.x) > 0): + # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`. + # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`. + self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt) + self.nxt_bbox_index += 1 + + def _do_accumulate( + self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: DensePoseDataRelative + ): + """ + Accumulate instances data for one image, given that the data is not empty + + Args: + box_xywh_gt (tensor): GT bounding box + box_xywh_est (tensor): estimated bounding box + dp_gt (DensePoseDataRelative): GT densepose data + """ + self.i_gt.append(dp_gt.i) + self.x_gt.append(dp_gt.x) + self.y_gt.append(dp_gt.y) + self.u_gt.append(dp_gt.u) + self.v_gt.append(dp_gt.v) + if hasattr(dp_gt, "segm"): + self.s_gt.append(dp_gt.segm.unsqueeze(0)) + self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4)) + self.bbox_xywh_est.append(box_xywh_est.view(-1, 4)) + self.point_bbox_with_dp_indices.append( + torch.full_like(dp_gt.i, self.nxt_bbox_with_dp_index) + ) + self.point_bbox_indices.append(torch.full_like(dp_gt.i, self.nxt_bbox_index)) + self.bbox_indices.append(self.nxt_bbox_index) + self.nxt_bbox_with_dp_index += 1 + + def pack(self) -> Optional[PackedChartBasedAnnotations]: + """ + Pack data into tensors + """ + if not len(self.i_gt): + # TODO: + # returning proper empty annotations would require + # creating empty tensors of appropriate shape and + # type on an appropriate device; + # we return None so far to indicate empty annotations + return None + return PackedChartBasedAnnotations( + fine_segm_labels_gt=torch.cat(self.i_gt, 0).long(), + x_gt=torch.cat(self.x_gt, 0), + y_gt=torch.cat(self.y_gt, 0), + u_gt=torch.cat(self.u_gt, 0), + v_gt=torch.cat(self.v_gt, 0), + # ignore segmentation annotations, if not all the instances contain those + coarse_segm_gt=torch.cat(self.s_gt, 0) + if len(self.s_gt) == len(self.bbox_xywh_gt) + else None, + bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0), + bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0), + point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0).long(), + point_bbox_indices=torch.cat(self.point_bbox_indices, 0).long(), + bbox_indices=torch.as_tensor( + self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device + ).long(), + ) + + +def extract_packed_annotations_from_matches( + proposals_with_targets: List[Instances], accumulator: AnnotationsAccumulator +) -> Any: + for proposals_targets_per_image in proposals_with_targets: + accumulator.accumulate(proposals_targets_per_image) + return accumulator.pack() + + +def sample_random_indices( + n_indices: int, n_samples: int, device: Optional[torch.device] = None +) -> Optional[torch.Tensor]: + """ + Samples `n_samples` random indices from range `[0..n_indices - 1]`. + If `n_indices` is smaller than `n_samples`, returns `None` meaning that all indices + are selected. + Args: + n_indices (int): total number of indices + n_samples (int): number of indices to sample + device (torch.device): the desired device of returned tensor + Return: + Tensor of selected vertex indices, or `None`, if all vertices are selected + """ + if (n_samples <= 0) or (n_indices <= n_samples): + return None + indices = torch.randperm(n_indices, device=device)[:n_samples] + return indices diff --git a/densepose/modeling/predictors/__init__.py b/densepose/modeling/predictors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ece0757acf2a4924079c884cab44a71cea22c37 --- /dev/null +++ b/densepose/modeling/predictors/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .chart import DensePoseChartPredictor +from .chart_confidence import DensePoseChartConfidencePredictorMixin +from .chart_with_confidence import DensePoseChartWithConfidencePredictor +from .cse import DensePoseEmbeddingPredictor +from .cse_confidence import DensePoseEmbeddingConfidencePredictorMixin +from .cse_with_confidence import DensePoseEmbeddingWithConfidencePredictor +from .registry import DENSEPOSE_PREDICTOR_REGISTRY diff --git a/densepose/modeling/predictors/chart.py b/densepose/modeling/predictors/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..3bcd13f7c592e37c2751556cda1f6e9cd3400b73 --- /dev/null +++ b/densepose/modeling/predictors/chart.py @@ -0,0 +1,94 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import torch +from torch import nn + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d, interpolate + +from ...structures import DensePoseChartPredictorOutput +from ..utils import initialize_module_params +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseChartPredictor(nn.Module): + """ + Predictor (last layers of a DensePose model) that takes DensePose head outputs as an input + and produces 4 tensors which represent DensePose results for predefined body parts + (patches / charts): + * coarse segmentation, a tensor of shape [N, K, Hout, Wout] + * fine segmentation, a tensor of shape [N, C, Hout, Wout] + * U coordinates, a tensor of shape [N, C, Hout, Wout] + * V coordinates, a tensor of shape [N, C, Hout, Wout] + where + - N is the number of instances + - K is the number of coarse segmentation channels ( + 2 = foreground / background, + 15 = one of 14 body parts / background) + - C is the number of fine segmentation channels ( + 24 fine body parts / background) + - Hout and Wout are height and width of predictions + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize predictor using configuration options + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + """ + super().__init__() + dim_in = input_channels + n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + # coarse segmentation + self.ann_index_lowres = ConvTranspose2d( + dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # fine segmentation + self.index_uv_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # U + self.u_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # V + self.v_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE + initialize_module_params(self) + + def interp2d(self, tensor_nchw: torch.Tensor): + """ + Bilinear interpolation method to be used for upscaling + + Args: + tensor_nchw (tensor): tensor of shape (N, C, H, W) + Return: + tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed + by applying the scale factor to H and W + """ + return interpolate( + tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + + def forward(self, head_outputs: torch.Tensor): + """ + Perform forward step on DensePose head outputs + + Args: + head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W] + Return: + An instance of DensePoseChartPredictorOutput + """ + return DensePoseChartPredictorOutput( + coarse_segm=self.interp2d(self.ann_index_lowres(head_outputs)), + fine_segm=self.interp2d(self.index_uv_lowres(head_outputs)), + u=self.interp2d(self.u_lowres(head_outputs)), + v=self.interp2d(self.v_lowres(head_outputs)), + ) diff --git a/densepose/modeling/predictors/chart_confidence.py b/densepose/modeling/predictors/chart_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..0c0099952f3e675e42aa7d3b6d35065fdaf43dbb --- /dev/null +++ b/densepose/modeling/predictors/chart_confidence.py @@ -0,0 +1,174 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d + +from ...structures import decorate_predictor_output_class_with_confidences +from ..confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType +from ..utils import initialize_module_params + + +class DensePoseChartConfidencePredictorMixin: + """ + Predictor contains the last layers of a DensePose model that take DensePose head + outputs as an input and produce model outputs. Confidence predictor mixin is used + to generate confidences for segmentation and UV tensors estimated by some + base predictor. Several assumptions need to hold for the base predictor: + 1) the `forward` method must return SIUV tuple as the first result ( + S = coarse segmentation, I = fine segmentation, U and V are intrinsic + chart coordinates) + 2) `interp2d` method must be defined to perform bilinear interpolation; + the same method is typically used for SIUV and confidences + Confidence predictor mixin provides confidence estimates, as described in: + N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences + from Noisy Labels, NeurIPS 2019 + A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020 + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize confidence predictor using configuration options. + + Args: + cfg (CfgNode): configuration options + input_channels (int): number of input channels + """ + # we rely on base predictor to call nn.Module.__init__ + super().__init__(cfg, input_channels) # pyre-ignore[19] + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + self._initialize_confidence_estimation_layers(cfg, input_channels) + self._registry = {} + initialize_module_params(self) # pyre-ignore[6] + + def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int): + """ + Initialize confidence estimation layers based on configuration options + + Args: + cfg (CfgNode): configuration options + dim_in (int): number of input channels + """ + dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + if self.confidence_model_cfg.uv_confidence.enabled: + if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + self.sigma_2_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + elif ( + self.confidence_model_cfg.uv_confidence.type + == DensePoseUVConfidenceType.INDEP_ANISO + ): + self.sigma_2_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.kappa_u_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.kappa_v_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + else: + raise ValueError( + f"Unknown confidence model type: " + f"{self.confidence_model_cfg.confidence_model_type}" + ) + if self.confidence_model_cfg.segm_confidence.enabled: + self.fine_segm_confidence_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.coarse_segm_confidence_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + + def forward(self, head_outputs: torch.Tensor): + """ + Perform forward operation on head outputs used as inputs for the predictor. + Calls forward method from the base predictor and uses its outputs to compute + confidences. + + Args: + head_outputs (Tensor): head outputs used as predictor inputs + Return: + An instance of outputs with confidences, + see `decorate_predictor_output_class_with_confidences` + """ + # assuming base class returns SIUV estimates in its first result + base_predictor_outputs = super().forward(head_outputs) # pyre-ignore[16] + + # create output instance by extending base predictor outputs: + output = self._create_output_instance(base_predictor_outputs) + + if self.confidence_model_cfg.uv_confidence.enabled: + if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + # assuming base class defines interp2d method for bilinear interpolation + output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs)) # pyre-ignore[16] + elif ( + self.confidence_model_cfg.uv_confidence.type + == DensePoseUVConfidenceType.INDEP_ANISO + ): + # assuming base class defines interp2d method for bilinear interpolation + output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs)) + output.kappa_u = self.interp2d(self.kappa_u_lowres(head_outputs)) # pyre-ignore[16] + output.kappa_v = self.interp2d(self.kappa_v_lowres(head_outputs)) # pyre-ignore[16] + else: + raise ValueError( + f"Unknown confidence model type: " + f"{self.confidence_model_cfg.confidence_model_type}" + ) + if self.confidence_model_cfg.segm_confidence.enabled: + # base predictor outputs are assumed to have `fine_segm` and `coarse_segm` attributes + # base predictor is assumed to define `interp2d` method for bilinear interpolation + output.fine_segm_confidence = ( + F.softplus( + self.interp2d(self.fine_segm_confidence_lowres(head_outputs)) # pyre-ignore[16] + ) + + self.confidence_model_cfg.segm_confidence.epsilon + ) + output.fine_segm = base_predictor_outputs.fine_segm * torch.repeat_interleave( + output.fine_segm_confidence, base_predictor_outputs.fine_segm.shape[1], dim=1 + ) + output.coarse_segm_confidence = ( + F.softplus( + self.interp2d( + self.coarse_segm_confidence_lowres(head_outputs) # pyre-ignore[16] + ) + ) + + self.confidence_model_cfg.segm_confidence.epsilon + ) + output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave( + output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1 + ) + + return output + + def _create_output_instance(self, base_predictor_outputs: Any): + """ + Create an instance of predictor outputs by copying the outputs from the + base predictor and initializing confidence + + Args: + base_predictor_outputs: an instance of base predictor outputs + (the outputs type is assumed to be a dataclass) + Return: + An instance of outputs with confidences + """ + PredictorOutput = decorate_predictor_output_class_with_confidences( + type(base_predictor_outputs) # pyre-ignore[6] + ) + # base_predictor_outputs is assumed to be a dataclass + # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields + output = PredictorOutput( + **base_predictor_outputs.__dict__, + coarse_segm_confidence=None, + fine_segm_confidence=None, + sigma_1=None, + sigma_2=None, + kappa_u=None, + kappa_v=None, + ) + return output diff --git a/densepose/modeling/predictors/chart_with_confidence.py b/densepose/modeling/predictors/chart_with_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1cd6cc8fda56e831fbc02a8ffdd844866c0e4f --- /dev/null +++ b/densepose/modeling/predictors/chart_with_confidence.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from . import DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseChartWithConfidencePredictor( + DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor +): + """ + Predictor that combines chart and chart confidence estimation + """ + + pass diff --git a/densepose/modeling/predictors/cse.py b/densepose/modeling/predictors/cse.py new file mode 100644 index 0000000000000000000000000000000000000000..466a5ecddbfa338a2b603facf06d1f4510fff6eb --- /dev/null +++ b/densepose/modeling/predictors/cse.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import torch +from torch import nn + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d, interpolate + +from ...structures import DensePoseEmbeddingPredictorOutput +from ..utils import initialize_module_params +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseEmbeddingPredictor(nn.Module): + """ + Last layers of a DensePose model that take DensePose head outputs as an input + and produce model outputs for continuous surface embeddings (CSE). + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize predictor using configuration options + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + """ + super().__init__() + dim_in = input_channels + n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + # coarse segmentation + self.coarse_segm_lowres = ConvTranspose2d( + dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # embedding + self.embed_lowres = ConvTranspose2d( + dim_in, embed_size, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE + initialize_module_params(self) + + def interp2d(self, tensor_nchw: torch.Tensor): + """ + Bilinear interpolation method to be used for upscaling + + Args: + tensor_nchw (tensor): tensor of shape (N, C, H, W) + Return: + tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed + by applying the scale factor to H and W + """ + return interpolate( + tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + + def forward(self, head_outputs): + """ + Perform forward step on DensePose head outputs + + Args: + head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W] + """ + embed_lowres = self.embed_lowres(head_outputs) + coarse_segm_lowres = self.coarse_segm_lowres(head_outputs) + embed = self.interp2d(embed_lowres) + coarse_segm = self.interp2d(coarse_segm_lowres) + return DensePoseEmbeddingPredictorOutput(embedding=embed, coarse_segm=coarse_segm) diff --git a/densepose/modeling/predictors/cse_confidence.py b/densepose/modeling/predictors/cse_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..8220337cea8eb87bbdf74378079551259dcc37e2 --- /dev/null +++ b/densepose/modeling/predictors/cse_confidence.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d + +from densepose.modeling.confidence import DensePoseConfidenceModelConfig +from densepose.modeling.utils import initialize_module_params +from densepose.structures import decorate_cse_predictor_output_class_with_confidences + + +class DensePoseEmbeddingConfidencePredictorMixin: + """ + Predictor contains the last layers of a DensePose model that take DensePose head + outputs as an input and produce model outputs. Confidence predictor mixin is used + to generate confidences for coarse segmentation estimated by some + base predictor. Several assumptions need to hold for the base predictor: + 1) the `forward` method must return CSE DensePose head outputs, + tensor of shape [N, D, H, W] + 2) `interp2d` method must be defined to perform bilinear interpolation; + the same method is typically used for masks and confidences + Confidence predictor mixin provides confidence estimates, as described in: + N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences + from Noisy Labels, NeurIPS 2019 + A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020 + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize confidence predictor using configuration options. + + Args: + cfg (CfgNode): configuration options + input_channels (int): number of input channels + """ + # we rely on base predictor to call nn.Module.__init__ + super().__init__(cfg, input_channels) # pyre-ignore[19] + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + self._initialize_confidence_estimation_layers(cfg, input_channels) + self._registry = {} + initialize_module_params(self) # pyre-ignore[6] + + def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int): + """ + Initialize confidence estimation layers based on configuration options + + Args: + cfg (CfgNode): configuration options + dim_in (int): number of input channels + """ + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + if self.confidence_model_cfg.segm_confidence.enabled: + self.coarse_segm_confidence_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + + def forward(self, head_outputs: torch.Tensor): + """ + Perform forward operation on head outputs used as inputs for the predictor. + Calls forward method from the base predictor and uses its outputs to compute + confidences. + + Args: + head_outputs (Tensor): head outputs used as predictor inputs + Return: + An instance of outputs with confidences, + see `decorate_cse_predictor_output_class_with_confidences` + """ + # assuming base class returns SIUV estimates in its first result + base_predictor_outputs = super().forward(head_outputs) # pyre-ignore[16] + + # create output instance by extending base predictor outputs: + output = self._create_output_instance(base_predictor_outputs) + + if self.confidence_model_cfg.segm_confidence.enabled: + # base predictor outputs are assumed to have `coarse_segm` attribute + # base predictor is assumed to define `interp2d` method for bilinear interpolation + output.coarse_segm_confidence = ( + F.softplus( + self.interp2d( # pyre-ignore[16] + self.coarse_segm_confidence_lowres(head_outputs) # pyre-ignore[16] + ) + ) + + self.confidence_model_cfg.segm_confidence.epsilon + ) + output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave( + output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1 + ) + + return output + + def _create_output_instance(self, base_predictor_outputs: Any): + """ + Create an instance of predictor outputs by copying the outputs from the + base predictor and initializing confidence + + Args: + base_predictor_outputs: an instance of base predictor outputs + (the outputs type is assumed to be a dataclass) + Return: + An instance of outputs with confidences + """ + PredictorOutput = decorate_cse_predictor_output_class_with_confidences( + type(base_predictor_outputs) # pyre-ignore[6] + ) + # base_predictor_outputs is assumed to be a dataclass + # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields + output = PredictorOutput( + **base_predictor_outputs.__dict__, + coarse_segm_confidence=None, + ) + return output diff --git a/densepose/modeling/predictors/cse_with_confidence.py b/densepose/modeling/predictors/cse_with_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..17ecef67ffb67cd0e64de73632eaede1d8f3c701 --- /dev/null +++ b/densepose/modeling/predictors/cse_with_confidence.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from . import DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseEmbeddingWithConfidencePredictor( + DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor +): + """ + Predictor that combines CSE and CSE confidence estimation + """ + + pass diff --git a/densepose/modeling/predictors/registry.py b/densepose/modeling/predictors/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..f96901d3242fa8f3d35d053ed0bdd7649a045b88 --- /dev/null +++ b/densepose/modeling/predictors/registry.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.utils.registry import Registry + +DENSEPOSE_PREDICTOR_REGISTRY = Registry("DENSEPOSE_PREDICTOR") diff --git a/densepose/modeling/roi_heads/__init__.py b/densepose/modeling/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8403589f23ec2ffa8afafcd566ca0b0b7b2671a7 --- /dev/null +++ b/densepose/modeling/roi_heads/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .v1convx import DensePoseV1ConvXHead +from .deeplab import DensePoseDeepLabHead +from .registry import ROI_DENSEPOSE_HEAD_REGISTRY +from .roi_head import Decoder, DensePoseROIHeads diff --git a/densepose/modeling/roi_heads/deeplab.py b/densepose/modeling/roi_heads/deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5cb483037b302ff1fb2c305275a65e4ba4e941 --- /dev/null +++ b/densepose/modeling/roi_heads/deeplab.py @@ -0,0 +1,263 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import Conv2d + +from .registry import ROI_DENSEPOSE_HEAD_REGISTRY + + +@ROI_DENSEPOSE_HEAD_REGISTRY.register() +class DensePoseDeepLabHead(nn.Module): + """ + DensePose head using DeepLabV3 model from + "Rethinking Atrous Convolution for Semantic Image Segmentation" + . + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + super(DensePoseDeepLabHead, self).__init__() + # fmt: off + hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL + norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM + self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS + self.use_nonlocal = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON + # fmt: on + pad_size = kernel_size // 2 + n_channels = input_channels + + self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels) # 6, 12, 56 + self.add_module("ASPP", self.ASPP) + + if self.use_nonlocal: + self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True) + self.add_module("NLBlock", self.NLBlock) + # weight_init.c2_msra_fill(self.ASPP) + + for i in range(self.n_stacked_convs): + norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None + layer = Conv2d( + n_channels, + hidden_dim, + kernel_size, + stride=1, + padding=pad_size, + bias=not norm, + norm=norm_module, + ) + weight_init.c2_msra_fill(layer) + n_channels = hidden_dim + layer_name = self._get_layer_name(i) + self.add_module(layer_name, layer) + self.n_out_channels = hidden_dim + # initialize_module_params(self) + + def forward(self, features): + x0 = features + x = self.ASPP(x0) + if self.use_nonlocal: + x = self.NLBlock(x) + output = x + for i in range(self.n_stacked_convs): + layer_name = self._get_layer_name(i) + x = getattr(self, layer_name)(x) + x = F.relu(x) + output = x + return output + + def _get_layer_name(self, i: int): + layer_name = "body_conv_fcn{}".format(i + 1) + return layer_name + + +# Copied from +# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py +# See https://arxiv.org/pdf/1706.05587.pdf for details +class ASPPConv(nn.Sequential): + def __init__(self, in_channels, out_channels, dilation): + modules = [ + nn.Conv2d( + in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False + ), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ] + super(ASPPConv, self).__init__(*modules) + + +class ASPPPooling(nn.Sequential): + def __init__(self, in_channels, out_channels): + super(ASPPPooling, self).__init__( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, out_channels, 1, bias=False), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ) + + def forward(self, x): + size = x.shape[-2:] + x = super(ASPPPooling, self).forward(x) + return F.interpolate(x, size=size, mode="bilinear", align_corners=False) + + +class ASPP(nn.Module): + def __init__(self, in_channels, atrous_rates, out_channels): + super(ASPP, self).__init__() + modules = [] + modules.append( + nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, bias=False), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ) + ) + + rate1, rate2, rate3 = tuple(atrous_rates) + modules.append(ASPPConv(in_channels, out_channels, rate1)) + modules.append(ASPPConv(in_channels, out_channels, rate2)) + modules.append(ASPPConv(in_channels, out_channels, rate3)) + modules.append(ASPPPooling(in_channels, out_channels)) + + self.convs = nn.ModuleList(modules) + + self.project = nn.Sequential( + nn.Conv2d(5 * out_channels, out_channels, 1, bias=False), + # nn.BatchNorm2d(out_channels), + nn.ReLU() + # nn.Dropout(0.5) + ) + + def forward(self, x): + res = [] + for conv in self.convs: + res.append(conv(x)) + res = torch.cat(res, dim=1) + return self.project(res) + + +# copied from +# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py +# See https://arxiv.org/abs/1711.07971 for details +class _NonLocalBlockND(nn.Module): + def __init__( + self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True + ): + super(_NonLocalBlockND, self).__init__() + + assert dimension in [1, 2, 3] + + self.dimension = dimension + self.sub_sample = sub_sample + + self.in_channels = in_channels + self.inter_channels = inter_channels + + if self.inter_channels is None: + self.inter_channels = in_channels // 2 + if self.inter_channels == 0: + self.inter_channels = 1 + + if dimension == 3: + conv_nd = nn.Conv3d + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + bn = nn.GroupNorm # (32, hidden_dim) #nn.BatchNorm3d + elif dimension == 2: + conv_nd = nn.Conv2d + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm2d + else: + conv_nd = nn.Conv1d + max_pool_layer = nn.MaxPool1d(kernel_size=2) + bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm1d + + self.g = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + if bn_layer: + self.W = nn.Sequential( + conv_nd( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + stride=1, + padding=0, + ), + bn(32, self.in_channels), + ) + nn.init.constant_(self.W[1].weight, 0) + nn.init.constant_(self.W[1].bias, 0) + else: + self.W = conv_nd( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + stride=1, + padding=0, + ) + nn.init.constant_(self.W.weight, 0) + nn.init.constant_(self.W.bias, 0) + + self.theta = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + self.phi = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + if sub_sample: + self.g = nn.Sequential(self.g, max_pool_layer) + self.phi = nn.Sequential(self.phi, max_pool_layer) + + def forward(self, x): + """ + :param x: (b, c, t, h, w) + :return: + """ + + batch_size = x.size(0) + + g_x = self.g(x).view(batch_size, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) + f = torch.matmul(theta_x, phi_x) + f_div_C = F.softmax(f, dim=-1) + + y = torch.matmul(f_div_C, g_x) + y = y.permute(0, 2, 1).contiguous() + y = y.view(batch_size, self.inter_channels, *x.size()[2:]) + W_y = self.W(y) + z = W_y + x + + return z + + +class NONLocalBlock2D(_NonLocalBlockND): + def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): + super(NONLocalBlock2D, self).__init__( + in_channels, + inter_channels=inter_channels, + dimension=2, + sub_sample=sub_sample, + bn_layer=bn_layer, + ) diff --git a/densepose/modeling/roi_heads/registry.py b/densepose/modeling/roi_heads/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..e1cea432f1fda3861266fa636d002667b3fb46a0 --- /dev/null +++ b/densepose/modeling/roi_heads/registry.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.utils.registry import Registry + +ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD") diff --git a/densepose/modeling/roi_heads/roi_head.py b/densepose/modeling/roi_heads/roi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..aee645fde0d8321de9181a624a0c921b6dc167c4 --- /dev/null +++ b/densepose/modeling/roi_heads/roi_head.py @@ -0,0 +1,218 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import numpy as np +from typing import Dict, List, Optional +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn as nn +from torch.nn import functional as F + +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads +from detectron2.modeling.poolers import ROIPooler +from detectron2.modeling.roi_heads import select_foreground_proposals +from detectron2.structures import ImageList, Instances + +from .. import ( + build_densepose_data_filter, + build_densepose_embedder, + build_densepose_head, + build_densepose_losses, + build_densepose_predictor, + densepose_inference, +) + + +class Decoder(nn.Module): + """ + A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper + (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from + all levels of the FPN into single output. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features): + super(Decoder, self).__init__() + + # fmt: off + self.in_features = in_features + feature_strides = {k: v.stride for k, v in input_shape.items()} + feature_channels = {k: v.channels for k, v in input_shape.items()} + num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES + conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS + self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE + norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM + # fmt: on + + self.scale_heads = [] + for in_feature in self.in_features: + head_ops = [] + head_length = max( + 1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) + ) + for k in range(head_length): + conv = Conv2d( + feature_channels[in_feature] if k == 0 else conv_dims, + conv_dims, + kernel_size=3, + stride=1, + padding=1, + bias=not norm, + norm=get_norm(norm, conv_dims), + activation=F.relu, + ) + weight_init.c2_msra_fill(conv) + head_ops.append(conv) + if feature_strides[in_feature] != self.common_stride: + head_ops.append( + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + ) + self.scale_heads.append(nn.Sequential(*head_ops)) + self.add_module(in_feature, self.scale_heads[-1]) + self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) + weight_init.c2_msra_fill(self.predictor) + + def forward(self, features: List[torch.Tensor]): + for i, _ in enumerate(self.in_features): + if i == 0: + x = self.scale_heads[i](features[i]) + else: + x = x + self.scale_heads[i](features[i]) + x = self.predictor(x) + return x + + +@ROI_HEADS_REGISTRY.register() +class DensePoseROIHeads(StandardROIHeads): + """ + A Standard ROIHeads which contains an addition of DensePose head. + """ + + def __init__(self, cfg, input_shape): + super().__init__(cfg, input_shape) + self._init_densepose_head(cfg, input_shape) + + def _init_densepose_head(self, cfg, input_shape): + # fmt: off + self.densepose_on = cfg.MODEL.DENSEPOSE_ON + if not self.densepose_on: + return + self.densepose_data_filter = build_densepose_data_filter(cfg) + dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION + dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO + dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE + self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON + # fmt: on + if self.use_decoder: + dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,) + else: + dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features) + in_channels = [input_shape[f].channels for f in self.in_features][0] + + if self.use_decoder: + self.decoder = Decoder(cfg, input_shape, self.in_features) + + self.densepose_pooler = ROIPooler( + output_size=dp_pooler_resolution, + scales=dp_pooler_scales, + sampling_ratio=dp_pooler_sampling_ratio, + pooler_type=dp_pooler_type, + ) + self.densepose_head = build_densepose_head(cfg, in_channels) + self.densepose_predictor = build_densepose_predictor( + cfg, self.densepose_head.n_out_channels + ) + self.densepose_losses = build_densepose_losses(cfg) + self.embedder = build_densepose_embedder(cfg) + + def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]): + """ + Forward logic of the densepose prediction branch. + + Args: + features (dict[str, Tensor]): input data as a mapping from feature + map name to tensor. Axis 0 represents the number of images `N` in + the input data; axes 1-3 are channels, height, and width, which may + vary between feature maps (e.g., if a feature pyramid is used). + instances (list[Instances]): length `N` list of `Instances`. The i-th + `Instances` contains instances for the i-th input image, + In training, they can be the proposals. + In inference, they can be the predicted boxes. + + Returns: + In training, a dict of losses. + In inference, update `instances` with new fields "densepose" and return it. + """ + if not self.densepose_on: + return {} if self.training else instances + + features_list = [features[f] for f in self.in_features] + if self.training: + proposals, _ = select_foreground_proposals(instances, self.num_classes) + features_list, proposals = self.densepose_data_filter(features_list, proposals) + if len(proposals) > 0: + proposal_boxes = [x.proposal_boxes for x in proposals] + + if self.use_decoder: + features_list = [self.decoder(features_list)] + + features_dp = self.densepose_pooler(features_list, proposal_boxes) + densepose_head_outputs = self.densepose_head(features_dp) + densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) + densepose_loss_dict = self.densepose_losses( + proposals, densepose_predictor_outputs, embedder=self.embedder + ) + return densepose_loss_dict + else: + pred_boxes = [x.pred_boxes for x in instances] + + if self.use_decoder: + features_list = [self.decoder(features_list)] + + features_dp = self.densepose_pooler(features_list, pred_boxes) + if len(features_dp) > 0: + densepose_head_outputs = self.densepose_head(features_dp) + densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) + else: + densepose_predictor_outputs = None + + densepose_inference(densepose_predictor_outputs, instances) + return instances + + def forward( + self, + images: ImageList, + features: Dict[str, torch.Tensor], + proposals: List[Instances], + targets: Optional[List[Instances]] = None, + ): + instances, losses = super().forward(images, features, proposals, targets) + del targets, images + + if self.training: + losses.update(self._forward_densepose(features, instances)) + return instances, losses + + def forward_with_given_boxes( + self, features: Dict[str, torch.Tensor], instances: List[Instances] + ): + """ + Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. + + This is useful for downstream tasks where a box is known, but need to obtain + other attributes (outputs of other heads). + Test-time augmentation also uses this. + + Args: + features: same as in `forward()` + instances (list[Instances]): instances to predict other outputs. Expect the keys + "pred_boxes" and "pred_classes" to exist. + + Returns: + instances (list[Instances]): + the same `Instances` objects, with extra + fields such as `pred_masks` or `pred_keypoints`. + """ + + instances = super().forward_with_given_boxes(features, instances) + instances = self._forward_densepose(features, instances) + return instances diff --git a/densepose/modeling/roi_heads/v1convx.py b/densepose/modeling/roi_heads/v1convx.py new file mode 100644 index 0000000000000000000000000000000000000000..df79f658d8f7149e44aa1a31072adc4dadd89a48 --- /dev/null +++ b/densepose/modeling/roi_heads/v1convx.py @@ -0,0 +1,64 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import Conv2d + +from ..utils import initialize_module_params +from .registry import ROI_DENSEPOSE_HEAD_REGISTRY + + +@ROI_DENSEPOSE_HEAD_REGISTRY.register() +class DensePoseV1ConvXHead(nn.Module): + """ + Fully convolutional DensePose head. + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize DensePose fully convolutional head + + Args: + cfg (CfgNode): configuration options + input_channels (int): number of input channels + """ + super(DensePoseV1ConvXHead, self).__init__() + # fmt: off + hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL + self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS + # fmt: on + pad_size = kernel_size // 2 + n_channels = input_channels + for i in range(self.n_stacked_convs): + layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size) + layer_name = self._get_layer_name(i) + self.add_module(layer_name, layer) + n_channels = hidden_dim + self.n_out_channels = n_channels + initialize_module_params(self) + + def forward(self, features: torch.Tensor): + """ + Apply DensePose fully convolutional head to the input features + + Args: + features (tensor): input features + Result: + A tensor of DensePose head outputs + """ + x = features + output = x + for i in range(self.n_stacked_convs): + layer_name = self._get_layer_name(i) + x = getattr(self, layer_name)(x) + x = F.relu(x) + output = x + return output + + def _get_layer_name(self, i: int): + layer_name = "body_conv_fcn{}".format(i + 1) + return layer_name diff --git a/densepose/modeling/test_time_augmentation.py b/densepose/modeling/test_time_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..ec2022ed16727f538993d2c7db60a60a1183b90d --- /dev/null +++ b/densepose/modeling/test_time_augmentation.py @@ -0,0 +1,207 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import copy +import numpy as np +import torch +from fvcore.transforms import HFlipTransform, TransformList +from torch.nn import functional as F + +from detectron2.data.transforms import RandomRotation, RotationTransform, apply_transform_gens +from detectron2.modeling.postprocessing import detector_postprocess +from detectron2.modeling.test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA + +from ..converters import HFlipConverter + + +class DensePoseDatasetMapperTTA(DatasetMapperTTA): + def __init__(self, cfg): + super().__init__(cfg=cfg) + self.angles = cfg.TEST.AUG.ROTATION_ANGLES + + def __call__(self, dataset_dict): + ret = super().__call__(dataset_dict=dataset_dict) + numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy() + for angle in self.angles: + rotate = RandomRotation(angle=angle, expand=True) + new_numpy_image, tfms = apply_transform_gens([rotate], np.copy(numpy_image)) + torch_image = torch.from_numpy(np.ascontiguousarray(new_numpy_image.transpose(2, 0, 1))) + dic = copy.deepcopy(dataset_dict) + # In DatasetMapperTTA, there is a pre_tfm transform (resize or no-op) that is + # added at the beginning of each TransformList. That's '.transforms[0]'. + dic["transforms"] = TransformList( + [ret[-1]["transforms"].transforms[0]] + tfms.transforms + ) + dic["image"] = torch_image + ret.append(dic) + return ret + + +class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA): + def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1): + """ + Args: + cfg (CfgNode): + model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on. + transform_data (DensePoseTransformData): contains symmetry label + transforms used for horizontal flip + tta_mapper (callable): takes a dataset dict and returns a list of + augmented versions of the dataset dict. Defaults to + `DatasetMapperTTA(cfg)`. + batch_size (int): batch the augmented images into this batch size for inference. + """ + self._transform_data = transform_data.to(model.device) + super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size) + + # the implementation follows closely the one from detectron2/modeling + def _inference_one_image(self, input): + """ + Args: + input (dict): one dataset dict with "image" field being a CHW tensor + + Returns: + dict: one output dict + """ + orig_shape = (input["height"], input["width"]) + # For some reason, resize with uint8 slightly increases box AP but decreases densepose AP + input["image"] = input["image"].to(torch.uint8) + augmented_inputs, tfms = self._get_augmented_inputs(input) + # Detect boxes from all augmented versions + with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]): + # temporarily disable roi heads + all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms) + merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape) + + if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON: + # Use the detected boxes to obtain new fields + augmented_instances = self._rescale_detected_boxes( + augmented_inputs, merged_instances, tfms + ) + # run forward on the detected boxes + outputs = self._batch_inference(augmented_inputs, augmented_instances) + # Delete now useless variables to avoid being out of memory + del augmented_inputs, augmented_instances + # average the predictions + if self.cfg.MODEL.MASK_ON: + merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms) + if self.cfg.MODEL.DENSEPOSE_ON: + merged_instances.pred_densepose = self._reduce_pred_densepose(outputs, tfms) + # postprocess + merged_instances = detector_postprocess(merged_instances, *orig_shape) + return {"instances": merged_instances} + else: + return {"instances": merged_instances} + + def _get_augmented_boxes(self, augmented_inputs, tfms): + # Heavily based on detectron2/modeling/test_time_augmentation.py + # Only difference is that RotationTransform is excluded from bbox computation + # 1: forward with all augmented images + outputs = self._batch_inference(augmented_inputs) + # 2: union the results + all_boxes = [] + all_scores = [] + all_classes = [] + for output, tfm in zip(outputs, tfms): + # Need to inverse the transforms on boxes, to obtain results on original image + if not any(isinstance(t, RotationTransform) for t in tfm.transforms): + # Some transforms can't compute bbox correctly + pred_boxes = output.pred_boxes.tensor + original_pred_boxes = tfm.inverse().apply_box(pred_boxes.cpu().numpy()) + all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device)) + all_scores.extend(output.scores) + all_classes.extend(output.pred_classes) + all_boxes = torch.cat(all_boxes, dim=0) + return all_boxes, all_scores, all_classes + + def _reduce_pred_densepose(self, outputs, tfms): + # Should apply inverse transforms on densepose preds. + # We assume only rotation, resize & flip are used. pred_masks is a scale-invariant + # representation, so we handle the other ones specially + for idx, (output, tfm) in enumerate(zip(outputs, tfms)): + for t in tfm.transforms: + for attr in ["coarse_segm", "fine_segm", "u", "v"]: + setattr( + output.pred_densepose, + attr, + _inverse_rotation( + getattr(output.pred_densepose, attr), output.pred_boxes.tensor, t + ), + ) + if any(isinstance(t, HFlipTransform) for t in tfm.transforms): + output.pred_densepose = HFlipConverter.convert( + output.pred_densepose, self._transform_data + ) + self._incremental_avg_dp(outputs[0].pred_densepose, output.pred_densepose, idx) + return outputs[0].pred_densepose + + # incrementally computed average: u_(n + 1) = u_n + (x_(n+1) - u_n) / (n + 1). + def _incremental_avg_dp(self, avg, new_el, idx): + for attr in ["coarse_segm", "fine_segm", "u", "v"]: + setattr(avg, attr, (getattr(avg, attr) * idx + getattr(new_el, attr)) / (idx + 1)) + if idx: + # Deletion of the > 0 index intermediary values to prevent GPU OOM + setattr(new_el, attr, None) + return avg + + +def _inverse_rotation(densepose_attrs, boxes, transform): + # resample outputs to image size and rotate back the densepose preds + # on the rotated images to the space of the original image + if len(boxes) == 0 or not isinstance(transform, RotationTransform): + return densepose_attrs + boxes = boxes.int().cpu().numpy() + wh_boxes = boxes[:, 2:] - boxes[:, :2] # bboxes in the rotated space + inv_boxes = rotate_box_inverse(transform, boxes).astype(int) # bboxes in original image + wh_diff = (inv_boxes[:, 2:] - inv_boxes[:, :2] - wh_boxes) // 2 # diff between new/old bboxes + rotation_matrix = torch.tensor([transform.rm_image]).to(device=densepose_attrs.device).float() + rotation_matrix[:, :, -1] = 0 + # To apply grid_sample for rotation, we need to have enough space to fit the original and + # rotated bboxes. l_bds and r_bds are the left/right bounds that will be used to + # crop the difference once the rotation is done + l_bds = np.maximum(0, -wh_diff) + for i in range(len(densepose_attrs)): + if min(wh_boxes[i]) <= 0: + continue + densepose_attr = densepose_attrs[[i]].clone() + # 1. Interpolate densepose attribute to size of the rotated bbox + densepose_attr = F.interpolate(densepose_attr, wh_boxes[i].tolist()[::-1], mode="bilinear") + # 2. Pad the interpolated attribute so it has room for the original + rotated bbox + densepose_attr = F.pad(densepose_attr, tuple(np.repeat(np.maximum(0, wh_diff[i]), 2))) + # 3. Compute rotation grid and transform + grid = F.affine_grid(rotation_matrix, size=densepose_attr.shape) + densepose_attr = F.grid_sample(densepose_attr, grid) + # 4. Compute right bounds and crop the densepose_attr to the size of the original bbox + r_bds = densepose_attr.shape[2:][::-1] - l_bds[i] + densepose_attr = densepose_attr[:, :, l_bds[i][1] : r_bds[1], l_bds[i][0] : r_bds[0]] + if min(densepose_attr.shape) > 0: + # Interpolate back to the original size of the densepose attribute + densepose_attr = F.interpolate( + densepose_attr, densepose_attrs.shape[-2:], mode="bilinear" + ) + # Adding a very small probability to the background class to fill padded zones + densepose_attr[:, 0] += 1e-10 + densepose_attrs[i] = densepose_attr + return densepose_attrs + + +def rotate_box_inverse(rot_tfm, rotated_box): + """ + rotated_box is a N * 4 array of [x0, y0, x1, y1] boxes + When a bbox is rotated, it gets bigger, because we need to surround the tilted bbox + So when a bbox is rotated then inverse-rotated, it is much bigger than the original + This function aims to invert the rotation on the box, but also resize it to its original size + """ + # 1. Compute the inverse rotation of the rotated bboxes (bigger than it ) + invrot_box = rot_tfm.inverse().apply_box(rotated_box) + h, w = rotated_box[:, 3] - rotated_box[:, 1], rotated_box[:, 2] - rotated_box[:, 0] + ih, iw = invrot_box[:, 3] - invrot_box[:, 1], invrot_box[:, 2] - invrot_box[:, 0] + assert 2 * rot_tfm.abs_sin**2 != 1, "45 degrees angle can't be inverted" + # 2. Inverse the corresponding computation in the rotation transform + # to get the original height/width of the rotated boxes + orig_h = (h * rot_tfm.abs_cos - w * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2) + orig_w = (w * rot_tfm.abs_cos - h * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2) + # 3. Resize the inverse-rotated bboxes to their original size + invrot_box[:, 0] += (iw - orig_w) / 2 + invrot_box[:, 1] += (ih - orig_h) / 2 + invrot_box[:, 2] -= (iw - orig_w) / 2 + invrot_box[:, 3] -= (ih - orig_h) / 2 + + return invrot_box diff --git a/densepose/modeling/utils.py b/densepose/modeling/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2e76eb9535a68dcb4ccb065556c55289294e42c8 --- /dev/null +++ b/densepose/modeling/utils.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from torch import nn + + +def initialize_module_params(module: nn.Module) -> None: + for name, param in module.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") diff --git a/densepose/structures/__init__.py b/densepose/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ed32c5e9d6c4c1599ba960681d9e86889e2cdbd8 --- /dev/null +++ b/densepose/structures/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .chart import DensePoseChartPredictorOutput +from .chart_confidence import decorate_predictor_output_class_with_confidences +from .cse_confidence import decorate_cse_predictor_output_class_with_confidences +from .chart_result import ( + DensePoseChartResult, + DensePoseChartResultWithConfidences, + quantize_densepose_chart_result, + compress_quantized_densepose_chart_result, + decompress_compressed_densepose_chart_result, +) +from .cse import DensePoseEmbeddingPredictorOutput +from .data_relative import DensePoseDataRelative +from .list import DensePoseList +from .mesh import Mesh, create_mesh +from .transform_data import DensePoseTransformData, normalized_coords_transform diff --git a/densepose/structures/chart.py b/densepose/structures/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..115cc084e98115c537382494af9eb0e246cd375b --- /dev/null +++ b/densepose/structures/chart.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import dataclass +from typing import Union +import torch + + +@dataclass +class DensePoseChartPredictorOutput: + """ + Predictor output that contains segmentation and inner coordinates predictions for predefined + body parts: + * coarse segmentation, a tensor of shape [N, K, Hout, Wout] + * fine segmentation, a tensor of shape [N, C, Hout, Wout] + * U coordinates, a tensor of shape [N, C, Hout, Wout] + * V coordinates, a tensor of shape [N, C, Hout, Wout] + where + - N is the number of instances + - K is the number of coarse segmentation channels ( + 2 = foreground / background, + 15 = one of 14 body parts / background) + - C is the number of fine segmentation channels ( + 24 fine body parts / background) + - Hout and Wout are height and width of predictions + """ + + coarse_segm: torch.Tensor + fine_segm: torch.Tensor + u: torch.Tensor + v: torch.Tensor + + def __len__(self): + """ + Number of instances (N) in the output + """ + return self.coarse_segm.size(0) + + def __getitem__( + self, item: Union[int, slice, torch.BoolTensor] + ) -> "DensePoseChartPredictorOutput": + """ + Get outputs for the selected instance(s) + + Args: + item (int or slice or tensor): selected items + """ + if isinstance(item, int): + return DensePoseChartPredictorOutput( + coarse_segm=self.coarse_segm[item].unsqueeze(0), + fine_segm=self.fine_segm[item].unsqueeze(0), + u=self.u[item].unsqueeze(0), + v=self.v[item].unsqueeze(0), + ) + else: + return DensePoseChartPredictorOutput( + coarse_segm=self.coarse_segm[item], + fine_segm=self.fine_segm[item], + u=self.u[item], + v=self.v[item], + ) + + def to(self, device: torch.device): + """ + Transfers all tensors to the given device + """ + coarse_segm = self.coarse_segm.to(device) + fine_segm = self.fine_segm.to(device) + u = self.u.to(device) + v = self.v.to(device) + return DensePoseChartPredictorOutput(coarse_segm=coarse_segm, fine_segm=fine_segm, u=u, v=v) diff --git a/densepose/structures/chart_confidence.py b/densepose/structures/chart_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..57c63257a7c176af1522e2f143ed594c26906c76 --- /dev/null +++ b/densepose/structures/chart_confidence.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import make_dataclass +from functools import lru_cache +from typing import Any, Optional +import torch + + +@lru_cache(maxsize=None) +def decorate_predictor_output_class_with_confidences(BasePredictorOutput: type) -> type: + """ + Create a new output class from an existing one by adding new attributes + related to confidence estimation: + - sigma_1 (tensor) + - sigma_2 (tensor) + - kappa_u (tensor) + - kappa_v (tensor) + - fine_segm_confidence (tensor) + - coarse_segm_confidence (tensor) + + Details on confidence estimation parameters can be found in: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020 + + The new class inherits the provided `BasePredictorOutput` class, + it's name is composed of the name of the provided class and + "WithConfidences" suffix. + + Args: + BasePredictorOutput (type): output type to which confidence data + is to be added, assumed to be a dataclass + Return: + New dataclass derived from the provided one that has attributes + for confidence estimation + """ + + PredictorOutput = make_dataclass( + BasePredictorOutput.__name__ + "WithConfidences", + fields=[ + ("sigma_1", Optional[torch.Tensor], None), + ("sigma_2", Optional[torch.Tensor], None), + ("kappa_u", Optional[torch.Tensor], None), + ("kappa_v", Optional[torch.Tensor], None), + ("fine_segm_confidence", Optional[torch.Tensor], None), + ("coarse_segm_confidence", Optional[torch.Tensor], None), + ], + bases=(BasePredictorOutput,), + ) + + # add possibility to index PredictorOutput + + def slice_if_not_none(data, item): + if data is None: + return None + if isinstance(item, int): + return data[item].unsqueeze(0) + return data[item] + + def PredictorOutput_getitem(self, item): + PredictorOutput = type(self) + base_predictor_output_sliced = super(PredictorOutput, self).__getitem__(item) + return PredictorOutput( + **base_predictor_output_sliced.__dict__, + coarse_segm_confidence=slice_if_not_none(self.coarse_segm_confidence, item), + fine_segm_confidence=slice_if_not_none(self.fine_segm_confidence, item), + sigma_1=slice_if_not_none(self.sigma_1, item), + sigma_2=slice_if_not_none(self.sigma_2, item), + kappa_u=slice_if_not_none(self.kappa_u, item), + kappa_v=slice_if_not_none(self.kappa_v, item), + ) + + PredictorOutput.__getitem__ = PredictorOutput_getitem + + def PredictorOutput_to(self, device: torch.device): + """ + Transfers all tensors to the given device + """ + PredictorOutput = type(self) + base_predictor_output_to = super(PredictorOutput, self).to(device) # pyre-ignore[16] + + def to_device_if_tensor(var: Any): + if isinstance(var, torch.Tensor): + return var.to(device) + return var + + return PredictorOutput( + **base_predictor_output_to.__dict__, + sigma_1=to_device_if_tensor(self.sigma_1), + sigma_2=to_device_if_tensor(self.sigma_2), + kappa_u=to_device_if_tensor(self.kappa_u), + kappa_v=to_device_if_tensor(self.kappa_v), + fine_segm_confidence=to_device_if_tensor(self.fine_segm_confidence), + coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence), + ) + + PredictorOutput.to = PredictorOutput_to + return PredictorOutput diff --git a/densepose/structures/chart_result.py b/densepose/structures/chart_result.py new file mode 100644 index 0000000000000000000000000000000000000000..003933d03d153d045c0bf551c465bc7a224d90cb --- /dev/null +++ b/densepose/structures/chart_result.py @@ -0,0 +1,183 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import dataclass +from typing import Any, Optional, Tuple +import torch + + +@dataclass +class DensePoseChartResult: + """ + DensePose results for chart-based methods represented by labels and inner + coordinates (U, V) of individual charts. Each chart is a 2D manifold + that has an associated label and is parameterized by two coordinates U and V. + Both U and V take values in [0, 1]. + Thus the results are represented by two tensors: + - labels (tensor [H, W] of long): contains estimated label for each pixel of + the detection bounding box of size (H, W) + - uv (tensor [2, H, W] of float): contains estimated U and V coordinates + for each pixel of the detection bounding box of size (H, W) + """ + + labels: torch.Tensor + uv: torch.Tensor + + def to(self, device: torch.device): + """ + Transfers all tensors to the given device + """ + labels = self.labels.to(device) + uv = self.uv.to(device) + return DensePoseChartResult(labels=labels, uv=uv) + + +@dataclass +class DensePoseChartResultWithConfidences: + """ + We add confidence values to DensePoseChartResult + Thus the results are represented by two tensors: + - labels (tensor [H, W] of long): contains estimated label for each pixel of + the detection bounding box of size (H, W) + - uv (tensor [2, H, W] of float): contains estimated U and V coordinates + for each pixel of the detection bounding box of size (H, W) + Plus one [H, W] tensor of float for each confidence type + """ + + labels: torch.Tensor + uv: torch.Tensor + sigma_1: Optional[torch.Tensor] = None + sigma_2: Optional[torch.Tensor] = None + kappa_u: Optional[torch.Tensor] = None + kappa_v: Optional[torch.Tensor] = None + fine_segm_confidence: Optional[torch.Tensor] = None + coarse_segm_confidence: Optional[torch.Tensor] = None + + def to(self, device: torch.device): + """ + Transfers all tensors to the given device, except if their value is None + """ + + def to_device_if_tensor(var: Any): + if isinstance(var, torch.Tensor): + return var.to(device) + return var + + return DensePoseChartResultWithConfidences( + labels=self.labels.to(device), + uv=self.uv.to(device), + sigma_1=to_device_if_tensor(self.sigma_1), + sigma_2=to_device_if_tensor(self.sigma_2), + kappa_u=to_device_if_tensor(self.kappa_u), + kappa_v=to_device_if_tensor(self.kappa_v), + fine_segm_confidence=to_device_if_tensor(self.fine_segm_confidence), + coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence), + ) + + +@dataclass +class DensePoseChartResultQuantized: + """ + DensePose results for chart-based methods represented by labels and quantized + inner coordinates (U, V) of individual charts. Each chart is a 2D manifold + that has an associated label and is parameterized by two coordinates U and V. + Both U and V take values in [0, 1]. + Quantized coordinates Uq and Vq have uint8 values which are obtained as: + Uq = U * 255 (hence 0 <= Uq <= 255) + Vq = V * 255 (hence 0 <= Vq <= 255) + Thus the results are represented by one tensor: + - labels_uv_uint8 (tensor [3, H, W] of uint8): contains estimated label + and quantized coordinates Uq and Vq for each pixel of the detection + bounding box of size (H, W) + """ + + labels_uv_uint8: torch.Tensor + + def to(self, device: torch.device): + """ + Transfers all tensors to the given device + """ + labels_uv_uint8 = self.labels_uv_uint8.to(device) + return DensePoseChartResultQuantized(labels_uv_uint8=labels_uv_uint8) + + +@dataclass +class DensePoseChartResultCompressed: + """ + DensePose results for chart-based methods represented by a PNG-encoded string. + The tensor of quantized DensePose results of size [3, H, W] is considered + as an image with 3 color channels. PNG compression is applied and the result + is stored as a Base64-encoded string. The following attributes are defined: + - shape_chw (tuple of 3 int): contains shape of the result tensor + (number of channels, height, width) + - labels_uv_str (str): contains Base64-encoded results tensor of size + [3, H, W] compressed with PNG compression methods + """ + + shape_chw: Tuple[int, int, int] + labels_uv_str: str + + +def quantize_densepose_chart_result(result: DensePoseChartResult) -> DensePoseChartResultQuantized: + """ + Applies quantization to DensePose chart-based result. + + Args: + result (DensePoseChartResult): DensePose chart-based result + Return: + Quantized DensePose chart-based result (DensePoseChartResultQuantized) + """ + h, w = result.labels.shape + labels_uv_uint8 = torch.zeros([3, h, w], dtype=torch.uint8, device=result.labels.device) + labels_uv_uint8[0] = result.labels + labels_uv_uint8[1:] = (result.uv * 255).clamp(0, 255).byte() + return DensePoseChartResultQuantized(labels_uv_uint8=labels_uv_uint8) + + +def compress_quantized_densepose_chart_result( + result: DensePoseChartResultQuantized, +) -> DensePoseChartResultCompressed: + """ + Compresses quantized DensePose chart-based result + + Args: + result (DensePoseChartResultQuantized): quantized DensePose chart-based result + Return: + Compressed DensePose chart-based result (DensePoseChartResultCompressed) + """ + import base64 + import numpy as np + from io import BytesIO + from PIL import Image + + labels_uv_uint8_np_chw = result.labels_uv_uint8.cpu().numpy() + labels_uv_uint8_np_hwc = np.moveaxis(labels_uv_uint8_np_chw, 0, -1) + im = Image.fromarray(labels_uv_uint8_np_hwc) + fstream = BytesIO() + im.save(fstream, format="png", optimize=True) + labels_uv_str = base64.encodebytes(fstream.getvalue()).decode() + shape_chw = labels_uv_uint8_np_chw.shape + return DensePoseChartResultCompressed(labels_uv_str=labels_uv_str, shape_chw=shape_chw) + + +def decompress_compressed_densepose_chart_result( + result: DensePoseChartResultCompressed, +) -> DensePoseChartResultQuantized: + """ + Decompresses DensePose chart-based result encoded into a base64 string + + Args: + result (DensePoseChartResultCompressed): compressed DensePose chart result + Return: + Quantized DensePose chart-based result (DensePoseChartResultQuantized) + """ + import base64 + import numpy as np + from io import BytesIO + from PIL import Image + + fstream = BytesIO(base64.decodebytes(result.labels_uv_str.encode())) + im = Image.open(fstream) + labels_uv_uint8_np_chw = np.moveaxis(np.array(im, dtype=np.uint8), -1, 0) + return DensePoseChartResultQuantized( + labels_uv_uint8=torch.from_numpy(labels_uv_uint8_np_chw.reshape(result.shape_chw)) + ) diff --git a/densepose/structures/cse.py b/densepose/structures/cse.py new file mode 100644 index 0000000000000000000000000000000000000000..9cd65da96c04613053e21494bc2dcc04f37fe1fd --- /dev/null +++ b/densepose/structures/cse.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from dataclasses import dataclass +from typing import Union +import torch + + +@dataclass +class DensePoseEmbeddingPredictorOutput: + """ + Predictor output that contains embedding and coarse segmentation data: + * embedding: float tensor of size [N, D, H, W], contains estimated embeddings + * coarse_segm: float tensor of size [N, K, H, W] + Here D = MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE + K = MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + """ + + embedding: torch.Tensor + coarse_segm: torch.Tensor + + def __len__(self): + """ + Number of instances (N) in the output + """ + return self.coarse_segm.size(0) + + def __getitem__( + self, item: Union[int, slice, torch.BoolTensor] + ) -> "DensePoseEmbeddingPredictorOutput": + """ + Get outputs for the selected instance(s) + + Args: + item (int or slice or tensor): selected items + """ + if isinstance(item, int): + return DensePoseEmbeddingPredictorOutput( + coarse_segm=self.coarse_segm[item].unsqueeze(0), + embedding=self.embedding[item].unsqueeze(0), + ) + else: + return DensePoseEmbeddingPredictorOutput( + coarse_segm=self.coarse_segm[item], embedding=self.embedding[item] + ) + + def to(self, device: torch.device): + """ + Transfers all tensors to the given device + """ + coarse_segm = self.coarse_segm.to(device) + embedding = self.embedding.to(device) + return DensePoseEmbeddingPredictorOutput(coarse_segm=coarse_segm, embedding=embedding) diff --git a/densepose/structures/cse_confidence.py b/densepose/structures/cse_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..ee5166f82d45ecb4ea829ec2ecab248161c19421 --- /dev/null +++ b/densepose/structures/cse_confidence.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import make_dataclass +from functools import lru_cache +from typing import Any, Optional +import torch + + +@lru_cache(maxsize=None) +def decorate_cse_predictor_output_class_with_confidences(BasePredictorOutput: type) -> type: + """ + Create a new output class from an existing one by adding new attributes + related to confidence estimation: + - coarse_segm_confidence (tensor) + + Details on confidence estimation parameters can be found in: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020 + + The new class inherits the provided `BasePredictorOutput` class, + it's name is composed of the name of the provided class and + "WithConfidences" suffix. + + Args: + BasePredictorOutput (type): output type to which confidence data + is to be added, assumed to be a dataclass + Return: + New dataclass derived from the provided one that has attributes + for confidence estimation + """ + + PredictorOutput = make_dataclass( + BasePredictorOutput.__name__ + "WithConfidences", + fields=[ + ("coarse_segm_confidence", Optional[torch.Tensor], None), + ], + bases=(BasePredictorOutput,), + ) + + # add possibility to index PredictorOutput + + def slice_if_not_none(data, item): + if data is None: + return None + if isinstance(item, int): + return data[item].unsqueeze(0) + return data[item] + + def PredictorOutput_getitem(self, item): + PredictorOutput = type(self) + base_predictor_output_sliced = super(PredictorOutput, self).__getitem__(item) + return PredictorOutput( + **base_predictor_output_sliced.__dict__, + coarse_segm_confidence=slice_if_not_none(self.coarse_segm_confidence, item), + ) + + PredictorOutput.__getitem__ = PredictorOutput_getitem + + def PredictorOutput_to(self, device: torch.device): + """ + Transfers all tensors to the given device + """ + PredictorOutput = type(self) + base_predictor_output_to = super(PredictorOutput, self).to(device) # pyre-ignore[16] + + def to_device_if_tensor(var: Any): + if isinstance(var, torch.Tensor): + return var.to(device) + return var + + return PredictorOutput( + **base_predictor_output_to.__dict__, + coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence), + ) + + PredictorOutput.to = PredictorOutput_to + return PredictorOutput diff --git a/densepose/structures/data_relative.py b/densepose/structures/data_relative.py new file mode 100644 index 0000000000000000000000000000000000000000..187e140495f94a740fdd91d756f2195a0c8f4f30 --- /dev/null +++ b/densepose/structures/data_relative.py @@ -0,0 +1,243 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +import torch +from torch.nn import functional as F + +from densepose.data.meshes.catalog import MeshCatalog +from densepose.structures.mesh import load_mesh_symmetry +from densepose.structures.transform_data import DensePoseTransformData + + +class DensePoseDataRelative: + """ + Dense pose relative annotations that can be applied to any bounding box: + x - normalized X coordinates [0, 255] of annotated points + y - normalized Y coordinates [0, 255] of annotated points + i - body part labels 0,...,24 for annotated points + u - body part U coordinates [0, 1] for annotated points + v - body part V coordinates [0, 1] for annotated points + segm - 256x256 segmentation mask with values 0,...,14 + To obtain absolute x and y data wrt some bounding box one needs to first + divide the data by 256, multiply by the respective bounding box size + and add bounding box offset: + x_img = x0 + x_norm * w / 256.0 + y_img = y0 + y_norm * h / 256.0 + Segmentation masks are typically sampled to get image-based masks. + """ + + # Key for normalized X coordinates in annotation dict + X_KEY = "dp_x" + # Key for normalized Y coordinates in annotation dict + Y_KEY = "dp_y" + # Key for U part coordinates in annotation dict (used in chart-based annotations) + U_KEY = "dp_U" + # Key for V part coordinates in annotation dict (used in chart-based annotations) + V_KEY = "dp_V" + # Key for I point labels in annotation dict (used in chart-based annotations) + I_KEY = "dp_I" + # Key for segmentation mask in annotation dict + S_KEY = "dp_masks" + # Key for vertex ids (used in continuous surface embeddings annotations) + VERTEX_IDS_KEY = "dp_vertex" + # Key for mesh id (used in continuous surface embeddings annotations) + MESH_NAME_KEY = "ref_model" + # Number of body parts in segmentation masks + N_BODY_PARTS = 14 + # Number of parts in point labels + N_PART_LABELS = 24 + MASK_SIZE = 256 + + def __init__(self, annotation, cleanup=False): + self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY]) + self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY]) + if ( + DensePoseDataRelative.I_KEY in annotation + and DensePoseDataRelative.U_KEY in annotation + and DensePoseDataRelative.V_KEY in annotation + ): + self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY]) + self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY]) + self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY]) + if ( + DensePoseDataRelative.VERTEX_IDS_KEY in annotation + and DensePoseDataRelative.MESH_NAME_KEY in annotation + ): + self.vertex_ids = torch.as_tensor( + annotation[DensePoseDataRelative.VERTEX_IDS_KEY], dtype=torch.long + ) + self.mesh_id = MeshCatalog.get_mesh_id(annotation[DensePoseDataRelative.MESH_NAME_KEY]) + if DensePoseDataRelative.S_KEY in annotation: + self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation) + self.device = torch.device("cpu") + if cleanup: + DensePoseDataRelative.cleanup_annotation(annotation) + + def to(self, device): + if self.device == device: + return self + new_data = DensePoseDataRelative.__new__(DensePoseDataRelative) + new_data.x = self.x.to(device) + new_data.y = self.y.to(device) + for attr in ["i", "u", "v", "vertex_ids", "segm"]: + if hasattr(self, attr): + setattr(new_data, attr, getattr(self, attr).to(device)) + if hasattr(self, "mesh_id"): + new_data.mesh_id = self.mesh_id + new_data.device = device + return new_data + + @staticmethod + def extract_segmentation_mask(annotation): + import pycocotools.mask as mask_utils + + # TODO: annotation instance is accepted if it contains either + # DensePose segmentation or instance segmentation. However, here we + # only rely on DensePose segmentation + poly_specs = annotation[DensePoseDataRelative.S_KEY] + if isinstance(poly_specs, torch.Tensor): + # data is already given as mask tensors, no need to decode + return poly_specs + segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32) + if isinstance(poly_specs, dict): + if poly_specs: + mask = mask_utils.decode(poly_specs) + segm[mask > 0] = 1 + else: + for i in range(len(poly_specs)): + poly_i = poly_specs[i] + if poly_i: + mask_i = mask_utils.decode(poly_i) + segm[mask_i > 0] = i + 1 + return segm + + @staticmethod + def validate_annotation(annotation): + for key in [ + DensePoseDataRelative.X_KEY, + DensePoseDataRelative.Y_KEY, + ]: + if key not in annotation: + return False, "no {key} data in the annotation".format(key=key) + valid_for_iuv_setting = all( + key in annotation + for key in [ + DensePoseDataRelative.I_KEY, + DensePoseDataRelative.U_KEY, + DensePoseDataRelative.V_KEY, + ] + ) + valid_for_cse_setting = all( + key in annotation + for key in [ + DensePoseDataRelative.VERTEX_IDS_KEY, + DensePoseDataRelative.MESH_NAME_KEY, + ] + ) + if not valid_for_iuv_setting and not valid_for_cse_setting: + return ( + False, + "expected either {} (IUV setting) or {} (CSE setting) annotations".format( + ", ".join( + [ + DensePoseDataRelative.I_KEY, + DensePoseDataRelative.U_KEY, + DensePoseDataRelative.V_KEY, + ] + ), + ", ".join( + [ + DensePoseDataRelative.VERTEX_IDS_KEY, + DensePoseDataRelative.MESH_NAME_KEY, + ] + ), + ), + ) + return True, None + + @staticmethod + def cleanup_annotation(annotation): + for key in [ + DensePoseDataRelative.X_KEY, + DensePoseDataRelative.Y_KEY, + DensePoseDataRelative.I_KEY, + DensePoseDataRelative.U_KEY, + DensePoseDataRelative.V_KEY, + DensePoseDataRelative.S_KEY, + DensePoseDataRelative.VERTEX_IDS_KEY, + DensePoseDataRelative.MESH_NAME_KEY, + ]: + if key in annotation: + del annotation[key] + + def apply_transform(self, transforms, densepose_transform_data): + self._transform_pts(transforms, densepose_transform_data) + if hasattr(self, "segm"): + self._transform_segm(transforms, densepose_transform_data) + + def _transform_pts(self, transforms, dp_transform_data): + import detectron2.data.transforms as T + + # NOTE: This assumes that HorizFlipTransform is the only one that does flip + do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 + if do_hflip: + self.x = self.MASK_SIZE - self.x + if hasattr(self, "i"): + self._flip_iuv_semantics(dp_transform_data) + if hasattr(self, "vertex_ids"): + self._flip_vertices() + + for t in transforms.transforms: + if isinstance(t, T.RotationTransform): + xy_scale = np.array((t.w, t.h)) / DensePoseDataRelative.MASK_SIZE + xy = t.apply_coords(np.stack((self.x, self.y), axis=1) * xy_scale) + self.x, self.y = torch.tensor(xy / xy_scale, dtype=self.x.dtype).T + + def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None: + i_old = self.i.clone() + uv_symmetries = dp_transform_data.uv_symmetries + pt_label_symmetries = dp_transform_data.point_label_symmetries + for i in range(self.N_PART_LABELS): + if i + 1 in i_old: + annot_indices_i = i_old == i + 1 + if pt_label_symmetries[i + 1] != i + 1: + self.i[annot_indices_i] = pt_label_symmetries[i + 1] + u_loc = (self.u[annot_indices_i] * 255).long() + v_loc = (self.v[annot_indices_i] * 255).long() + self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to( + device=self.u.device + ) + self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to( + device=self.v.device + ) + + def _flip_vertices(self): + mesh_info = MeshCatalog[MeshCatalog.get_mesh_name(self.mesh_id)] + mesh_symmetry = ( + load_mesh_symmetry(mesh_info.symmetry) if mesh_info.symmetry is not None else None + ) + self.vertex_ids = mesh_symmetry["vertex_transforms"][self.vertex_ids] + + def _transform_segm(self, transforms, dp_transform_data): + import detectron2.data.transforms as T + + # NOTE: This assumes that HorizFlipTransform is the only one that does flip + do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 + if do_hflip: + self.segm = torch.flip(self.segm, [1]) + self._flip_segm_semantics(dp_transform_data) + + for t in transforms.transforms: + if isinstance(t, T.RotationTransform): + self._transform_segm_rotation(t) + + def _flip_segm_semantics(self, dp_transform_data): + old_segm = self.segm.clone() + mask_label_symmetries = dp_transform_data.mask_label_symmetries + for i in range(self.N_BODY_PARTS): + if mask_label_symmetries[i + 1] != i + 1: + self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1] + + def _transform_segm_rotation(self, rotation): + self.segm = F.interpolate(self.segm[None, None, :], (rotation.h, rotation.w)).numpy() + self.segm = torch.tensor(rotation.apply_segmentation(self.segm[0, 0]))[None, None, :] + self.segm = F.interpolate(self.segm, [DensePoseDataRelative.MASK_SIZE] * 2)[0, 0] diff --git a/densepose/structures/list.py b/densepose/structures/list.py new file mode 100644 index 0000000000000000000000000000000000000000..7631f8f78f4e9b1a94653d4e47639c50affe58eb --- /dev/null +++ b/densepose/structures/list.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import torch + +from densepose.structures.data_relative import DensePoseDataRelative + + +class DensePoseList: + + _TORCH_DEVICE_CPU = torch.device("cpu") + + def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU): + assert len(densepose_datas) == len( + boxes_xyxy_abs + ), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format( + len(densepose_datas), len(boxes_xyxy_abs) + ) + self.densepose_datas = [] + for densepose_data in densepose_datas: + assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, ( + "Attempt to initialize DensePoseList with DensePose datas " + "of type {}, expected DensePoseDataRelative".format(type(densepose_data)) + ) + densepose_data_ondevice = ( + densepose_data.to(device) if densepose_data is not None else None + ) + self.densepose_datas.append(densepose_data_ondevice) + self.boxes_xyxy_abs = boxes_xyxy_abs.to(device) + self.image_size_hw = image_size_hw + self.device = device + + def to(self, device): + if self.device == device: + return self + return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device) + + def __iter__(self): + return iter(self.densepose_datas) + + def __len__(self): + return len(self.densepose_datas) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.densepose_datas)) + s += "image_width={}, ".format(self.image_size_hw[1]) + s += "image_height={})".format(self.image_size_hw[0]) + return s + + def __getitem__(self, item): + if isinstance(item, int): + densepose_data_rel = self.densepose_datas[item] + return densepose_data_rel + elif isinstance(item, slice): + densepose_datas_rel = self.densepose_datas[item] + boxes_xyxy_abs = self.boxes_xyxy_abs[item] + return DensePoseList( + densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device + ) + elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool): + densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0] + boxes_xyxy_abs = self.boxes_xyxy_abs[item] + return DensePoseList( + densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device + ) + else: + densepose_datas_rel = [self.densepose_datas[i] for i in item] + boxes_xyxy_abs = self.boxes_xyxy_abs[item] + return DensePoseList( + densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device + ) diff --git a/densepose/structures/mesh.py b/densepose/structures/mesh.py new file mode 100644 index 0000000000000000000000000000000000000000..a5abd3419b35234e6b44c0577bef2818f99a5cdc --- /dev/null +++ b/densepose/structures/mesh.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pickle +from functools import lru_cache +from typing import Dict, Optional, Tuple +import torch + +from detectron2.utils.file_io import PathManager + +from densepose.data.meshes.catalog import MeshCatalog, MeshInfo + + +def _maybe_copy_to_device( + attribute: Optional[torch.Tensor], device: torch.device +) -> Optional[torch.Tensor]: + if attribute is None: + return None + return attribute.to(device) + + +class Mesh: + def __init__( + self, + vertices: Optional[torch.Tensor] = None, + faces: Optional[torch.Tensor] = None, + geodists: Optional[torch.Tensor] = None, + symmetry: Optional[Dict[str, torch.Tensor]] = None, + texcoords: Optional[torch.Tensor] = None, + mesh_info: Optional[MeshInfo] = None, + device: Optional[torch.device] = None, + ): + """ + Args: + vertices (tensor [N, 3] of float32): vertex coordinates in 3D + faces (tensor [M, 3] of long): triangular face represented as 3 + vertex indices + geodists (tensor [N, N] of float32): geodesic distances from + vertex `i` to vertex `j` (optional, default: None) + symmetry (dict: str -> tensor): various mesh symmetry data: + - "vertex_transforms": vertex mapping under horizontal flip, + tensor of size [N] of type long; vertex `i` is mapped to + vertex `tensor[i]` (optional, default: None) + texcoords (tensor [N, 2] of float32): texture coordinates, i.e. global + and normalized mesh UVs (optional, default: None) + mesh_info (MeshInfo type): necessary to load the attributes on-the-go, + can be used instead of passing all the variables one by one + device (torch.device): device of the Mesh. If not provided, will use + the device of the vertices + """ + self._vertices = vertices + self._faces = faces + self._geodists = geodists + self._symmetry = symmetry + self._texcoords = texcoords + self.mesh_info = mesh_info + self.device = device + + assert self._vertices is not None or self.mesh_info is not None + + all_fields = [self._vertices, self._faces, self._geodists, self._texcoords] + + if self.device is None: + for field in all_fields: + if field is not None: + self.device = field.device + break + if self.device is None and symmetry is not None: + for key in symmetry: + self.device = symmetry[key].device + break + self.device = torch.device("cpu") if self.device is None else self.device + + assert all([var.device == self.device for var in all_fields if var is not None]) + if symmetry: + assert all(symmetry[key].device == self.device for key in symmetry) + if texcoords and vertices: + assert len(vertices) == len(texcoords) + + def to(self, device: torch.device): + device_symmetry = self._symmetry + if device_symmetry: + device_symmetry = {key: value.to(device) for key, value in device_symmetry.items()} + return Mesh( + _maybe_copy_to_device(self._vertices, device), + _maybe_copy_to_device(self._faces, device), + _maybe_copy_to_device(self._geodists, device), + device_symmetry, + _maybe_copy_to_device(self._texcoords, device), + self.mesh_info, + device, + ) + + @property + def vertices(self): + if self._vertices is None and self.mesh_info is not None: + self._vertices = load_mesh_data(self.mesh_info.data, "vertices", self.device) + return self._vertices + + @property + def faces(self): + if self._faces is None and self.mesh_info is not None: + self._faces = load_mesh_data(self.mesh_info.data, "faces", self.device) + return self._faces + + @property + def geodists(self): + if self._geodists is None and self.mesh_info is not None: + self._geodists = load_mesh_auxiliary_data(self.mesh_info.geodists, self.device) + return self._geodists + + @property + def symmetry(self): + if self._symmetry is None and self.mesh_info is not None: + self._symmetry = load_mesh_symmetry(self.mesh_info.symmetry, self.device) + return self._symmetry + + @property + def texcoords(self): + if self._texcoords is None and self.mesh_info is not None: + self._texcoords = load_mesh_auxiliary_data(self.mesh_info.texcoords, self.device) + return self._texcoords + + def get_geodists(self): + if self.geodists is None: + self.geodists = self._compute_geodists() + return self.geodists + + def _compute_geodists(self): + # TODO: compute using Laplace-Beltrami + geodists = None + return geodists + + +def load_mesh_data( + mesh_fpath: str, field: str, device: Optional[torch.device] = None +) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + with PathManager.open(mesh_fpath, "rb") as hFile: + # pyre-fixme[7]: Expected `Tuple[Optional[Tensor], Optional[Tensor]]` but + # got `Tensor`. + return torch.as_tensor(pickle.load(hFile)[field], dtype=torch.float).to(device) + return None + + +def load_mesh_auxiliary_data( + fpath: str, device: Optional[torch.device] = None +) -> Optional[torch.Tensor]: + fpath_local = PathManager.get_local_path(fpath) + with PathManager.open(fpath_local, "rb") as hFile: + return torch.as_tensor(pickle.load(hFile), dtype=torch.float).to(device) + return None + + +@lru_cache() +def load_mesh_symmetry( + symmetry_fpath: str, device: Optional[torch.device] = None +) -> Optional[Dict[str, torch.Tensor]]: + with PathManager.open(symmetry_fpath, "rb") as hFile: + symmetry_loaded = pickle.load(hFile) + symmetry = { + "vertex_transforms": torch.as_tensor( + symmetry_loaded["vertex_transforms"], dtype=torch.long + ).to(device), + } + return symmetry + return None + + +@lru_cache() +def create_mesh(mesh_name: str, device: Optional[torch.device] = None) -> Mesh: + return Mesh(mesh_info=MeshCatalog[mesh_name], device=device) diff --git a/densepose/structures/transform_data.py b/densepose/structures/transform_data.py new file mode 100644 index 0000000000000000000000000000000000000000..a345d66945fbd709ea6644caa7a71435aa0ed569 --- /dev/null +++ b/densepose/structures/transform_data.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from typing import BinaryIO, Dict, Union +import torch + + +def normalized_coords_transform(x0, y0, w, h): + """ + Coordinates transform that maps top left corner to (-1, -1) and bottom + right corner to (1, 1). Used for torch.grid_sample to initialize the + grid + """ + + def f(p): + return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1) + + return f + + +class DensePoseTransformData: + + # Horizontal symmetry label transforms used for horizontal flip + MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14] + # fmt: off + POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23] # noqa + # fmt: on + + def __init__(self, uv_symmetries: Dict[str, torch.Tensor], device: torch.device): + self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES + self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES + self.uv_symmetries = uv_symmetries + self.device = torch.device("cpu") + + def to(self, device: torch.device, copy: bool = False) -> "DensePoseTransformData": + """ + Convert transform data to the specified device + + Args: + device (torch.device): device to convert the data to + copy (bool): flag that specifies whether to copy or to reference the data + in case the device is the same + Return: + An instance of `DensePoseTransformData` with data stored on the specified device + """ + if self.device == device and not copy: + return self + uv_symmetry_map = {} + for key in self.uv_symmetries: + uv_symmetry_map[key] = self.uv_symmetries[key].to(device=device, copy=copy) + return DensePoseTransformData(uv_symmetry_map, device) + + @staticmethod + def load(io: Union[str, BinaryIO]): + """ + Args: + io: (str or binary file-like object): input file to load data from + Returns: + An instance of `DensePoseTransformData` with transforms loaded from the file + """ + import scipy.io + + uv_symmetry_map = scipy.io.loadmat(io) + uv_symmetry_map_torch = {} + for key in ["U_transforms", "V_transforms"]: + uv_symmetry_map_torch[key] = [] + map_src = uv_symmetry_map[key] + map_dst = uv_symmetry_map_torch[key] + for i in range(map_src.shape[1]): + map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float)) + uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0) + transform_data = DensePoseTransformData(uv_symmetry_map_torch, device=torch.device("cpu")) + return transform_data diff --git a/densepose/utils/__init__.py b/densepose/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/densepose/utils/dbhelper.py b/densepose/utils/dbhelper.py new file mode 100644 index 0000000000000000000000000000000000000000..772e31874b2f65da9ae8b4e03c7515d5af282586 --- /dev/null +++ b/densepose/utils/dbhelper.py @@ -0,0 +1,147 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from typing import Any, Dict, Optional, Tuple + + +class EntrySelector: + """ + Base class for entry selectors + """ + + @staticmethod + def from_string(spec: str) -> "EntrySelector": + if spec == "*": + return AllEntrySelector() + return FieldEntrySelector(spec) + + +class AllEntrySelector(EntrySelector): + """ + Selector that accepts all entries + """ + + SPECIFIER = "*" + + def __call__(self, entry): + return True + + +class FieldEntrySelector(EntrySelector): + """ + Selector that accepts only entries that match provided field + specifier(s). Only a limited set of specifiers is supported for now: + ::=[] + ::=[] + is a valid identifier + ::= "int" | "str" + ::= "=" + ::= "," + ::= ":" + ::= | + ::= + ::= "-" + is a string without spaces and special symbols + (e.g. , , , ) + """ + + _SPEC_DELIM = "," + _TYPE_DELIM = ":" + _RANGE_DELIM = "-" + _EQUAL = "=" + _ERROR_PREFIX = "Invalid field selector specifier" + + class _FieldEntryValuePredicate: + """ + Predicate that checks strict equality for the specified entry field + """ + + def __init__(self, name: str, typespec: Optional[str], value: str): + import builtins + + self.name = name + self.type = getattr(builtins, typespec) if typespec is not None else str + self.value = value + + def __call__(self, entry): + return entry[self.name] == self.type(self.value) + + class _FieldEntryRangePredicate: + """ + Predicate that checks whether an entry field falls into the specified range + """ + + def __init__(self, name: str, typespec: Optional[str], vmin: str, vmax: str): + import builtins + + self.name = name + self.type = getattr(builtins, typespec) if typespec is not None else str + self.vmin = vmin + self.vmax = vmax + + def __call__(self, entry): + return (entry[self.name] >= self.type(self.vmin)) and ( + entry[self.name] <= self.type(self.vmax) + ) + + def __init__(self, spec: str): + self._predicates = self._parse_specifier_into_predicates(spec) + + def __call__(self, entry: Dict[str, Any]): + for predicate in self._predicates: + if not predicate(entry): + return False + return True + + def _parse_specifier_into_predicates(self, spec: str): + predicates = [] + specs = spec.split(self._SPEC_DELIM) + for subspec in specs: + eq_idx = subspec.find(self._EQUAL) + if eq_idx > 0: + field_name_with_type = subspec[:eq_idx] + field_name, field_type = self._parse_field_name_type(field_name_with_type) + field_value_or_range = subspec[eq_idx + 1 :] + if self._is_range_spec(field_value_or_range): + vmin, vmax = self._get_range_spec(field_value_or_range) + predicate = FieldEntrySelector._FieldEntryRangePredicate( + field_name, field_type, vmin, vmax + ) + else: + predicate = FieldEntrySelector._FieldEntryValuePredicate( + field_name, field_type, field_value_or_range + ) + predicates.append(predicate) + elif eq_idx == 0: + self._parse_error(f'"{subspec}", field name is empty!') + else: + self._parse_error(f'"{subspec}", should have format ' "=!") + return predicates + + def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]: + type_delim_idx = field_name_with_type.find(self._TYPE_DELIM) + if type_delim_idx > 0: + field_name = field_name_with_type[:type_delim_idx] + field_type = field_name_with_type[type_delim_idx + 1 :] + elif type_delim_idx == 0: + self._parse_error(f'"{field_name_with_type}", field name is empty!') + else: + field_name = field_name_with_type + field_type = None + # pyre-fixme[61]: `field_name` may not be initialized here. + # pyre-fixme[61]: `field_type` may not be initialized here. + return field_name, field_type + + def _is_range_spec(self, field_value_or_range): + delim_idx = field_value_or_range.find(self._RANGE_DELIM) + return delim_idx > 0 + + def _get_range_spec(self, field_value_or_range): + if self._is_range_spec(field_value_or_range): + delim_idx = field_value_or_range.find(self._RANGE_DELIM) + vmin = field_value_or_range[:delim_idx] + vmax = field_value_or_range[delim_idx + 1 :] + return vmin, vmax + else: + self._parse_error('"field_value_or_range", range of values expected!') + + def _parse_error(self, msg): + raise ValueError(f"{self._ERROR_PREFIX}: {msg}") diff --git a/densepose/utils/logger.py b/densepose/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..70cd3cb0eb0fc7495b1a4b50a05725a0e5b1baba --- /dev/null +++ b/densepose/utils/logger.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging + + +def verbosity_to_level(verbosity) -> int: + if verbosity is not None: + if verbosity == 0: + return logging.WARNING + elif verbosity == 1: + return logging.INFO + elif verbosity >= 2: + return logging.DEBUG + return logging.WARNING diff --git a/densepose/utils/transform.py b/densepose/utils/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc4ae7be878302ec39b7f235e3ae1b7a3ca29ee --- /dev/null +++ b/densepose/utils/transform.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from detectron2.data import MetadataCatalog +from detectron2.utils.file_io import PathManager + +from densepose import DensePoseTransformData + + +def load_for_dataset(dataset_name): + path = MetadataCatalog.get(dataset_name).densepose_transform_src + densepose_transform_data_fpath = PathManager.get_local_path(path) + return DensePoseTransformData.load(densepose_transform_data_fpath) + + +def load_from_cfg(cfg): + return load_for_dataset(cfg.DATASETS.TEST[0]) diff --git a/densepose/vis/__init__.py b/densepose/vis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/densepose/vis/base.py b/densepose/vis/base.py new file mode 100644 index 0000000000000000000000000000000000000000..a5c7e4ad19b6df600d8dcecacde6a3488c83a3e1 --- /dev/null +++ b/densepose/vis/base.py @@ -0,0 +1,192 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import numpy as np +import cv2 +import torch + +Image = np.ndarray +Boxes = torch.Tensor + + +class MatrixVisualizer: + """ + Base visualizer for matrix data + """ + + def __init__( + self, + inplace=True, + cmap=cv2.COLORMAP_PARULA, + val_scale=1.0, + alpha=0.7, + interp_method_matrix=cv2.INTER_LINEAR, + interp_method_mask=cv2.INTER_NEAREST, + ): + self.inplace = inplace + self.cmap = cmap + self.val_scale = val_scale + self.alpha = alpha + self.interp_method_matrix = interp_method_matrix + self.interp_method_mask = interp_method_mask + + def visualize(self, image_bgr, mask, matrix, bbox_xywh): + self._check_image(image_bgr) + self._check_mask_matrix(mask, matrix) + if self.inplace: + image_target_bgr = image_bgr + else: + image_target_bgr = image_bgr + image_target_bgr *= 0 + x, y, w, h = [int(v) for v in bbox_xywh] + if w <= 0 or h <= 0: + return image_bgr + mask, matrix = self._resize(mask, matrix, w, h) + mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3]) + matrix_scaled = matrix.astype(np.float32) * self.val_scale + _EPSILON = 1e-6 + if np.any(matrix_scaled > 255 + _EPSILON): + logger = logging.getLogger(__name__) + logger.warning( + f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]" + ) + matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8) + matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap) + matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg] + image_target_bgr[y : y + h, x : x + w, :] = ( + image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha + ) + return image_target_bgr.astype(np.uint8) + + def _resize(self, mask, matrix, w, h): + if (w != mask.shape[1]) or (h != mask.shape[0]): + mask = cv2.resize(mask, (w, h), self.interp_method_mask) + if (w != matrix.shape[1]) or (h != matrix.shape[0]): + matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix) + return mask, matrix + + def _check_image(self, image_rgb): + assert len(image_rgb.shape) == 3 + assert image_rgb.shape[2] == 3 + assert image_rgb.dtype == np.uint8 + + def _check_mask_matrix(self, mask, matrix): + assert len(matrix.shape) == 2 + assert len(mask.shape) == 2 + assert mask.dtype == np.uint8 + + +class RectangleVisualizer: + + _COLOR_GREEN = (18, 127, 15) + + def __init__(self, color=_COLOR_GREEN, thickness=1): + self.color = color + self.thickness = thickness + + def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None): + x, y, w, h = bbox_xywh + color = color or self.color + thickness = thickness or self.thickness + cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness) + return image_bgr + + +class PointsVisualizer: + + _COLOR_GREEN = (18, 127, 15) + + def __init__(self, color_bgr=_COLOR_GREEN, r=5): + self.color_bgr = color_bgr + self.r = r + + def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None): + for j, pt_xy in enumerate(pts_xy): + x, y = pt_xy + color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr + r = rs[j] if rs is not None else self.r + cv2.circle(image_bgr, (x, y), r, color_bgr, -1) + return image_bgr + + +class TextVisualizer: + + _COLOR_GRAY = (218, 227, 218) + _COLOR_WHITE = (255, 255, 255) + + def __init__( + self, + font_face=cv2.FONT_HERSHEY_SIMPLEX, + font_color_bgr=_COLOR_GRAY, + font_scale=0.35, + font_line_type=cv2.LINE_AA, + font_line_thickness=1, + fill_color_bgr=_COLOR_WHITE, + fill_color_transparency=1.0, + frame_color_bgr=_COLOR_WHITE, + frame_color_transparency=1.0, + frame_thickness=1, + ): + self.font_face = font_face + self.font_color_bgr = font_color_bgr + self.font_scale = font_scale + self.font_line_type = font_line_type + self.font_line_thickness = font_line_thickness + self.fill_color_bgr = fill_color_bgr + self.fill_color_transparency = fill_color_transparency + self.frame_color_bgr = frame_color_bgr + self.frame_color_transparency = frame_color_transparency + self.frame_thickness = frame_thickness + + def visualize(self, image_bgr, txt, topleft_xy): + txt_w, txt_h = self.get_text_size_wh(txt) + topleft_xy = tuple(map(int, topleft_xy)) + x, y = topleft_xy + if self.frame_color_transparency < 1.0: + t = self.frame_thickness + image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = ( + image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] + * self.frame_color_transparency + + np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency) + ).astype(float) + if self.fill_color_transparency < 1.0: + image_bgr[y : y + txt_h, x : x + txt_w, :] = ( + image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency + + np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency) + ).astype(float) + cv2.putText( + image_bgr, + txt, + topleft_xy, + self.font_face, + self.font_scale, + self.font_color_bgr, + self.font_line_thickness, + self.font_line_type, + ) + return image_bgr + + def get_text_size_wh(self, txt): + ((txt_w, txt_h), _) = cv2.getTextSize( + txt, self.font_face, self.font_scale, self.font_line_thickness + ) + return txt_w, txt_h + + +class CompoundVisualizer: + def __init__(self, visualizers): + self.visualizers = visualizers + + def visualize(self, image_bgr, data): + assert len(data) == len( + self.visualizers + ), "The number of datas {} should match the number of visualizers" " {}".format( + len(data), len(self.visualizers) + ) + image = image_bgr + for i, visualizer in enumerate(self.visualizers): + image = visualizer.visualize(image, data[i]) + return image + + def __str__(self): + visualizer_str = ", ".join([str(v) for v in self.visualizers]) + return "Compound Visualizer [{}]".format(visualizer_str) diff --git a/densepose/vis/bounding_box.py b/densepose/vis/bounding_box.py new file mode 100644 index 0000000000000000000000000000000000000000..6a9d37f40b1b64832a53d8c2cc8ce3bf90b30a3b --- /dev/null +++ b/densepose/vis/bounding_box.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .base import RectangleVisualizer, TextVisualizer + + +class BoundingBoxVisualizer: + def __init__(self): + self.rectangle_visualizer = RectangleVisualizer() + + def visualize(self, image_bgr, boxes_xywh): + for bbox_xywh in boxes_xywh: + image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh) + return image_bgr + + +class ScoredBoundingBoxVisualizer: + def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None, **kwargs): + if bbox_visualizer_params is None: + bbox_visualizer_params = {} + if score_visualizer_params is None: + score_visualizer_params = {} + self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params) + self.visualizer_score = TextVisualizer(**score_visualizer_params) + + def visualize(self, image_bgr, scored_bboxes): + boxes_xywh, box_scores = scored_bboxes + assert len(boxes_xywh) == len( + box_scores + ), "Number of bounding boxes {} should be equal to the number of scores {}".format( + len(boxes_xywh), len(box_scores) + ) + for i, box_xywh in enumerate(boxes_xywh): + score_i = box_scores[i] + image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh) + score_txt = "{0:6.4f}".format(score_i) + topleft_xy = box_xywh[0], box_xywh[1] + image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy) + return image_bgr diff --git a/densepose/vis/densepose_data_points.py b/densepose/vis/densepose_data_points.py new file mode 100644 index 0000000000000000000000000000000000000000..17e67cbf96022e09363cf1deac21814e4544f570 --- /dev/null +++ b/densepose/vis/densepose_data_points.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +from typing import Iterable, Optional, Tuple +import cv2 + +from densepose.structures import DensePoseDataRelative + +from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer + + +class DensePoseDataCoarseSegmentationVisualizer: + """ + Visualizer for ground truth segmentation + """ + + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, + cmap=cmap, + val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS, + alpha=alpha, + ) + + def visualize( + self, + image_bgr: Image, + bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]], + ) -> Image: + if bbox_densepose_datas is None: + return image_bgr + for bbox_xywh, densepose_data in zip(*bbox_densepose_datas): + matrix = densepose_data.segm.numpy() + mask = np.zeros(matrix.shape, dtype=np.uint8) + mask[matrix > 0] = 1 + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy()) + return image_bgr + + +class DensePoseDataPointsVisualizer: + def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA, **kwargs): + self.points_visualizer = PointsVisualizer() + self.densepose_data_to_value_fn = densepose_data_to_value_fn + self.cmap = cmap + + def visualize( + self, + image_bgr: Image, + bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]], + ) -> Image: + if bbox_densepose_datas is None: + return image_bgr + for bbox_xywh, densepose_data in zip(*bbox_densepose_datas): + x0, y0, w, h = bbox_xywh.numpy() + x = densepose_data.x.numpy() * w / 255.0 + x0 + y = densepose_data.y.numpy() * h / 255.0 + y0 + pts_xy = zip(x, y) + if self.densepose_data_to_value_fn is None: + image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy) + else: + v = self.densepose_data_to_value_fn(densepose_data) + img_colors_bgr = cv2.applyColorMap(v, self.cmap) + colors_bgr = [ + [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr + ] + image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr) + return image_bgr + + +def _densepose_data_u_for_cmap(densepose_data): + u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0 + return u.astype(np.uint8) + + +def _densepose_data_v_for_cmap(densepose_data): + v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0 + return v.astype(np.uint8) + + +def _densepose_data_i_for_cmap(densepose_data): + i = ( + np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS) + * 255.0 + / DensePoseDataRelative.N_PART_LABELS + ) + return i.astype(np.uint8) + + +class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer): + def __init__(self, **kwargs): + super(DensePoseDataPointsUVisualizer, self).__init__( + densepose_data_to_value_fn=_densepose_data_u_for_cmap, **kwargs + ) + + +class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer): + def __init__(self, **kwargs): + super(DensePoseDataPointsVVisualizer, self).__init__( + densepose_data_to_value_fn=_densepose_data_v_for_cmap, **kwargs + ) + + +class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer): + def __init__(self, **kwargs): + super(DensePoseDataPointsIVisualizer, self).__init__( + densepose_data_to_value_fn=_densepose_data_i_for_cmap, **kwargs + ) diff --git a/densepose/vis/densepose_outputs_iuv.py b/densepose/vis/densepose_outputs_iuv.py new file mode 100644 index 0000000000000000000000000000000000000000..454627cbe9fd5507cae8cbb9aab7c7bc0ffcfab7 --- /dev/null +++ b/densepose/vis/densepose_outputs_iuv.py @@ -0,0 +1,101 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +from typing import Optional, Tuple +import cv2 + +from densepose.structures import DensePoseDataRelative + +from ..structures import DensePoseChartPredictorOutput +from .base import Boxes, Image, MatrixVisualizer + + +class DensePoseOutputsVisualizer: + def __init__( + self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, to_visualize=None, **kwargs + ): + assert to_visualize in "IUV", "can only visualize IUV" + self.to_visualize = to_visualize + + if self.to_visualize == "I": + val_scale = 255.0 / DensePoseDataRelative.N_PART_LABELS + else: + val_scale = 1.0 + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha + ) + + def visualize( + self, + image_bgr: Image, + dp_output_with_bboxes: Tuple[Optional[DensePoseChartPredictorOutput], Optional[Boxes]], + ) -> Image: + densepose_output, bboxes_xywh = dp_output_with_bboxes + if densepose_output is None or bboxes_xywh is None: + return image_bgr + + assert isinstance( + densepose_output, DensePoseChartPredictorOutput + ), "DensePoseChartPredictorOutput expected, {} encountered".format(type(densepose_output)) + + S = densepose_output.coarse_segm + I = densepose_output.fine_segm # noqa + U = densepose_output.u + V = densepose_output.v + N = S.size(0) + assert N == I.size( + 0 + ), "densepose outputs S {} and I {}" " should have equal first dim size".format( + S.size(), I.size() + ) + assert N == U.size( + 0 + ), "densepose outputs S {} and U {}" " should have equal first dim size".format( + S.size(), U.size() + ) + assert N == V.size( + 0 + ), "densepose outputs S {} and V {}" " should have equal first dim size".format( + S.size(), V.size() + ) + assert N == len( + bboxes_xywh + ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format( + len(bboxes_xywh), N + ) + for n in range(N): + Sn = S[n].argmax(dim=0) + In = I[n].argmax(dim=0) * (Sn > 0).long() + segmentation = In.cpu().numpy().astype(np.uint8) + mask = np.zeros(segmentation.shape, dtype=np.uint8) + mask[segmentation > 0] = 1 + bbox_xywh = bboxes_xywh[n] + + if self.to_visualize == "I": + vis = segmentation + elif self.to_visualize in "UV": + U_or_Vn = {"U": U, "V": V}[self.to_visualize][n].cpu().numpy().astype(np.float32) + vis = np.zeros(segmentation.shape, dtype=np.float32) + for partId in range(U_or_Vn.shape[0]): + vis[segmentation == partId] = ( + U_or_Vn[partId][segmentation == partId].clip(0, 1) * 255 + ) + + # pyre-fixme[61]: `vis` may not be initialized here. + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, vis, bbox_xywh) + + return image_bgr + + +class DensePoseOutputsUVisualizer(DensePoseOutputsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs): + super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="U", **kwargs) + + +class DensePoseOutputsVVisualizer(DensePoseOutputsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs): + super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="V", **kwargs) + + +class DensePoseOutputsFineSegmentationVisualizer(DensePoseOutputsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs): + super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="I", **kwargs) diff --git a/densepose/vis/densepose_outputs_vertex.py b/densepose/vis/densepose_outputs_vertex.py new file mode 100644 index 0000000000000000000000000000000000000000..4e10bf82cfea92f488480d423d3ced5d61d04c49 --- /dev/null +++ b/densepose/vis/densepose_outputs_vertex.py @@ -0,0 +1,229 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import json +import numpy as np +from functools import lru_cache +from typing import Dict, List, Optional, Tuple +import cv2 +import torch + +from detectron2.utils.file_io import PathManager + +from densepose.modeling import build_densepose_embedder +from densepose.modeling.cse.utils import get_closest_vertices_mask_from_ES + +from ..data.utils import get_class_to_mesh_name_mapping +from ..structures import DensePoseEmbeddingPredictorOutput +from ..structures.mesh import create_mesh +from .base import Boxes, Image, MatrixVisualizer +from .densepose_results_textures import get_texture_atlas + + +@lru_cache() +def get_xyz_vertex_embedding(mesh_name: str, device: torch.device): + if mesh_name == "smpl_27554": + embed_path = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/cse/mds_d=256.npy" + ) + embed_map, _ = np.load(embed_path, allow_pickle=True) + embed_map = torch.tensor(embed_map).float()[:, 0] + embed_map -= embed_map.min() + embed_map /= embed_map.max() + else: + mesh = create_mesh(mesh_name, device) + embed_map = mesh.vertices.sum(dim=1) + embed_map -= embed_map.min() + embed_map /= embed_map.max() + embed_map = embed_map**2 + return embed_map + + +class DensePoseOutputsVertexVisualizer: + def __init__( + self, + cfg, + inplace=True, + cmap=cv2.COLORMAP_JET, + alpha=0.7, + device="cuda", + default_class=0, + **kwargs, + ): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha + ) + self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg) + self.embedder = build_densepose_embedder(cfg) + self.device = torch.device(device) + self.default_class = default_class + + self.mesh_vertex_embeddings = { + mesh_name: self.embedder(mesh_name).to(self.device) + for mesh_name in self.class_to_mesh_name.values() + if self.embedder.has_embeddings(mesh_name) + } + + def visualize( + self, + image_bgr: Image, + outputs_boxes_xywh_classes: Tuple[ + Optional[DensePoseEmbeddingPredictorOutput], Optional[Boxes], Optional[List[int]] + ], + ) -> Image: + if outputs_boxes_xywh_classes[0] is None: + return image_bgr + + S, E, N, bboxes_xywh, pred_classes = self.extract_and_check_outputs_and_boxes( + outputs_boxes_xywh_classes + ) + + for n in range(N): + x, y, w, h = bboxes_xywh[n].int().tolist() + mesh_name = self.class_to_mesh_name[pred_classes[n]] + closest_vertices, mask = get_closest_vertices_mask_from_ES( + E[[n]], + S[[n]], + h, + w, + self.mesh_vertex_embeddings[mesh_name], + self.device, + ) + embed_map = get_xyz_vertex_embedding(mesh_name, self.device) + vis = (embed_map[closest_vertices].clip(0, 1) * 255.0).cpu().numpy() + mask_numpy = mask.cpu().numpy().astype(dtype=np.uint8) + image_bgr = self.mask_visualizer.visualize(image_bgr, mask_numpy, vis, [x, y, w, h]) + + return image_bgr + + def extract_and_check_outputs_and_boxes(self, outputs_boxes_xywh_classes): + + densepose_output, bboxes_xywh, pred_classes = outputs_boxes_xywh_classes + + if pred_classes is None: + pred_classes = [self.default_class] * len(bboxes_xywh) + + assert isinstance( + densepose_output, DensePoseEmbeddingPredictorOutput + ), "DensePoseEmbeddingPredictorOutput expected, {} encountered".format( + type(densepose_output) + ) + + S = densepose_output.coarse_segm + E = densepose_output.embedding + N = S.size(0) + assert N == E.size( + 0 + ), "CSE coarse_segm {} and embeddings {}" " should have equal first dim size".format( + S.size(), E.size() + ) + assert N == len( + bboxes_xywh + ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format( + len(bboxes_xywh), N + ) + assert N == len(pred_classes), ( + "number of predicted classes {}" + " should be equal to first dim size of outputs {}".format(len(bboxes_xywh), N) + ) + + return S, E, N, bboxes_xywh, pred_classes + + +def get_texture_atlases(json_str: Optional[str]) -> Optional[Dict[str, Optional[np.ndarray]]]: + """ + json_str is a JSON string representing a mesh_name -> texture_atlas_path dictionary + """ + if json_str is None: + return None + + paths = json.loads(json_str) + return {mesh_name: get_texture_atlas(path) for mesh_name, path in paths.items()} + + +class DensePoseOutputsTextureVisualizer(DensePoseOutputsVertexVisualizer): + def __init__( + self, + cfg, + texture_atlases_dict, + device="cuda", + default_class=0, + **kwargs, + ): + self.embedder = build_densepose_embedder(cfg) + + self.texture_image_dict = {} + self.alpha_dict = {} + + for mesh_name in texture_atlases_dict.keys(): + if texture_atlases_dict[mesh_name].shape[-1] == 4: # Image with alpha channel + self.alpha_dict[mesh_name] = texture_atlases_dict[mesh_name][:, :, -1] / 255.0 + self.texture_image_dict[mesh_name] = texture_atlases_dict[mesh_name][:, :, :3] + else: + self.alpha_dict[mesh_name] = texture_atlases_dict[mesh_name].sum(axis=-1) > 0 + self.texture_image_dict[mesh_name] = texture_atlases_dict[mesh_name] + + self.device = torch.device(device) + self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg) + self.default_class = default_class + + self.mesh_vertex_embeddings = { + mesh_name: self.embedder(mesh_name).to(self.device) + for mesh_name in self.class_to_mesh_name.values() + } + + def visualize( + self, + image_bgr: Image, + outputs_boxes_xywh_classes: Tuple[ + Optional[DensePoseEmbeddingPredictorOutput], Optional[Boxes], Optional[List[int]] + ], + ) -> Image: + image_target_bgr = image_bgr.copy() + if outputs_boxes_xywh_classes[0] is None: + return image_target_bgr + + S, E, N, bboxes_xywh, pred_classes = self.extract_and_check_outputs_and_boxes( + outputs_boxes_xywh_classes + ) + + meshes = { + p: create_mesh(self.class_to_mesh_name[p], self.device) for p in np.unique(pred_classes) + } + + for n in range(N): + x, y, w, h = bboxes_xywh[n].int().cpu().numpy() + mesh_name = self.class_to_mesh_name[pred_classes[n]] + closest_vertices, mask = get_closest_vertices_mask_from_ES( + E[[n]], + S[[n]], + h, + w, + self.mesh_vertex_embeddings[mesh_name], + self.device, + ) + uv_array = meshes[pred_classes[n]].texcoords[closest_vertices].permute((2, 0, 1)) + uv_array = uv_array.cpu().numpy().clip(0, 1) + textured_image = self.generate_image_with_texture( + image_target_bgr[y : y + h, x : x + w], + uv_array, + mask.cpu().numpy(), + self.class_to_mesh_name[pred_classes[n]], + ) + if textured_image is None: + continue + image_target_bgr[y : y + h, x : x + w] = textured_image + + return image_target_bgr + + def generate_image_with_texture(self, bbox_image_bgr, uv_array, mask, mesh_name): + alpha = self.alpha_dict.get(mesh_name) + texture_image = self.texture_image_dict.get(mesh_name) + if alpha is None or texture_image is None: + return None + U, V = uv_array + x_index = (U * texture_image.shape[1]).astype(int) + y_index = (V * texture_image.shape[0]).astype(int) + local_texture = texture_image[y_index, x_index][mask] + local_alpha = np.expand_dims(alpha[y_index, x_index][mask], -1) + output_image = bbox_image_bgr.copy() + output_image[mask] = output_image[mask] * (1 - local_alpha) + local_texture * local_alpha + return output_image.astype(np.uint8) diff --git a/densepose/vis/densepose_results.py b/densepose/vis/densepose_results.py new file mode 100644 index 0000000000000000000000000000000000000000..a660d26eec6c873bd5cf9b35d0c8387075789afc --- /dev/null +++ b/densepose/vis/densepose_results.py @@ -0,0 +1,355 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import numpy as np +from typing import List, Optional, Tuple +import cv2 +import torch + +from densepose.structures import DensePoseDataRelative + +from ..structures import DensePoseChartResult +from .base import Boxes, Image, MatrixVisualizer + + +class DensePoseResultsVisualizer: + def visualize( + self, + image_bgr: Image, + results_and_boxes_xywh: Tuple[Optional[List[DensePoseChartResult]], Optional[Boxes]], + ) -> Image: + densepose_result, boxes_xywh = results_and_boxes_xywh + if densepose_result is None or boxes_xywh is None: + return image_bgr + + boxes_xywh = boxes_xywh.cpu().numpy() + context = self.create_visualization_context(image_bgr) + for i, result in enumerate(densepose_result): + iuv_array = torch.cat( + (result.labels[None].type(torch.float32), result.uv * 255.0) + ).type(torch.uint8) + self.visualize_iuv_arr(context, iuv_array.cpu().numpy(), boxes_xywh[i]) + image_bgr = self.context_to_image_bgr(context) + return image_bgr + + def create_visualization_context(self, image_bgr: Image): + return image_bgr + + def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None: + pass + + def context_to_image_bgr(self, context): + return context + + def get_image_bgr_from_context(self, context): + return context + + +class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer): + def __init__( + self, + data_extractor, + segm_extractor, + inplace=True, + cmap=cv2.COLORMAP_PARULA, + alpha=0.7, + val_scale=1.0, + **kwargs, + ): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha + ) + self.data_extractor = data_extractor + self.segm_extractor = segm_extractor + + def context_to_image_bgr(self, context): + return context + + def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None: + image_bgr = self.get_image_bgr_from_context(context) + matrix = self.data_extractor(iuv_arr) + segm = self.segm_extractor(iuv_arr) + mask = np.zeros(matrix.shape, dtype=np.uint8) + mask[segm > 0] = 1 + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh) + + +def _extract_i_from_iuvarr(iuv_arr): + return iuv_arr[0, :, :] + + +def _extract_u_from_iuvarr(iuv_arr): + return iuv_arr[1, :, :] + + +def _extract_v_from_iuvarr(iuv_arr): + return iuv_arr[2, :, :] + + +class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer): + def __init__(self, levels=10, **kwargs): + self.levels = levels + self.plot_args = kwargs + + def create_visualization_context(self, image_bgr: Image): + import matplotlib.pyplot as plt + from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas + + context = {} + context["image_bgr"] = image_bgr + dpi = 100 + height_inches = float(image_bgr.shape[0]) / dpi + width_inches = float(image_bgr.shape[1]) / dpi + fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi) + plt.axes([0, 0, 1, 1]) + plt.axis("off") + context["fig"] = fig + canvas = FigureCanvas(fig) + context["canvas"] = canvas + extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0) + plt.imshow(image_bgr[:, :, ::-1], extent=extent) + return context + + def context_to_image_bgr(self, context): + fig = context["fig"] + w, h = map(int, fig.get_size_inches() * fig.get_dpi()) + canvas = context["canvas"] + canvas.draw() + image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8") + image_rgb = image_1d.reshape(h, w, 3) + image_bgr = image_rgb[:, :, ::-1].copy() + return image_bgr + + def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> None: + import matplotlib.pyplot as plt + + u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0 + v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0 + extent = ( + bbox_xywh[0], + bbox_xywh[0] + bbox_xywh[2], + bbox_xywh[1], + bbox_xywh[1] + bbox_xywh[3], + ) + plt.contour(u, self.levels, extent=extent, **self.plot_args) + plt.contour(v, self.levels, extent=extent, **self.plot_args) + + +class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer): + """ + Contour visualization using marching squares + """ + + def __init__(self, levels=10, **kwargs): + # TODO: colormap is hardcoded + cmap = cv2.COLORMAP_PARULA + if isinstance(levels, int): + self.levels = np.linspace(0, 1, levels) + else: + self.levels = levels + if "linewidths" in kwargs: + self.linewidths = kwargs["linewidths"] + else: + self.linewidths = [1] * len(self.levels) + self.plot_args = kwargs + img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap) + self.level_colors_bgr = [ + [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr + ] + + def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> None: + image_bgr = self.get_image_bgr_from_context(context) + segm = _extract_i_from_iuvarr(iuv_arr) + u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0 + v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0 + self._contours(image_bgr, u, segm, bbox_xywh) + self._contours(image_bgr, v, segm, bbox_xywh) + + def _contours(self, image_bgr, arr, segm, bbox_xywh): + for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1): + mask = segm == part_idx + if not np.any(mask): + continue + arr_min = np.amin(arr[mask]) + arr_max = np.amax(arr[mask]) + I, J = np.nonzero(mask) + i0 = np.amin(I) + i1 = np.amax(I) + 1 + j0 = np.amin(J) + j1 = np.amax(J) + 1 + if (j1 == j0 + 1) or (i1 == i0 + 1): + continue + Nw = arr.shape[1] - 1 + Nh = arr.shape[0] - 1 + for level_idx, level in enumerate(self.levels): + if (level < arr_min) or (level > arr_max): + continue + vp = arr[i0:i1, j0:j1] >= level + bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8 + mp = mask[i0:i1, j0:j1] + bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8 + it = np.nditer(bin_codes, flags=["multi_index"]) + color_bgr = self.level_colors_bgr[level_idx] + linewidth = self.linewidths[level_idx] + while not it.finished: + if (it[0] != 0) and (it[0] != 15): + i, j = it.multi_index + if bin_mask_codes[i, j] != 0: + self._draw_line( + image_bgr, + arr, + mask, + level, + color_bgr, + linewidth, + it[0], + it.multi_index, + bbox_xywh, + Nw, + Nh, + (i0, j0), + ) + it.iternext() + + def _draw_line( + self, + image_bgr, + arr, + mask, + v, + color_bgr, + linewidth, + bin_code, + multi_idx, + bbox_xywh, + Nw, + Nh, + offset, + ): + lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset) + x0, y0, w, h = bbox_xywh + x1 = x0 + w + y1 = y0 + h + for line in lines: + x0r, y0r = line[0] + x1r, y1r = line[1] + pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0))) + pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0))) + cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth) + + def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset): + i0, j0 = offset + i, j = multi_idx + i += i0 + j += j0 + v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1] + x0i = float(j) / Nw + y0j = float(i) / Nh + He = 1.0 / Nh + We = 1.0 / Nw + if (bin_code == 1) or (bin_code == 14): + a = (v - v0) / (v1 - v0) + b = (v - v0) / (v3 - v0) + pt1 = (x0i, y0j + a * He) + pt2 = (x0i + b * We, y0j) + return [(pt1, pt2)] + elif (bin_code == 2) or (bin_code == 13): + a = (v - v0) / (v1 - v0) + b = (v - v1) / (v2 - v1) + pt1 = (x0i, y0j + a * He) + pt2 = (x0i + b * We, y0j + He) + return [(pt1, pt2)] + elif (bin_code == 3) or (bin_code == 12): + a = (v - v0) / (v3 - v0) + b = (v - v1) / (v2 - v1) + pt1 = (x0i + a * We, y0j) + pt2 = (x0i + b * We, y0j + He) + return [(pt1, pt2)] + elif (bin_code == 4) or (bin_code == 11): + a = (v - v1) / (v2 - v1) + b = (v - v3) / (v2 - v3) + pt1 = (x0i + a * We, y0j + He) + pt2 = (x0i + We, y0j + b * He) + return [(pt1, pt2)] + elif (bin_code == 6) or (bin_code == 9): + a = (v - v0) / (v1 - v0) + b = (v - v3) / (v2 - v3) + pt1 = (x0i, y0j + a * He) + pt2 = (x0i + We, y0j + b * He) + return [(pt1, pt2)] + elif (bin_code == 7) or (bin_code == 8): + a = (v - v0) / (v3 - v0) + b = (v - v3) / (v2 - v3) + pt1 = (x0i + a * We, y0j) + pt2 = (x0i + We, y0j + b * He) + return [(pt1, pt2)] + elif bin_code == 5: + a1 = (v - v0) / (v1 - v0) + b1 = (v - v1) / (v2 - v1) + pt11 = (x0i, y0j + a1 * He) + pt12 = (x0i + b1 * We, y0j + He) + a2 = (v - v0) / (v3 - v0) + b2 = (v - v3) / (v2 - v3) + pt21 = (x0i + a2 * We, y0j) + pt22 = (x0i + We, y0j + b2 * He) + return [(pt11, pt12), (pt21, pt22)] + elif bin_code == 10: + a1 = (v - v0) / (v3 - v0) + b1 = (v - v0) / (v1 - v0) + pt11 = (x0i + a1 * We, y0j) + pt12 = (x0i, y0j + b1 * He) + a2 = (v - v1) / (v2 - v1) + b2 = (v - v3) / (v2 - v3) + pt21 = (x0i + a2 * We, y0j + He) + pt22 = (x0i + We, y0j + b2 * He) + return [(pt11, pt12), (pt21, pt22)] + return [] + + +try: + import matplotlib + + matplotlib.use("Agg") + DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer +except ModuleNotFoundError: + logger = logging.getLogger(__name__) + logger.warning("Could not import matplotlib, using custom contour visualizer") + DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer + + +class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer): + def __init__(self, inplace=False, cmap=cv2.COLORMAP_PARULA, alpha=1, **kwargs): + super(DensePoseResultsFineSegmentationVisualizer, self).__init__( + _extract_i_from_iuvarr, + _extract_i_from_iuvarr, + inplace, + cmap, + alpha, + val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS, + **kwargs, + ) + + +class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs): + super(DensePoseResultsUVisualizer, self).__init__( + _extract_u_from_iuvarr, + _extract_i_from_iuvarr, + inplace, + cmap, + alpha, + val_scale=1.0, + **kwargs, + ) + + +class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs): + super(DensePoseResultsVVisualizer, self).__init__( + _extract_v_from_iuvarr, + _extract_i_from_iuvarr, + inplace, + cmap, + alpha, + val_scale=1.0, + **kwargs, + ) diff --git a/densepose/vis/densepose_results_textures.py b/densepose/vis/densepose_results_textures.py new file mode 100644 index 0000000000000000000000000000000000000000..8b02f2bdbaa8bb1b70bc0f690a568ac4f8f1c91a --- /dev/null +++ b/densepose/vis/densepose_results_textures.py @@ -0,0 +1,91 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +from typing import List, Optional, Tuple +import torch + +from detectron2.data.detection_utils import read_image + +from ..structures import DensePoseChartResult +from .base import Boxes, Image +from .densepose_results import DensePoseResultsVisualizer + + +def get_texture_atlas(path: Optional[str]) -> Optional[np.ndarray]: + if path is None: + return None + + # Reading images like that downsamples 16-bit images to 8-bit + # If 16-bit images are needed, we can replace that by cv2.imread with the + # cv2.IMREAD_UNCHANGED flag (with cv2 we also need it to keep alpha channels) + # The rest of the pipeline would need to be adapted to 16-bit images too + bgr_image = read_image(path) + rgb_image = np.copy(bgr_image) # Convert BGR -> RGB + rgb_image[:, :, :3] = rgb_image[:, :, 2::-1] # Works with alpha channel + return rgb_image + + +class DensePoseResultsVisualizerWithTexture(DensePoseResultsVisualizer): + """ + texture_atlas: An image, size 6N * 4N, with N * N squares for each of the 24 body parts. + It must follow the grid found at https://github.com/facebookresearch/DensePose/blob/master/DensePoseData/demo_data/texture_atlas_200.png # noqa + For each body part, U is proportional to the x coordinate, and (1 - V) to y + """ + + def __init__(self, texture_atlas, **kwargs): + self.texture_atlas = texture_atlas + self.body_part_size = texture_atlas.shape[0] // 6 + assert self.body_part_size == texture_atlas.shape[1] // 4 + + def visualize( + self, + image_bgr: Image, + results_and_boxes_xywh: Tuple[Optional[List[DensePoseChartResult]], Optional[Boxes]], + ) -> Image: + densepose_result, boxes_xywh = results_and_boxes_xywh + if densepose_result is None or boxes_xywh is None: + return image_bgr + + boxes_xywh = boxes_xywh.int().cpu().numpy() + texture_image, alpha = self.get_texture() + for i, result in enumerate(densepose_result): + iuv_array = torch.cat((result.labels[None], result.uv.clamp(0, 1))) + x, y, w, h = boxes_xywh[i] + bbox_image = image_bgr[y : y + h, x : x + w] + image_bgr[y : y + h, x : x + w] = self.generate_image_with_texture( + texture_image, alpha, bbox_image, iuv_array.cpu().numpy() + ) + return image_bgr + + def get_texture(self): + N = self.body_part_size + texture_image = np.zeros([24, N, N, self.texture_atlas.shape[-1]]) + for i in range(4): + for j in range(6): + texture_image[(6 * i + j), :, :, :] = self.texture_atlas[ + N * j : N * (j + 1), N * i : N * (i + 1), : + ] + + if texture_image.shape[-1] == 4: # Image with alpha channel + alpha = texture_image[:, :, :, -1] / 255.0 + texture_image = texture_image[:, :, :, :3] + else: + alpha = texture_image.sum(axis=-1) > 0 + + return texture_image, alpha + + def generate_image_with_texture(self, texture_image, alpha, bbox_image_bgr, iuv_array): + + I, U, V = iuv_array + generated_image_bgr = bbox_image_bgr.copy() + + for PartInd in range(1, 25): + x, y = np.where(I == PartInd) + x_index = (U[x, y] * (self.body_part_size - 1)).astype(int) + y_index = ((1 - V[x, y]) * (self.body_part_size - 1)).astype(int) + part_alpha = np.expand_dims(alpha[PartInd - 1, y_index, x_index], -1) + generated_image_bgr[I == PartInd] = ( + generated_image_bgr[I == PartInd] * (1 - part_alpha) + + texture_image[PartInd - 1, y_index, x_index] * part_alpha + ) + + return generated_image_bgr.astype(np.uint8) diff --git a/densepose/vis/extractor.py b/densepose/vis/extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..543efac9272746b8cc130534d5d09251404c8d97 --- /dev/null +++ b/densepose/vis/extractor.py @@ -0,0 +1,199 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +from typing import List, Optional, Sequence, Tuple +import torch + +from detectron2.layers.nms import batched_nms +from detectron2.structures.instances import Instances + +from densepose.converters import ToChartResultConverterWithConfidences +from densepose.structures import ( + DensePoseChartResultWithConfidences, + DensePoseEmbeddingPredictorOutput, +) +from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer +from densepose.vis.densepose_outputs_vertex import DensePoseOutputsVertexVisualizer +from densepose.vis.densepose_results import DensePoseResultsVisualizer + +from .base import CompoundVisualizer + +Scores = Sequence[float] +DensePoseChartResultsWithConfidences = List[DensePoseChartResultWithConfidences] + + +def extract_scores_from_instances(instances: Instances, select=None): + if instances.has("scores"): + return instances.scores if select is None else instances.scores[select] + return None + + +def extract_boxes_xywh_from_instances(instances: Instances, select=None): + if instances.has("pred_boxes"): + boxes_xywh = instances.pred_boxes.tensor.clone() + boxes_xywh[:, 2] -= boxes_xywh[:, 0] + boxes_xywh[:, 3] -= boxes_xywh[:, 1] + return boxes_xywh if select is None else boxes_xywh[select] + return None + + +def create_extractor(visualizer: object): + """ + Create an extractor for the provided visualizer + """ + if isinstance(visualizer, CompoundVisualizer): + extractors = [create_extractor(v) for v in visualizer.visualizers] + return CompoundExtractor(extractors) + elif isinstance(visualizer, DensePoseResultsVisualizer): + return DensePoseResultExtractor() + elif isinstance(visualizer, ScoredBoundingBoxVisualizer): + return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances]) + elif isinstance(visualizer, BoundingBoxVisualizer): + return extract_boxes_xywh_from_instances + elif isinstance(visualizer, DensePoseOutputsVertexVisualizer): + return DensePoseOutputsExtractor() + else: + logger = logging.getLogger(__name__) + logger.error(f"Could not create extractor for {visualizer}") + return None + + +class BoundingBoxExtractor: + """ + Extracts bounding boxes from instances + """ + + def __call__(self, instances: Instances): + boxes_xywh = extract_boxes_xywh_from_instances(instances) + return boxes_xywh + + +class ScoredBoundingBoxExtractor: + """ + Extracts bounding boxes from instances + """ + + def __call__(self, instances: Instances, select=None): + scores = extract_scores_from_instances(instances) + boxes_xywh = extract_boxes_xywh_from_instances(instances) + if (scores is None) or (boxes_xywh is None): + return (boxes_xywh, scores) + if select is not None: + scores = scores[select] + boxes_xywh = boxes_xywh[select] + return (boxes_xywh, scores) + + +class DensePoseResultExtractor: + """ + Extracts DensePose chart result with confidences from instances + """ + + def __call__( + self, instances: Instances, select=None + ) -> Tuple[Optional[DensePoseChartResultsWithConfidences], Optional[torch.Tensor]]: + if instances.has("pred_densepose") and instances.has("pred_boxes"): + dpout = instances.pred_densepose + boxes_xyxy = instances.pred_boxes + boxes_xywh = extract_boxes_xywh_from_instances(instances) + if select is not None: + dpout = dpout[select] + boxes_xyxy = boxes_xyxy[select] + converter = ToChartResultConverterWithConfidences() + results = [converter.convert(dpout[i], boxes_xyxy[[i]]) for i in range(len(dpout))] + return results, boxes_xywh + else: + return None, None + + +class DensePoseOutputsExtractor: + """ + Extracts DensePose result from instances + """ + + def __call__( + self, + instances: Instances, + select=None, + ) -> Tuple[ + Optional[DensePoseEmbeddingPredictorOutput], Optional[torch.Tensor], Optional[List[int]] + ]: + if not (instances.has("pred_densepose") and instances.has("pred_boxes")): + return None, None, None + + dpout = instances.pred_densepose + boxes_xyxy = instances.pred_boxes + boxes_xywh = extract_boxes_xywh_from_instances(instances) + + if instances.has("pred_classes"): + classes = instances.pred_classes.tolist() + else: + classes = None + + if select is not None: + dpout = dpout[select] + boxes_xyxy = boxes_xyxy[select] + if classes is not None: + classes = classes[select] + + return dpout, boxes_xywh, classes + + +class CompoundExtractor: + """ + Extracts data for CompoundVisualizer + """ + + def __init__(self, extractors): + self.extractors = extractors + + def __call__(self, instances: Instances, select=None): + datas = [] + for extractor in self.extractors: + data = extractor(instances, select) + datas.append(data) + return datas + + +class NmsFilteredExtractor: + """ + Extracts data in the format accepted by NmsFilteredVisualizer + """ + + def __init__(self, extractor, iou_threshold): + self.extractor = extractor + self.iou_threshold = iou_threshold + + def __call__(self, instances: Instances, select=None): + scores = extract_scores_from_instances(instances) + boxes_xywh = extract_boxes_xywh_from_instances(instances) + if boxes_xywh is None: + return None + select_local_idx = batched_nms( + boxes_xywh, + scores, + torch.zeros(len(scores), dtype=torch.int32), + iou_threshold=self.iou_threshold, + ).squeeze() + select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device) + select_local[select_local_idx] = True + select = select_local if select is None else (select & select_local) + return self.extractor(instances, select=select) + + +class ScoreThresholdedExtractor: + """ + Extracts data in the format accepted by ScoreThresholdedVisualizer + """ + + def __init__(self, extractor, min_score): + self.extractor = extractor + self.min_score = min_score + + def __call__(self, instances: Instances, select=None): + scores = extract_scores_from_instances(instances) + if scores is None: + return None + select_local = scores > self.min_score + select = select_local if select is None else (select & select_local) + data = self.extractor(instances, select=select) + return data