IDM-VTON

Running on Zero

IDM-VTON / detectron2 /data /detection_utils.py

IDM-VTON

update IDM-VTON Demo

938e515 8 months ago

23.2 kB

	# -- coding: utf-8 --
	# Copyright (c) Facebook, Inc. and its affiliates.

	"""
	Common data processing utilities that are used in a
	typical object detection data pipeline.
	"""
	import logging
	import numpy as np
	from typing import List, Union
	import pycocotools.mask as mask_util
	import torch
	from PIL import Image

	from detectron2.structures import (
	BitMasks,
	Boxes,
	BoxMode,
	Instances,
	Keypoints,
	PolygonMasks,
	RotatedBoxes,
	polygons_to_bitmask,
	)
	from detectron2.utils.file_io import PathManager

	from . import transforms as T
	from .catalog import MetadataCatalog

	__all__ = [
	"SizeMismatchError",
	"convert_image_to_rgb",
	"check_image_size",
	"transform_proposals",
	"transform_instance_annotations",
	"annotations_to_instances",
	"annotations_to_instances_rotated",
	"build_augmentation",
	"build_transform_gen",
	"create_keypoint_hflip_indices",
	"filter_empty_instances",
	"read_image",
	]


	class SizeMismatchError(ValueError):
	"""
	When loaded image has difference width/height compared with annotation.
	"""


	# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
	_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
	_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]

	# https://www.exiv2.org/tags.html
	_EXIF_ORIENT = 274 # exif 'Orientation' tag


	def convert_PIL_to_numpy(image, format):
	"""
	Convert PIL image to numpy array of target format.

	Args:
	image (PIL.Image): a PIL image
	format (str): the format of output image

	Returns:
	(np.ndarray): also see `read_image`
	"""
	if format is not None:
	# PIL only supports RGB, so convert to RGB and flip channels over below
	conversion_format = format
	if format in ["BGR", "YUV-BT.601"]:
	conversion_format = "RGB"
	image = image.convert(conversion_format)
	image = np.asarray(image)
	# PIL squeezes out the channel dimension for "L", so make it HWC
	if format == "L":
	image = np.expand_dims(image, -1)

	# handle formats not supported by PIL
	elif format == "BGR":
	# flip channels if needed
	image = image[:, :, ::-1]
	elif format == "YUV-BT.601":
	image = image / 255.0
	image = np.dot(image, np.array(_M_RGB2YUV).T)

	return image


	def convert_image_to_rgb(image, format):
	"""
	Convert an image from given format to RGB.

	Args:
	image (np.ndarray or Tensor): an HWC image
	format (str): the format of input image, also see `read_image`

	Returns:
	(np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8
	"""
	if isinstance(image, torch.Tensor):
	image = image.cpu().numpy()
	if format == "BGR":
	image = image[:, :, [2, 1, 0]]
	elif format == "YUV-BT.601":
	image = np.dot(image, np.array(_M_YUV2RGB).T)
	image = image * 255.0
	else:
	if format == "L":
	image = image[:, :, 0]
	image = image.astype(np.uint8)
	image = np.asarray(Image.fromarray(image, mode=format).convert("RGB"))
	return image


	def _apply_exif_orientation(image):
	"""
	Applies the exif orientation correctly.

	This code exists per the bug:
	https://github.com/python-pillow/Pillow/issues/3973
	with the function `ImageOps.exif_transpose`. The Pillow source raises errors with
	various methods, especially `tobytes`

	Function based on:
	https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59
	https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527

	Args:
	image (PIL.Image): a PIL image

	Returns:
	(PIL.Image): the PIL image with exif orientation applied, if applicable
	"""
	if not hasattr(image, "getexif"):
	return image

	try:
	exif = image.getexif()
	except Exception: # https://github.com/facebookresearch/detectron2/issues/1885
	exif = None

	if exif is None:
	return image

	orientation = exif.get(_EXIF_ORIENT)

	method = {
	2: Image.FLIP_LEFT_RIGHT,
	3: Image.ROTATE_180,
	4: Image.FLIP_TOP_BOTTOM,
	5: Image.TRANSPOSE,
	6: Image.ROTATE_270,
	7: Image.TRANSVERSE,
	8: Image.ROTATE_90,
	}.get(orientation)

	if method is not None:
	return image.transpose(method)
	return image


	def read_image(file_name, format=None):
	"""
	Read an image into the given format.
	Will apply rotation and flipping if the image has such exif information.

	Args:
	file_name (str): image file path
	format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601".

	Returns:
	image (np.ndarray):
	an HWC image in the given format, which is 0-255, uint8 for
	supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601.
	"""
	with PathManager.open(file_name, "rb") as f:
	image = Image.open(f)

	# work around this bug: https://github.com/python-pillow/Pillow/issues/3973
	image = _apply_exif_orientation(image)
	return convert_PIL_to_numpy(image, format)


	def check_image_size(dataset_dict, image):
	"""
	Raise an error if the image does not match the size specified in the dict.
	"""
	if "width" in dataset_dict or "height" in dataset_dict:
	image_wh = (image.shape[1], image.shape[0])
	expected_wh = (dataset_dict["width"], dataset_dict["height"])
	if not image_wh == expected_wh:
	raise SizeMismatchError(
	"Mismatched image shape{}, got {}, expect {}.".format(
	" for image " + dataset_dict["file_name"]
	if "file_name" in dataset_dict
	else "",
	image_wh,
	expected_wh,
	)
	+ " Please check the width/height in your annotation."
	)

	# To ensure bbox always remap to original image size
	if "width" not in dataset_dict:
	dataset_dict["width"] = image.shape[1]
	if "height" not in dataset_dict:
	dataset_dict["height"] = image.shape[0]


	def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0):
	"""
	Apply transformations to the proposals in dataset_dict, if any.

	Args:
	dataset_dict (dict): a dict read from the dataset, possibly
	contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
	image_shape (tuple): height, width
	transforms (TransformList):
	proposal_topk (int): only keep top-K scoring proposals
	min_box_size (int): proposals with either side smaller than this
	threshold are removed

	The input dict is modified in-place, with abovementioned keys removed. A new
	key "proposals" will be added. Its value is an `Instances`
	object which contains the transformed proposals in its field
	"proposal_boxes" and "objectness_logits".
	"""
	if "proposal_boxes" in dataset_dict:
	# Transform proposal boxes
	boxes = transforms.apply_box(
	BoxMode.convert(
	dataset_dict.pop("proposal_boxes"),
	dataset_dict.pop("proposal_bbox_mode"),
	BoxMode.XYXY_ABS,
	)
	)
	boxes = Boxes(boxes)
	objectness_logits = torch.as_tensor(
	dataset_dict.pop("proposal_objectness_logits").astype("float32")
	)

	boxes.clip(image_shape)
	keep = boxes.nonempty(threshold=min_box_size)
	boxes = boxes[keep]
	objectness_logits = objectness_logits[keep]

	proposals = Instances(image_shape)
	proposals.proposal_boxes = boxes[:proposal_topk]
	proposals.objectness_logits = objectness_logits[:proposal_topk]
	dataset_dict["proposals"] = proposals


	def get_bbox(annotation):
	"""
	Get bbox from data
	Args:
	annotation (dict): dict of instance annotations for a single instance.
	Returns:
	bbox (ndarray): x1, y1, x2, y2 coordinates
	"""
	# bbox is 1d (per-instance bounding box)
	bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
	return bbox


	def transform_instance_annotations(
	annotation, transforms, image_size, *, keypoint_hflip_indices=None
	):
	"""
	Apply transforms to box, segmentation and keypoints annotations of a single instance.

	It will use `transforms.apply_box` for the box, and
	`transforms.apply_coords` for segmentation polygons & keypoints.
	If you need anything more specially designed for each data structure,
	you'll need to implement your own version of this function or the transforms.

	Args:
	annotation (dict): dict of instance annotations for a single instance.
	It will be modified in-place.
	transforms (TransformList or list[Transform]):
	image_size (tuple): the height, width of the transformed image
	keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.

	Returns:
	dict:
	the same input dict with fields "bbox", "segmentation", "keypoints"
	transformed according to `transforms`.
	The "bbox_mode" field will be set to XYXY_ABS.
	"""
	if isinstance(transforms, (tuple, list)):
	transforms = T.TransformList(transforms)
	# bbox is 1d (per-instance bounding box)
	bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
	# clip transformed bbox to image size
	bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0)
	annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
	annotation["bbox_mode"] = BoxMode.XYXY_ABS

	if "segmentation" in annotation:
	# each instance contains 1 or more polygons
	segm = annotation["segmentation"]
	if isinstance(segm, list):
	# polygons
	polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
	annotation["segmentation"] = [
	p.reshape(-1) for p in transforms.apply_polygons(polygons)
	]
	elif isinstance(segm, dict):
	# RLE
	mask = mask_util.decode(segm)
	mask = transforms.apply_segmentation(mask)
	assert tuple(mask.shape[:2]) == image_size
	annotation["segmentation"] = mask
	else:
	raise ValueError(
	"Cannot transform segmentation of type '{}'!"
	"Supported types are: polygons as list[list[float] or ndarray],"
	" COCO-style RLE as a dict.".format(type(segm))
	)

	if "keypoints" in annotation:
	keypoints = transform_keypoint_annotations(
	annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
	)
	annotation["keypoints"] = keypoints

	return annotation


	def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
	"""
	Transform keypoint annotations of an image.
	If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0)

	Args:
	keypoints (list[float]): Nx3 float in Detectron2's Dataset format.
	Each point is represented by (x, y, visibility).
	transforms (TransformList):
	image_size (tuple): the height, width of the transformed image
	keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
	When `transforms` includes horizontal flip, will use the index
	mapping to flip keypoints.
	"""
	# (N*3,) -> (N, 3)
	keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
	keypoints_xy = transforms.apply_coords(keypoints[:, :2])

	# Set all out-of-boundary points to "unlabeled"
	inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1]))
	inside = inside.all(axis=1)
	keypoints[:, :2] = keypoints_xy
	keypoints[:, 2][~inside] = 0

	# This assumes that HorizFlipTransform is the only one that does flip
	do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1

	# Alternative way: check if probe points was horizontally flipped.
	# probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
	# probe_aug = transforms.apply_coords(probe.copy())
	# do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0]) # noqa

	# If flipped, swap each keypoint with its opposite-handed equivalent
	if do_hflip:
	if keypoint_hflip_indices is None:
	raise ValueError("Cannot flip keypoints without providing flip indices!")
	if len(keypoints) != len(keypoint_hflip_indices):
	raise ValueError(
	"Keypoint data has {} points, but metadata "
	"contains {} points!".format(len(keypoints), len(keypoint_hflip_indices))
	)
	keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :]

	# Maintain COCO convention that if visibility == 0 (unlabeled), then x, y = 0
	keypoints[keypoints[:, 2] == 0] = 0
	return keypoints


	def annotations_to_instances(annos, image_size, mask_format="polygon"):
	"""
	Create an :class:`Instances` object used by the models,
	from instance annotations in the dataset dict.

	Args:
	annos (list[dict]): a list of instance annotations in one image, each
	element for one instance.
	image_size (tuple): height, width

	Returns:
	Instances:
	It will contain fields "gt_boxes", "gt_classes",
	"gt_masks", "gt_keypoints", if they can be obtained from `annos`.
	This is the format that builtin models expect.
	"""
	boxes = (
	np.stack(
	[BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
	)
	if len(annos)
	else np.zeros((0, 4))
	)
	target = Instances(image_size)
	target.gt_boxes = Boxes(boxes)

	classes = [int(obj["category_id"]) for obj in annos]
	classes = torch.tensor(classes, dtype=torch.int64)
	target.gt_classes = classes

	if len(annos) and "segmentation" in annos[0]:
	segms = [obj["segmentation"] for obj in annos]
	if mask_format == "polygon":
	try:
	masks = PolygonMasks(segms)
	except ValueError as e:
	raise ValueError(
	"Failed to use mask_format=='polygon' from the given annotations!"
	) from e
	else:
	assert mask_format == "bitmask", mask_format
	masks = []
	for segm in segms:
	if isinstance(segm, list):
	# polygon
	masks.append(polygons_to_bitmask(segm, *image_size))
	elif isinstance(segm, dict):
	# COCO RLE
	masks.append(mask_util.decode(segm))
	elif isinstance(segm, np.ndarray):
	assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
	segm.ndim
	)
	# mask array
	masks.append(segm)
	else:
	raise ValueError(
	"Cannot convert segmentation of type '{}' to BitMasks!"
	"Supported types are: polygons as list[list[float] or ndarray],"
	" COCO-style RLE as a dict, or a binary segmentation mask "
	" in a 2D numpy array of shape HxW.".format(type(segm))
	)
	# torch.from_numpy does not support array with negative stride.
	masks = BitMasks(
	torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
	)
	target.gt_masks = masks

	if len(annos) and "keypoints" in annos[0]:
	kpts = [obj.get("keypoints", []) for obj in annos]
	target.gt_keypoints = Keypoints(kpts)

	return target


	def annotations_to_instances_rotated(annos, image_size):
	"""
	Create an :class:`Instances` object used by the models,
	from instance annotations in the dataset dict.
	Compared to `annotations_to_instances`, this function is for rotated boxes only

	Args:
	annos (list[dict]): a list of instance annotations in one image, each
	element for one instance.
	image_size (tuple): height, width

	Returns:
	Instances:
	Containing fields "gt_boxes", "gt_classes",
	if they can be obtained from `annos`.
	This is the format that builtin models expect.
	"""
	boxes = [obj["bbox"] for obj in annos]
	target = Instances(image_size)
	boxes = target.gt_boxes = RotatedBoxes(boxes)
	boxes.clip(image_size)

	classes = [obj["category_id"] for obj in annos]
	classes = torch.tensor(classes, dtype=torch.int64)
	target.gt_classes = classes

	return target


	def filter_empty_instances(
	instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False
	):
	"""
	Filter out empty instances in an `Instances` object.

	Args:
	instances (Instances):
	by_box (bool): whether to filter out instances with empty boxes
	by_mask (bool): whether to filter out instances with empty masks
	box_threshold (float): minimum width and height to be considered non-empty
	return_mask (bool): whether to return boolean mask of filtered instances

	Returns:
	Instances: the filtered instances.
	tensor[bool], optional: boolean mask of filtered instances
	"""
	assert by_box or by_mask
	r = []
	if by_box:
	r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
	if instances.has("gt_masks") and by_mask:
	r.append(instances.gt_masks.nonempty())

	# TODO: can also filter visible keypoints

	if not r:
	return instances
	m = r[0]
	for x in r[1:]:
	m = m & x
	if return_mask:
	return instances[m], m
	return instances[m]


	def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]:
	"""
	Args:
	dataset_names: list of dataset names

	Returns:
	list[int]: a list of size=#keypoints, storing the
	horizontally-flipped keypoint indices.
	"""
	if isinstance(dataset_names, str):
	dataset_names = [dataset_names]

	check_metadata_consistency("keypoint_names", dataset_names)
	check_metadata_consistency("keypoint_flip_map", dataset_names)

	meta = MetadataCatalog.get(dataset_names[0])
	names = meta.keypoint_names
	# TODO flip -> hflip
	flip_map = dict(meta.keypoint_flip_map)
	flip_map.update({v: k for k, v in flip_map.items()})
	flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
	flip_indices = [names.index(i) for i in flipped_names]
	return flip_indices


	def get_fed_loss_cls_weights(dataset_names: Union[str, List[str]], freq_weight_power=1.0):
	"""
	Get frequency weight for each class sorted by class id.
	We now calcualte freqency weight using image_count to the power freq_weight_power.

	Args:
	dataset_names: list of dataset names
	freq_weight_power: power value
	"""
	if isinstance(dataset_names, str):
	dataset_names = [dataset_names]

	check_metadata_consistency("class_image_count", dataset_names)

	meta = MetadataCatalog.get(dataset_names[0])
	class_freq_meta = meta.class_image_count
	class_freq = torch.tensor(
	[c["image_count"] for c in sorted(class_freq_meta, key=lambda x: x["id"])]
	)
	class_freq_weight = class_freq.float() ** freq_weight_power
	return class_freq_weight


	def gen_crop_transform_with_instance(crop_size, image_size, instance):
	"""
	Generate a CropTransform so that the cropping region contains
	the center of the given instance.

	Args:
	crop_size (tuple): h, w in pixels
	image_size (tuple): h, w
	instance (dict): an annotation dict of one instance, in Detectron2's
	dataset format.
	"""
	crop_size = np.asarray(crop_size, dtype=np.int32)
	bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
	center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
	assert (
	image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
	), "The annotation bounding box is outside of the image!"
	assert (
	image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
	), "Crop size is larger than image size!"

	min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
	max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
	max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))

	y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
	x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
	return T.CropTransform(x0, y0, crop_size[1], crop_size[0])


	def check_metadata_consistency(key, dataset_names):
	"""
	Check that the datasets have consistent metadata.

	Args:
	key (str): a metadata key
	dataset_names (list[str]): a list of dataset names

	Raises:
	AttributeError: if the key does not exist in the metadata
	ValueError: if the given datasets do not have the same metadata values defined by key
	"""
	if len(dataset_names) == 0:
	return
	logger = logging.getLogger(__name__)
	entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
	for idx, entry in enumerate(entries_per_dataset):
	if entry != entries_per_dataset[0]:
	logger.error(
	"Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
	)
	logger.error(
	"Metadata '{}' for dataset '{}' is '{}'".format(
	key, dataset_names[0], str(entries_per_dataset[0])
	)
	)
	raise ValueError("Datasets have different metadata '{}'!".format(key))


	def build_augmentation(cfg, is_train):
	"""
	Create a list of default :class:`Augmentation` from config.
	Now it includes resizing and flipping.

	Returns:
	list[Augmentation]
	"""
	if is_train:
	min_size = cfg.INPUT.MIN_SIZE_TRAIN
	max_size = cfg.INPUT.MAX_SIZE_TRAIN
	sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
	else:
	min_size = cfg.INPUT.MIN_SIZE_TEST
	max_size = cfg.INPUT.MAX_SIZE_TEST
	sample_style = "choice"
	augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
	if is_train and cfg.INPUT.RANDOM_FLIP != "none":
	augmentation.append(
	T.RandomFlip(
	horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
	vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
	)
	)
	return augmentation


	build_transform_gen = build_augmentation
	"""
	Alias for backward-compatibility.
	"""