Image-Text-to-Text
Transformers
PyTorch
English
doubutsu_next
conversational
custom_code
Inference Endpoints
doubutsu-2b-pt-756 / utils.py
qtnx's picture
Upload folder using huggingface_hub
ec5b76c verified
from typing import List, Tuple
from PIL import Image
import math
def generate_grid_configurations(size: int) -> List[Tuple[int, int]]:
grid_configs = [
(2 * size, 2 * size),
(1 * size, 2 * size),
(1 * size, 3 * size),
(1 * size, 4 * size),
(4 * size, 1 * size),
(3 * size, 1 * size),
(2 * size, 1 * size),
]
return grid_configs
def select_best_resolution(original_size, possible_resolutions):
"""
Selects the best resolution from a list of possible resolutions based on the original size.
Args:
original_size (tuple): The original size of the image in the format (width, height).
possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
Returns:
tuple: The best fit resolution in the format (width, height).
"""
original_width, original_height = original_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for width, height in possible_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = (
int(original_width * scale),
int(original_height * scale),
)
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (width, height)
return best_fit
def resize_and_pad_image(image, target_resolution):
"""
Resize and pad an image to a target resolution while maintaining aspect ratio.
Args:
image (PIL.Image.Image): The input image.
target_resolution (tuple): The target resolution (width, height) of the image.
Returns:
PIL.Image.Image: The resized and padded image.
"""
original_width, original_height = image.size
target_width, target_height = target_resolution
scale_w = target_width / original_width
scale_h = target_height / original_height
if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)
# Resize the image
resized_image = image.resize((new_width, new_height))
new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
new_image.paste(resized_image, (paste_x, paste_y))
return new_image
def divide_to_patches(image, patch_size):
"""
Divides an image into patches of a specified size.
Args:
image (PIL.Image.Image): The input image.
patch_size (int): The size of each patch.
Returns:
list: A list of PIL.Image.Image objects representing the patches.
"""
patches = []
width, height = image.size
for i in range(0, height, patch_size):
for j in range(0, width, patch_size):
box = (j, i, j + patch_size, i + patch_size)
patch = image.crop(box)
patches.append(patch)
return patches
def slice_anyres_image(image, patch_size=378):
grid_pinpoints = generate_grid_configurations(patch_size)
best_resolution = select_best_resolution(image.size, grid_pinpoints)
image_padded = resize_and_pad_image(image, best_resolution)
patches = divide_to_patches(image_padded, patch_size)
size = {"shortest_edge": patch_size}
image_original_resize = image.resize((size["shortest_edge"], size["shortest_edge"]))
image_patches = [image_original_resize] + patches
return image_patches