# ------------------------------------------ # TextDiffuser: Diffusion Models as Text Painters # Paper Link: https://arxiv.org/abs/2305.10855 # Code Link: https://github.com/microsoft/unilm/tree/master/textdiffuser # Copyright (c) Microsoft Corporation. # This file defines a set of commonly used utility functions. # ------------------------------------------ import os import re import cv2 import math import shutil import string import textwrap import numpy as np from PIL import Image, ImageFont, ImageDraw, ImageOps from typing import * # define alphabet and alphabet_dic alphabet = string.digits + string.ascii_lowercase + string.ascii_uppercase + string.punctuation + ' ' # len(aphabet) = 95 alphabet_dic = {} for index, c in enumerate(alphabet): alphabet_dic[c] = index + 1 # the index 0 stands for non-character def transform_mask_pil(mask_root, size): """ This function extracts the mask area and text area from the images. Args: mask_root (str): The path of mask image. * The white area is the unmasked area * The gray area is the masked area * The white area is the text area """ img = np.array(mask_root) img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) ret, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY) # pixel value is set to 0 or 255 according to the threshold return 1 - (binary.astype(np.float32) / 255) def transform_mask(mask_root, size): """ This function extracts the mask area and text area from the images. Args: mask_root (str): The path of mask image. * The white area is the unmasked area * The gray area is the masked area * The white area is the text area """ img = cv2.imread(mask_root) img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) ret, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY) # pixel value is set to 0 or 255 according to the threshold return 1 - (binary.astype(np.float32) / 255) def segmentation_mask_visualization(font_path: str, segmentation_mask: np.array): """ This function visualizes the segmentaiton masks with characters. Args: font_path (str): The path of font. We recommand to use Arial.ttf segmentation_mask (np.array): The character-level segmentation mask. """ segmentation_mask = cv2.resize(segmentation_mask, (64, 64), interpolation=cv2.INTER_NEAREST) font = ImageFont.truetype(font_path, 8) blank = Image.new('RGB', (512,512), (0,0,0)) d = ImageDraw.Draw(blank) for i in range(64): for j in range(64): if int(segmentation_mask[i][j]) == 0 or int(segmentation_mask[i][j])-1 >= len(alphabet): continue else: d.text((j*8, i*8), alphabet[int(segmentation_mask[i][j])-1], font=font, fill=(0, 255, 0)) return blank def make_caption_pil(font_path: str, captions: List[str]): """ This function converts captions into pil images. Args: font_path (str): The path of font. We recommand to use Arial.ttf captions (List[str]): List of captions. """ caption_pil_list = [] font = ImageFont.truetype(font_path, 18) for caption in captions: border_size = 2 img = Image.new('RGB', (512-4,48-4), (255,255,255)) img = ImageOps.expand(img, border=(border_size, border_size, border_size, border_size), fill=(127, 127, 127)) draw = ImageDraw.Draw(img) border_size = 2 text = caption lines = textwrap.wrap(text, width=40) x, y = 4, 4 line_height = font.getsize('A')[1] + 4 start = 0 for line in lines: draw.text((x, y+start), line, font=font, fill=(200, 127, 0)) y += line_height caption_pil_list.append(img) return caption_pil_list def filter_segmentation_mask(segmentation_mask: np.array): """ This function removes some noisy predictions of segmentation masks. Args: segmentation_mask (np.array): The character-level segmentation mask. """ segmentation_mask[segmentation_mask==alphabet_dic['-']] = 0 segmentation_mask[segmentation_mask==alphabet_dic[' ']] = 0 return segmentation_mask def combine_image(args, resolution, sub_output_dir: str, pred_image_list: List, image_pil: Image, character_mask_pil: Image, character_mask_highlight_pil: Image, caption_pil_list: List): """ This function combines all the outputs and useful inputs together. Args: args (argparse.ArgumentParser): The arguments. pred_image_list (List): List of predicted images. image_pil (Image): The original image. character_mask_pil (Image): The character-level segmentation mask. character_mask_highlight_pil (Image): The character-level segmentation mask highlighting character regions with green color. caption_pil_list (List): List of captions. """ size = len(pred_image_list) if size == 1: return pred_image_list[0] elif size == 2: blank = Image.new('RGB', (resolution*2, resolution), (0,0,0)) blank.paste(pred_image_list[0],(0,0)) blank.paste(pred_image_list[1],(resolution,0)) elif size == 3: blank = Image.new('RGB', (resolution*3, resolution), (0,0,0)) blank.paste(pred_image_list[0],(0,0)) blank.paste(pred_image_list[1],(resolution,0)) blank.paste(pred_image_list[2],(resolution*2,0)) elif size == 4: blank = Image.new('RGB', (resolution*2, resolution*2), (0,0,0)) blank.paste(pred_image_list[0],(0,0)) blank.paste(pred_image_list[1],(resolution,0)) blank.paste(pred_image_list[2],(0,resolution)) blank.paste(pred_image_list[3],(resolution,resolution)) return blank def combine_image_gradio(args, size, sub_output_dir: str, pred_image_list: List, image_pil: Image, character_mask_pil: Image, character_mask_highlight_pil: Image, caption_pil_list: List): """ This function combines all the outputs and useful inputs together. Args: args (argparse.ArgumentParser): The arguments. pred_image_list (List): List of predicted images. image_pil (Image): The original image. character_mask_pil (Image): The character-level segmentation mask. character_mask_highlight_pil (Image): The character-level segmentation mask highlighting character regions with green color. caption_pil_list (List): List of captions. """ size = len(pred_image_list) if size == 1: return pred_image_list[0] elif size == 2: blank = Image.new('RGB', (size*2, size), (0,0,0)) blank.paste(pred_image_list[0],(0,0)) blank.paste(pred_image_list[1],(size,0)) elif size == 3: blank = Image.new('RGB', (size*3, size), (0,0,0)) blank.paste(pred_image_list[0],(0,0)) blank.paste(pred_image_list[1],(size,0)) blank.paste(pred_image_list[2],(size*2,0)) elif size == 4: blank = Image.new('RGB', (size*2, size*2), (0,0,0)) blank.paste(pred_image_list[0],(0,0)) blank.paste(pred_image_list[1],(size,0)) blank.paste(pred_image_list[2],(0,size)) blank.paste(pred_image_list[3],(size,size)) return blank def get_width(font_path, text): """ This function calculates the width of the text. Args: font_path (str): user prompt. text (str): user prompt. """ font = ImageFont.truetype(font_path, 24) width, _ = font.getsize(text) return width def get_key_words(text: str): """ This function detect keywords (enclosed by quotes) from user prompts. The keywords are used to guide the layout generation. Args: text (str): user prompt. """ words = [] text = text matches = re.findall(r"'(.*?)'", text) # find the keywords enclosed by '' if matches: for match in matches: words.extend(match.split()) if len(words) >= 8: return [] return words def adjust_overlap_box(box_output, current_index): """ This function adjust the overlapping boxes. Args: box_output (List): List of predicted boxes. current_index (int): the index of current box. """ if current_index == 0: return box_output else: # judge whether it contains overlap with the last output last_box = box_output[0, current_index-1, :] xmin_last, ymin_last, xmax_last, ymax_last = last_box current_box = box_output[0, current_index, :] xmin, ymin, xmax, ymax = current_box if xmin_last <= xmin <= xmax_last and ymin_last <= ymin <= ymax_last: print('adjust overlapping') distance_x = xmax_last - xmin distance_y = ymax_last - ymin if distance_x <= distance_y: # avoid overlap new_x_min = xmax_last + 0.025 new_x_max = xmax - xmin + xmax_last + 0.025 box_output[0,current_index,0] = new_x_min box_output[0,current_index,2] = new_x_max else: new_y_min = ymax_last + 0.025 new_y_max = ymax - ymin + ymax_last + 0.025 box_output[0,current_index,1] = new_y_min box_output[0,current_index,3] = new_y_max elif xmin_last <= xmin <= xmax_last and ymin_last <= ymax <= ymax_last: print('adjust overlapping') new_x_min = xmax_last + 0.05 new_x_max = xmax - xmin + xmax_last + 0.05 box_output[0,current_index,0] = new_x_min box_output[0,current_index,2] = new_x_max return box_output def shrink_box(box, scale_factor = 0.9): """ This function shrinks the box. Args: box (List): List of predicted boxes. scale_factor (float): The scale factor of shrinking. """ x1, y1, x2, y2 = box x1_new = x1 + (x2 - x1) * (1 - scale_factor) / 2 y1_new = y1 + (y2 - y1) * (1 - scale_factor) / 2 x2_new = x2 - (x2 - x1) * (1 - scale_factor) / 2 y2_new = y2 - (y2 - y1) * (1 - scale_factor) / 2 return (x1_new, y1_new, x2_new, y2_new) def adjust_font_size(args, width, height, draw, text): """ This function adjusts the font size. Args: args (argparse.ArgumentParser): The arguments. width (int): The width of the text. height (int): The height of the text. draw (ImageDraw): The ImageDraw object. text (str): The text. """ size_start = height while True: font = ImageFont.truetype(args.font_path, size_start) text_width, _ = draw.textsize(text, font=font) if text_width >= width: size_start = size_start - 1 else: return size_start def inpainting_merge_image(original_image, mask_image, inpainting_image): """ This function merges the original image, mask image and inpainting image. Args: original_image (PIL.Image): The original image. mask_image (PIL.Image): The mask images. inpainting_image (PIL.Image): The inpainting images. """ original_image = original_image.resize((512, 512)) mask_image = mask_image.resize((512, 512)) inpainting_image = inpainting_image.resize((512, 512)) mask_image.convert('L') threshold = 250 table = [] for i in range(256): if i < threshold: table.append(1) else: table.append(0) mask_image = mask_image.point(table, "1") merged_image = Image.composite(inpainting_image, original_image, mask_image) return merged_image