wanghaofan's picture
Upload 14 files
897c6c0 verified
raw
history blame
9.9 kB
import gradio as gr
import spaces
import os
import sys
import subprocess
import numpy as np
from PIL import Image
import cv2
import torch
import random
from controlnet_aux import OpenposeDetector, CannyDetector
from depth_anything_v2.dpt import DepthAnythingV2
MAX_SEED = np.iinfo(np.int32).max
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
if randomize_seed:
seed = random.randint(0, MAX_SEED)
return seed
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
encoder = 'vitl'
model = DepthAnythingV2(**model_configs[encoder])
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model")
state_dict = torch.load(filepath, map_location="cpu")
model.load_state_dict(state_dict)
model = model.to(DEVICE).eval()
import torch
from diffusers.utils import load_image
from diffusers import FluxControlNetPipeline, FluxControlNetModel, FluxMultiControlNetModel
base_model = 'black-forest-labs/FLUX.1-dev'
controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro'
controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
controlnet = FluxMultiControlNetModel([controlnet])
pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
pipe.to("cuda")
mode_mapping = {"canny":0, "tile":1, "depth":2, "blur":3, "openpose":4, "gray":5, "low quality": 6}
strength_mapping = {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}
canny = CannyDetector()
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
def convert_from_image_to_cv2(img: Image) -> np.ndarray:
return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
def convert_from_cv2_to_image(img: np.ndarray) -> Image:
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
def extract_depth(image):
image = np.asarray(image)
depth = model.infer_image(image[:, :, ::-1])
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
depth = depth.astype(np.uint8)
gray_depth = Image.fromarray(depth).convert('RGB')
return gray_depth
def extract_openpose(img):
processed_image_open_pose = open_pose(img, hand_and_face=True)
return processed_image_open_pose
def extract_canny(image):
processed_image_canny = canny(image)
return processed_image_canny
def apply_gaussian_blur(image, kernel_size=(21, 21)):
image = convert_from_image_to_cv2(image)
blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0))
return blurred_image
def convert_to_grayscale(image):
image = convert_from_image_to_cv2(image)
gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))
return gray_image
def add_gaussian_noise(image, mean=0, sigma=10):
image = convert_from_image_to_cv2(image)
noise = np.random.normal(mean, sigma, image.shape)
noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8))
return noisy_image
def tile(input_image, resolution=1024):
input_image = convert_from_image_to_cv2(input_image)
H, W, C = input_image.shape
H = float(H)
W = float(W)
k = float(resolution) / min(H, W)
H *= k
W *= k
H = int(np.round(H / 64.0)) * 64
W = int(np.round(W / 64.0)) * 64
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
img = convert_from_cv2_to_image(img)
return img
def resize_img(input_image, max_side=1024, min_side=768, size=None,
pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
w, h = input_image.size
if size is not None:
w_resize_new, h_resize_new = size
else:
ratio = min_side / min(h, w)
w, h = round(ratio*w), round(ratio*h)
ratio = max_side / max(h, w)
input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
input_image = input_image.resize([w_resize_new, h_resize_new], mode)
if pad_to_max_side:
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
offset_x = (max_side - w_resize_new) // 2
offset_y = (max_side - h_resize_new) // 2
res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
input_image = Image.fromarray(res)
return input_image
@spaces.GPU()
def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)):
control_mode_num = mode_mapping[control_mode]
if cond_in is None:
if image_in is not None:
image_in = resize_img(load_image(image_in))
if control_mode == "canny":
control_image = extract_canny(image_in)
elif control_mode == "depth":
control_image = extract_depth(image_in)
elif control_mode == "openpose":
control_image = extract_openpose(image_in)
elif control_mode == "blur":
control_image = apply_gaussian_blur(image_in)
elif control_mode == "low quality":
control_image = add_gaussian_noise(image_in)
elif control_mode == "gray":
control_image = convert_to_grayscale(image_in)
elif control_mode == "tile":
control_image = tile(image_in)
else:
control_image = resize_img(load_image(cond_in))
width, height = control_image.size
image = pipe(
prompt,
control_image=[control_image],
control_mode=[control_mode_num],
width=width,
height=height,
controlnet_conditioning_scale=[control_strength],
num_inference_steps=inference_steps,
guidance_scale=guidance_scale,
generator=torch.manual_seed(seed),
).images[0]
return image, control_image, gr.update(visible=True)
css="""
#col-container{
margin: 0 auto;
max-width: 1080px;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown("""
# FLUX.1-dev-ControlNet-Union-Pro
A unified ControlNet for FLUX.1-dev model from the InstantX team and Shakker Labs. Model card: [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro). <br />
The recommended strength: {"canny":0.65, "tile":0.45, "depth":0.55, "blur":0.45, "openpose":0.55, "gray":0.45, "low quality": 0.4}. Long prompt is preferred by FLUX.1.
""")
with gr.Column():
with gr.Row():
with gr.Column():
with gr.Row(equal_height=True):
cond_in = gr.Image(label="Upload a processed control image", sources=["upload"], type="filepath")
image_in = gr.Image(label="Extract condition from a reference image (Optional)", sources=["upload"], type="filepath")
prompt = gr.Textbox(label="Prompt", value="best quality")
with gr.Accordion("Controlnet"):
control_mode = gr.Radio(
["canny", "depth", "openpose", "gray", "blur", "tile", "low quality"], label="Mode", value="gray",
info="select the control mode, one for all"
)
control_strength = gr.Slider(
label="control strength",
minimum=0,
maximum=1.0,
step=0.05,
value=0.50,
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=42,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
with gr.Accordion("Advanced settings", open=False):
with gr.Column():
with gr.Row():
inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=50, step=1, value=24)
guidance_scale = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.5)
submit_btn = gr.Button("Submit")
with gr.Column():
result = gr.Image(label="Result")
processed_cond = gr.Image(label="Preprocessed Cond")
submit_btn.click(
fn=randomize_seed_fn,
inputs=[seed, randomize_seed],
outputs=seed,
queue=False,
api_name=False
).then(
fn = infer,
inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed],
outputs = [result, processed_cond],
show_api=False
)
demo.queue(api_open=False)
demo.launch()