kolcontrl

Running on Zero

File size: 14,600 Bytes

d5f497d
 
 
6c91ee7
 
 
d5f497d
78d6af0
6c91ee7
 
d5f497d
 
6c91ee7
78d6af0
6c91ee7
 
 
 
3ad3d31
6c91ee7
78d6af0
d5f497d
 
 
6c91ee7
 
3ad3d31
d5f497d
 
 
 
 
6c91ee7
 
 
3ad3d31
d5f497d
6c91ee7
d5f497d
78d6af0
6c91ee7
 
 
 
d5f497d
6c91ee7
d5f497d
6c91ee7
d5f497d
78d6af0
d5f497d
 
6c91ee7
d5f497d
 
6c91ee7
d5f497d
3ad3d31
 
78d6af0
3ad3d31
 
 
 
 
 
 
78d6af0
 
 
 
 
 
 
 
 
 
6c91ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5f497d
3ad3d31
 
 
 
 
3936987
 
78d6af0
3ad3d31
 
d5f497d
8004741
d5f497d
 
e9f3ef9
6c91ee7
78d6af0
8f532a7
6c91ee7
 
 
 
 
 
 
78d6af0
d5f497d
 
 
78d6af0
e9f3ef9
78d6af0
6c91ee7
78d6af0
 
 
 
 
 
 
 
 
6c91ee7
 
 
78d6af0
cd4f227
e9f3ef9
 
 
78d6af0
6155537
e9f3ef9
 
 
 
 
 
 
78d6af0
e9f3ef9
 
 
78d6af0
e9f3ef9
 
 
78d6af0
 
 
 
 
 
 
 
 
e9f3ef9
 
 
78d6af0
fad18b4
3ad3d31
 
 
78d6af0
595a73a
3ad3d31
 
 
 
 
 
 
78d6af0
3ad3d31
 
 
78d6af0
595a73a
3ad3d31
 
78d6af0
 
 
 
 
 
 
 
 
3ad3d31
 
 
78d6af0
2602407
78ad020
2602407
fad18b4
2602407
fad18b4
78ad020
 
 
2602407
fad18b4
2602407
fad18b4
d5f497d
 
3ad3d31
2602407
0fb30ab
2602407
0fb30ab
3ad3d31
 
2602407
 
 
20c2217
d5f497d
 
f92dc60
 
 
 
 
2602407
d5f497d
 
 
 
2602407
78d6af0
d890da3
d5f497d
6c91ee7
2602407
 
d5f497d
2602407
 
d5f497d
2602407
d5f497d
 
2602407
d5f497d
 
 
 
 
2602407
d5f497d
 
2602407
d5f497d
 
 
6c91ee7
d5f497d
 
2602407
d5f497d
 
 
6c91ee7
d5f497d
 
6c91ee7
2602407
d5f497d
 
6c91ee7
 
 
 
2602407
6c91ee7
 
 
 
 
 
 
2602407
6c91ee7
 
 
 
d5f497d
78ad020
2602407
 
 
d5f497d
 
2602407
 
78d6af0
 
d5f497d
 
 
78d6af0
 
 
 
 
78ad020
 
 
78d6af0
 
 
 
 
d5f497d
3ad3d31
 
 
78d6af0
 
 
 
 
3ad3d31
d5f497d
78ad020
78d6af0
 
 
78ad020
 
 
78d6af0
 
 
78ad020
 
3ad3d31
78d6af0
 
 
3ad3d31
 
78d6af0

import spaces
import random
import torch
import cv2
import gradio as gr
import numpy as np
from huggingface_hub import snapshot_download
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
from diffusers.utils import load_image
from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
from kolors.models.modeling_chatglm import ChatGLMModel
from kolors.models.tokenization_chatglm import ChatGLMTokenizer
from kolors.models.controlnet import ControlNetModel
from diffusers import AutoencoderKL
from kolors.models.unet_2d_condition import UNet2DConditionModel
from diffusers import EulerDiscreteScheduler
from PIL import Image
from annotator.midas import MidasDetector
from annotator.dwpose import DWposeDetector
from annotator.util import resize_image, HWC3
from transformers import pipeline

device = "cuda"
ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")

text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=None).half().to(device)

pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
    vae=vae,
    controlnet=controlnet_depth,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    force_zeros_for_empty_prompt=False
)

pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
    vae=vae,
    controlnet=controlnet_canny,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    force_zeros_for_empty_prompt=False
)

pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
    vae=vae,
    controlnet=controlnet_pose,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    force_zeros_for_empty_prompt=False
)

# 번역 모델 초기화
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

# prompt를 처리하는 함수 추가
def process_prompt(prompt):
    if any('\u3131' <= char <= '\u3163' or '\uac00' <= char <= '\ud7a3' for char in prompt):
        translated = translator(prompt)[0]['translation_text']
        return prompt, translated
    return prompt, prompt

@spaces.GPU
def process_canny_condition(image, canny_threods=[100,200]):
    np_image = image.copy()
    np_image = cv2.Canny(np_image, canny_threods[0], canny_threods[1])
    np_image = np_image[:, :, None]
    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
    np_image = HWC3(np_image)
    return Image.fromarray(np_image)

model_midas = MidasDetector()
@spaces.GPU
def process_depth_condition_midas(img, res = 1024):
    h,w,_ = img.shape
    img = resize_image(HWC3(img), res)
    result = HWC3(model_midas(img))
    result = cv2.resize(result, (w,h))
    return Image.fromarray(result)

model_dwpose = DWposeDetector()
@spaces.GPU
def process_dwpose_condition(image, res=1024):
    h,w,_ = image.shape
    img = resize_image(HWC3(image), res)
    out_res, out_img = model_dwpose(image) 
    result = HWC3(out_img)
    result = cv2.resize(result, (w,h))
    return Image.fromarray(result)

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

@spaces.GPU
def infer_depth(prompt, 
          image = None, 
          negative_prompt = "NSFW, facial shadow, low resolution, JPEG artifacts, blurry, poor quality, blackface, neon lights.", 
          seed = 397886929, 
          randomize_seed = False,
          guidance_scale = 6.0, 
          num_inference_steps = 50,
          controlnet_conditioning_scale = 0.7,
          control_guidance_end = 0.9,
          strength = 1.0
        ):
    original_prompt, english_prompt = process_prompt(prompt)
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)
    init_image = resize_image(image, MAX_IMAGE_SIZE)
    pipe = pipe_depth.to("cuda")
    condi_img = process_depth_condition_midas(np.array(init_image), MAX_IMAGE_SIZE)
    image = pipe(
        prompt=english_prompt,
        image=init_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        control_guidance_end=control_guidance_end, 
        strength=strength, 
        control_image=condi_img,
        negative_prompt=negative_prompt, 
        num_inference_steps=num_inference_steps, 
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        generator=generator,
    ).images[0]
    return [condi_img, image], seed, original_prompt, english_prompt

@spaces.GPU
def infer_canny(prompt, 
          image = None, 
          negative_prompt = "NSFW, facial shadow, low resolution, JPEG artifacts, blurry, poor quality, blackface, neon lights.", 
          seed = 397886929, 
          randomize_seed = False,
          guidance_scale = 6.0, 
          num_inference_steps = 50,
          controlnet_conditioning_scale = 0.7,
          control_guidance_end = 0.9,
          strength = 1.0
        ):
    original_prompt, english_prompt = process_prompt(prompt)
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)
    init_image = resize_image(image, MAX_IMAGE_SIZE)
    pipe = pipe_canny.to("cuda")
    condi_img = process_canny_condition(np.array(init_image))
    image = pipe(
        prompt=english_prompt,
        image=init_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        control_guidance_end=control_guidance_end, 
        strength=strength, 
        control_image=condi_img,
        negative_prompt=negative_prompt, 
        num_inference_steps=num_inference_steps, 
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        generator=generator,
    ).images[0]
    return [condi_img, image], seed, original_prompt, english_prompt

@spaces.GPU
def infer_pose(prompt, 
          image = None, 
          negative_prompt = "NSFW, facial shadow, low resolution, JPEG artifacts, blurry, poor quality, blackface, neon lights.", 
          seed = 66, 
          randomize_seed = False,
          guidance_scale = 6.0, 
          num_inference_steps = 50,
          controlnet_conditioning_scale = 0.7,
          control_guidance_end = 0.9,
          strength = 1.0
        ):
    original_prompt, english_prompt = process_prompt(prompt)
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)
    init_image = resize_image(image, MAX_IMAGE_SIZE)
    pipe = pipe_pose.to("cuda")
    condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
    image = pipe(
        prompt=english_prompt,
        image=init_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        control_guidance_end=control_guidance_end, 
        strength=strength, 
        control_image=condi_img,
        negative_prompt=negative_prompt, 
        num_inference_steps=num_inference_steps, 
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        generator=generator,
    ).images[0]
    return [condi_img, image], seed, original_prompt, english_prompt

canny_examples = [
    ["아름다운 소녀, 고품질, 매우 선명, 생생한 색상, 초고해상도, 최상의 품질, 8k, 고화질, 4K",
     "image/woman_1.png"],
    ["파노라마, 컵 안에 앉아있는 귀여운 흰 강아지, 카메라를 바라보는, 애니메이션 스타일, 3D 렌더링, 옥테인 렌더",
    "image/dog.png"]
]

depth_examples = [
    ["신카이 마코토 스타일, 풍부한 색감, 초록 셔츠를 입은 여성이 들판에 서 있는, 아름다운 풍경, 맑고 밝은, 얼룩진 빛과 그림자, 최고의 품질, 초세밀, 8K 화질",
     "image/woman_2.png"],
    ["화려한 색상의 작은 새, 고품질, 매우 선명, 생생한 색상, 초고해상도, 최상의 품질, 8k, 고화질, 4K",
     "image/bird.png"]
]

pose_examples = [
    ["보라색 퍼프 슬리브 드레스를 입고 왕관과 흰색 레이스 장갑을 낀 소녀가 양 손으로 얼굴을 감싸고 있는, 고품질, 매우 선명, 생생한 색상, 초고해상도, 최상의 품질, 8k, 고화질, 4K",
     "image/woman_3.png"],
    ["검은색 스포츠 재킷과 흰색 이너를 입고 목걸이를 한 여성이 거리에 서 있는, 배경은 빨간 건물과 녹색 나무, 고품질, 매우 선명, 생생한 색상, 초고해상도, 최상의 품질, 8k, 고화질, 4K",
     "image/woman_4.png"]
]

css = """
footer {
    visibility: hidden;
}
"""

def load_description(fp):
    with open(fp, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
    with gr.Row():
        with gr.Column(elem_id="col-left"):
            with gr.Row():
                prompt = gr.Textbox(
                    label="프롬프트",
                    placeholder="프롬프트를 입력하세요 (한글 또는 영어)",
                    lines=2
                )
            with gr.Row():
                image = gr.Image(label="이미지", type="pil")
            with gr.Accordion("고급 설정", open=False):
                negative_prompt = gr.Textbox(
                    label="네거티브 프롬프트",
                    placeholder="네거티브 프롬프트를 입력하세요",
                    visible=True,
                    value="nsfw, 얼굴 그림자, 저해상도, jpeg 아티팩트, 흐릿함, 열악함, 검은 얼굴, 네온 조명"
                )
                seed = gr.Slider(
                    label="시드",
                    minimum=0,
                    maximum=MAX_SEED,
                    step=1,
                    value=0,
                )
                randomize_seed = gr.Checkbox(label="시드 무작위화", value=True)
                with gr.Row():
                    guidance_scale = gr.Slider(
                        label="가이던스 스케일",
                        minimum=0.0,
                        maximum=10.0,
                        step=0.1,
                        value=6.0,
                    )
                    num_inference_steps = gr.Slider(
                        label="추론 단계 수",
                        minimum=10,
                        maximum=50,
                        step=1,
                        value=30,
                    )
                with gr.Row():
                    controlnet_conditioning_scale = gr.Slider(
                        label="컨트롤넷 컨디셔닝 스케일",
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        value=0.7,
                    )
                    control_guidance_end = gr.Slider(
                        label="컨트롤 가이던스 종료",
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        value=0.9,
                    )
                with gr.Row():
                    strength = gr.Slider(
                        label="강도",
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        value=1.0,
                    )
            with gr.Row():
                canny_button = gr.Button("캐니", elem_id="button")
                depth_button = gr.Button("깊이", elem_id="button")
                pose_button = gr.Button("포즈", elem_id="button")
            
        with gr.Column(elem_id="col-right"):
            result = gr.Gallery(label="결과", show_label=False, columns=2)
            seed_used = gr.Number(label="사용된 시드")
            original_prompt_display = gr.Textbox(label="원본 프롬프트")
            english_prompt_display = gr.Textbox(label="영어 프롬프트")
    
    with gr.Row():
        gr.Examples(
                fn=infer_canny,
                examples=canny_examples,
                inputs=[prompt, image],
                outputs=[result, seed_used, original_prompt_display, english_prompt_display],
                label="Canny"
            )
    with gr.Row():
        gr.Examples(
                fn=infer_depth,
                examples=depth_examples,
                inputs=[prompt, image],
                outputs=[result, seed_used, original_prompt_display, english_prompt_display],
                label="Depth"
            )
        
    with gr.Row():
        gr.Examples(
                fn=infer_pose,
                examples=pose_examples,
                inputs=[prompt, image],
                outputs=[result, seed_used, original_prompt_display, english_prompt_display],
                label="Pose"
            )

    canny_button.click(
        fn=infer_canny,
        inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
        outputs=[result, seed_used, original_prompt_display, english_prompt_display]
    )

    depth_button.click(
        fn=infer_depth,
        inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
        outputs=[result, seed_used, original_prompt_display, english_prompt_display]
    )

    pose_button.click(
        fn=infer_pose,
        inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
        outputs=[result, seed_used, original_prompt_display, english_prompt_display]
    )

Kolors.queue().launch(debug=True)