import os import spaces os.environ['HF_HOME'] = os.path.join(os.path.dirname(__file__), 'hf_download') HF_TOKEN = os.environ['hf_token'] if 'hf_token' in os.environ else None import uuid import time import torch import numpy as np import gradio as gr import tempfile gradio_temp_dir = os.path.join(tempfile.gettempdir(), 'gradio') os.makedirs(gradio_temp_dir, exist_ok=True) from threading import Thread # Phi3 Hijack from transformers.models.phi3.modeling_phi3 import Phi3PreTrainedModel Phi3PreTrainedModel._supports_sdpa = True from PIL import Image from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from diffusers import AutoencoderKL, UNet2DConditionModel from diffusers.models.attention_processor import AttnProcessor2_0 from transformers import CLIPTextModel, CLIPTokenizer from lib_omost.pipeline import StableDiffusionXLOmostPipeline from chat_interface import ChatInterface from transformers.generation.stopping_criteria import StoppingCriteriaList import lib_omost.canvas as omost_canvas # SDXL # sdxl_name = 'SG161222/RealVisXL_V4.0' # # sdxl_name = 'stabilityai/stable-diffusion-xl-base-1.0' # tokenizer = CLIPTokenizer.from_pretrained( # sdxl_name, subfolder="tokenizer") # tokenizer_2 = CLIPTokenizer.from_pretrained( # sdxl_name, subfolder="tokenizer_2") # text_encoder = CLIPTextModel.from_pretrained( # sdxl_name, subfolder="text_encoder", torch_dtype=torch.float16, variant="fp16", device_map="auto") # text_encoder_2 = CLIPTextModel.from_pretrained( # sdxl_name, subfolder="text_encoder_2", torch_dtype=torch.float16, variant="fp16", device_map="auto") # vae = AutoencoderKL.from_pretrained( # sdxl_name, subfolder="vae", torch_dtype=torch.bfloat16, variant="fp16", device_map="auto") # bfloat16 vae # unet = UNet2DConditionModel.from_pretrained( # sdxl_name, subfolder="unet", torch_dtype=torch.float16, variant="fp16", device_map="auto") sdxl_name = 'SG161222/RealVisXL_V4.0' tokenizer = CLIPTokenizer.from_pretrained( sdxl_name, subfolder="tokenizer") tokenizer_2 = CLIPTokenizer.from_pretrained( sdxl_name, subfolder="tokenizer_2") text_encoder = CLIPTextModel.from_pretrained( sdxl_name, subfolder="text_encoder", torch_dtype=torch.float32, device_map="auto") text_encoder_2 = CLIPTextModel.from_pretrained( sdxl_name, subfolder="text_encoder_2", torch_dtype=torch.float32, device_map="auto") vae = AutoencoderKL.from_pretrained( sdxl_name, subfolder="vae", torch_dtype=torch.float32, device_map="auto") unet = UNet2DConditionModel.from_pretrained( sdxl_name, subfolder="unet", torch_dtype=torch.float32, device_map="auto") unet.set_attn_processor(AttnProcessor2_0()) vae.set_attn_processor(AttnProcessor2_0()) pipeline = StableDiffusionXLOmostPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, text_encoder_2=text_encoder_2, tokenizer_2=tokenizer_2, unet=unet, scheduler=None, # We completely give up diffusers sampling system and use A1111's method ) # LLM # model_name = 'lllyasviel/omost-phi-3-mini-128k' llm_name = 'lllyasviel/omost-llama-3-8b' # model_name = 'lllyasviel/omost-dolphin-2.9-llama3-8b' llm_model = AutoModelForCausalLM.from_pretrained( llm_name, torch_dtype="auto", token=HF_TOKEN, device_map="auto", trust_remote_code=True, ) llm_tokenizer = AutoTokenizer.from_pretrained( llm_name, token=HF_TOKEN ) @torch.inference_mode() def pytorch2numpy(imgs): results = [] for x in imgs: y = x.movedim(0, -1) y = y * 127.5 + 127.5 y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8) results.append(y) return results @torch.inference_mode() def numpy2pytorch(imgs): h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0 h = h.movedim(-1, 1) return h def resize_without_crop(image, target_width, target_height): pil_image = Image.fromarray(image) resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS) return np.array(resized_image) @spaces.GPU(duration=120) @torch.inference_mode() def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: float, max_new_tokens: int) -> str: print('Chat begin:', message) time_stamp = time.time() np.random.seed(int(seed)) torch.manual_seed(int(seed)) conversation = [{"role": "system", "content": omost_canvas.system_prompt}] for user, assistant in history: if isinstance(user, str) and isinstance(assistant, str): if len(user) > 0 and len(assistant) > 0: conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) conversation.append({"role": "user", "content": message}) input_ids = llm_tokenizer.apply_chat_template( conversation, return_tensors="pt", add_generation_prompt=True).to(llm_model.device) streamer = TextIteratorStreamer(llm_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) def interactive_stopping_criteria(*args, **kwargs) -> bool: if getattr(streamer, 'user_interrupted', False): print('User stopped generation:', message) return True else: return False stopping_criteria = StoppingCriteriaList([interactive_stopping_criteria]) def interrupter(): streamer.user_interrupted = True return generate_kwargs = dict( input_ids=input_ids, streamer=streamer, stopping_criteria=stopping_criteria, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, ) if temperature == 0: generate_kwargs['do_sample'] = False Thread(target=llm_model.generate, kwargs=generate_kwargs).start() outputs = [] for text in streamer: outputs.append(text) # print(outputs) yield "".join(outputs), None print(f'Chat end at {time.time() - time_stamp:.2f} seconds:', message) return @torch.inference_mode() def post_chat(history): canvas_outputs = None try: if history: history = [(user, assistant) for user, assistant in history if isinstance(user, str) and isinstance(assistant, str)] last_assistant = history[-1][1] if len(history) > 0 else None canvas = omost_canvas.Canvas.from_bot_response(last_assistant) canvas_outputs = canvas.process() except Exception as e: print('Last assistant response is not valid canvas:', e) return canvas_outputs, gr.update(visible=canvas_outputs is not None), gr.update(interactive=len(history) > 0) def preprocess_product_image(image, target_width, target_height): image = image.convert("RGB") image = image.resize((target_width, target_height), Image.LANCZOS) image = np.array(image) image = image.astype(np.float32) / 127.5 - 1.0 image = np.transpose(image, (2, 0, 1)) return torch.from_numpy(image).unsqueeze(0) @spaces.GPU @torch.inference_mode() def diffusion_fn(chatbot, canvas_outputs, num_samples, seed, image_width, image_height, highres_scale, steps, cfg, highres_steps, highres_denoise, negative_prompt, product_image): use_initial_latent = False eps = 0.05 image_width, image_height = int(image_width // 64) * 64, int(image_height // 64) * 64 print("Diffusion function called . . . ") if product_image is not None: product_image = preprocess_product_image(product_image, image_width, image_height) rng = torch.Generator(unet.device).manual_seed(seed) positive_cond, positive_pooler, negative_cond, negative_pooler = pipeline.all_conds_from_canvas(canvas_outputs, negative_prompt) if use_initial_latent: initial_latent = torch.from_numpy(canvas_outputs['initial_latent'])[None].movedim(-1, 1) / 127.5 - 1.0 initial_latent_blur = 40 initial_latent = torch.nn.functional.avg_pool2d( torch.nn.functional.pad(initial_latent, (initial_latent_blur,) * 4, mode='reflect'), kernel_size=(initial_latent_blur * 2 + 1,) * 2, stride=(1, 1)) initial_latent = torch.nn.functional.interpolate(initial_latent, (image_height, image_width)) initial_latent = initial_latent.to(dtype=vae.dtype, device=vae.device) initial_latent = vae.encode(initial_latent).latent_dist.mode() * vae.config.scaling_factor else: initial_latent = torch.zeros(size=(num_samples, 4, image_height // 8, image_width // 8), dtype=torch.float32) initial_latent = initial_latent.to(dtype=unet.dtype, device=unet.device) print("Generating latents . . .") latents = pipeline( initial_latent=initial_latent, strength=1.0, num_inference_steps=int(steps), batch_size=num_samples, prompt_embeds=positive_cond, negative_prompt_embeds=negative_cond, pooled_prompt_embeds=positive_pooler, negative_pooled_prompt_embeds=negative_pooler, generator=rng, guidance_scale=float(cfg), product_image=product_image, ).images print("Latents generated. Processing results...") latents = latents.to(dtype=vae.dtype, device=vae.device) / vae.config.scaling_factor pixels = vae.decode(latents).sample B, C, H, W = pixels.shape pixels = pytorch2numpy(pixels) if highres_scale > 1.0 + eps: pixels = [ resize_without_crop( image=p, target_width=int(round(W * highres_scale / 64.0) * 64), target_height=int(round(H * highres_scale / 64.0) * 64) ) for p in pixels ] pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype) latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor latents = latents.to(device=unet.device, dtype=unet.dtype) latents = pipeline( initial_latent=latents, strength=highres_denoise, num_inference_steps=highres_steps, batch_size=num_samples, prompt_embeds=positive_cond, negative_prompt_embeds=negative_cond, pooled_prompt_embeds=positive_pooler, negative_pooled_prompt_embeds=negative_pooler, generator=rng, guidance_scale=float(cfg), ).images latents = latents.to(dtype=vae.dtype, device=vae.device) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels) for i in range(len(pixels)): unique_hex = uuid.uuid4().hex image_path = os.path.join(gradio_temp_dir, f"{unique_hex}_{i}.png") image = Image.fromarray(pixels[i]) image.save(image_path) chatbot = chatbot + [(None, (image_path, 'image'))] return chatbot css = ''' code {white-space: pre-wrap !important;} .gradio-container {max-width: none !important;} .outer_parent {flex: 1;} .inner_parent {flex: 1;} footer {display: none !important; visibility: hidden !important;} .translucent {display: none !important; visibility: hidden !important;} ''' from gradio.themes.utils import colors with gr.Blocks( fill_height=True, css=css, theme=gr.themes.Default(primary_hue=colors.blue, secondary_hue=colors.cyan, neutral_hue=colors.gray) ) as demo: with gr.Row(elem_classes='outer_parent'): with gr.Column(scale=25): product_image = gr.Image(label="Product Image", type="pil") with gr.Row(): clear_btn = gr.Button("➕ New Chat", variant="secondary", size="sm", min_width=60) retry_btn = gr.Button("Retry", variant="secondary", size="sm", min_width=60, visible=False) undo_btn = gr.Button("✏️️ Edit Last Input", variant="secondary", size="sm", min_width=60, interactive=False) seed = gr.Number(label="Random Seed", value=123456, precision=0) with gr.Accordion(open=True, label='Language Model'): with gr.Group(): with gr.Row(): temperature = gr.Slider( minimum=0.0, maximum=2.0, step=0.01, value=0.6, label="Temperature") top_p = gr.Slider( minimum=0.0, maximum=1.0, step=0.01, value=0.9, label="Top P") max_new_tokens = gr.Slider( minimum=128, maximum=4096, step=1, value=4096, label="Max New Tokens") with gr.Accordion(open=True, label='Image Diffusion Model'): with gr.Group(): with gr.Row(): image_width = gr.Slider(label="Image Width", minimum=256, maximum=2048, value=896, step=64) image_height = gr.Slider(label="Image Height", minimum=256, maximum=2048, value=1152, step=64) with gr.Row(): num_samples = gr.Slider(label="Image Number", minimum=1, maximum=12, value=1, step=1) steps = gr.Slider(label="Sampling Steps", minimum=1, maximum=100, value=25, step=1) with gr.Accordion(open=False, label='Advanced'): cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=5.0, step=0.01) highres_scale = gr.Slider(label="HR-fix Scale (\"1\" is disabled)", minimum=1.0, maximum=2.0, value=1.0, step=0.01) highres_steps = gr.Slider(label="Highres Fix Steps", minimum=1, maximum=100, value=20, step=1) highres_denoise = gr.Slider(label="Highres Fix Denoise", minimum=0.1, maximum=1.0, value=0.4, step=0.01) n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality') render_button = gr.Button("Render the Image!", size='lg', variant="primary", visible=False) examples = gr.Dataset( samples=[ ['generate an image of the fierce battle of warriors and the dragon'], ['change the dragon to a dinosaur'] ], components=[gr.Textbox(visible=False)], label='Quick Prompts' ) with gr.Row(): gr.Markdown("Omost: converting LLM's coding capability to image compositing capability.") with gr.Row(): gr.Markdown("Local version (8GB VRAM): https://github.com/lllyasviel/Omost") # with gr.Row(): # gr.Markdown("Hint: You can [duplicate this space](https://huggingface.co/spaces/lllyasviel/Omost?duplicate=true) to your private account to bypass the waiting queue.") with gr.Column(scale=75, elem_classes='inner_parent'): canvas_state = gr.State(None) chatbot = gr.Chatbot(label='Omost', scale=1, show_copy_button=True, layout="panel", render=False) chatInterface = ChatInterface( fn=chat_fn, post_fn=post_chat, post_fn_kwargs=dict(inputs=[chatbot], outputs=[canvas_state, render_button, undo_btn]), pre_fn=lambda: gr.update(visible=False), pre_fn_kwargs=dict(outputs=[render_button]), chatbot=chatbot, retry_btn=retry_btn, undo_btn=undo_btn, clear_btn=clear_btn, additional_inputs=[seed, temperature, top_p, max_new_tokens], examples=examples, show_stop_button=False ) render_button.click( fn=diffusion_fn, inputs=[ chatInterface.chatbot, canvas_state, num_samples, seed, image_width, image_height, highres_scale, steps, cfg, highres_steps, highres_denoise, n_prompt, product_image ], outputs=[chatInterface.chatbot]).then( fn=lambda x: x, inputs=[ chatInterface.chatbot ], outputs=[chatInterface.chatbot_state]) if __name__ == "__main__": demo.queue().launch()