Spaces:

KAIST-Visual-AI-Group
/

syncdiffusion-demo

Sleeping

App Files Files Community

phillipinseoul commited on Sep 22, 2023

Commit

47a3cb0

•

1 Parent(s): 31d5771

add app.py

Browse files

Files changed (10) hide show

app.py +87 -0
requirements.txt +9 -0
syncdiffusion/__pycache__/model.cpython-38.pyc +0 -0
syncdiffusion/__pycache__/syncdiffusion.cpython-38.pyc +0 -0
syncdiffusion/__pycache__/syncdiffusion_model.cpython-38.pyc +0 -0
syncdiffusion/__pycache__/syncdiffusion_model.cpython-39.pyc +0 -0
syncdiffusion/__pycache__/utils.cpython-38.pyc +0 -0
syncdiffusion/__pycache__/utils.cpython-39.pyc +0 -0
syncdiffusion/syncdiffusion_model.py +232 -0
syncdiffusion/utils.py +28 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+app.py
+An interactive demo for text-guided panorama generation.
+"""
+import torch
+import gradio as gr
+from syncdiffusion.syncdiffusion_model import SyncDiffusion
+from syncdiffusion.utils import seed_everything
+def run_inference(
+        prompt: str,
+        height: int = 512,
+        width: int = 2048,
+        sync_weight: float = 20.0,
+        sync_decay_rate: float = 0.95,
+        sync_freq: int = 1,
+        sync_thres: int = 5,
+        seed: int = 0
+    ):
+    # set device
+    device = torch.device = torch.device("cuda")
+    # set random seed
+    seed_everything(seed)
+    # load SyncDiffusion model
+    syncdiffusion = SyncDiffusion(device, sd_version="2.0")
+    img = syncdiffusion.sample_syncdiffusion(
+        prompts = prompt,
+        negative_prompts = "",
+        height = height,
+        width = width,
+        num_inference_steps = 50,
+        guidance_scale = 7.5,
+        sync_weight = sync_weight,
+        sync_decay_rate = sync_decay_rate,
+        sync_freq = sync_freq,
+        sync_thres = sync_thres,
+        stride = 16
+        )
+    return [img]
+if __name__=="__main__":
+    title = "SyncDiffusion: Text-Guided Panorama Generation"
+    description_text = '''
+    This demo features text-guided panorama generation from our work <a href="https://arxiv.org/abs/2306.05178">SyncDiffusion: Coherent Montage via Synchronized Joint Diffusions, NeurIPS 2023</a>.
+    Please refer to our <a href="https://syncdiffusion.github.io/">project page</a> for details.
+    '''
+    # create UI
+    with gr.Blocks(title=title) as demo:
+        # description of demo
+        gr.Markdown(description_text)
+        # inputs
+        with gr.Row():
+            with gr.Column():
+                run_button = gr.Button(label="Generate")
+                prompt = gr.Textbox(label="Text Prompt", value='a cinematic view of a castle in the sunset')
+                width = gr.Slider(label="Width", minimum=512, maximum=4096, value=2048, step=128)
+                sync_weight = gr.Slider(label="Sync Weight", minimum=0.0, maximum=30.0, value=20.0, step=5.0)
+                sync_thres = gr.Slider(label="Sync Threshold (If N, apply SyncDiffusion for the first N steps)", minimum=0, maximum=50, value=5, step=1)
+                seed = gr.Number(label="Seed", value=0)
+            with gr.Column():
+                result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+        # display examples
+        examples = gr.Examples(
+            examples=[
+                'a cinematic view of a castle in the sunset',
+                'natural landscape in anime style illustration',
+                'a photo of a lake under the northern lights',
+            ],
+            inputs=[prompt],
+        )
+        ips = [prompt, width, sync_weight, sync_thres, seed]
+        run_button.click(fn=run_inference(), inputs=ips, outputs=[result_gallery])
+    demo.queue(max_size=30)
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch==1.12.1
+torchvision==0.13.1
+transformers==4.28
+diffusers==0.15.1
+opencv-python==4.5.1.48
+lpips
+accelerate
+tqdm
+gradio

syncdiffusion/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (5.81 kB). View file

syncdiffusion/__pycache__/syncdiffusion.cpython-38.pyc ADDED Viewed

Binary file (5.75 kB). View file

syncdiffusion/__pycache__/syncdiffusion_model.cpython-38.pyc ADDED Viewed

Binary file (5.53 kB). View file

syncdiffusion/__pycache__/syncdiffusion_model.cpython-39.pyc ADDED Viewed

Binary file (5.63 kB). View file

syncdiffusion/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.3 kB). View file

syncdiffusion/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.29 kB). View file

syncdiffusion/syncdiffusion_model.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from torch.autograd import grad
+import argparse
+from tqdm import tqdm
+from syncdiffusion.utils import *
+import lpips
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler
+class SyncDiffusion(nn.Module):
+    def __init__(self, device='cuda', sd_version='2.0', hf_key=None):
+        super().__init__()
+        self.device = device
+        self.sd_version = sd_version
+        print(f'[INFO] loading stable diffusion...')
+        if hf_key is not None:
+            print(f'[INFO] using hugging face custom model key: {hf_key}')
+            model_key = hf_key
+        elif self.sd_version == '2.1':
+            model_key = "stabilityai/stable-diffusion-2-1-base"
+        elif self.sd_version == '2.0':
+            model_key = "stabilityai/stable-diffusion-2-base"
+        elif self.sd_version == '1.5':
+            model_key = "runwayml/stable-diffusion-v1-5"
+        else:
+            raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
+        # Load pretrained models from HuggingFace
+        self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae").to(self.device)
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder").to(self.device)
+        self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet").to(self.device)
+        # Freeze models
+        for p in self.unet.parameters():
+            p.requires_grad_(False)
+        for p in self.vae.parameters():
+            p.requires_grad_(False)
+        for p in self.text_encoder.parameters():
+            p.requires_grad_(False)
+        self.unet.eval()
+        self.vae.eval()
+        self.text_encoder.eval()
+        print(f'[INFO] loaded stable diffusion!')
+        # Set DDIM scheduler
+        self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
+        # load perceptual loss (LPIPS)
+        self.percept_loss = lpips.LPIPS(net='vgg').to(self.device)
+        print(f'[INFO] loaded perceptual loss!')
+    def get_text_embeds(self, prompt, negative_prompt):
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(prompt, padding='max_length', max_length=self.tokenizer.model_max_length,
+                                    truncation=True, return_tensors='pt')
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # Repeat for unconditional embeddings
+        uncond_input = self.tokenizer(negative_prompt, padding='max_length', max_length=self.tokenizer.model_max_length,
+                                      return_tensors='pt')
+        uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+        # Concatenate for final embeddings
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def sample_syncdiffusion(
+        self,
+        prompts,
+        negative_prompts="",
+        height=512,
+        width=2048,
+        latent_size=64,                     # fix latent size to 64 for Stable Diffusion
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        sync_weight=20,                     # gradient descent weight 'w' in the paper
+        sync_freq=1,                        # sync_freq=n: perform gradient descent every n steps
+        sync_thres=50,                      # sync_thres=n: compute SyncDiffusion only for the first n steps
+        sync_decay_rate=0.95,               # decay rate for sync_weight, set as 0.95 in the paper
+        stride=16,                          # stride for latents, set as 16 in the paper
+    ):
+        assert height >= 512 and width >= 512, 'height and width must be at least 512'
+        assert height % (stride * 8) == 0 and width % (stride * 8) == 0, 'height and width must be divisible by the stride multiplied by 8'
+        assert stride % 8 == 0 and stride < 64, 'stride must be divisible by 8 and smaller than the latent size of Stable Diffusion'
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # obtain text embeddings
+        text_embeds = self.get_text_embeds(prompts, negative_prompts)  # [2, 77, 768]
+        # define a list of windows to process in parallel
+        views = get_views(height, width, stride=stride)
+        print(f"[INFO] number of views to process: {len(views)}")
+        # Initialize latent
+        latent = torch.randn((1, self.unet.in_channels, height // 8, width // 8))
+        count = torch.zeros_like(latent, requires_grad=False, device=self.device)
+        value = torch.zeros_like(latent, requires_grad=False, device=self.device)
+        latent = latent.to(self.device)
+        # set DDIM scheduler
+        self.scheduler.set_timesteps(num_inference_steps)
+        # set the anchor view as the middle view
+        anchor_view_idx = len(views) // 2
+        # set SyncDiffusion scheduler
+        sync_scheduler = exponential_decay_list(
+            init_weight=sync_weight,
+            decay_rate=sync_decay_rate,
+            num_steps=num_inference_steps
+        )
+        print(f'[INFO] using exponential decay scheduler with decay rate {sync_decay_rate}')
+        with torch.autocast('cuda'):
+            for i, t in enumerate(tqdm(self.scheduler.timesteps)):
+                count.zero_()
+                value.zero_()
+                '''
+                (1) First, obtain the reference anchor view (for computing the perceptual loss)
+                '''
+                with torch.no_grad():
+                    if (i + 1) % sync_freq == 0 and i < sync_thres:
+                        # decode the anchor view
+                        h_start, h_end, w_start, w_end = views[anchor_view_idx]
+                        latent_view = latent[:, :, h_start:h_end, w_start:w_end].detach()
+                        latent_model_input = torch.cat([latent_view] * 2)                                               # 2 x 4 x 64 x 64
+                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeds)['sample']
+                        # perform guidance
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred_new = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                        # predict the 'foreseen denoised' latent (x0) of the anchor view
+                        latent_pred_x0 = self.scheduler.step(noise_pred_new, t, latent_view)["pred_original_sample"]
+                        decoded_image_anchor = self.decode_latents(latent_pred_x0)                                      # 1 x 3 x 512 x 512
+                '''
+                (2) Then perform SyncDiffusion and run a single denoising step
+                '''
+                for view_idx, (h_start, h_end, w_start, w_end) in enumerate(views):
+                    latent_view = latent[:, :, h_start:h_end, w_start:w_end].detach()
+                    ############################## BEGIN: PERFORM GRADIENT DESCENT (SyncDiffusion) ##############################
+                    latent_view_copy = latent_view.clone().detach()
+                    #### TODO: TEST ####
+                    # if i % sync_freq == 0 and i < sync_thres:
+                    if (i + 1) % sync_freq == 0 and i < sync_thres:
+                        # gradient on latent_view
+                        latent_view = latent_view.requires_grad_()
+                        # expand the latents for classifier-free guidance
+                        latent_model_input = torch.cat([latent_view] * 2)
+                        # predict the noise residual
+                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeds)['sample']
+                        # perform guidance
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred_new = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                        # compute the denoising step with the reference model
+                        out = self.scheduler.step(noise_pred_new, t, latent_view)
+                        # predict the 'foreseen denoised' latent (x0)
+                        latent_view_x0 = out['pred_original_sample']
+                        # decode the denoised latent
+                        decoded_x0 = self.decode_latents(latent_view_x0)                 # 1 x 3 x 512 x 512
+                        # compute the perceptual loss (LPIPS)
+                        percept_loss = self.percept_loss(
+                            decoded_x0 * 2.0 - 1.0,
+                            decoded_image_anchor * 2.0 - 1.0
+                        )
+                        # compute the gradient of the perceptual loss w.r.t. the latent
+                        norm_grad = grad(outputs=percept_loss, inputs=latent_view)[0]
+                        # SyncDiffusion: update the original latent
+                        if view_idx != anchor_view_idx:
+                            latent_view_copy = latent_view_copy - sync_scheduler[i] * norm_grad                             # 1 x 4 x 64 x 64
+                    ############################## END: PERFORM GRADIENT DESCENT (SyncDiffusion) ##############################
+                    # after gradient descent, perform a single denoising step
+                    with torch.no_grad():
+                        latent_model_input = torch.cat([latent_view_copy] * 2)
+                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeds)['sample']
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred_new = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                        out = self.scheduler.step(noise_pred_new, t, latent_view_copy)
+                        latent_view_denoised = out['prev_sample']
+                    # merge the latent views
+                    value[:, :, h_start:h_end, w_start:w_end] += latent_view_denoised
+                    count[:, :, h_start:h_end, w_start:w_end] += 1
+                # take the MultiDiffusion step (average the latents)
+                latent = torch.where(count > 0, value / count, value)
+        # decode latents to panorama image
+        with torch.no_grad():
+            imgs = self.decode_latents(latent)  # [1, 3, 512, 512]
+            img = T.ToPILImage()(imgs[0].cpu())
+        print(f"[INFO] Done!")
+        return img

syncdiffusion/utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import torch
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def get_views(panorama_height, panorama_width, window_size=64, stride=8):
+    panorama_height /= 8
+    panorama_width /= 8
+    num_blocks_height = (panorama_height - window_size) // stride + 1
+    num_blocks_width = (panorama_width - window_size) // stride + 1
+    total_num_blocks = int(num_blocks_height * num_blocks_width)
+    views = []
+    for i in range(total_num_blocks):
+        h_start = int((i // num_blocks_width) * stride)
+        h_end = h_start + window_size
+        w_start = int((i % num_blocks_width) * stride)
+        w_end = w_start + window_size
+        views.append((h_start, h_end, w_start, w_end))
+    return views
+def exponential_decay_list(init_weight, decay_rate, num_steps):
+    weights = [init_weight * (decay_rate ** i) for i in range(num_steps)]
+    return torch.tensor(weights)