Spaces:

rlawjdghek
/

StableVITON

Running on Zero

App Files Files Community

rlawjdghek commited on Apr 26

Commit

8df522a

•

1 Parent(s): ad267aa

update

Browse files

Files changed (5) hide show

app.py +97 -45
cldm/cldm.py +3 -2
ldm/models/autoencoder.py +3 -3
ldm/models/diffusion/ddpm.py +2 -2
utils_stableviton.py +14 -4

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from preprocess.detectron2.projects.DensePose.apply_net_gradio import DensePose4Gradio
 from preprocess.humanparsing.run_parsing import Parsing
 from preprocess.openpose.run_openpose import OpenPose
-import pytorch_lightning as pl
 import os
 import sys
 import time
@@ -17,14 +17,13 @@ import spaces
 from cldm.model import create_model
 from cldm.plms_hacked import PLMSSampler
-from utils_stableviton import get_batch, get_mask_location, tensor2img
-print("pip import done")
 PROJECT_ROOT = Path(__file__).absolute().parents[1].absolute()
 sys.path.insert(0, str(PROJECT_ROOT))
-IMG_H = 512
-IMG_W = 384
 openpose_model_hd = OpenPose(0)
 openpose_model_hd.preprocessor.body_estimation.model.to('cuda')
@@ -44,18 +43,27 @@ config.model.params.img_W = IMG_W
 params = config.model.params
 model = create_model(config_path=None, config=config)
-model.load_state_dict(torch.load("./checkpoints/VITONHD.ckpt", map_location="cpu")["state_dict"])
 model = model.cuda()
 model.eval()
 sampler = PLMSSampler(model)
-# #### model init <<<<
 def stable_viton_model_hd(
         batch,
         n_steps,
 ):
     z, cond = model.get_input(batch, params.first_stage_key)
     bs = z.shape[0]
     c_crossattn = cond["c_crossattn"][0][:bs]
     if c_crossattn.ndim == 4:
@@ -71,16 +79,16 @@ def stable_viton_model_hd(
     ts = torch.full((1,), 999, device=z.device, dtype=torch.long)
     start_code = model.q_sample(z, ts)
     output, _, _ = sampler.sample(
         n_steps,
         bs,
-        (4, IMG_H // 8, IMG_W // 8),
         cond,
-        x_T=start_code,
         verbose=False,
         eta=0.0,
-        unconditional_conditioning=uc_full,
     )
     output = model.decode_first_stage(output)
@@ -88,61 +96,107 @@ def stable_viton_model_hd(
     pil_output = Image.fromarray(output)
     return pil_output
-@spaces.GPU  # TODO: turn on when final upload
 @torch.no_grad()
-def process_hd(vton_img, garm_img, n_steps):
     model_type = 'hd'
     category = 0  # 0:upperbody; 1:lowerbody; 2:dress
     stt = time.time()
     print('load images... ', end='')
-    garm_img = Image.open(garm_img).resize((IMG_W, IMG_H))
-    vton_img = Image.open(vton_img).resize((IMG_W, IMG_H))
     print('%.2fs' % (time.time() - stt))
     stt = time.time()
     print('get agnostic map... ', end='')
     keypoints = openpose_model_hd(vton_img.resize((IMG_W, IMG_H)))
     model_parse, _ = parsing_model_hd(vton_img.resize((IMG_W, IMG_H)))
-    mask, mask_gray = get_mask_location(model_type, category_dict_utils[category], model_parse, keypoints)
     mask = mask.resize((IMG_W, IMG_H), Image.NEAREST)
     mask_gray = mask_gray.resize((IMG_W, IMG_H), Image.NEAREST)
     masked_vton_img = Image.composite(mask_gray, vton_img, mask)  # agnostic map
     print('%.2fs' % (time.time() - stt))
     stt = time.time()
     print('get densepose... ', end='')
     vton_img = vton_img.resize((IMG_W, IMG_H))  # size for densepose
     densepose = densepose_model_hd.execute(vton_img)  # densepose
-    # human_img_arg = _apply_exif_orientation(vton_img.resize((IMG_W, IMG_H)))
-    # human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
-    # args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
-    # verbosity = getattr(args, "verbosity", None)
-    # pose_img = args.func(args, human_img_arg)
-    # pose_img = pose_img[:, :, ::-1]
-    # pose_img = Image.fromarray(pose_img).resize((IMG_W, IMG_H))
     print('%.2fs' % (time.time() - stt))
     batch = get_batch(
-        vton_img,
-        garm_img,
-        densepose,
-        masked_vton_img,
-        mask,
-        IMG_H,
         IMG_W
     )
-    sample = stable_viton_model_hd(
-        batch,
-        n_steps
-    )
     return sample
-example_path = opj(os.path.dirname(__file__), 'examples')
 example_model_ps = sorted(glob(opj(example_path, "model/*")))
 example_garment_ps = sorted(glob(opj(example_path, "garment/*")))
@@ -151,7 +205,7 @@ with gr.Blocks(css='style.css') as demo:
         """
         <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
             <div>
-                <h1>StableVITON Demo 👕👔👗</h1>
                 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
                     <a href='https://arxiv.org/abs/2312.01725'>
                         <img src="https://img.shields.io/badge/arXiv-2312.01725-red">
@@ -189,17 +243,15 @@ with gr.Blocks(css='style.css') as demo:
                 examples_per_page=14,
                 examples=example_garment_ps)
         with gr.Column():
-            result_gallery = gr.Image(label='Output', show_label=False, scale=1)
-            # result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery", scale=1)
     with gr.Column():
         run_button = gr.Button(value="Run")
-        # TODO: change default values (important!)
-        # n_samples = gr.Slider(label="Images", minimum=1, maximum=4, value=1, step=1)
-        n_steps = gr.Slider(label="Steps", minimum=20, maximum=70, value=25, step=1)
-        # guidance_scale = gr.Slider(label="Guidance scale", minimum=1.0, maximum=5.0, value=2.0, step=0.1)
         # seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=-1)
-    ips = [vton_img, garm_img, n_steps]
     run_button.click(fn=process_hd, inputs=ips, outputs=[result_gallery])
 demo.queue().launch()

 from preprocess.detectron2.projects.DensePose.apply_net_gradio import DensePose4Gradio
 from preprocess.humanparsing.run_parsing import Parsing
 from preprocess.openpose.run_openpose import OpenPose
 import os
 import sys
 import time
 from cldm.model import create_model
 from cldm.plms_hacked import PLMSSampler
+from utils_stableviton import get_mask_location, get_batch, tensor2img, center_crop
 PROJECT_ROOT = Path(__file__).absolute().parents[1].absolute()
 sys.path.insert(0, str(PROJECT_ROOT))
+IMG_H = 1024
+IMG_W = 768
 openpose_model_hd = OpenPose(0)
 openpose_model_hd.preprocessor.body_estimation.model.to('cuda')
 params = config.model.params
 model = create_model(config_path=None, config=config)
+model.load_state_dict(torch.load("./checkpoints/eternal_1024.ckpt", map_location="cpu")["state_dict"])
 model = model.cuda()
 model.eval()
 sampler = PLMSSampler(model)
+model2 = create_model(config_path=None, config=config)
+model2.load_state_dict(torch.load("./checkpoints/VITONHD_1024.ckpt", map_location="cpu")["state_dict"])
+model2 = model.cuda()
+model2.eval()
+sampler2 = PLMSSampler(model2)
+# #### model init <<<<
+@spaces.GPU
+@torch.autocast("cuda")
+@torch.no_grad()
 def stable_viton_model_hd(
         batch,
         n_steps,
 ):
     z, cond = model.get_input(batch, params.first_stage_key)
+    z = z
     bs = z.shape[0]
     c_crossattn = cond["c_crossattn"][0][:bs]
     if c_crossattn.ndim == 4:
     ts = torch.full((1,), 999, device=z.device, dtype=torch.long)
     start_code = model.q_sample(z, ts)
+    torch.cuda.empty_cache()
     output, _, _ = sampler.sample(
         n_steps,
         bs,
+        (4, IMG_H//8, IMG_W//8),
         cond,
+        x_T=start_code,
         verbose=False,
         eta=0.0,
+        unconditional_conditioning=uc_full,
     )
     output = model.decode_first_stage(output)
     pil_output = Image.fromarray(output)
     return pil_output
+@torch.autocast("cuda")
 @torch.no_grad()
+def stable_viton_model_hd2(
+        batch,
+        n_steps,
+):
+    z, cond = model2.get_input(batch, params.first_stage_key)
+    z = z
+    bs = z.shape[0]
+    c_crossattn = cond["c_crossattn"][0][:bs]
+    if c_crossattn.ndim == 4:
+        c_crossattn = model2.get_learned_conditioning(c_crossattn)
+        cond["c_crossattn"] = [c_crossattn]
+    uc_cross = model2.get_unconditional_conditioning(bs)
+    uc_full = {"c_concat": cond["c_concat"], "c_crossattn": [uc_cross]}
+    uc_full["first_stage_cond"] = cond["first_stage_cond"]
+    for k, v in batch.items():
+        if isinstance(v, torch.Tensor):
+            batch[k] = v.cuda()
+    sampler2.model.batch = batch
+    ts = torch.full((1,), 999, device=z.device, dtype=torch.long)
+    start_code = model2.q_sample(z, ts)
+    torch.cuda.empty_cache()
+    output, _, _ = sampler2.sample(
+        n_steps,
+        bs,
+        (4, IMG_H//8, IMG_W//8),
+        cond,
+        x_T=start_code,
+        verbose=False,
+        eta=0.0,
+        unconditional_conditioning=uc_full,
+    )
+    output = model2.decode_first_stage(output)
+    output = tensor2img(output)
+    pil_output = Image.fromarray(output)
+    return pil_output
+# @spaces.GPU  # TODO: turn on when final upload
+@torch.no_grad()
+def process_hd(vton_img, garm_img, n_steps, is_custom):
     model_type = 'hd'
     category = 0  # 0:upperbody; 1:lowerbody; 2:dress
     stt = time.time()
     print('load images... ', end='')
+    # garm_img = Image.open(garm_img).resize((IMG_W, IMG_H))
+    # vton_img = Image.open(vton_img).resize((IMG_W, IMG_H))
+    garm_img = Image.open(garm_img)
+    vton_img = Image.open(vton_img)
+    vton_img = center_crop(vton_img)
+    garm_img = garm_img.resize((IMG_W, IMG_H))
+    vton_img = vton_img.resize((IMG_W, IMG_H))
     print('%.2fs' % (time.time() - stt))
     stt = time.time()
     print('get agnostic map... ', end='')
     keypoints = openpose_model_hd(vton_img.resize((IMG_W, IMG_H)))
     model_parse, _ = parsing_model_hd(vton_img.resize((IMG_W, IMG_H)))
+    mask, mask_gray = get_mask_location(model_type, category_dict_utils[category], model_parse, keypoints, radius=5)
     mask = mask.resize((IMG_W, IMG_H), Image.NEAREST)
     mask_gray = mask_gray.resize((IMG_W, IMG_H), Image.NEAREST)
     masked_vton_img = Image.composite(mask_gray, vton_img, mask)  # agnostic map
     print('%.2fs' % (time.time() - stt))
+    # breakpoint()
     stt = time.time()
     print('get densepose... ', end='')
     vton_img = vton_img.resize((IMG_W, IMG_H))  # size for densepose
     densepose = densepose_model_hd.execute(vton_img)  # densepose
     print('%.2fs' % (time.time() - stt))
     batch = get_batch(
+        vton_img,
+        garm_img,
+        densepose,
+        masked_vton_img,
+        mask,
+        IMG_H,
         IMG_W
     )
+    if is_custom:
+        sample = stable_viton_model_hd(
+            batch,
+            n_steps,
+        )
+    else:
+        sample = stable_viton_model_hd2(
+            batch,
+            n_steps,
+        )
     return sample
+example_path = opj(os.path.dirname(__file__), 'examples_eternal')
 example_model_ps = sorted(glob(opj(example_path, "model/*")))
 example_garment_ps = sorted(glob(opj(example_path, "garment/*")))
         """
         <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
             <div>
+                <h1>Rdy2Wr.AI StableVITON Demo 👕👔👗</h1>
                 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
                     <a href='https://arxiv.org/abs/2312.01725'>
                         <img src="https://img.shields.io/badge/arXiv-2312.01725-red">
                 examples_per_page=14,
                 examples=example_garment_ps)
         with gr.Column():
+            result_gallery = gr.Image(label='Output', show_label=False, preview=True, scale=1)
+            # result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery", preview=True, scale=1)
     with gr.Column():
         run_button = gr.Button(value="Run")
+        n_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=20, step=1)
+        is_custom = gr.Checkbox(label="customized model")
         # seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=-1)
+    ips = [vton_img, garm_img, n_steps, is_custom]
     run_button.click(fn=process_hd, inputs=ips, outputs=[result_gallery])
 demo.queue().launch()

cldm/cldm.py CHANGED Viewed

@@ -32,6 +32,7 @@ class ControlLDM(LatentDiffusion):
             *args,
             **kwargs
         ):
         self.control_stage_config = control_stage_config
         self.use_pbe_weight = use_pbe_weight
         self.u_cond_percent = u_cond_percent
@@ -62,7 +63,7 @@ class ControlLDM(LatentDiffusion):
                     control = control[:bs]
                 control = control.to(self.device)
                 control = einops.rearrange(control, 'b h w c -> b c h w')
-                control = control.to(memory_format=torch.contiguous_format).float()
                 control_lst.append(control)
             control = control_lst
         else:
@@ -71,7 +72,7 @@ class ControlLDM(LatentDiffusion):
                 control = control[:bs]
             control = control.to(self.device)
             control = einops.rearrange(control, 'b h w c -> b c h w')
-            control = control.to(memory_format=torch.contiguous_format).float()
             control = [control]
         cond_dict = dict(c_crossattn=[c], c_concat=control)
         if self.first_stage_key_cond is not None:

             *args,
             **kwargs
         ):
+        self.device = torch.device("cuda")
         self.control_stage_config = control_stage_config
         self.use_pbe_weight = use_pbe_weight
         self.u_cond_percent = u_cond_percent
                     control = control[:bs]
                 control = control.to(self.device)
                 control = einops.rearrange(control, 'b h w c -> b c h w')
+                control = control.to(memory_format=torch.contiguous_format)
                 control_lst.append(control)
             control = control_lst
         else:
                 control = control[:bs]
             control = control.to(self.device)
             control = einops.rearrange(control, 'b h w c -> b c h w')
+            control = control.to(memory_format=torch.contiguous_format)
             control = [control]
         cond_dict = dict(c_crossattn=[c], c_concat=control)
         if self.first_stage_key_cond is not None:

ldm/models/autoencoder.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-import pytorch_lightning as pl
 import torch.nn as nn
 import torch.nn.functional as F
 from contextlib import contextmanager
@@ -9,9 +9,9 @@ from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
 from ldm.util import instantiate_from_config
 from ldm.modules.ema import LitEma
-class AutoencoderKL(pl.LightningModule):
     def __init__(self,
                  ddconfig,
                  lossconfig,

 import torch
+# import pytorch_lightning as pl
 import torch.nn as nn
 import torch.nn.functional as F
 from contextlib import contextmanager
 from ldm.util import instantiate_from_config
 from ldm.modules.ema import LitEma
+class AutoencoderKL(nn.Module):
     def __init__(self,
                  ddconfig,
                  lossconfig,

ldm/models/diffusion/ddpm.py CHANGED Viewed

@@ -9,7 +9,7 @@ https://github.com/CompVis/taming-transformers
 import torch
 import torch.nn as nn
 import numpy as np
-import pytorch_lightning as pl
 from torch.optim.lr_scheduler import LambdaLR
 from einops import rearrange, repeat
 from contextlib import contextmanager, nullcontext
@@ -47,7 +47,7 @@ def disabled_train(self, mode=True):
 def uniform_on_device(r1, r2, shape, device):
     return (r1 - r2) * torch.rand(*shape, device=device) + r2
-class DDPM(pl.LightningModule):
     # classic DDPM with Gaussian diffusion, in image space
     def __init__(self,
                  unet_config,

 import torch
 import torch.nn as nn
 import numpy as np
+# import pytorch_lightning as pl
 from torch.optim.lr_scheduler import LambdaLR
 from einops import rearrange, repeat
 from contextlib import contextmanager, nullcontext
 def uniform_on_device(r1, r2, shape, device):
     return (r1 - r2) * torch.rand(*shape, device=device) + r2
+class DDPM(nn.Module):
     # classic DDPM with Gaussian diffusion, in image space
     def __init__(self,
                  unet_config,

utils_stableviton.py CHANGED Viewed

@@ -24,7 +24,6 @@ label_map = {
     "scarf": 17,
 }
 def extend_arm_mask(wrist, elbow, scale):
     wrist = elbow + scale * (wrist - elbow)
     return wrist
@@ -56,7 +55,7 @@ def refine_mask(mask):
     return refine_mask
-def get_mask_location(model_type, category, model_parse: Image.Image, keypoint: dict, width=384, height=512):
     im_parse = model_parse.resize((width, height), Image.NEAREST)
     parse_array = np.array(im_parse)
@@ -149,10 +148,10 @@ def get_mask_location(model_type, category, model_parse: Image.Image, keypoint:
         parser_mask_fixed += hands_left + hands_right
     parser_mask_fixed = np.logical_or(parser_mask_fixed, parse_head)
-    parse_mask = cv2.dilate(parse_mask, np.ones((5, 5), np.uint16), iterations=5)
     if category == 'dresses' or category == 'upper_body':
         neck_mask = (parse_array == 18).astype(np.float32)
-        neck_mask = cv2.dilate(neck_mask, np.ones((5, 5), np.uint16), iterations=1)
         neck_mask = np.logical_and(neck_mask, np.logical_not(parse_head))
         parse_mask = np.logical_or(parse_mask, neck_mask)
         arm_mask = cv2.dilate(np.logical_or(im_arms_left, im_arms_right).astype('float32'), np.ones((5, 5), np.uint16), iterations=4)
@@ -204,3 +203,14 @@ def tensor2img(x):
         x = np.concatenate([x,x,x], axis=-1)
     return x

     "scarf": 17,
 }
 def extend_arm_mask(wrist, elbow, scale):
     wrist = elbow + scale * (wrist - elbow)
     return wrist
     return refine_mask
+def get_mask_location(model_type, category, model_parse: Image.Image, keypoint: dict, width=384, height=512, radius=5):
     im_parse = model_parse.resize((width, height), Image.NEAREST)
     parse_array = np.array(im_parse)
         parser_mask_fixed += hands_left + hands_right
     parser_mask_fixed = np.logical_or(parser_mask_fixed, parse_head)
+    parse_mask = cv2.dilate(parse_mask, np.ones((radius, radius), np.uint16), iterations=5)
     if category == 'dresses' or category == 'upper_body':
         neck_mask = (parse_array == 18).astype(np.float32)
+        neck_mask = cv2.dilate(neck_mask, np.ones((radius, radius), np.uint16), iterations=1)
         neck_mask = np.logical_and(neck_mask, np.logical_not(parse_head))
         parse_mask = np.logical_or(parse_mask, neck_mask)
         arm_mask = cv2.dilate(np.logical_or(im_arms_left, im_arms_right).astype('float32'), np.ones((5, 5), np.uint16), iterations=4)
         x = np.concatenate([x,x,x], axis=-1)
     return x
+def center_crop(image):
+    width, height = image.size
+    new_height = height
+    new_width = height*3/4
+    left = (width - new_width)/2
+    top = (height - new_height)/2
+    right = (width + new_width)/2
+    bottom = (height + new_height)/2
+    image = image.crop((left, top, right, bottom))
+    return image