lev1 commited on
Commit
62cb566
1 Parent(s): cdcc7cc

Depth guided generation

Browse files
Files changed (5) hide show
  1. app.py +3 -0
  2. app_canny.py +7 -7
  3. app_depth.py +77 -0
  4. model.py +71 -0
  5. utils.py +29 -9
app.py CHANGED
@@ -7,6 +7,7 @@ from app_pose import create_demo as create_demo_pose
7
  from app_text_to_video import create_demo as create_demo_text_to_video
8
  from app_pix2pix_video import create_demo as create_demo_pix2pix_video
9
  from app_canny_db import create_demo as create_demo_canny_db
 
10
  import argparse
11
  import os
12
 
@@ -62,6 +63,8 @@ with gr.Blocks(css='style.css') as demo:
62
  create_demo_canny(model)
63
  with gr.Tab('Edge Conditional and Dreambooth Specialized'):
64
  create_demo_canny_db(model)
 
 
65
  '''
66
  '''
67
  gr.HTML(
 
7
  from app_text_to_video import create_demo as create_demo_text_to_video
8
  from app_pix2pix_video import create_demo as create_demo_pix2pix_video
9
  from app_canny_db import create_demo as create_demo_canny_db
10
+ from app_depth import create_demo as create_demo_depth
11
  import argparse
12
  import os
13
 
 
63
  create_demo_canny(model)
64
  with gr.Tab('Edge Conditional and Dreambooth Specialized'):
65
  create_demo_canny_db(model)
66
+ with gr.Tab('Depth Conditional'):
67
+ create_demo_depth(model)
68
  '''
69
  '''
70
  gr.HTML(
app_canny.py CHANGED
@@ -7,19 +7,19 @@ on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
7
  def create_demo(model: Model):
8
 
9
  examples = [
10
- ["__assets__/canny_videos_edge_2fps/butterfly.mp4",
11
  "white butterfly, a high-quality, detailed, and professional photo"],
12
- ["__assets__/canny_videos_edge_2fps/deer.mp4",
13
  "oil painting of a deer, a high-quality, detailed, and professional photo"],
14
- ["__assets__/canny_videos_edge_2fps/fox.mp4",
15
  "wild red fox is walking on the grass, a high-quality, detailed, and professional photo"],
16
- ["__assets__/canny_videos_edge_2fps/girl_dancing.mp4",
17
  "oil painting of a girl dancing close-up, masterpiece, a high-quality, detailed, and professional photo"],
18
- ["__assets__/canny_videos_edge_2fps/girl_turning.mp4",
19
  "oil painting of a beautiful girl, a high-quality, detailed, and professional photo"],
20
- ["__assets__/canny_videos_edge_2fps/halloween.mp4",
21
  "beautiful girl halloween style, a high-quality, detailed, and professional photo"],
22
- ["__assets__/canny_videos_edge_2fps/santa.mp4",
23
  "a santa claus, a high-quality, detailed, and professional photo"],
24
  ]
25
 
 
7
  def create_demo(model: Model):
8
 
9
  examples = [
10
+ ["__assets__/canny_videos_edge/butterfly.mp4",
11
  "white butterfly, a high-quality, detailed, and professional photo"],
12
+ ["__assets__/canny_videos_edge/deer.mp4",
13
  "oil painting of a deer, a high-quality, detailed, and professional photo"],
14
+ ["__assets__/canny_videos_edge/fox.mp4",
15
  "wild red fox is walking on the grass, a high-quality, detailed, and professional photo"],
16
+ ["__assets__/canny_videos_edge/girl_dancing.mp4",
17
  "oil painting of a girl dancing close-up, masterpiece, a high-quality, detailed, and professional photo"],
18
+ ["__assets__/canny_videos_edge/girl_turning.mp4",
19
  "oil painting of a beautiful girl, a high-quality, detailed, and professional photo"],
20
+ ["__assets__/canny_videos_edge/halloween.mp4",
21
  "beautiful girl halloween style, a high-quality, detailed, and professional photo"],
22
+ ["__assets__/canny_videos_edge/santa.mp4",
23
  "a santa claus, a high-quality, detailed, and professional photo"],
24
  ]
25
 
app_depth.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from model import Model
3
+ import os
4
+ on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
5
+
6
+
7
+ def create_demo(model: Model):
8
+
9
+ examples = [
10
+ ["__assets__/depth_videos/butterfly.mp4",
11
+ "white butterfly, a high-quality, detailed, and professional photo"],
12
+ ["__assets__/depth_videos/deer.mp4",
13
+ "oil painting of a deer, a high-quality, detailed, and professional photo"],
14
+ ["__assets__/depth_videos/fox.mp4",
15
+ "wild red fox is walking on the grass, a high-quality, detailed, and professional photo"],
16
+ ["__assets__/depth_videos/girl_dancing.mp4",
17
+ "oil painting of a girl dancing close-up, masterpiece, a high-quality, detailed, and professional photo"],
18
+ ["__assets__/depth_videos/girl_turning.mp4",
19
+ "oil painting of a beautiful girl, a high-quality, detailed, and professional photo"],
20
+ ["__assets__/depth_videos/halloween.mp4",
21
+ "beautiful girl halloween style, a high-quality, detailed, and professional photo"],
22
+ ["__assets__/depth_videos/santa.mp4",
23
+ "a santa claus, a high-quality, detailed, and professional photo"],
24
+ ]
25
+
26
+ with gr.Blocks() as demo:
27
+ with gr.Row():
28
+ gr.Markdown('## Text and Depth Conditional Video Generation')
29
+ with gr.Row():
30
+ gr.HTML(
31
+ """
32
+ <div style="text-align: left; auto;">
33
+ <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
34
+ Description: For performance purposes, our current preview release supports any input videos but caps output videos after 80 frames and the input videos are scaled down before processing.
35
+ </h3>
36
+ </div>
37
+ """)
38
+
39
+ with gr.Row():
40
+ with gr.Column():
41
+ input_video = gr.Video(
42
+ label="Input Video", source='upload', format="mp4", visible=True).style(height="auto")
43
+ with gr.Column():
44
+ prompt = gr.Textbox(label='Prompt')
45
+ run_button = gr.Button(label='Run')
46
+ with gr.Accordion('Advanced options', open=False):
47
+ watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
48
+ "None"], label="Watermark", value='Picsart AI Research')
49
+ chunk_size = gr.Slider(
50
+ label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
51
+ info="Number of frames processed at once. Reduce for lower memory usage.")
52
+ merging_ratio = gr.Slider(
53
+ label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
54
+ info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference).")
55
+ with gr.Column():
56
+ result = gr.Video(label="Generated Video").style(height="auto")
57
+
58
+ inputs = [
59
+ input_video,
60
+ prompt,
61
+ chunk_size,
62
+ watermark,
63
+ merging_ratio,
64
+ ]
65
+
66
+ gr.Examples(examples=examples,
67
+ inputs=inputs,
68
+ outputs=result,
69
+ fn=model.process_controlnet_depth,
70
+ cache_examples=on_huggingspace,
71
+ run_on_click=False,
72
+ )
73
+
74
+ run_button.click(fn=model.process_controlnet_depth,
75
+ inputs=inputs,
76
+ outputs=result,)
77
+ return demo
model.py CHANGED
@@ -13,6 +13,8 @@ import gradio_utils
13
  import os
14
  on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
15
 
 
 
16
 
17
  class ModelType(Enum):
18
  Pix2Pix_Video = 1,
@@ -20,6 +22,7 @@ class ModelType(Enum):
20
  ControlNetCanny = 3,
21
  ControlNetCannyDB = 4,
22
  ControlNetPose = 5,
 
23
 
24
 
25
  class Model:
@@ -33,6 +36,7 @@ class Model:
33
  ModelType.ControlNetCanny: StableDiffusionControlNetPipeline,
34
  ModelType.ControlNetCannyDB: StableDiffusionControlNetPipeline,
35
  ModelType.ControlNetPose: StableDiffusionControlNetPipeline,
 
36
  }
37
  self.controlnet_attn_proc = utils.CrossFrameAttnProcessor(
38
  unet_chunk_size=2)
@@ -165,6 +169,73 @@ class Model:
165
  video_path, resolution, self.device, self.dtype, False)
166
  control = utils.pre_process_canny(
167
  video, low_threshold, high_threshold).to(self.device).to(self.dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  f, _, h, w = video.shape
169
  self.generator.manual_seed(seed)
170
  latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,
 
13
  import os
14
  on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
15
 
16
+ from einops import rearrange
17
+
18
 
19
  class ModelType(Enum):
20
  Pix2Pix_Video = 1,
 
22
  ControlNetCanny = 3,
23
  ControlNetCannyDB = 4,
24
  ControlNetPose = 5,
25
+ ControlNetDepth = 6,
26
 
27
 
28
  class Model:
 
36
  ModelType.ControlNetCanny: StableDiffusionControlNetPipeline,
37
  ModelType.ControlNetCannyDB: StableDiffusionControlNetPipeline,
38
  ModelType.ControlNetPose: StableDiffusionControlNetPipeline,
39
+ ModelType.ControlNetDepth: StableDiffusionControlNetPipeline,
40
  }
41
  self.controlnet_attn_proc = utils.CrossFrameAttnProcessor(
42
  unet_chunk_size=2)
 
169
  video_path, resolution, self.device, self.dtype, False)
170
  control = utils.pre_process_canny(
171
  video, low_threshold, high_threshold).to(self.device).to(self.dtype)
172
+
173
+ # canny_to_save = list(rearrange(control, 'f c w h -> f w h c').cpu().detach().numpy())
174
+ # _ = utils.create_video(canny_to_save, 4, path="ddxk.mp4", watermark=None)
175
+
176
+ f, _, h, w = video.shape
177
+ self.generator.manual_seed(seed)
178
+ latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,
179
+ device=self.device, generator=self.generator)
180
+ latents = latents.repeat(f, 1, 1, 1)
181
+ result = self.inference(image=control,
182
+ prompt=prompt + ', ' + added_prompt,
183
+ height=h,
184
+ width=w,
185
+ negative_prompt=negative_prompts,
186
+ num_inference_steps=num_inference_steps,
187
+ guidance_scale=guidance_scale,
188
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
189
+ eta=eta,
190
+ latents=latents,
191
+ seed=seed,
192
+ output_type='numpy',
193
+ split_to_chunks=True,
194
+ chunk_size=chunk_size,
195
+ merging_ratio=merging_ratio,
196
+ )
197
+ return utils.create_video(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
198
+
199
+ def process_controlnet_depth(self,
200
+ video_path,
201
+ prompt,
202
+ chunk_size=8,
203
+ watermark='Picsart AI Research',
204
+ merging_ratio=0.0,
205
+ num_inference_steps=20,
206
+ controlnet_conditioning_scale=1.0,
207
+ guidance_scale=9.0,
208
+ seed=42,
209
+ eta=0.0,
210
+ resolution=512,
211
+ use_cf_attn=True,
212
+ save_path=None):
213
+ print("Module Depth")
214
+ video_path = gradio_utils.edge_path_to_video_path(video_path)
215
+ if self.model_type != ModelType.ControlNetDepth:
216
+ controlnet = ControlNetModel.from_pretrained(
217
+ "lllyasviel/sd-controlnet-depth")
218
+ self.set_model(ModelType.ControlNetDepth,
219
+ model_id="runwayml/stable-diffusion-v1-5", controlnet=controlnet)
220
+ self.pipe.scheduler = DDIMScheduler.from_config(
221
+ self.pipe.scheduler.config)
222
+ if use_cf_attn:
223
+ self.pipe.unet.set_attn_processor(
224
+ processor=self.controlnet_attn_proc)
225
+ self.pipe.controlnet.set_attn_processor(
226
+ processor=self.controlnet_attn_proc)
227
+
228
+ # added_prompt = 'best quality, extremely detailed'
229
+ # negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
230
+
231
+ video, fps = utils.prepare_video(
232
+ video_path, resolution, self.device, self.dtype, False)
233
+ control = utils.pre_process_depth(
234
+ video).to(self.device).to(self.dtype)
235
+
236
+ depth_map_to_save = list(rearrange(control, 'f c w h -> f w h c').cpu().detach().numpy())
237
+ _ = utils.create_video(depth_map_to_save, 4, path="ddxk.mp4", watermark=None)
238
+
239
  f, _, h, w = video.shape
240
  self.generator.manual_seed(seed)
241
  latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,
utils.py CHANGED
@@ -12,11 +12,12 @@ from PIL import Image
12
  from annotator.util import resize_image, HWC3
13
  from annotator.canny import CannyDetector
14
  from annotator.openpose import OpenposeDetector
 
15
  import decord
16
- # decord.bridge.set_bridge('torch')
17
 
18
  apply_canny = CannyDetector()
19
  apply_openpose = OpenposeDetector()
 
20
 
21
 
22
  def add_watermark(image, watermark_path, wm_rel_size=1/16, boundary=5):
@@ -55,6 +56,24 @@ def pre_process_canny(input_video, low_threshold=100, high_threshold=200):
55
  return rearrange(control, 'f h w c -> f c h w')
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def pre_process_pose(input_video, apply_pose_detect: bool = True):
59
  detected_maps = []
60
  for frame in input_video:
@@ -137,14 +156,15 @@ def prepare_video(video_path:str, resolution:int, device, dtype, normalize=True,
137
  _, h, w, _ = video.shape
138
  video = rearrange(video, "f h w c -> f c h w")
139
  video = torch.Tensor(video).to(device).to(dtype)
140
- if h > w:
141
- w = int(w * resolution / h)
142
- w = w - w % 8
143
- h = resolution - resolution % 8
144
- else:
145
- h = int(h * resolution / w)
146
- h = h - h % 8
147
- w = resolution - resolution % 8
 
148
  video = Resize((h, w), interpolation=InterpolationMode.BILINEAR, antialias=True)(video)
149
  if normalize:
150
  video = video / 127.5 - 1.0
 
12
  from annotator.util import resize_image, HWC3
13
  from annotator.canny import CannyDetector
14
  from annotator.openpose import OpenposeDetector
15
+ from annotator.midas import MidasDetector
16
  import decord
 
17
 
18
  apply_canny = CannyDetector()
19
  apply_openpose = OpenposeDetector()
20
+ apply_midas = MidasDetector()
21
 
22
 
23
  def add_watermark(image, watermark_path, wm_rel_size=1/16, boundary=5):
 
56
  return rearrange(control, 'f h w c -> f c h w')
57
 
58
 
59
+ def pre_process_depth(input_video, apply_depth_detect: bool = True):
60
+ detected_maps = []
61
+ for frame in input_video:
62
+ img = rearrange(frame, 'c h w -> h w c').cpu().numpy().astype(np.uint8)
63
+ img = HWC3(img)
64
+ if apply_depth_detect:
65
+ detected_map, _ = apply_midas(img)
66
+ else:
67
+ detected_map = img
68
+ detected_map = HWC3(detected_map)
69
+ H, W, C = img.shape
70
+ detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
71
+ detected_maps.append(detected_map[None])
72
+ detected_maps = np.concatenate(detected_maps)
73
+ control = torch.from_numpy(detected_maps.copy()).float() / 255.0
74
+ return rearrange(control, 'f h w c -> f c h w')
75
+
76
+
77
  def pre_process_pose(input_video, apply_pose_detect: bool = True):
78
  detected_maps = []
79
  for frame in input_video:
 
156
  _, h, w, _ = video.shape
157
  video = rearrange(video, "f h w c -> f c h w")
158
  video = torch.Tensor(video).to(device).to(dtype)
159
+
160
+ # Use max if you want the larger side to be equal to resolution (e.g. 512)
161
+ # k = float(resolution) / min(h, w)
162
+ k = float(resolution) / max(h, w)
163
+ h *= k
164
+ w *= k
165
+ h = int(np.round(h / 64.0)) * 64
166
+ w = int(np.round(w / 64.0)) * 64
167
+
168
  video = Resize((h, w), interpolation=InterpolationMode.BILINEAR, antialias=True)(video)
169
  if normalize:
170
  video = video / 127.5 - 1.0