myniu commited on
Commit
f9cae6d
1 Parent(s): 6d12ad6
Files changed (1) hide show
  1. app.py +59 -73
app.py CHANGED
@@ -149,93 +149,31 @@ class Drag:
149
 
150
  self.height = height
151
  self.width = width
152
- self.pipeline = None
153
- self.cmp = None
154
-
155
- @spaces.GPU(duration=100)
156
- def init_models(self, pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
157
 
158
- print('start loading models...')
159
- # Load scheduler, tokenizer and models.
160
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
161
- pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
162
- )
163
- vae = AutoencoderKLTemporalDecoder.from_pretrained(
164
- pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
165
- unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
166
- pretrained_model_name_or_path,
167
- subfolder="unet",
168
- low_cpu_mem_usage=True,
169
- variant="fp16",
170
- )
171
 
172
- controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
173
 
174
  cmp = CMP_demo(
175
  './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
176
  42000
177
- ).to(device)
178
  cmp.requires_grad_(False)
179
 
180
- self.cmp = cmp
181
-
182
- # Freeze vae and image_encoder
183
- vae.requires_grad_(False)
184
- image_encoder.requires_grad_(False)
185
- unet.requires_grad_(False)
186
- controlnet.requires_grad_(False)
187
-
188
- # Move image_encoder and vae to gpu and cast to weight_dtype
189
- image_encoder.to(device, dtype=weight_dtype)
190
- vae.to(device, dtype=weight_dtype)
191
- unet.to(device, dtype=weight_dtype)
192
- controlnet.to(device, dtype=weight_dtype)
193
-
194
- if enable_xformers_memory_efficient_attention:
195
- if is_xformers_available():
196
- import xformers
197
-
198
- xformers_version = version.parse(xformers.__version__)
199
- if xformers_version == version.parse("0.0.16"):
200
- print(
201
- "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
202
- )
203
- unet.enable_xformers_memory_efficient_attention()
204
- else:
205
- raise ValueError(
206
- "xformers is not available. Make sure it is installed correctly")
207
-
208
- if allow_tf32:
209
- torch.backends.cuda.matmul.allow_tf32 = True
210
-
211
- pipeline = FlowControlNetPipeline.from_pretrained(
212
- pretrained_model_name_or_path,
213
- unet=unet,
214
- controlnet=controlnet,
215
- image_encoder=image_encoder,
216
- vae=vae,
217
- torch_dtype=weight_dtype,
218
- )
219
- pipeline = pipeline.to(device)
220
-
221
- self.pipeline = pipeline
222
 
223
- print('models loaded.')
224
 
225
- def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
226
-
227
- '''
228
- frames: [b, 13, 3, 384, 384] (0, 1) tensor
229
- sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
230
- mask: [b, 13, 2, 384, 384] {0, 1} tensor
231
- '''
232
 
233
  b, t, c, h, w = frames.shape
234
  assert h == 384 and w == 384
235
  frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
236
  sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
237
  mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
238
- cmp_flow = self.cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
239
 
240
  if brush_mask is not None:
241
  brush_mask = torch.from_numpy(brush_mask) / 255.
@@ -268,6 +206,54 @@ class Drag:
268
 
269
  @torch.no_grad()
270
  def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  '''
272
  input_drag: [1, 13, 320, 576, 2]
273
  input_drag_384: [1, 13, 384, 384, 2]
@@ -321,7 +307,7 @@ class Drag:
321
 
322
  controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
323
 
324
- val_output = self.pipeline(
325
  input_first_frame_pil,
326
  input_first_frame_pil,
327
  controlnet_flow,
@@ -358,7 +344,7 @@ class Drag:
358
 
359
  return outputs
360
 
361
- @spaces.GPU
362
  @torch.no_grad()
363
  def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
364
 
 
149
 
150
  self.height = height
151
  self.width = width
 
 
 
 
 
152
 
153
+ def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
154
+
155
+ '''
156
+ frames: [b, 13, 3, 384, 384] (0, 1) tensor
157
+ sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
158
+ mask: [b, 13, 2, 384, 384] {0, 1} tensor
159
+ '''
 
 
 
 
 
 
160
 
 
161
 
162
  cmp = CMP_demo(
163
  './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
164
  42000
165
+ ).to('cuda')
166
  cmp.requires_grad_(False)
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
 
169
 
 
 
 
 
 
 
 
170
 
171
  b, t, c, h, w = frames.shape
172
  assert h == 384 and w == 384
173
  frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
174
  sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
175
  mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
176
+ cmp_flow = cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
177
 
178
  if brush_mask is not None:
179
  brush_mask = torch.from_numpy(brush_mask) / 255.
 
206
 
207
  @torch.no_grad()
208
  def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+ pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1"
217
+ resume_from_checkpoint="ckpts/controlnet"
218
+ # Load scheduler, tokenizer and models.
219
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
220
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
221
+ )
222
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
223
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
224
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
225
+ pretrained_model_name_or_path,
226
+ subfolder="unet",
227
+ low_cpu_mem_usage=True,
228
+ variant="fp16",
229
+ )
230
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
231
+ # Freeze vae and image_encoder
232
+ vae.requires_grad_(False)
233
+ image_encoder.requires_grad_(False)
234
+ unet.requires_grad_(False)
235
+ controlnet.requires_grad_(False)
236
+ # Move image_encoder and vae to gpu and cast to weight_dtype
237
+ image_encoder.to('cuda', dtype=torch.float16)
238
+ vae.to('cuda', dtype=torch.float16)
239
+ unet.to('cuda', dtype=torch.float16)
240
+ controlnet.to('cuda', dtype=torch.float16)
241
+ # init pipeline
242
+ pipeline = FlowControlNetPipeline.from_pretrained(
243
+ pretrained_model_name_or_path,
244
+ unet=unet,
245
+ controlnet=controlnet,
246
+ image_encoder=image_encoder,
247
+ vae=vae,
248
+ torch_dtype=torch.float16,
249
+ )
250
+ pipeline = pipeline.to('cuda')
251
+
252
+
253
+
254
+
255
+
256
+
257
  '''
258
  input_drag: [1, 13, 320, 576, 2]
259
  input_drag_384: [1, 13, 384, 384, 2]
 
307
 
308
  controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
309
 
310
+ val_output = pipeline(
311
  input_first_frame_pil,
312
  input_first_frame_pil,
313
  controlnet_flow,
 
344
 
345
  return outputs
346
 
347
+ @spaces.GPU(duration=100)
348
  @torch.no_grad()
349
  def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
350