Spaces:
Running
on
Zero
Running
on
Zero
myniu
commited on
Commit
•
f9cae6d
1
Parent(s):
6d12ad6
init
Browse files
app.py
CHANGED
@@ -149,93 +149,31 @@ class Drag:
|
|
149 |
|
150 |
self.height = height
|
151 |
self.width = width
|
152 |
-
self.pipeline = None
|
153 |
-
self.cmp = None
|
154 |
-
|
155 |
-
@spaces.GPU(duration=100)
|
156 |
-
def init_models(self, pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
166 |
-
pretrained_model_name_or_path,
|
167 |
-
subfolder="unet",
|
168 |
-
low_cpu_mem_usage=True,
|
169 |
-
variant="fp16",
|
170 |
-
)
|
171 |
|
172 |
-
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
173 |
|
174 |
cmp = CMP_demo(
|
175 |
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
176 |
42000
|
177 |
-
).to(
|
178 |
cmp.requires_grad_(False)
|
179 |
|
180 |
-
self.cmp = cmp
|
181 |
-
|
182 |
-
# Freeze vae and image_encoder
|
183 |
-
vae.requires_grad_(False)
|
184 |
-
image_encoder.requires_grad_(False)
|
185 |
-
unet.requires_grad_(False)
|
186 |
-
controlnet.requires_grad_(False)
|
187 |
-
|
188 |
-
# Move image_encoder and vae to gpu and cast to weight_dtype
|
189 |
-
image_encoder.to(device, dtype=weight_dtype)
|
190 |
-
vae.to(device, dtype=weight_dtype)
|
191 |
-
unet.to(device, dtype=weight_dtype)
|
192 |
-
controlnet.to(device, dtype=weight_dtype)
|
193 |
-
|
194 |
-
if enable_xformers_memory_efficient_attention:
|
195 |
-
if is_xformers_available():
|
196 |
-
import xformers
|
197 |
-
|
198 |
-
xformers_version = version.parse(xformers.__version__)
|
199 |
-
if xformers_version == version.parse("0.0.16"):
|
200 |
-
print(
|
201 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
202 |
-
)
|
203 |
-
unet.enable_xformers_memory_efficient_attention()
|
204 |
-
else:
|
205 |
-
raise ValueError(
|
206 |
-
"xformers is not available. Make sure it is installed correctly")
|
207 |
-
|
208 |
-
if allow_tf32:
|
209 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
210 |
-
|
211 |
-
pipeline = FlowControlNetPipeline.from_pretrained(
|
212 |
-
pretrained_model_name_or_path,
|
213 |
-
unet=unet,
|
214 |
-
controlnet=controlnet,
|
215 |
-
image_encoder=image_encoder,
|
216 |
-
vae=vae,
|
217 |
-
torch_dtype=weight_dtype,
|
218 |
-
)
|
219 |
-
pipeline = pipeline.to(device)
|
220 |
-
|
221 |
-
self.pipeline = pipeline
|
222 |
|
223 |
-
print('models loaded.')
|
224 |
|
225 |
-
def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
|
226 |
-
|
227 |
-
'''
|
228 |
-
frames: [b, 13, 3, 384, 384] (0, 1) tensor
|
229 |
-
sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
|
230 |
-
mask: [b, 13, 2, 384, 384] {0, 1} tensor
|
231 |
-
'''
|
232 |
|
233 |
b, t, c, h, w = frames.shape
|
234 |
assert h == 384 and w == 384
|
235 |
frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
|
236 |
sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
|
237 |
mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
|
238 |
-
cmp_flow =
|
239 |
|
240 |
if brush_mask is not None:
|
241 |
brush_mask = torch.from_numpy(brush_mask) / 255.
|
@@ -268,6 +206,54 @@ class Drag:
|
|
268 |
|
269 |
@torch.no_grad()
|
270 |
def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
'''
|
272 |
input_drag: [1, 13, 320, 576, 2]
|
273 |
input_drag_384: [1, 13, 384, 384, 2]
|
@@ -321,7 +307,7 @@ class Drag:
|
|
321 |
|
322 |
controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
|
323 |
|
324 |
-
val_output =
|
325 |
input_first_frame_pil,
|
326 |
input_first_frame_pil,
|
327 |
controlnet_flow,
|
@@ -358,7 +344,7 @@ class Drag:
|
|
358 |
|
359 |
return outputs
|
360 |
|
361 |
-
@spaces.GPU
|
362 |
@torch.no_grad()
|
363 |
def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
|
364 |
|
|
|
149 |
|
150 |
self.height = height
|
151 |
self.width = width
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
+
def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
|
154 |
+
|
155 |
+
'''
|
156 |
+
frames: [b, 13, 3, 384, 384] (0, 1) tensor
|
157 |
+
sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
|
158 |
+
mask: [b, 13, 2, 384, 384] {0, 1} tensor
|
159 |
+
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
|
|
161 |
|
162 |
cmp = CMP_demo(
|
163 |
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
164 |
42000
|
165 |
+
).to('cuda')
|
166 |
cmp.requires_grad_(False)
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
|
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
b, t, c, h, w = frames.shape
|
172 |
assert h == 384 and w == 384
|
173 |
frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
|
174 |
sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
|
175 |
mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
|
176 |
+
cmp_flow = cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
|
177 |
|
178 |
if brush_mask is not None:
|
179 |
brush_mask = torch.from_numpy(brush_mask) / 255.
|
|
|
206 |
|
207 |
@torch.no_grad()
|
208 |
def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
|
209 |
+
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1"
|
217 |
+
resume_from_checkpoint="ckpts/controlnet"
|
218 |
+
# Load scheduler, tokenizer and models.
|
219 |
+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
220 |
+
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
221 |
+
)
|
222 |
+
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
223 |
+
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
224 |
+
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
225 |
+
pretrained_model_name_or_path,
|
226 |
+
subfolder="unet",
|
227 |
+
low_cpu_mem_usage=True,
|
228 |
+
variant="fp16",
|
229 |
+
)
|
230 |
+
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
231 |
+
# Freeze vae and image_encoder
|
232 |
+
vae.requires_grad_(False)
|
233 |
+
image_encoder.requires_grad_(False)
|
234 |
+
unet.requires_grad_(False)
|
235 |
+
controlnet.requires_grad_(False)
|
236 |
+
# Move image_encoder and vae to gpu and cast to weight_dtype
|
237 |
+
image_encoder.to('cuda', dtype=torch.float16)
|
238 |
+
vae.to('cuda', dtype=torch.float16)
|
239 |
+
unet.to('cuda', dtype=torch.float16)
|
240 |
+
controlnet.to('cuda', dtype=torch.float16)
|
241 |
+
# init pipeline
|
242 |
+
pipeline = FlowControlNetPipeline.from_pretrained(
|
243 |
+
pretrained_model_name_or_path,
|
244 |
+
unet=unet,
|
245 |
+
controlnet=controlnet,
|
246 |
+
image_encoder=image_encoder,
|
247 |
+
vae=vae,
|
248 |
+
torch_dtype=torch.float16,
|
249 |
+
)
|
250 |
+
pipeline = pipeline.to('cuda')
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
'''
|
258 |
input_drag: [1, 13, 320, 576, 2]
|
259 |
input_drag_384: [1, 13, 384, 384, 2]
|
|
|
307 |
|
308 |
controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
|
309 |
|
310 |
+
val_output = pipeline(
|
311 |
input_first_frame_pil,
|
312 |
input_first_frame_pil,
|
313 |
controlnet_flow,
|
|
|
344 |
|
345 |
return outputs
|
346 |
|
347 |
+
@spaces.GPU(duration=100)
|
348 |
@torch.no_grad()
|
349 |
def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
|
350 |
|