Spaces:

HarlanHong
/

DaGAN

Running

App Files Files Community

harlanhong commited on Apr 25, 2022

Commit

e418082

•

1 Parent(s): c45e94d

force

Browse files

Files changed (7) hide show

.gitignore +1 -0
app.py +13 -106
demo_dagan.py +92 -82
depth.pth +0 -3
encoder.pth +0 -3
generator.pt +0 -3
kp_detector.pt +0 -3

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pyc

app.py CHANGED Viewed

@@ -3,21 +3,14 @@ import shutil
 import gradio as gr
 from PIL import Image
 import subprocess
 #os.chdir('Restormer')
-from demo_dagan import *
 # Download sample images
-import torch
-import torch.nn.functional as F
-import os
-from skimage import img_as_ubyte
-import imageio
-from skimage.transform import resize
-import numpy as np
-import modules.generator as G
-import modules.keypoint_detector as KPD
-import yaml
-from collections import OrderedDict
-import depth
 examples = [['project/cartoon2.jpg','project/video1.mp4'],
 						['project/cartoon3.jpg','project/video2.mp4'],
@@ -25,9 +18,6 @@ examples = [['project/cartoon2.jpg','project/video1.mp4'],
 						['project/celeb2.jpg','project/video2.mp4'],
 						]
-inference_on = ['Full Resolution Image', 'Downsampled Image']
 title = "DaGAN"
 description = """
 Gradio demo for <b>Depth-Aware Generative Adversarial Network for Talking Head Video Generation</b>, CVPR 2022L. <a href='https://arxiv.org/abs/2203.06605'>[Paper]</a><a href='https://github.com/harlanhong/CVPR2022-DaGAN'>[Github Code]</a>\n
@@ -38,99 +28,16 @@ Gradio demo for <b>Depth-Aware Generative Adversarial Network for Talking Head V
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2203.06605'>Depth-Aware Generative Adversarial Network for Talking Head Video Generation</a> | <a href='https://github.com/harlanhong/CVPR2022-DaGAN'>Github Repo</a></p>"
-def inference(source_image, video):
     if not os.path.exists('temp'):
-        os.system('mkdir temp')
-    cmd = f"ffmpeg -y -ss 00:00:00 -i {video} -to 00:00:08 -c copy video_input.mp4"
     subprocess.run(cmd.split())
     driving_video = "video_input.mp4"
-    output = "rst.mp4"
-    with open("config/vox-adv-256.yaml") as f:
-        config = yaml.load(f)
-    generator = G.SPADEDepthAwareGenerator(**config['model_params']['generator_params'],**config['model_params']['common_params'])
-    config['model_params']['common_params']['num_channels'] = 4
-    kp_detector = KPD.KPDetector(**config['model_params']['kp_detector_params'],**config['model_params']['common_params'])
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    g_checkpoint = torch.load("generator.pt", map_location=device)
-    kp_checkpoint = torch.load("kp_detector.pt", map_location=device)
-    ckp_generator = OrderedDict((k.replace('module.',''),v) for k,v in g_checkpoint.items())
-    generator.load_state_dict(ckp_generator)
-    ckp_kp_detector = OrderedDict((k.replace('module.',''),v) for k,v in kp_checkpoint.items())
-    kp_detector.load_state_dict(ckp_kp_detector)
-    depth_encoder = depth.ResnetEncoder(18, False)
-    depth_decoder = depth.DepthDecoder(num_ch_enc=depth_encoder.num_ch_enc, scales=range(4))
-    loaded_dict_enc = torch.load('encoder.pth')
-    loaded_dict_dec = torch.load('depth.pth')
-    filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in depth_encoder.state_dict()}
-    depth_encoder.load_state_dict(filtered_dict_enc)
-    ckp_depth_decoder= {k: v for k, v in loaded_dict_dec.items() if k in depth_decoder.state_dict()}
-    depth_decoder.load_state_dict(ckp_depth_decoder)
-    depth_encoder.eval()
-    depth_decoder.eval()
-    # device = torch.device('cpu')
-    # stx()
-    generator = generator.to(device)
-    kp_detector = kp_detector.to(device)
-    depth_encoder = depth_encoder.to(device)
-    depth_decoder = depth_decoder.to(device)
-    generator.eval()
-    kp_detector.eval()
-    depth_encoder.eval()
-    depth_decoder.eval()
-    img_multiple_of = 8
-    with torch.inference_mode():
-        if torch.cuda.is_available():
-            torch.cuda.ipc_collect()
-            torch.cuda.empty_cache()
-        source_image = imageio.imread(source_image)
-        reader = imageio.get_reader(driving_video)
-        fps = reader.get_meta_data()['fps']
-        driving_video = []
-        try:
-            for im in reader:
-                driving_video.append(im)
-        except RuntimeError:
-            pass
-        reader.close()
-        source_image = resize(source_image, (256, 256))[..., :3]
-        driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]
-        i = find_best_frame(source_image, driving_video)
-        print ("Best frame: " + str(i))
-        driving_forward = driving_video[i:]
-        driving_backward = driving_video[:(i+1)][::-1]
-        sources_forward, drivings_forward, predictions_forward,depth_forward = make_animation(source_image, driving_forward, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False)
-        sources_backward, drivings_backward, predictions_backward,depth_backward = make_animation(source_image, driving_backward, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False)
-        predictions = predictions_backward[::-1] + predictions_forward[1:]
-        sources = sources_backward[::-1] + sources_forward[1:]
-        drivings = drivings_backward[::-1] + drivings_forward[1:]
-        depth_gray = depth_backward[::-1] + depth_forward[1:]
-        imageio.mimsave(output, [np.concatenate((img_as_ubyte(s),img_as_ubyte(d),img_as_ubyte(p)),1) for (s,d,p) in zip(sources, drivings, predictions)], fps=fps)
-        imageio.mimsave("gray.mp4", depth_gray, fps=fps)
-        # merge the gray video
-        animation = np.array(imageio.mimread(output,memtest=False))
-        gray = np.array(imageio.mimread("gray.mp4",memtest=False))
-        src_dst = animation[:,:,:512,:]
-        animate = animation[:,:,512:,:]
-        merge = np.concatenate((src_dst,gray,animate),2)
-        imageio.mimsave(output, merge, fps=fps)
-    return output
 gr.Interface(
 		inference,
 		[

 import gradio as gr
 from PIL import Image
 import subprocess
 #os.chdir('Restormer')
 # Download sample images
+os.system("wget https://github.com/swz30/Restormer/releases/download/v1.0/sample_images.zip")
+shutil.unpack_archive('sample_images.zip')
+os.remove('sample_images.zip')
 examples = [['project/cartoon2.jpg','project/video1.mp4'],
 						['project/cartoon3.jpg','project/video2.mp4'],
 						['project/celeb2.jpg','project/video2.mp4'],
 						]
 title = "DaGAN"
 description = """
 Gradio demo for <b>Depth-Aware Generative Adversarial Network for Talking Head Video Generation</b>, CVPR 2022L. <a href='https://arxiv.org/abs/2203.06605'>[Paper]</a><a href='https://github.com/harlanhong/CVPR2022-DaGAN'>[Github Code]</a>\n
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2203.06605'>Depth-Aware Generative Adversarial Network for Talking Head Video Generation</a> | <a href='https://github.com/harlanhong/CVPR2022-DaGAN'>Github Repo</a></p>"
+def inference(img, video):
     if not os.path.exists('temp'):
+      os.system('mkdir temp')
+    ####  Resize the longer edge of the input image
+    cmd = f"ffmpeg -y -ss 00:00:00 -i {video} -to 00:00:08 -c copy temp/driving_video.mp4"
     subprocess.run(cmd.split())
     driving_video = "video_input.mp4"
+    os.system("python demo_dagan.py --source_image {} --driving_video 'temp/driving_video.mp4' --output 'temp/rst.mp4'".format(img))
+    return f'temp/rst.mp4'
 gr.Interface(
 		inference,
 		[

demo_dagan.py CHANGED Viewed

@@ -6,10 +6,19 @@
 import torch
 import torch.nn.functional as F
 import os
 import argparse
 from scipy.spatial import ConvexHull
 from tqdm import tqdm
 import numpy as np
 parser = argparse.ArgumentParser(description='Test DaGAN on your own images')
 parser.add_argument('--source_image', default='./temp/source.jpg', type=str, help='Directory of input source image')
 parser.add_argument('--driving_video', default='./temp/driving.mp4', type=str, help='Directory for driving video')
@@ -62,6 +71,7 @@ def find_best_frame(source, driving, cpu=False):
             frame_num = i
     return frame_num
 def make_animation(source_image, driving_video, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False):
     sources = []
     drivings = []
@@ -111,88 +121,88 @@ def make_animation(source_image, driving_video, generator, kp_detector, relative
             predictions.append(np.transpose(out['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0])
             depth_gray.append(gray_driving)
     return sources, drivings, predictions,depth_gray
-# with open("config/vox-adv-256.yaml") as f:
-#     config = yaml.load(f)
-# generator = G.SPADEDepthAwareGenerator(**config['model_params']['generator_params'],**config['model_params']['common_params'])
-# config['model_params']['common_params']['num_channels'] = 4
-# kp_detector = KPD.KPDetector(**config['model_params']['kp_detector_params'],**config['model_params']['common_params'])
-# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# g_checkpoint = torch.load("generator.pt", map_location=device)
-# kp_checkpoint = torch.load("kp_detector.pt", map_location=device)
-# ckp_generator = OrderedDict((k.replace('module.',''),v) for k,v in g_checkpoint.items())
-# generator.load_state_dict(ckp_generator)
-# ckp_kp_detector = OrderedDict((k.replace('module.',''),v) for k,v in kp_checkpoint.items())
-# kp_detector.load_state_dict(ckp_kp_detector)
-# depth_encoder = depth.ResnetEncoder(18, False)
-# depth_decoder = depth.DepthDecoder(num_ch_enc=depth_encoder.num_ch_enc, scales=range(4))
-# loaded_dict_enc = torch.load('encoder.pth')
-# loaded_dict_dec = torch.load('depth.pth')
-# filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in depth_encoder.state_dict()}
-# depth_encoder.load_state_dict(filtered_dict_enc)
-# ckp_depth_decoder= {k: v for k, v in loaded_dict_dec.items() if k in depth_decoder.state_dict()}
-# depth_decoder.load_state_dict(ckp_depth_decoder)
-# depth_encoder.eval()
-# depth_decoder.eval()
-# # device = torch.device('cpu')
-# # stx()
-# generator = generator.to(device)
-# kp_detector = kp_detector.to(device)
-# depth_encoder = depth_encoder.to(device)
-# depth_decoder = depth_decoder.to(device)
-# generator.eval()
-# kp_detector.eval()
-# depth_encoder.eval()
-# depth_decoder.eval()
-# img_multiple_of = 8
-# with torch.inference_mode():
-#     if torch.cuda.is_available():
-#         torch.cuda.ipc_collect()
-#         torch.cuda.empty_cache()
-#     source_image = imageio.imread(args.source_image)
-#     reader = imageio.get_reader(args.driving_video)
-#     fps = reader.get_meta_data()['fps']
-#     driving_video = []
-#     try:
-#         for im in reader:
-#             driving_video.append(im)
-#     except RuntimeError:
-#         pass
-#     reader.close()
-#     source_image = resize(source_image, (256, 256))[..., :3]
-#     driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]
-#     i = find_best_frame(source_image, driving_video)
-#     print ("Best frame: " + str(i))
-#     driving_forward = driving_video[i:]
-#     driving_backward = driving_video[:(i+1)][::-1]
-#     sources_forward, drivings_forward, predictions_forward,depth_forward = make_animation(source_image, driving_forward, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False)
-#     sources_backward, drivings_backward, predictions_backward,depth_backward = make_animation(source_image, driving_backward, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False)
-#     predictions = predictions_backward[::-1] + predictions_forward[1:]
-#     sources = sources_backward[::-1] + sources_forward[1:]
-#     drivings = drivings_backward[::-1] + drivings_forward[1:]
-#     depth_gray = depth_backward[::-1] + depth_forward[1:]
-#     imageio.mimsave(args.output, [np.concatenate((img_as_ubyte(s),img_as_ubyte(d),img_as_ubyte(p)),1) for (s,d,p) in zip(sources, drivings, predictions)], fps=fps)
-#     imageio.mimsave("gray.mp4", depth_gray, fps=fps)
-#     # merge the gray video
-#     animation = np.array(imageio.mimread(args.output,memtest=False))
-#     gray = np.array(imageio.mimread("gray.mp4",memtest=False))
-#     src_dst = animation[:,:,:512,:]
-#     animate = animation[:,:,512:,:]
-#     merge = np.concatenate((src_dst,gray,animate),2)
-#     imageio.mimsave(args.output, merge, fps=fps)
     # print(f"\nRestored images are saved at {out_dir}")

 import torch
 import torch.nn.functional as F
 import os
+from skimage import img_as_ubyte
+import cv2
 import argparse
+import imageio
+from skimage.transform import resize
 from scipy.spatial import ConvexHull
 from tqdm import tqdm
 import numpy as np
+import modules.generator as G
+import modules.keypoint_detector as KPD
+import yaml
+from collections import OrderedDict
+import depth
 parser = argparse.ArgumentParser(description='Test DaGAN on your own images')
 parser.add_argument('--source_image', default='./temp/source.jpg', type=str, help='Directory of input source image')
 parser.add_argument('--driving_video', default='./temp/driving.mp4', type=str, help='Directory for driving video')
             frame_num = i
     return frame_num
 def make_animation(source_image, driving_video, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False):
     sources = []
     drivings = []
             predictions.append(np.transpose(out['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0])
             depth_gray.append(gray_driving)
     return sources, drivings, predictions,depth_gray
+with open("config/vox-adv-256.yaml") as f:
+    config = yaml.load(f)
+generator = G.SPADEDepthAwareGenerator(**config['model_params']['generator_params'],**config['model_params']['common_params'])
+config['model_params']['common_params']['num_channels'] = 4
+kp_detector = KPD.KPDetector(**config['model_params']['kp_detector_params'],**config['model_params']['common_params'])
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+g_checkpoint = torch.load("generator.pt", map_location=device)
+kp_checkpoint = torch.load("kp_detector.pt", map_location=device)
+ckp_generator = OrderedDict((k.replace('module.',''),v) for k,v in g_checkpoint.items())
+generator.load_state_dict(ckp_generator)
+ckp_kp_detector = OrderedDict((k.replace('module.',''),v) for k,v in kp_checkpoint.items())
+kp_detector.load_state_dict(ckp_kp_detector)
+depth_encoder = depth.ResnetEncoder(18, False)
+depth_decoder = depth.DepthDecoder(num_ch_enc=depth_encoder.num_ch_enc, scales=range(4))
+loaded_dict_enc = torch.load('encoder.pth')
+loaded_dict_dec = torch.load('depth.pth')
+filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in depth_encoder.state_dict()}
+depth_encoder.load_state_dict(filtered_dict_enc)
+ckp_depth_decoder= {k: v for k, v in loaded_dict_dec.items() if k in depth_decoder.state_dict()}
+depth_decoder.load_state_dict(ckp_depth_decoder)
+depth_encoder.eval()
+depth_decoder.eval()
+# device = torch.device('cpu')
+# stx()
+generator = generator.to(device)
+kp_detector = kp_detector.to(device)
+depth_encoder = depth_encoder.to(device)
+depth_decoder = depth_decoder.to(device)
+generator.eval()
+kp_detector.eval()
+depth_encoder.eval()
+depth_decoder.eval()
+img_multiple_of = 8
+with torch.inference_mode():
+    if torch.cuda.is_available():
+        torch.cuda.ipc_collect()
+        torch.cuda.empty_cache()
+    source_image = imageio.imread(args.source_image)
+    reader = imageio.get_reader(args.driving_video)
+    fps = reader.get_meta_data()['fps']
+    driving_video = []
+    try:
+        for im in reader:
+            driving_video.append(im)
+    except RuntimeError:
+        pass
+    reader.close()
+    source_image = resize(source_image, (256, 256))[..., :3]
+    driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]
+    i = find_best_frame(source_image, driving_video)
+    print ("Best frame: " + str(i))
+    driving_forward = driving_video[i:]
+    driving_backward = driving_video[:(i+1)][::-1]
+    sources_forward, drivings_forward, predictions_forward,depth_forward = make_animation(source_image, driving_forward, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False)
+    sources_backward, drivings_backward, predictions_backward,depth_backward = make_animation(source_image, driving_backward, generator, kp_detector, relative=True, adapt_movement_scale=True, cpu=False)
+    predictions = predictions_backward[::-1] + predictions_forward[1:]
+    sources = sources_backward[::-1] + sources_forward[1:]
+    drivings = drivings_backward[::-1] + drivings_forward[1:]
+    depth_gray = depth_backward[::-1] + depth_forward[1:]
+    imageio.mimsave(args.output, [np.concatenate((img_as_ubyte(s),img_as_ubyte(d),img_as_ubyte(p)),1) for (s,d,p) in zip(sources, drivings, predictions)], fps=fps)
+    imageio.mimsave("gray.mp4", depth_gray, fps=fps)
+    # merge the gray video
+    animation = np.array(imageio.mimread(args.output,memtest=False))
+    gray = np.array(imageio.mimread("gray.mp4",memtest=False))
+    src_dst = animation[:,:,:512,:]
+    animate = animation[:,:,512:,:]
+    merge = np.concatenate((src_dst,gray,animate),2)
+    imageio.mimsave(args.output, merge, fps=fps)
     # print(f"\nRestored images are saved at {out_dir}")

depth.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:11eb72a1e520d6086d9f357b6740340a235b067acdd6d495049877de2772d1a4
-size 12621521

encoder.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:de3d906dac888c2947cf0dabe319b8d3a5da98dd695d8b96512891f5c5a6bca3
-size 46837645

generator.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:34ac6a18ca3b0d9df080990d4975d9f4db04f7216fa9dbe4d580e920ee4b2bde
-size 270494161

kp_detector.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f03aac403bf71445163f22cd7f883548980603065326c6b8ee08b74ad18d1bd
-size 57103620