Spaces:

Pie31415
/

rome

Build error

App Files Files Community

rome / app.py

Pie31415

updated layout

57885e4 almost 2 years ago

raw

history blame

5.87 kB

	import sys
	import torch
	import gradio as gr
	import pickle

	from easydict import EasyDict as edict
	from huggingface_hub import hf_hub_download

	sys.path.append("./rome/")
	sys.path.append('./DECA')

	from rome.infer import Infer
	from rome.src.utils.processing import process_black_shape, tensor2image

	# loading models ---- create model repo
	default_modnet_path = hf_hub_download('Pie31415/rome', 'modnet_photographic_portrait_matting.ckpt')
	default_model_path = hf_hub_download('Pie31415/rome', 'rome.pth')

	# parser configurations
	args = edict({
	"save_dir": ".",
	"save_render": True,
	"model_checkpoint": default_model_path,
	"modnet_path": default_modnet_path,
	"random_seed": 0,
	"debug": False,
	"verbose": False,
	"model_image_size": 256,
	"align_source": True,
	"align_target": False,
	"align_scale": 1.25,
	"use_mesh_deformations": False,
	"subdivide_mesh": False,
	"renderer_sigma": 1e-08,
	"renderer_zfar": 100.0,
	"renderer_type": "soft_mesh",
	"renderer_texture_type": "texture_uv",
	"renderer_normalized_alphas": False,
	"deca_path": "DECA",
	"rome_data_dir": "rome/data",
	"autoenc_cat_alphas": False,
	"autoenc_align_inputs": False,
	"autoenc_use_warp": False,
	"autoenc_num_channels": 64,
	"autoenc_max_channels": 512,
	"autoenc_num_groups": 4,
	"autoenc_num_bottleneck_groups": 0,
	"autoenc_num_blocks": 2,
	"autoenc_num_layers": 4,
	"autoenc_block_type": "bottleneck",
	"neural_texture_channels": 8,
	"num_harmonic_encoding_funcs": 6,
	"unet_num_channels": 64,
	"unet_max_channels": 512,
	"unet_num_groups": 4,
	"unet_num_blocks": 1,
	"unet_num_layers": 2,
	"unet_block_type": "conv",
	"unet_skip_connection_type": "cat",
	"unet_use_normals_cond": True,
	"unet_use_vertex_cond": False,
	"unet_use_uvs_cond": False,
	"unet_pred_mask": False,
	"use_separate_seg_unet": True,
	"norm_layer_type": "gn",
	"activation_type": "relu",
	"conv_layer_type": "ws_conv",
	"deform_norm_layer_type": "gn",
	"deform_activation_type": "relu",
	"deform_conv_layer_type": "ws_conv",
	"unet_seg_weight": 0.0,
	"unet_seg_type": "bce_with_logits",
	"deform_face_tightness": 0.0001,
	"use_whole_segmentation": False,
	"mask_hair_for_neck": False,
	"use_hair_from_avatar": False,
	"use_scalp_deforms": True,
	"use_neck_deforms": True,
	"use_basis_deformer": False,
	"use_unet_deformer": True,
	"pretrained_encoder_basis_path": "",
	"pretrained_vertex_basis_path": "",
	"num_basis": 50,
	"basis_init": "pca",
	"num_vertex": 5023,
	"train_basis": True,
	"path_to_deca": "DECA",
	"path_to_linear_hair_model": "data/linear_hair.pth", # N/A
	"path_to_mobile_model": "data/disp_model.pth", # N/A
	"n_scalp": 60,
	"use_distill": False,
	"use_mobile_version": False,
	"deformer_path": "data/rome.pth",
	"output_unet_deformer_feats": 32,
	"use_deca_details": False,
	"use_flametex": False,
	"upsample_type": "nearest",
	"num_frequencies": 6,
	"deform_face_scale_coef": 0.0,
	"device": "cpu"
	})

	# download FLAME and DECA pretrained
	generic_model_path = hf_hub_download('Pie31415/rome', 'generic_model.pkl')
	deca_model_path = hf_hub_download('Pie31415/rome', 'deca_model.tar')

	with open(generic_model_path, 'rb') as f:
	ss = pickle.load(f, encoding='latin1')

	with open('./DECA/data/generic_model.pkl', 'wb') as out:
	pickle.dump(ss, out)

	with open(deca_model_path, "rb") as input:
	with open('./DECA/data/deca_model.tar', "wb") as out:
	for line in input:
	out.write(line)

	# load ROME inference model
	infer = Infer(args)

	def image_inference(
	source_img: gr.inputs.Image = None,
	driver_img: gr.inputs.Image = None
	):
	out = infer.evaluate(source_img, driver_img, crop_center=False)
	res = tensor2image(torch.cat([out['source_information']['data_dict']['source_img'][0].cpu(),
	out['source_information']['data_dict']['target_img'][0].cpu(),
	out['render_masked'].cpu(), out['pred_target_shape_img'][0].cpu()], dim=2))
	return res[..., ::-1]

	def video_inference():
	pass

	with gr.Blocks() as demo:
	gr.Markdown("# <p align='center'>ROME: Realistic one-shot mesh-based head avatars</p>")

	gr.Markdown(
	"""
	<p style='text-align: center'>
	Create a personal avatar from just a single image using ROME.
	<br> <a href='https://arxiv.org/abs/2206.08343' target='_blank'>Paper</a> \| <a href='https://samsunglabs.github.io/rome' target='_blank'>Project Page</a> \| <a href='https://github.com/SamsungLabs/rome' target='_blank'>Github</a>
	</p>
	"""
	)

	with gr.Tab("Image Inference"):
	with gr.Row():
	source_img = gr.Image(type="pil", label="source image", show_label=True)
	driver_img = gr.Image(type="pil", label="driver image", show_label=True)
	image_output = gr.Image()
	image_button = gr.Button("Predict")
	with gr.Tab("Video Inference"):
	with gr.Row():
	source_video = gr.Video(label="source video", )
	driver_image_for_vid = gr.Image(type="pil", label="driver image", show_label=True)
	video_output = gr.Image()
	video_button = gr.Button("Predict")

	gr.Examples(
	examples=[
	["./examples/lincoln.jpg", "./examples/taras2.jpg"],
	["./examples/lincoln.jpg", "./examples/taras1.jpg"]
	],
	inputs=[source_img, driver_img],
	outputs=[image_output],
	fn=image_inference,
	cache_examples=True
	)

	image_button.click(image_inference, inputs=[source_img, driver_img], outputs=image_output)
	video_button.click(None, inputs=[source_video, driver_image_for_vid], outputs=video_output)

	demo.launch()