File size: 3,134 Bytes
098012a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
data:
train_bs: 4
val_bs: 1
train_width: 512
train_height: 512
fps: 25
sample_rate: 16000
n_motion_frames: 2
n_sample_frames: 16
audio_margin: 2
train_meta_paths:
- "./data/inference.json"
wav2vec_config:
audio_type: "vocals" # audio vocals
model_scale: "base" # base large
features: "all" # last avg all
model_path: ./pretrained_models/chinese-wav2vec2-base
audio_separator:
model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
face_expand_ratio: 1.2
solver:
gradient_accumulation_steps: 1
mixed_precision: "no"
enable_xformers_memory_efficient_attention: True
gradient_checkpointing: True
max_train_steps: 30000
max_grad_norm: 1.0
# lr
learning_rate: 1e-5
scale_lr: False
lr_warmup_steps: 1
lr_scheduler: "constant"
# optimizer
use_8bit_adam: True
adam_beta1: 0.9
adam_beta2: 0.999
adam_weight_decay: 1.0e-2
adam_epsilon: 1.0e-8
val:
validation_steps: 1000
noise_scheduler_kwargs:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "linear"
steps_offset: 1
clip_sample: false
unet_additional_kwargs:
use_inflated_groupnorm: true
unet_use_cross_frame_attention: false
unet_use_temporal_attention: false
use_motion_module: true
use_audio_module: true
motion_module_resolutions:
- 1
- 2
- 4
- 8
motion_module_mid_block: true
motion_module_decoder_only: false
motion_module_type: Vanilla
motion_module_kwargs:
num_attention_heads: 8
num_transformer_block: 1
attention_block_types:
- Temporal_Self
- Temporal_Self
temporal_position_encoding: true
temporal_position_encoding_max_len: 32
temporal_attention_dim_div: 1
audio_attention_dim: 768
stack_enable_blocks_name:
- "up"
- "down"
- "mid"
stack_enable_blocks_depth: [0,1,2,3]
trainable_para:
- audio_modules
- motion_modules
base_model_path: "./pretrained_models/stable-diffusion-v1-5"
vae_model_path: "./pretrained_models/sd-vae-ft-mse"
face_analysis_model_path: "./pretrained_models/face_analysis"
mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt"
weight_dtype: "fp16" # [fp16, fp32]
uncond_img_ratio: 0.05
uncond_audio_ratio: 0.05
uncond_ia_ratio: 0.05
start_ratio: 0.05
noise_offset: 0.05
snr_gamma: 5.0
enable_zero_snr: True
stage1_ckpt_dir: "./exp_output/stage1/"
single_inference_times: 10
inference_steps: 40
cfg_scale: 3.5
seed: 42
resume_from_checkpoint: "latest"
checkpointing_steps: 500
exp_name: "joyhallo"
output_dir: "./opts"
audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth"
ref_img_path:
- "examples/reference_images/1.jpg"
- "examples/reference_images/2.jpg"
- "examples/reference_images/3.jpg"
- "examples/reference_images/4.jpg"
- "examples/reference_images/5.jpg"
- "examples/reference_images/6.jpg"
- "examples/reference_images/7.jpg"
audio_path:
- "examples/driving_audios/0.wav"
- "examples/driving_audios/0.wav"
- "examples/driving_audios/0.wav"
- "examples/driving_audios/0.wav"
- "examples/driving_audios/0.wav"
- "examples/driving_audios/0.wav"
- "examples/driving_audios/0.wav" |