File size: 3,134 Bytes
098012a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
data:
  train_bs: 4
  val_bs: 1
  train_width: 512
  train_height: 512
  fps: 25
  sample_rate: 16000
  n_motion_frames: 2
  n_sample_frames: 16
  audio_margin: 2
  train_meta_paths:
    - "./data/inference.json"

wav2vec_config:
  audio_type: "vocals" # audio vocals
  model_scale: "base" # base large
  features: "all" # last avg all
  model_path: ./pretrained_models/chinese-wav2vec2-base
audio_separator:
  model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
face_expand_ratio: 1.2

solver:
  gradient_accumulation_steps: 1
  mixed_precision: "no"
  enable_xformers_memory_efficient_attention: True
  gradient_checkpointing: True
  max_train_steps: 30000
  max_grad_norm: 1.0
  # lr
  learning_rate: 1e-5
  scale_lr: False
  lr_warmup_steps: 1
  lr_scheduler: "constant"

  # optimizer
  use_8bit_adam: True
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_weight_decay: 1.0e-2
  adam_epsilon: 1.0e-8

val:
  validation_steps: 1000

noise_scheduler_kwargs:
  num_train_timesteps: 1000
  beta_start: 0.00085
  beta_end: 0.012
  beta_schedule: "linear"
  steps_offset: 1
  clip_sample: false

unet_additional_kwargs:
  use_inflated_groupnorm: true
  unet_use_cross_frame_attention: false
  unet_use_temporal_attention: false
  use_motion_module: true
  use_audio_module: true
  motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
  motion_module_mid_block: true
  motion_module_decoder_only: false
  motion_module_type: Vanilla
  motion_module_kwargs:
    num_attention_heads: 8
    num_transformer_block: 1
    attention_block_types:
      - Temporal_Self
      - Temporal_Self
    temporal_position_encoding: true
    temporal_position_encoding_max_len: 32
    temporal_attention_dim_div: 1
  audio_attention_dim: 768
  stack_enable_blocks_name:
    - "up"
    - "down"
    - "mid"
  stack_enable_blocks_depth: [0,1,2,3]
  
trainable_para:
  - audio_modules
  - motion_modules

base_model_path: "./pretrained_models/stable-diffusion-v1-5"
vae_model_path: "./pretrained_models/sd-vae-ft-mse"
face_analysis_model_path: "./pretrained_models/face_analysis"
mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt"

weight_dtype: "fp16" # [fp16, fp32]
uncond_img_ratio: 0.05
uncond_audio_ratio: 0.05
uncond_ia_ratio: 0.05
start_ratio: 0.05
noise_offset: 0.05
snr_gamma: 5.0
enable_zero_snr: True
stage1_ckpt_dir: "./exp_output/stage1/"

single_inference_times: 10
inference_steps: 40
cfg_scale: 3.5

seed: 42
resume_from_checkpoint: "latest"
checkpointing_steps: 500

exp_name: "joyhallo"
output_dir: "./opts"

audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth"

ref_img_path:
  - "examples/reference_images/1.jpg"
  - "examples/reference_images/2.jpg"
  - "examples/reference_images/3.jpg"
  - "examples/reference_images/4.jpg"
  - "examples/reference_images/5.jpg"
  - "examples/reference_images/6.jpg"
  - "examples/reference_images/7.jpg"
  
audio_path:
  - "examples/driving_audios/0.wav"
  - "examples/driving_audios/0.wav"
  - "examples/driving_audios/0.wav"
  - "examples/driving_audios/0.wav"
  - "examples/driving_audios/0.wav"
  - "examples/driving_audios/0.wav"
  - "examples/driving_audios/0.wav"