shisheng7 commited on
Commit
098012a
1 Parent(s): 949ee8e

Create config.json

Browse files
Files changed (1) hide show
  1. config.json +132 -0
config.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ train_bs: 4
3
+ val_bs: 1
4
+ train_width: 512
5
+ train_height: 512
6
+ fps: 25
7
+ sample_rate: 16000
8
+ n_motion_frames: 2
9
+ n_sample_frames: 16
10
+ audio_margin: 2
11
+ train_meta_paths:
12
+ - "./data/inference.json"
13
+
14
+ wav2vec_config:
15
+ audio_type: "vocals" # audio vocals
16
+ model_scale: "base" # base large
17
+ features: "all" # last avg all
18
+ model_path: ./pretrained_models/chinese-wav2vec2-base
19
+ audio_separator:
20
+ model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
21
+ face_expand_ratio: 1.2
22
+
23
+ solver:
24
+ gradient_accumulation_steps: 1
25
+ mixed_precision: "no"
26
+ enable_xformers_memory_efficient_attention: True
27
+ gradient_checkpointing: True
28
+ max_train_steps: 30000
29
+ max_grad_norm: 1.0
30
+ # lr
31
+ learning_rate: 1e-5
32
+ scale_lr: False
33
+ lr_warmup_steps: 1
34
+ lr_scheduler: "constant"
35
+
36
+ # optimizer
37
+ use_8bit_adam: True
38
+ adam_beta1: 0.9
39
+ adam_beta2: 0.999
40
+ adam_weight_decay: 1.0e-2
41
+ adam_epsilon: 1.0e-8
42
+
43
+ val:
44
+ validation_steps: 1000
45
+
46
+ noise_scheduler_kwargs:
47
+ num_train_timesteps: 1000
48
+ beta_start: 0.00085
49
+ beta_end: 0.012
50
+ beta_schedule: "linear"
51
+ steps_offset: 1
52
+ clip_sample: false
53
+
54
+ unet_additional_kwargs:
55
+ use_inflated_groupnorm: true
56
+ unet_use_cross_frame_attention: false
57
+ unet_use_temporal_attention: false
58
+ use_motion_module: true
59
+ use_audio_module: true
60
+ motion_module_resolutions:
61
+ - 1
62
+ - 2
63
+ - 4
64
+ - 8
65
+ motion_module_mid_block: true
66
+ motion_module_decoder_only: false
67
+ motion_module_type: Vanilla
68
+ motion_module_kwargs:
69
+ num_attention_heads: 8
70
+ num_transformer_block: 1
71
+ attention_block_types:
72
+ - Temporal_Self
73
+ - Temporal_Self
74
+ temporal_position_encoding: true
75
+ temporal_position_encoding_max_len: 32
76
+ temporal_attention_dim_div: 1
77
+ audio_attention_dim: 768
78
+ stack_enable_blocks_name:
79
+ - "up"
80
+ - "down"
81
+ - "mid"
82
+ stack_enable_blocks_depth: [0,1,2,3]
83
+
84
+ trainable_para:
85
+ - audio_modules
86
+ - motion_modules
87
+
88
+ base_model_path: "./pretrained_models/stable-diffusion-v1-5"
89
+ vae_model_path: "./pretrained_models/sd-vae-ft-mse"
90
+ face_analysis_model_path: "./pretrained_models/face_analysis"
91
+ mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt"
92
+
93
+ weight_dtype: "fp16" # [fp16, fp32]
94
+ uncond_img_ratio: 0.05
95
+ uncond_audio_ratio: 0.05
96
+ uncond_ia_ratio: 0.05
97
+ start_ratio: 0.05
98
+ noise_offset: 0.05
99
+ snr_gamma: 5.0
100
+ enable_zero_snr: True
101
+ stage1_ckpt_dir: "./exp_output/stage1/"
102
+
103
+ single_inference_times: 10
104
+ inference_steps: 40
105
+ cfg_scale: 3.5
106
+
107
+ seed: 42
108
+ resume_from_checkpoint: "latest"
109
+ checkpointing_steps: 500
110
+
111
+ exp_name: "joyhallo"
112
+ output_dir: "./opts"
113
+
114
+ audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth"
115
+
116
+ ref_img_path:
117
+ - "examples/reference_images/1.jpg"
118
+ - "examples/reference_images/2.jpg"
119
+ - "examples/reference_images/3.jpg"
120
+ - "examples/reference_images/4.jpg"
121
+ - "examples/reference_images/5.jpg"
122
+ - "examples/reference_images/6.jpg"
123
+ - "examples/reference_images/7.jpg"
124
+
125
+ audio_path:
126
+ - "examples/driving_audios/0.wav"
127
+ - "examples/driving_audios/0.wav"
128
+ - "examples/driving_audios/0.wav"
129
+ - "examples/driving_audios/0.wav"
130
+ - "examples/driving_audios/0.wav"
131
+ - "examples/driving_audios/0.wav"
132
+ - "examples/driving_audios/0.wav"