Diffusers
AudioLDM2Pipeline
anhnct commited on
Commit
df84f99
1 Parent(s): e16f423

Upload 27 files

Browse files
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": 10,
3
+ "feature_extractor_type": "ClapFeatureExtractor",
4
+ "feature_size": 64,
5
+ "fft_window_size": 1024,
6
+ "frequency_max": 14000,
7
+ "frequency_min": 50,
8
+ "hop_length": 480,
9
+ "max_length_s": 10,
10
+ "n_fft": 1024,
11
+ "nb_frequency_bins": 513,
12
+ "nb_max_frames": 1000,
13
+ "nb_max_samples": 480000,
14
+ "padding": "repeatpad",
15
+ "padding_side": "right",
16
+ "padding_value": 0.0,
17
+ "processor_class": "ClapProcessor",
18
+ "return_attention_mask": false,
19
+ "sampling_rate": 48000,
20
+ "top_db": null,
21
+ "truncation": "rand_trunc"
22
+ }
language_model/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2Model"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "max_new_tokens": 8,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.34.0",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
language_model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e172b4b624b7419cf1f2c60b35837109eaa2e2378a89f0eddbc2a112a9fb9620
3
+ size 497803738
model_index.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AudioLDM2Pipeline",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "ClapFeatureExtractor"
7
+ ],
8
+ "language_model": [
9
+ "transformers",
10
+ "GPT2Model"
11
+ ],
12
+ "projection_model": [
13
+ "audioldm2",
14
+ "AudioLDM2ProjectionModel"
15
+ ],
16
+ "scheduler": [
17
+ "diffusers",
18
+ "DDIMScheduler"
19
+ ],
20
+ "text_encoder": [
21
+ "transformers",
22
+ "ClapModel"
23
+ ],
24
+ "text_encoder_2": [
25
+ "transformers",
26
+ "VitsModel"
27
+ ],
28
+ "tokenizer": [
29
+ "transformers",
30
+ "RobertaTokenizerFast"
31
+ ],
32
+ "tokenizer_2": [
33
+ "transformers",
34
+ "VitsTokenizer"
35
+ ],
36
+ "unet": [
37
+ "audioldm2",
38
+ "AudioLDM2UNet2DConditionModel"
39
+ ],
40
+ "vae": [
41
+ "diffusers",
42
+ "AutoencoderKL"
43
+ ],
44
+ "vocoder": [
45
+ "transformers",
46
+ "SpeechT5HifiGan"
47
+ ]
48
+ }
projection_model/config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AudioLDM2ProjectionModel",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "langauge_model_dim": 768,
5
+ "max_seq_length": 310,
6
+ "text_encoder_1_dim": 192,
7
+ "text_encoder_dim": 512
8
+ }
projection_model/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d045022a64d6688e657cc761a0cb98b3c2df9df7873fc25d860e2306652465ee
3
+ size 2422924
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "beta_end": 0.0195,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.0015,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ClapModel"
4
+ ],
5
+ "audio_config": {
6
+ "depths": [
7
+ 2,
8
+ 2,
9
+ 12,
10
+ 2
11
+ ],
12
+ "fusion_num_hidden_layers": 2,
13
+ "hidden_size": 1024,
14
+ "model_type": "clap_audio_model",
15
+ "patch_embeds_hidden_size": 128,
16
+ "projection_hidden_size": 768
17
+ },
18
+ "hidden_size": 768,
19
+ "initializer_factor": 1.0,
20
+ "logit_scale_init_value": 14.285714285714285,
21
+ "model_type": "clap",
22
+ "num_hidden_layers": 16,
23
+ "projection_dim": 512,
24
+ "projection_hidden_act": "relu",
25
+ "text_config": {
26
+ "classifier_dropout": null,
27
+ "fusion_hidden_size": 768,
28
+ "fusion_num_hidden_layers": 2,
29
+ "initializer_range": 0.02,
30
+ "model_type": "clap_text_model",
31
+ "projection_hidden_size": 768
32
+ },
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.34.0"
35
+ }
text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5273abc4ce184eac0e026002b27b1e197037e2a107184a71f6ebb5afb4090d6a
3
+ size 776445110
text_encoder_2/config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "architectures": [
4
+ "VitsModel"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "depth_separable_channels": 2,
8
+ "depth_separable_num_layers": 3,
9
+ "duration_predictor_dropout": 0.5,
10
+ "duration_predictor_filter_channels": 256,
11
+ "duration_predictor_flow_bins": 10,
12
+ "duration_predictor_kernel_size": 3,
13
+ "duration_predictor_num_flows": 4,
14
+ "duration_predictor_tail_bound": 5.0,
15
+ "ffn_dim": 768,
16
+ "ffn_kernel_size": 3,
17
+ "flow_size": 192,
18
+ "hidden_act": "relu",
19
+ "hidden_dropout": 0.1,
20
+ "hidden_size": 192,
21
+ "initializer_range": 0.02,
22
+ "layer_norm_eps": 1e-05,
23
+ "layerdrop": 0.1,
24
+ "leaky_relu_slope": 0.1,
25
+ "model_type": "vits",
26
+ "noise_scale": 0.667,
27
+ "noise_scale_duration": 0.8,
28
+ "num_attention_heads": 2,
29
+ "num_hidden_layers": 6,
30
+ "num_speakers": 1,
31
+ "posterior_encoder_num_wavenet_layers": 16,
32
+ "prior_encoder_num_flows": 4,
33
+ "prior_encoder_num_wavenet_layers": 4,
34
+ "resblock_dilation_sizes": [
35
+ [
36
+ 1,
37
+ 3,
38
+ 5
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ]
50
+ ],
51
+ "resblock_kernel_sizes": [
52
+ 3,
53
+ 7,
54
+ 11
55
+ ],
56
+ "sampling_rate": 16000,
57
+ "speaker_embedding_size": 0,
58
+ "speaking_rate": 1.0,
59
+ "spectrogram_bins": 513,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.34.0",
62
+ "upsample_initial_channel": 512,
63
+ "upsample_kernel_sizes": [
64
+ 16,
65
+ 16,
66
+ 4,
67
+ 4
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2
74
+ ],
75
+ "use_bias": true,
76
+ "use_stochastic_duration_prediction": true,
77
+ "vocab_size": 183,
78
+ "wavenet_dilation_rate": 1,
79
+ "wavenet_dropout": 0.0,
80
+ "wavenet_kernel_size": 5,
81
+ "window_size": 4
82
+ }
text_encoder_2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c0825d996bb5e20ee89af0a022ba9b43af0afba2e033dac8abe83b3b45172b7
3
+ size 145503282
tokenizer/added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 2,
3
+ "<mask>": 50264,
4
+ "<pad>": 1,
5
+ "<s>": 0,
6
+ "<unk>": 3
7
+ }
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "additional_special_tokens": [],
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": true,
48
+ "cls_token": "<s>",
49
+ "eos_token": "</s>",
50
+ "errors": "replace",
51
+ "mask_token": "<mask>",
52
+ "max_length": null,
53
+ "model_max_length": 512,
54
+ "pad_to_multiple_of": null,
55
+ "pad_token": "<pad>",
56
+ "pad_token_type_id": 0,
57
+ "padding_side": "right",
58
+ "processor_class": "ClapProcessor",
59
+ "sep_token": "</s>",
60
+ "tokenizer_class": "RobertaTokenizer",
61
+ "trim_offsets": true,
62
+ "unk_token": "<unk>"
63
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "_": 0
3
+ }
tokenizer_2/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "_"
3
+ }
tokenizer_2/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "_",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "additional_special_tokens": [],
14
+ "clean_up_tokenization_spaces": true,
15
+ "is_uroman": false,
16
+ "language": "eng",
17
+ "model_max_length": 310,
18
+ "normalize": true,
19
+ "pad_token": "_",
20
+ "phonemize": true,
21
+ "tokenizer_class": "VitsTokenizer",
22
+ "tokenizer_file": null
23
+ }
tokenizer_2/vocab.json ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_": 0,
3
+ ";": 1,
4
+ ":": 2,
5
+ ",": 3,
6
+ ".": 4,
7
+ "!": 5,
8
+ "?": 6,
9
+ "¡": 7,
10
+ "¿": 8,
11
+ "—": 9,
12
+ "…": 10,
13
+ "\"": 11,
14
+ "«": 12,
15
+ "»": 13,
16
+ "“": 14,
17
+ "”": 15,
18
+ " ": 16,
19
+ "A": 17,
20
+ "B": 18,
21
+ "C": 19,
22
+ "D": 20,
23
+ "E": 21,
24
+ "F": 22,
25
+ "G": 23,
26
+ "H": 24,
27
+ "I": 25,
28
+ "J": 26,
29
+ "K": 27,
30
+ "L": 28,
31
+ "M": 29,
32
+ "N": 30,
33
+ "O": 31,
34
+ "P": 32,
35
+ "Q": 33,
36
+ "R": 34,
37
+ "S": 35,
38
+ "T": 36,
39
+ "U": 37,
40
+ "V": 38,
41
+ "W": 39,
42
+ "X": 40,
43
+ "Y": 41,
44
+ "Z": 42,
45
+ "a": 43,
46
+ "b": 44,
47
+ "c": 45,
48
+ "d": 46,
49
+ "e": 47,
50
+ "f": 48,
51
+ "g": 49,
52
+ "h": 50,
53
+ "i": 51,
54
+ "j": 52,
55
+ "k": 53,
56
+ "l": 54,
57
+ "m": 55,
58
+ "n": 56,
59
+ "o": 57,
60
+ "p": 58,
61
+ "q": 59,
62
+ "r": 60,
63
+ "s": 61,
64
+ "t": 62,
65
+ "u": 63,
66
+ "v": 64,
67
+ "w": 65,
68
+ "x": 66,
69
+ "y": 67,
70
+ "z": 68,
71
+ "ɑ": 69,
72
+ "ɐ": 70,
73
+ "ɒ": 71,
74
+ "æ": 72,
75
+ "ɓ": 73,
76
+ "ʙ": 74,
77
+ "β": 75,
78
+ "ɔ": 76,
79
+ "ɕ": 77,
80
+ "ç": 78,
81
+ "ɗ": 79,
82
+ "ɖ": 80,
83
+ "ð": 81,
84
+ "ʤ": 82,
85
+ "ə": 83,
86
+ "ɘ": 84,
87
+ "ɚ": 85,
88
+ "ɛ": 86,
89
+ "ɜ": 87,
90
+ "ɝ": 88,
91
+ "ɞ": 89,
92
+ "ɟ": 90,
93
+ "ʄ": 91,
94
+ "ɡ": 92,
95
+ "ɠ": 93,
96
+ "ɢ": 94,
97
+ "ʛ": 95,
98
+ "ɦ": 96,
99
+ "ɧ": 97,
100
+ "ħ": 98,
101
+ "ɥ": 99,
102
+ "ʜ": 100,
103
+ "ɨ": 101,
104
+ "ɪ": 102,
105
+ "ʝ": 103,
106
+ "ɭ": 104,
107
+ "ɬ": 105,
108
+ "ɫ": 106,
109
+ "ɮ": 107,
110
+ "ʟ": 108,
111
+ "ɱ": 109,
112
+ "ɯ": 110,
113
+ "ɰ": 111,
114
+ "ŋ": 112,
115
+ "ɳ": 113,
116
+ "ɲ": 114,
117
+ "ɴ": 115,
118
+ "ø": 116,
119
+ "ɵ": 117,
120
+ "ɸ": 118,
121
+ "θ": 119,
122
+ "œ": 120,
123
+ "ɶ": 121,
124
+ "ʘ": 122,
125
+ "ɹ": 123,
126
+ "ɺ": 124,
127
+ "ɾ": 125,
128
+ "ɻ": 126,
129
+ "ʀ": 127,
130
+ "ʁ": 128,
131
+ "ɽ": 129,
132
+ "ʂ": 130,
133
+ "ʃ": 131,
134
+ "ʈ": 132,
135
+ "ʧ": 133,
136
+ "ʉ": 134,
137
+ "ʊ": 135,
138
+ "ʋ": 136,
139
+ "ⱱ": 137,
140
+ "ʌ": 138,
141
+ "ɣ": 139,
142
+ "ɤ": 140,
143
+ "ʍ": 141,
144
+ "χ": 142,
145
+ "ʎ": 143,
146
+ "ʏ": 144,
147
+ "ʑ": 145,
148
+ "ʐ": 146,
149
+ "ʒ": 147,
150
+ "ʔ": 148,
151
+ "ʡ": 149,
152
+ "ʕ": 150,
153
+ "ʢ": 151,
154
+ "ǀ": 152,
155
+ "ǁ": 153,
156
+ "ǂ": 154,
157
+ "ǃ": 155,
158
+ "ˈ": 156,
159
+ "ˌ": 157,
160
+ "ː": 158,
161
+ "ˑ": 159,
162
+ "ʼ": 160,
163
+ "ʴ": 161,
164
+ "ʰ": 162,
165
+ "ʱ": 163,
166
+ "ʲ": 164,
167
+ "ʷ": 165,
168
+ "ˠ": 166,
169
+ "ˤ": 167,
170
+ "˞": 168,
171
+ "↓": 169,
172
+ "↑": 170,
173
+ "→": 171,
174
+ "↗": 172,
175
+ "↘": 173,
176
+ "'": 176,
177
+ "̩": 175,
178
+ "ᵻ": 177,
179
+ "♪": 178,
180
+ "☎": 179,
181
+ "☒": 180,
182
+ "☝": 181,
183
+ "⚠": 182
184
+ }
unet/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AudioLDM2UNet2DConditionModel",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 384,
10
+ 640
11
+ ],
12
+ "class_embed_type": null,
13
+ "class_embeddings_concat": false,
14
+ "conv_in_kernel": 3,
15
+ "conv_out_kernel": 3,
16
+ "cross_attention_dim": [
17
+ [
18
+ null,
19
+ 768
20
+ ],
21
+ [
22
+ null,
23
+ 768
24
+ ],
25
+ [
26
+ null,
27
+ 768
28
+ ],
29
+ [
30
+ null,
31
+ 768
32
+ ]
33
+ ],
34
+ "down_block_types": [
35
+ "DownBlock2D",
36
+ "CrossAttnDownBlock2D",
37
+ "CrossAttnDownBlock2D",
38
+ "CrossAttnDownBlock2D"
39
+ ],
40
+ "downsample_padding": 1,
41
+ "flip_sin_to_cos": true,
42
+ "freq_shift": 0,
43
+ "in_channels": 8,
44
+ "layers_per_block": 2,
45
+ "mid_block_scale_factor": 1,
46
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
47
+ "norm_eps": 1e-05,
48
+ "norm_num_groups": 32,
49
+ "num_attention_heads": null,
50
+ "num_class_embeds": null,
51
+ "only_cross_attention": false,
52
+ "out_channels": 8,
53
+ "projection_class_embeddings_input_dim": null,
54
+ "resnet_time_scale_shift": "default",
55
+ "sample_size": 262,
56
+ "time_cond_proj_dim": null,
57
+ "time_embedding_act_fn": null,
58
+ "time_embedding_dim": null,
59
+ "time_embedding_type": "positional",
60
+ "timestep_post_act": null,
61
+ "transformer_layers_per_block": 1,
62
+ "up_block_types": [
63
+ "CrossAttnUpBlock2D",
64
+ "CrossAttnUpBlock2D",
65
+ "CrossAttnUpBlock2D",
66
+ "UpBlock2D"
67
+ ],
68
+ "upcast_attention": false,
69
+ "use_linear_projection": false
70
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c8adb14c35ce3550290c1d26202401033f2badbe6234a3404894501f27f27e
3
+ size 1048171066
vae/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512
9
+ ],
10
+ "down_block_types": [
11
+ "DownEncoderBlock2D",
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D"
14
+ ],
15
+ "force_upcast": true,
16
+ "in_channels": 1,
17
+ "latent_channels": 8,
18
+ "layers_per_block": 2,
19
+ "norm_num_groups": 32,
20
+ "out_channels": 1,
21
+ "sample_size": 1048,
22
+ "scaling_factor": 0.41837412118911743,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D"
27
+ ]
28
+ }
vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c4201e7ebec40dc21ed45d99077db20fd5020dbe5886269e3b83108cd2c4e16
3
+ size 221587226
vocoder/config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SpeechT5HifiGan"
4
+ ],
5
+ "initializer_range": 0.01,
6
+ "leaky_relu_slope": 0.1,
7
+ "model_in_dim": 64,
8
+ "model_type": "hifigan",
9
+ "normalize_before": false,
10
+ "resblock_dilation_sizes": [
11
+ [
12
+ 1,
13
+ 3,
14
+ 5
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 5
20
+ ],
21
+ [
22
+ 1,
23
+ 3,
24
+ 5
25
+ ]
26
+ ],
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 7,
30
+ 11
31
+ ],
32
+ "sampling_rate": 16000,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.34.0",
35
+ "upsample_initial_channel": 1024,
36
+ "upsample_kernel_sizes": [
37
+ 16,
38
+ 16,
39
+ 8,
40
+ 4,
41
+ 4
42
+ ],
43
+ "upsample_rates": [
44
+ 5,
45
+ 4,
46
+ 2,
47
+ 2,
48
+ 2
49
+ ]
50
+ }
vocoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bfa544480458b5175cb0664a2bc1cc615c2b83c19c71d2c8c3002bba3c93bde
3
+ size 221120794