shizhediao2 commited on
Commit
d787118
1 Parent(s): 73efe29

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +233 -0
config.json ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "JambaForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_hidden_size": -1,
7
+ "attn_implementation": "flex",
8
+ "attn_layer_offset": 100,
9
+ "attn_layer_period": 1,
10
+ "attn_reuse_every_i_layer": -1,
11
+ "bos_token_id": 1,
12
+ "calc_logits_for_entire_prompt": false,
13
+ "compact_gating": false,
14
+ "compute_attn_mat": false,
15
+ "conv_dim": {
16
+ "0": 3200,
17
+ "1": 3200,
18
+ "2": 3200,
19
+ "3": 3200,
20
+ "4": 3200,
21
+ "5": 3200,
22
+ "6": 3200,
23
+ "7": 3200,
24
+ "8": 3200,
25
+ "9": 3200,
26
+ "10": 3200,
27
+ "11": 3200,
28
+ "12": 3200,
29
+ "13": 3200,
30
+ "14": 3200,
31
+ "15": 3200,
32
+ "16": 3200,
33
+ "17": 3200,
34
+ "18": 3200,
35
+ "19": 3200,
36
+ "20": 3200,
37
+ "21": 3200,
38
+ "22": 3200,
39
+ "23": 3200,
40
+ "24": 3200,
41
+ "25": 3200,
42
+ "26": 3200,
43
+ "27": 3200,
44
+ "28": 3200,
45
+ "29": 3200,
46
+ "30": 3200,
47
+ "31": 3200
48
+ },
49
+ "dense_public_ffn_structure": false,
50
+ "double_v_dim": false,
51
+ "enable_mod": false,
52
+ "eos_token_id": 2,
53
+ "expert_layer_offset": 1000,
54
+ "expert_layer_period": 1000,
55
+ "ffn_reuse_every_i_layer": -1,
56
+ "ffn_sharing_config": null,
57
+ "fully_parallel_jamba": false,
58
+ "fused_multihead_config": {
59
+ "conv_attn": null,
60
+ "diverse_head_merge_op": "mean_wo_gate",
61
+ "expand_v": true,
62
+ "qkv_mode": "orig",
63
+ "share_qk": false
64
+ },
65
+ "global_attn_idx": [
66
+ 0,
67
+ 15,
68
+ 31
69
+ ],
70
+ "gradient_checkpoint_layer": null,
71
+ "hidden_act": "silu",
72
+ "hidden_size": 1600,
73
+ "hybrid_block_indices": [],
74
+ "hybrid_decoder_layer": "mamba",
75
+ "initializer_range": 0.02,
76
+ "intermediate_size": 5504,
77
+ "kq_head_dim": -1,
78
+ "kq_norm": "none",
79
+ "kv_reuse_every_i_layer": -1,
80
+ "kv_reuse_group": [
81
+ [
82
+ 1,
83
+ 2
84
+ ],
85
+ [
86
+ 3,
87
+ 4
88
+ ],
89
+ [
90
+ 5,
91
+ 6
92
+ ],
93
+ [
94
+ 7,
95
+ 8
96
+ ],
97
+ [
98
+ 9,
99
+ 10
100
+ ],
101
+ [
102
+ 11,
103
+ 12
104
+ ],
105
+ [
106
+ 13,
107
+ 14
108
+ ],
109
+ [
110
+ 16,
111
+ 17,
112
+ 18
113
+ ],
114
+ [
115
+ 19,
116
+ 20
117
+ ],
118
+ [
119
+ 21,
120
+ 22
121
+ ],
122
+ [
123
+ 23,
124
+ 24
125
+ ],
126
+ [
127
+ 25,
128
+ 26
129
+ ],
130
+ [
131
+ 27,
132
+ 28
133
+ ],
134
+ [
135
+ 29,
136
+ 30
137
+ ]
138
+ ],
139
+ "kv_weight_reuse": false,
140
+ "layer_type": [
141
+ "h",
142
+ "h",
143
+ "h",
144
+ "h",
145
+ "h",
146
+ "h",
147
+ "h",
148
+ "h",
149
+ "h",
150
+ "h",
151
+ "h",
152
+ "h",
153
+ "h",
154
+ "h",
155
+ "h",
156
+ "h",
157
+ "h",
158
+ "h",
159
+ "h",
160
+ "h",
161
+ "h",
162
+ "h",
163
+ "h",
164
+ "h",
165
+ "h",
166
+ "h",
167
+ "h",
168
+ "h",
169
+ "h",
170
+ "h",
171
+ "h",
172
+ "h"
173
+ ],
174
+ "local_expand_ratio": 1,
175
+ "local_global_dual_branch": false,
176
+ "local_global_dual_branch_merge_op": "mean",
177
+ "lookback_mode": "",
178
+ "macro_arch": "",
179
+ "mamba2_headdim": 64,
180
+ "mamba_conv_bias": true,
181
+ "mamba_d_conv": 4,
182
+ "mamba_d_state": 16,
183
+ "mamba_dt_rank": 100,
184
+ "mamba_expand": 2,
185
+ "mamba_inner_layernorms": true,
186
+ "mamba_latent_size": null,
187
+ "mamba_multihead_config": null,
188
+ "mamba_proj_bias": false,
189
+ "mamba_reuse_every_i_layer": -1,
190
+ "max_position_embeddings": 2176,
191
+ "memory_tokens_interspersed_every": 0,
192
+ "mlp_hidden_act": "silu",
193
+ "mod_topk": 2,
194
+ "model_type": "jamba",
195
+ "moe_config": null,
196
+ "num_attention_heads": 25,
197
+ "num_attn_per_ffn": 3,
198
+ "num_experts": 1,
199
+ "num_experts_per_tok": 1,
200
+ "num_ffn": 1,
201
+ "num_hidden_layers": 32,
202
+ "num_key_value_heads": 5,
203
+ "num_mamba": 1,
204
+ "num_memory_tokens": 128,
205
+ "output_router_logits": false,
206
+ "pad_token_id": 0,
207
+ "public_ffn_structure": false,
208
+ "reduce_attn_ratio": 0.5,
209
+ "reduce_method": "mean",
210
+ "rms_norm_eps": 1e-06,
211
+ "rope": true,
212
+ "rope_theta": 10000.0,
213
+ "rope_type": null,
214
+ "router_aux_loss_coef": 0.001,
215
+ "save_input_output": false,
216
+ "seq_length": 2048,
217
+ "sequential_jamba": false,
218
+ "share_kv": false,
219
+ "shared_module_attn": "",
220
+ "shared_module_mamba": "",
221
+ "sliding_window": 1024,
222
+ "sliding_window_size": null,
223
+ "swa_full_head": false,
224
+ "tie_word_embeddings": true,
225
+ "torch_dtype": "bfloat16",
226
+ "transformers_version": "4.44.1",
227
+ "use_cache": false,
228
+ "use_mamba2": false,
229
+ "use_mamba_kernels": true,
230
+ "v_head_dim": 128,
231
+ "visual_attn": false,
232
+ "vocab_size": 32001
233
+ }