pglo commited on
Commit
ded606b
1 Parent(s): 5507f28

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,16 +1,15 @@
1
  {
2
- "_name_or_path": "Zyphra/Zamba2-2.7B",
3
  "add_bias_linear": false,
4
  "architectures": [
5
  "Zamba2ForCausalLM"
6
  ],
7
  "attention_dropout": 0.0,
8
- "bos_token_id": 32001,
9
  "conv_dimension": 4,
10
- "eos_token_id": 32002,
11
  "expansion_factor": 2,
12
  "ffn_hidden_size": 10240,
13
- "ft_lora": false,
14
  "gated_linear_unit": true,
15
  "hidden_size": 2560,
16
  "initializer_range": 0.02,
@@ -72,6 +71,7 @@
72
  "m"
73
  ],
74
  "lora_rank": 128,
 
75
  "mamba_headdim": 64,
76
  "max_position_embeddings": 4096,
77
  "model_type": "zamba2",
@@ -81,18 +81,18 @@
81
  "num_logits_to_keep": 1,
82
  "num_mem_blocks": 2,
83
  "num_query_groups": 32,
84
- "pad_token_id": 32002,
85
  "rms_norm_eps": 1e-05,
86
  "rope_theta": 10000,
87
  "se_shared_attention_lora": false,
88
  "sliding_window": null,
89
  "state_size": 64,
90
- "torch_dtype": "bfloat16",
91
  "transformers_version": "4.43.0.dev0",
92
  "use_cache": false,
93
  "use_mamba_kernels": true,
94
  "use_mem_rope": false,
95
  "use_shared_attention_lora": false,
96
  "use_shared_block_lora": true,
97
- "vocab_size": 32003
98
  }
 
1
  {
2
+ "_name_or_path": "Zyphra/Zamba2-2.7B-instruct",
3
  "add_bias_linear": false,
4
  "architectures": [
5
  "Zamba2ForCausalLM"
6
  ],
7
  "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
  "conv_dimension": 4,
10
+ "eos_token_id": 2,
11
  "expansion_factor": 2,
12
  "ffn_hidden_size": 10240,
 
13
  "gated_linear_unit": true,
14
  "hidden_size": 2560,
15
  "initializer_range": 0.02,
 
71
  "m"
72
  ],
73
  "lora_rank": 128,
74
+ "lora_rank_mamba": 128,
75
  "mamba_headdim": 64,
76
  "max_position_embeddings": 4096,
77
  "model_type": "zamba2",
 
81
  "num_logits_to_keep": 1,
82
  "num_mem_blocks": 2,
83
  "num_query_groups": 32,
84
+ "pad_token_id": 0,
85
  "rms_norm_eps": 1e-05,
86
  "rope_theta": 10000,
87
  "se_shared_attention_lora": false,
88
  "sliding_window": null,
89
  "state_size": 64,
90
+ "torch_dtype": "float32",
91
  "transformers_version": "4.43.0.dev0",
92
  "use_cache": false,
93
  "use_mamba_kernels": true,
94
  "use_mem_rope": false,
95
  "use_shared_attention_lora": false,
96
  "use_shared_block_lora": true,
97
+ "vocab_size": 32000
98
  }
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 32001,
4
- "eos_token_id": 32002,
5
- "pad_token_id": 32002,
6
  "transformers_version": "4.43.0.dev0"
7
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
  "transformers_version": "4.43.0.dev0"
7
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87a0ced325f07efcbd99c9e3320137b4bcca78fb52c7652a97813109bf0570e4
3
- size 4994068408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02bfb07b081a97e16ba1222a980a4cb7bf5071981da538b0ed8247566773d870
3
+ size 4998338056
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abe294ae533c70a2adb36f3145f97700bb68318f4775934f31a8bd9c93356964
3
- size 383523184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feb5b364c9438d060ab89931bd62c7065e32495d8cd2c5aff6bea5baf5d1a724
3
+ size 543078152
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 5377528640
4
  },
5
  "weight_map": {
6
  "model.blocks.0.feed_forward.linear_fc1.weight": "model-00001-of-00002.safetensors",
@@ -453,7 +453,7 @@
453
  "model.mamba_layers.47.mamba.in_proj.0.weight": "model-00001-of-00002.safetensors",
454
  "model.mamba_layers.47.mamba.norm.weight": "model-00001-of-00002.safetensors",
455
  "model.mamba_layers.47.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
456
- "model.mamba_layers.48.input_layernorm.weight": "model-00001-of-00002.safetensors",
457
  "model.mamba_layers.48.mamba.A_log": "model-00001-of-00002.safetensors",
458
  "model.mamba_layers.48.mamba.D": "model-00001-of-00002.safetensors",
459
  "model.mamba_layers.48.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
@@ -461,16 +461,16 @@
461
  "model.mamba_layers.48.mamba.dt_bias": "model-00001-of-00002.safetensors",
462
  "model.mamba_layers.48.mamba.in_proj.0.weight": "model-00001-of-00002.safetensors",
463
  "model.mamba_layers.48.mamba.norm.weight": "model-00001-of-00002.safetensors",
464
- "model.mamba_layers.48.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
465
- "model.mamba_layers.49.input_layernorm.weight": "model-00001-of-00002.safetensors",
466
- "model.mamba_layers.49.mamba.A_log": "model-00001-of-00002.safetensors",
467
- "model.mamba_layers.49.mamba.D": "model-00001-of-00002.safetensors",
468
- "model.mamba_layers.49.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
469
- "model.mamba_layers.49.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
470
- "model.mamba_layers.49.mamba.dt_bias": "model-00001-of-00002.safetensors",
471
- "model.mamba_layers.49.mamba.in_proj.0.weight": "model-00001-of-00002.safetensors",
472
- "model.mamba_layers.49.mamba.norm.weight": "model-00001-of-00002.safetensors",
473
- "model.mamba_layers.49.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
474
  "model.mamba_layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
475
  "model.mamba_layers.5.mamba.A_log": "model-00001-of-00002.safetensors",
476
  "model.mamba_layers.5.mamba.D": "model-00001-of-00002.safetensors",
@@ -481,13 +481,13 @@
481
  "model.mamba_layers.5.mamba.norm.weight": "model-00001-of-00002.safetensors",
482
  "model.mamba_layers.5.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
483
  "model.mamba_layers.50.input_layernorm.weight": "model-00002-of-00002.safetensors",
484
- "model.mamba_layers.50.mamba.A_log": "model-00001-of-00002.safetensors",
485
- "model.mamba_layers.50.mamba.D": "model-00001-of-00002.safetensors",
486
- "model.mamba_layers.50.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
487
- "model.mamba_layers.50.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
488
- "model.mamba_layers.50.mamba.dt_bias": "model-00001-of-00002.safetensors",
489
- "model.mamba_layers.50.mamba.in_proj.0.weight": "model-00001-of-00002.safetensors",
490
- "model.mamba_layers.50.mamba.norm.weight": "model-00001-of-00002.safetensors",
491
  "model.mamba_layers.50.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
492
  "model.mamba_layers.51.input_layernorm.weight": "model-00002-of-00002.safetensors",
493
  "model.mamba_layers.51.mamba.A_log": "model-00002-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 5541353280
4
  },
5
  "weight_map": {
6
  "model.blocks.0.feed_forward.linear_fc1.weight": "model-00001-of-00002.safetensors",
 
453
  "model.mamba_layers.47.mamba.in_proj.0.weight": "model-00001-of-00002.safetensors",
454
  "model.mamba_layers.47.mamba.norm.weight": "model-00001-of-00002.safetensors",
455
  "model.mamba_layers.47.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
456
+ "model.mamba_layers.48.input_layernorm.weight": "model-00002-of-00002.safetensors",
457
  "model.mamba_layers.48.mamba.A_log": "model-00001-of-00002.safetensors",
458
  "model.mamba_layers.48.mamba.D": "model-00001-of-00002.safetensors",
459
  "model.mamba_layers.48.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
 
461
  "model.mamba_layers.48.mamba.dt_bias": "model-00001-of-00002.safetensors",
462
  "model.mamba_layers.48.mamba.in_proj.0.weight": "model-00001-of-00002.safetensors",
463
  "model.mamba_layers.48.mamba.norm.weight": "model-00001-of-00002.safetensors",
464
+ "model.mamba_layers.48.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
465
+ "model.mamba_layers.49.input_layernorm.weight": "model-00002-of-00002.safetensors",
466
+ "model.mamba_layers.49.mamba.A_log": "model-00002-of-00002.safetensors",
467
+ "model.mamba_layers.49.mamba.D": "model-00002-of-00002.safetensors",
468
+ "model.mamba_layers.49.mamba.conv1d.bias": "model-00002-of-00002.safetensors",
469
+ "model.mamba_layers.49.mamba.conv1d.weight": "model-00002-of-00002.safetensors",
470
+ "model.mamba_layers.49.mamba.dt_bias": "model-00002-of-00002.safetensors",
471
+ "model.mamba_layers.49.mamba.in_proj.0.weight": "model-00002-of-00002.safetensors",
472
+ "model.mamba_layers.49.mamba.norm.weight": "model-00002-of-00002.safetensors",
473
+ "model.mamba_layers.49.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
474
  "model.mamba_layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
475
  "model.mamba_layers.5.mamba.A_log": "model-00001-of-00002.safetensors",
476
  "model.mamba_layers.5.mamba.D": "model-00001-of-00002.safetensors",
 
481
  "model.mamba_layers.5.mamba.norm.weight": "model-00001-of-00002.safetensors",
482
  "model.mamba_layers.5.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
483
  "model.mamba_layers.50.input_layernorm.weight": "model-00002-of-00002.safetensors",
484
+ "model.mamba_layers.50.mamba.A_log": "model-00002-of-00002.safetensors",
485
+ "model.mamba_layers.50.mamba.D": "model-00002-of-00002.safetensors",
486
+ "model.mamba_layers.50.mamba.conv1d.bias": "model-00002-of-00002.safetensors",
487
+ "model.mamba_layers.50.mamba.conv1d.weight": "model-00002-of-00002.safetensors",
488
+ "model.mamba_layers.50.mamba.dt_bias": "model-00002-of-00002.safetensors",
489
+ "model.mamba_layers.50.mamba.in_proj.0.weight": "model-00002-of-00002.safetensors",
490
+ "model.mamba_layers.50.mamba.norm.weight": "model-00002-of-00002.safetensors",
491
  "model.mamba_layers.50.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
492
  "model.mamba_layers.51.input_layernorm.weight": "model-00002-of-00002.safetensors",
493
  "model.mamba_layers.51.mamba.A_log": "model-00002-of-00002.safetensors",
special_tokens_map.json CHANGED
@@ -1,23 +1,29 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "<|im_start|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<|im_end|>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
  ],
18
- "bos_token": "<s>",
19
- "eos_token": "</s>",
20
- "pad_token": "[PAD]",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "unk_token": {
22
  "content": "<unk>",
23
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "[PAD]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
  "unk_token": {
28
  "content": "<unk>",
29
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,42 +1,10 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 4096,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
12
  "id": 0,
13
- "content": "<unk>",
14
- "single_word": false,
15
- "lstrip": false,
16
- "rstrip": false,
17
- "normalized": false,
18
- "special": true
19
- },
20
- {
21
- "id": 1,
22
- "content": "<s>",
23
- "single_word": false,
24
- "lstrip": false,
25
- "rstrip": false,
26
- "normalized": false,
27
- "special": true
28
- },
29
- {
30
- "id": 2,
31
- "content": "</s>",
32
- "single_word": false,
33
- "lstrip": false,
34
- "rstrip": false,
35
- "normalized": false,
36
- "special": true
37
- },
38
- {
39
- "id": 32000,
40
  "content": "[PAD]",
41
  "single_word": false,
42
  "lstrip": false,
@@ -45,7 +13,7 @@
45
  "special": true
46
  },
47
  {
48
- "id": 32001,
49
  "content": "<|im_start|>",
50
  "single_word": false,
51
  "lstrip": false,
@@ -54,7 +22,7 @@
54
  "special": true
55
  },
56
  {
57
- "id": 32002,
58
  "content": "<|im_end|>",
59
  "single_word": false,
60
  "lstrip": false,
@@ -85,7 +53,7 @@
85
  "single": [
86
  {
87
  "SpecialToken": {
88
- "id": "<s>",
89
  "type_id": 0
90
  }
91
  },
@@ -99,7 +67,7 @@
99
  "pair": [
100
  {
101
  "SpecialToken": {
102
- "id": "<s>",
103
  "type_id": 0
104
  }
105
  },
@@ -111,7 +79,7 @@
111
  },
112
  {
113
  "SpecialToken": {
114
- "id": "<s>",
115
  "type_id": 1
116
  }
117
  },
@@ -123,13 +91,13 @@
123
  }
124
  ],
125
  "special_tokens": {
126
- "<s>": {
127
- "id": "<s>",
128
  "ids": [
129
  1
130
  ],
131
  "tokens": [
132
- "<s>"
133
  ]
134
  }
135
  }
@@ -168,9 +136,9 @@
168
  "byte_fallback": true,
169
  "ignore_merges": false,
170
  "vocab": {
171
- "<unk>": 0,
172
- "<s>": 1,
173
- "</s>": 2,
174
  "<0x00>": 3,
175
  "<0x01>": 4,
176
  "<0x02>": 5,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "content": "[PAD]",
9
  "single_word": false,
10
  "lstrip": false,
 
13
  "special": true
14
  },
15
  {
16
+ "id": 1,
17
  "content": "<|im_start|>",
18
  "single_word": false,
19
  "lstrip": false,
 
22
  "special": true
23
  },
24
  {
25
+ "id": 2,
26
  "content": "<|im_end|>",
27
  "single_word": false,
28
  "lstrip": false,
 
53
  "single": [
54
  {
55
  "SpecialToken": {
56
+ "id": "<|im_start|>",
57
  "type_id": 0
58
  }
59
  },
 
67
  "pair": [
68
  {
69
  "SpecialToken": {
70
+ "id": "<|im_start|>",
71
  "type_id": 0
72
  }
73
  },
 
79
  },
80
  {
81
  "SpecialToken": {
82
+ "id": "<|im_start|>",
83
  "type_id": 1
84
  }
85
  },
 
91
  }
92
  ],
93
  "special_tokens": {
94
+ "<|im_start|>": {
95
+ "id": "<|im_start|>",
96
  "ids": [
97
  1
98
  ],
99
  "tokens": [
100
+ "<|im_start|>"
101
  ]
102
  }
103
  }
 
136
  "byte_fallback": true,
137
  "ignore_merges": false,
138
  "vocab": {
139
+ "[PAD]": 0,
140
+ "<|im_start|>": 1,
141
+ "<|im_end|>": 2,
142
  "<0x00>": 3,
143
  "<0x01>": 4,
144
  "<0x02>": 5,
tokenizer_config.json CHANGED
@@ -4,30 +4,6 @@
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": true
29
- },
30
- "32000": {
31
  "content": "[PAD]",
32
  "lstrip": false,
33
  "normalized": false,
@@ -35,7 +11,7 @@
35
  "single_word": false,
36
  "special": true
37
  },
38
- "32001": {
39
  "content": "<|im_start|>",
40
  "lstrip": false,
41
  "normalized": false,
@@ -43,7 +19,7 @@
43
  "single_word": false,
44
  "special": true
45
  },
46
- "32002": {
47
  "content": "<|im_end|>",
48
  "lstrip": false,
49
  "normalized": false,
@@ -56,16 +32,20 @@
56
  "<|im_start|>",
57
  "<|im_end|>"
58
  ],
59
- "bos_token": "<s>",
60
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
61
  "clean_up_tokenization_spaces": false,
62
- "eos_token": "</s>",
63
  "legacy": true,
 
64
  "model_max_length": 4096,
65
  "pad_token": "[PAD]",
66
  "sp_model_kwargs": {},
67
  "spaces_between_special_tokens": false,
 
68
  "tokenizer_class": "LlamaTokenizer",
 
 
69
  "unk_token": "<unk>",
70
  "use_default_system_prompt": false
71
  }
 
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "content": "[PAD]",
8
  "lstrip": false,
9
  "normalized": false,
 
11
  "single_word": false,
12
  "special": true
13
  },
14
+ "1": {
15
  "content": "<|im_start|>",
16
  "lstrip": false,
17
  "normalized": false,
 
19
  "single_word": false,
20
  "special": true
21
  },
22
+ "2": {
23
  "content": "<|im_end|>",
24
  "lstrip": false,
25
  "normalized": false,
 
32
  "<|im_start|>",
33
  "<|im_end|>"
34
  ],
35
+ "bos_token": "<|im_start|>",
36
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
37
  "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<|im_end|>",
39
  "legacy": true,
40
+ "max_length": 4096,
41
  "model_max_length": 4096,
42
  "pad_token": "[PAD]",
43
  "sp_model_kwargs": {},
44
  "spaces_between_special_tokens": false,
45
+ "stride": 0,
46
  "tokenizer_class": "LlamaTokenizer",
47
+ "truncation_side": "right",
48
+ "truncation_strategy": "longest_first",
49
  "unk_token": "<unk>",
50
  "use_default_system_prompt": false
51
  }