mtasic85 commited on
Commit
c62a845
1 Parent(s): fd468b1

new tokenizer 38400

Browse files
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/model.yaml CHANGED
@@ -5,13 +5,13 @@ model_name: "tiny-llama-1.1b"
5
  # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
  # ``model_config``. (type: Optional[Config], default: null)
7
  model_config:
8
- padded_vocab_size: 32768
9
- vocab_size: 32768
10
  block_size: 131072
11
- n_layer: 10
12
  n_head: 32
13
  head_size: null
14
- n_embd: 320
15
  n_query_groups: 8
16
  rotary_percentage: 1.0
17
  parallel_residual: false
@@ -19,7 +19,7 @@ model_config:
19
  norm_class_name: "RMSNorm"
20
  norm_eps: 1e-05
21
  mlp_class_name: "LLaMAMLP"
22
- intermediate_size: 1120
23
  rope_base: 1000000
24
 
25
  # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
@@ -52,7 +52,7 @@ data:
52
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
53
  train:
54
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
55
- save_interval: 1000
56
 
57
  # Number of iterations between logging calls (type: int, default: 1)
58
  log_interval: 1
@@ -61,7 +61,7 @@ train:
61
  global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
- micro_batch_size: 4
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
  lr_warmup_steps: 2000
@@ -71,13 +71,13 @@ train:
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
  # max_tokens: 3000000000000
74
- max_tokens: 8628998688 # 351072 * 8193 * 3
75
 
76
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
77
  max_steps:
78
 
79
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
80
- max_seq_length: 8192
81
 
82
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
83
  tie_embeddings:
@@ -86,7 +86,7 @@ train:
86
  max_norm: 1.0
87
 
88
  # (type: float, default: 4e-05)
89
- min_lr: 4e-05
90
 
91
  # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
92
  eval:
 
5
  # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
  # ``model_config``. (type: Optional[Config], default: null)
7
  model_config:
8
+ padded_vocab_size: 38400
9
+ vocab_size: 38400
10
  block_size: 131072
11
+ n_layer: 5
12
  n_head: 32
13
  head_size: null
14
+ n_embd: 1024
15
  n_query_groups: 8
16
  rotary_percentage: 1.0
17
  parallel_residual: false
 
19
  norm_class_name: "RMSNorm"
20
  norm_eps: 1e-05
21
  mlp_class_name: "LLaMAMLP"
22
+ intermediate_size: 3584
23
  rope_base: 1000000
24
 
25
  # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 
52
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
53
  train:
54
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
55
+ save_interval: 100
56
 
57
  # Number of iterations between logging calls (type: int, default: 1)
58
  log_interval: 1
 
61
  global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
+ micro_batch_size: 8
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
  lr_warmup_steps: 2000
 
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
  # max_tokens: 3000000000000
74
+ max_tokens: ??? # ? * 8193 * 3
75
 
76
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
77
  max_steps:
78
 
79
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
80
+ max_seq_length: 4096
81
 
82
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
83
  tie_embeddings:
 
86
  max_norm: 1.0
87
 
88
  # (type: float, default: 4e-05)
89
+ min_lr: 1e-4
90
 
91
  # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
92
  eval:
scripts/prepare_pretrain_dataset.py CHANGED
@@ -7,11 +7,43 @@ from functools import partial
7
 
8
 
9
  def batch_iterator(name=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # text
11
  if name in (None, 'xu-song/cc100-samples'):
12
  dataset = (
13
  load_dataset(name, lang, split='train')
14
- for lang in ['am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
 
17
  for d in dataset:
@@ -21,19 +53,48 @@ def batch_iterator(name=None):
21
  del dataset
22
  gc.collect()
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # code
25
  if name in (None, 'bigcode/the-stack-smol-xs'):
26
  dataset = (
27
  load_dataset(name, lang, split='train', trust_remote_code=True)
28
  for lang in [
29
- 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly', 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
30
- 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp', 'css', 'cuda', 'dart', 'dockerfile', 'elixir',
31
- 'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go', 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
32
- 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean', 'literate-agda', 'literate-coffeescript', 'literate-haskell',
33
- 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab', 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
34
- 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext', 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
35
- 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan', 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
36
- 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt', 'yacc', 'zig'
 
 
 
 
 
 
 
 
37
  ]
38
  )
39
 
@@ -44,17 +105,17 @@ def batch_iterator(name=None):
44
  del dataset
45
  gc.collect()
46
 
47
- # text
48
- if name in (None, 'nampdn-ai/tiny-textbooks'):
49
  dataset = load_dataset(name, split='train')
50
-
51
  for row in dataset:
52
- yield row['textbook']
53
-
54
  del dataset
55
  gc.collect()
56
 
57
- # code
58
  if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
59
  dataset = load_dataset(name, split='train')
60
 
@@ -64,12 +125,12 @@ def batch_iterator(name=None):
64
  del dataset
65
  gc.collect()
66
 
67
- # code
68
- if name in (None, 'nampdn-ai/tiny-codes'):
69
  dataset = load_dataset(name, split='train')
70
 
71
  for row in dataset:
72
- yield row['prompt'] + '\n' + row['response']
73
 
74
  del dataset
75
  gc.collect()
@@ -114,29 +175,6 @@ def batch_iterator(name=None):
114
  del dataset
115
  gc.collect()
116
 
117
- # instructions
118
- alpaca_datasets_names = [
119
- 'saillab/alpaca-english-cleaned',
120
- 'saillab/alpaca-serbian-cleaned',
121
- 'saillab/alpaca-croatian-cleaned',
122
- 'saillab/alpaca-bosnian-cleaned',
123
- 'saillab/alpaca-macedonian-cleaned',
124
- 'saillab/alpaca-slovenian-cleaned',
125
- ]
126
-
127
- if name in (None, *alpaca_datasets_names):
128
- for split in ['train', 'test']:
129
- dataset = load_dataset(name, split=split)
130
-
131
- for row in dataset:
132
- if row['input'] in (None, '', 'nan'):
133
- yield row['instruction'] + '\n' + row['output']
134
- else:
135
- yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
136
-
137
- del dataset
138
- gc.collect()
139
-
140
 
141
  def tokenize_fn(dataset_name, tokenizer=None):
142
  for text in batch_iterator(dataset_name):
@@ -145,21 +183,18 @@ def tokenize_fn(dataset_name, tokenizer=None):
145
 
146
 
147
  datasets_names = [
 
148
  'xu-song/cc100-samples',
149
- 'bigcode/the-stack-smol-xs',
150
  'nampdn-ai/tiny-textbooks',
151
- 'm-a-p/CodeFeedback-Filtered-Instruction',
152
  'nampdn-ai/tiny-codes',
 
 
153
  'ajibawa-2023/Maths-College',
154
  'microsoft/orca-math-word-problems-200k',
155
  'datatab/orca_math_world_problem_200k_serbian',
156
  'badrex/llm-emoji-dataset',
157
- 'saillab/alpaca-english-cleaned',
158
- 'saillab/alpaca-serbian-cleaned',
159
- 'saillab/alpaca-croatian-cleaned',
160
- 'saillab/alpaca-bosnian-cleaned',
161
- 'saillab/alpaca-macedonian-cleaned',
162
- 'saillab/alpaca-slovenian-cleaned',
163
  ]
164
 
165
  outputs = optimize(
@@ -167,6 +202,6 @@ outputs = optimize(
167
  inputs=datasets_names,
168
  output_dir='../data/',
169
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
170
- chunk_size=(8193 * 2003),
171
  num_workers=16,
172
  )
 
7
 
8
 
9
  def batch_iterator(name=None):
10
+ # text
11
+ if name in (None, 'saillab/taco-datasets'):
12
+ dataset = (
13
+ load_dataset(name, data_dir=data_dir, split='train')
14
+ for data_dir in [
15
+ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
16
+ 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
17
+ ]
18
+ )
19
+
20
+ for d in dataset:
21
+ for row in d:
22
+ for n in row:
23
+ yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
24
+
25
+ del dataset
26
+ gc.collect()
27
+
28
  # text
29
  if name in (None, 'xu-song/cc100-samples'):
30
  dataset = (
31
  load_dataset(name, lang, split='train')
32
+ for lang in [
33
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
34
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
35
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
36
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
37
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
38
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
39
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
40
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
41
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
42
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
43
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
44
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
45
+ 'zh-Hans', 'zh-Hant', 'zu',
46
+ ]
47
  )
48
 
49
  for d in dataset:
 
53
  del dataset
54
  gc.collect()
55
 
56
+ # text
57
+ if name in (None, 'ontocord/fineweb-permissive-multilingual-2m'):
58
+ dataset = load_dataset(name, split='train')
59
+
60
+ for row in dataset:
61
+ yield row['text']
62
+
63
+ del dataset
64
+ gc.collect()
65
+
66
+ # text
67
+ if name in (None, 'nampdn-ai/tiny-textbooks'):
68
+ for split in ['train', 'test']:
69
+ dataset = load_dataset(name, split=split)
70
+
71
+ for row in dataset:
72
+ yield row['textbook']
73
+
74
+ del dataset
75
+ gc.collect()
76
+
77
  # code
78
  if name in (None, 'bigcode/the-stack-smol-xs'):
79
  dataset = (
80
  load_dataset(name, lang, split='train', trust_remote_code=True)
81
  for lang in [
82
+ 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
83
+ 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
84
+ 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
85
+ 'css', 'cuda', 'dart', 'dockerfile', 'elixir',
86
+ 'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
87
+ 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
88
+ 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
89
+ 'literate-agda', 'literate-coffeescript', 'literate-haskell',
90
+ 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
91
+ 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
92
+ 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
93
+ 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
94
+ 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
95
+ 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
96
+ 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
97
+ 'yacc', 'zig',
98
  ]
99
  )
100
 
 
105
  del dataset
106
  gc.collect()
107
 
108
+ # code
109
+ if name in (None, 'nampdn-ai/tiny-codes'):
110
  dataset = load_dataset(name, split='train')
111
+
112
  for row in dataset:
113
+ yield row['prompt'] + '\n' + row['response']
114
+
115
  del dataset
116
  gc.collect()
117
 
118
+ # text + code
119
  if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
120
  dataset = load_dataset(name, split='train')
121
 
 
125
  del dataset
126
  gc.collect()
127
 
128
+ # math
129
+ if name in (None, 'gair-prox/open-web-math-pro'):
130
  dataset = load_dataset(name, split='train')
131
 
132
  for row in dataset:
133
+ yield row['text']
134
 
135
  del dataset
136
  gc.collect()
 
175
  del dataset
176
  gc.collect()
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  def tokenize_fn(dataset_name, tokenizer=None):
180
  for text in batch_iterator(dataset_name):
 
183
 
184
 
185
  datasets_names = [
186
+ 'saillab/taco-datasets',
187
  'xu-song/cc100-samples',
188
+ 'ontocord/fineweb-permissive-multilingual-2m',
189
  'nampdn-ai/tiny-textbooks',
190
+ 'bigcode/the-stack-smol-xs',
191
  'nampdn-ai/tiny-codes',
192
+ 'm-a-p/CodeFeedback-Filtered-Instruction',
193
+ 'gair-prox/open-web-math-pro',
194
  'ajibawa-2023/Maths-College',
195
  'microsoft/orca-math-word-problems-200k',
196
  'datatab/orca_math_world_problem_200k_serbian',
197
  'badrex/llm-emoji-dataset',
 
 
 
 
 
 
198
  ]
199
 
200
  outputs = optimize(
 
202
  inputs=datasets_names,
203
  output_dir='../data/',
204
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
205
+ chunk_size=(4097 * 4006),
206
  num_workers=16,
207
  )
scripts/train_tokenizer.py CHANGED
@@ -110,11 +110,11 @@ def batch_iterator():
110
  gc.collect()
111
 
112
  # math
113
- dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
114
-
115
  for row in dataset:
116
- yield row['question'] + '\n' + row['answer']
117
-
118
  del dataset
119
  gc.collect()
120
 
@@ -127,6 +127,15 @@ def batch_iterator():
127
  del dataset
128
  gc.collect()
129
 
 
 
 
 
 
 
 
 
 
130
  # emoji
131
  dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
132
 
@@ -206,7 +215,7 @@ special_tokens = [
206
  for i in range(2, 25):
207
  special_tokens.append(' ' * i)
208
 
209
- for i in range(128 - len(special_tokens)):
210
  special_tokens.append(f'<|reserved_{i}|>')
211
 
212
  # emoji
@@ -235,7 +244,7 @@ tokenizer.post_processor = TemplateProcessing(
235
  tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
236
 
237
  trainer = BpeTrainer(
238
- vocab_size=131072, # 2 ** 17, 128k
239
  min_frequency=2,
240
  special_tokens=special_tokens,
241
  initial_alphabet=emoji_chars + programming_languages + code_keywords,
 
110
  gc.collect()
111
 
112
  # math
113
+ dataset = load_dataset('gair-prox/open-web-math-pro', split='train')
114
+
115
  for row in dataset:
116
+ yield row['text']
117
+
118
  del dataset
119
  gc.collect()
120
 
 
127
  del dataset
128
  gc.collect()
129
 
130
+ # math
131
+ dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
132
+
133
+ for row in dataset:
134
+ yield row['question'] + '\n' + row['answer']
135
+
136
+ del dataset
137
+ gc.collect()
138
+
139
  # emoji
140
  dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
141
 
 
215
  for i in range(2, 25):
216
  special_tokens.append(' ' * i)
217
 
218
+ for i in range(64 - len(special_tokens)):
219
  special_tokens.append(f'<|reserved_{i}|>')
220
 
221
  # emoji
 
244
  tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
245
 
246
  trainer = BpeTrainer(
247
+ vocab_size=38400, # 32768 chars + 5034 emojis
248
  min_frequency=2,
249
  special_tokens=special_tokens,
250
  initial_alphabet=emoji_chars + programming_languages + code_keywords,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -639,390 +639,6 @@
639
  "rstrip": false,
640
  "single_word": false,
641
  "special": true
642
- },
643
- "80": {
644
- "content": "<|reserved_0|>",
645
- "lstrip": false,
646
- "normalized": false,
647
- "rstrip": false,
648
- "single_word": false,
649
- "special": true
650
- },
651
- "81": {
652
- "content": "<|reserved_1|>",
653
- "lstrip": false,
654
- "normalized": false,
655
- "rstrip": false,
656
- "single_word": false,
657
- "special": true
658
- },
659
- "82": {
660
- "content": "<|reserved_2|>",
661
- "lstrip": false,
662
- "normalized": false,
663
- "rstrip": false,
664
- "single_word": false,
665
- "special": true
666
- },
667
- "83": {
668
- "content": "<|reserved_3|>",
669
- "lstrip": false,
670
- "normalized": false,
671
- "rstrip": false,
672
- "single_word": false,
673
- "special": true
674
- },
675
- "84": {
676
- "content": "<|reserved_4|>",
677
- "lstrip": false,
678
- "normalized": false,
679
- "rstrip": false,
680
- "single_word": false,
681
- "special": true
682
- },
683
- "85": {
684
- "content": "<|reserved_5|>",
685
- "lstrip": false,
686
- "normalized": false,
687
- "rstrip": false,
688
- "single_word": false,
689
- "special": true
690
- },
691
- "86": {
692
- "content": "<|reserved_6|>",
693
- "lstrip": false,
694
- "normalized": false,
695
- "rstrip": false,
696
- "single_word": false,
697
- "special": true
698
- },
699
- "87": {
700
- "content": "<|reserved_7|>",
701
- "lstrip": false,
702
- "normalized": false,
703
- "rstrip": false,
704
- "single_word": false,
705
- "special": true
706
- },
707
- "88": {
708
- "content": "<|reserved_8|>",
709
- "lstrip": false,
710
- "normalized": false,
711
- "rstrip": false,
712
- "single_word": false,
713
- "special": true
714
- },
715
- "89": {
716
- "content": "<|reserved_9|>",
717
- "lstrip": false,
718
- "normalized": false,
719
- "rstrip": false,
720
- "single_word": false,
721
- "special": true
722
- },
723
- "90": {
724
- "content": "<|reserved_10|>",
725
- "lstrip": false,
726
- "normalized": false,
727
- "rstrip": false,
728
- "single_word": false,
729
- "special": true
730
- },
731
- "91": {
732
- "content": "<|reserved_11|>",
733
- "lstrip": false,
734
- "normalized": false,
735
- "rstrip": false,
736
- "single_word": false,
737
- "special": true
738
- },
739
- "92": {
740
- "content": "<|reserved_12|>",
741
- "lstrip": false,
742
- "normalized": false,
743
- "rstrip": false,
744
- "single_word": false,
745
- "special": true
746
- },
747
- "93": {
748
- "content": "<|reserved_13|>",
749
- "lstrip": false,
750
- "normalized": false,
751
- "rstrip": false,
752
- "single_word": false,
753
- "special": true
754
- },
755
- "94": {
756
- "content": "<|reserved_14|>",
757
- "lstrip": false,
758
- "normalized": false,
759
- "rstrip": false,
760
- "single_word": false,
761
- "special": true
762
- },
763
- "95": {
764
- "content": "<|reserved_15|>",
765
- "lstrip": false,
766
- "normalized": false,
767
- "rstrip": false,
768
- "single_word": false,
769
- "special": true
770
- },
771
- "96": {
772
- "content": "<|reserved_16|>",
773
- "lstrip": false,
774
- "normalized": false,
775
- "rstrip": false,
776
- "single_word": false,
777
- "special": true
778
- },
779
- "97": {
780
- "content": "<|reserved_17|>",
781
- "lstrip": false,
782
- "normalized": false,
783
- "rstrip": false,
784
- "single_word": false,
785
- "special": true
786
- },
787
- "98": {
788
- "content": "<|reserved_18|>",
789
- "lstrip": false,
790
- "normalized": false,
791
- "rstrip": false,
792
- "single_word": false,
793
- "special": true
794
- },
795
- "99": {
796
- "content": "<|reserved_19|>",
797
- "lstrip": false,
798
- "normalized": false,
799
- "rstrip": false,
800
- "single_word": false,
801
- "special": true
802
- },
803
- "100": {
804
- "content": "<|reserved_20|>",
805
- "lstrip": false,
806
- "normalized": false,
807
- "rstrip": false,
808
- "single_word": false,
809
- "special": true
810
- },
811
- "101": {
812
- "content": "<|reserved_21|>",
813
- "lstrip": false,
814
- "normalized": false,
815
- "rstrip": false,
816
- "single_word": false,
817
- "special": true
818
- },
819
- "102": {
820
- "content": "<|reserved_22|>",
821
- "lstrip": false,
822
- "normalized": false,
823
- "rstrip": false,
824
- "single_word": false,
825
- "special": true
826
- },
827
- "103": {
828
- "content": "<|reserved_23|>",
829
- "lstrip": false,
830
- "normalized": false,
831
- "rstrip": false,
832
- "single_word": false,
833
- "special": true
834
- },
835
- "104": {
836
- "content": "<|reserved_24|>",
837
- "lstrip": false,
838
- "normalized": false,
839
- "rstrip": false,
840
- "single_word": false,
841
- "special": true
842
- },
843
- "105": {
844
- "content": "<|reserved_25|>",
845
- "lstrip": false,
846
- "normalized": false,
847
- "rstrip": false,
848
- "single_word": false,
849
- "special": true
850
- },
851
- "106": {
852
- "content": "<|reserved_26|>",
853
- "lstrip": false,
854
- "normalized": false,
855
- "rstrip": false,
856
- "single_word": false,
857
- "special": true
858
- },
859
- "107": {
860
- "content": "<|reserved_27|>",
861
- "lstrip": false,
862
- "normalized": false,
863
- "rstrip": false,
864
- "single_word": false,
865
- "special": true
866
- },
867
- "108": {
868
- "content": "<|reserved_28|>",
869
- "lstrip": false,
870
- "normalized": false,
871
- "rstrip": false,
872
- "single_word": false,
873
- "special": true
874
- },
875
- "109": {
876
- "content": "<|reserved_29|>",
877
- "lstrip": false,
878
- "normalized": false,
879
- "rstrip": false,
880
- "single_word": false,
881
- "special": true
882
- },
883
- "110": {
884
- "content": "<|reserved_30|>",
885
- "lstrip": false,
886
- "normalized": false,
887
- "rstrip": false,
888
- "single_word": false,
889
- "special": true
890
- },
891
- "111": {
892
- "content": "<|reserved_31|>",
893
- "lstrip": false,
894
- "normalized": false,
895
- "rstrip": false,
896
- "single_word": false,
897
- "special": true
898
- },
899
- "112": {
900
- "content": "<|reserved_32|>",
901
- "lstrip": false,
902
- "normalized": false,
903
- "rstrip": false,
904
- "single_word": false,
905
- "special": true
906
- },
907
- "113": {
908
- "content": "<|reserved_33|>",
909
- "lstrip": false,
910
- "normalized": false,
911
- "rstrip": false,
912
- "single_word": false,
913
- "special": true
914
- },
915
- "114": {
916
- "content": "<|reserved_34|>",
917
- "lstrip": false,
918
- "normalized": false,
919
- "rstrip": false,
920
- "single_word": false,
921
- "special": true
922
- },
923
- "115": {
924
- "content": "<|reserved_35|>",
925
- "lstrip": false,
926
- "normalized": false,
927
- "rstrip": false,
928
- "single_word": false,
929
- "special": true
930
- },
931
- "116": {
932
- "content": "<|reserved_36|>",
933
- "lstrip": false,
934
- "normalized": false,
935
- "rstrip": false,
936
- "single_word": false,
937
- "special": true
938
- },
939
- "117": {
940
- "content": "<|reserved_37|>",
941
- "lstrip": false,
942
- "normalized": false,
943
- "rstrip": false,
944
- "single_word": false,
945
- "special": true
946
- },
947
- "118": {
948
- "content": "<|reserved_38|>",
949
- "lstrip": false,
950
- "normalized": false,
951
- "rstrip": false,
952
- "single_word": false,
953
- "special": true
954
- },
955
- "119": {
956
- "content": "<|reserved_39|>",
957
- "lstrip": false,
958
- "normalized": false,
959
- "rstrip": false,
960
- "single_word": false,
961
- "special": true
962
- },
963
- "120": {
964
- "content": "<|reserved_40|>",
965
- "lstrip": false,
966
- "normalized": false,
967
- "rstrip": false,
968
- "single_word": false,
969
- "special": true
970
- },
971
- "121": {
972
- "content": "<|reserved_41|>",
973
- "lstrip": false,
974
- "normalized": false,
975
- "rstrip": false,
976
- "single_word": false,
977
- "special": true
978
- },
979
- "122": {
980
- "content": "<|reserved_42|>",
981
- "lstrip": false,
982
- "normalized": false,
983
- "rstrip": false,
984
- "single_word": false,
985
- "special": true
986
- },
987
- "123": {
988
- "content": "<|reserved_43|>",
989
- "lstrip": false,
990
- "normalized": false,
991
- "rstrip": false,
992
- "single_word": false,
993
- "special": true
994
- },
995
- "124": {
996
- "content": "<|reserved_44|>",
997
- "lstrip": false,
998
- "normalized": false,
999
- "rstrip": false,
1000
- "single_word": false,
1001
- "special": true
1002
- },
1003
- "125": {
1004
- "content": "<|reserved_45|>",
1005
- "lstrip": false,
1006
- "normalized": false,
1007
- "rstrip": false,
1008
- "single_word": false,
1009
- "special": true
1010
- },
1011
- "126": {
1012
- "content": "<|reserved_46|>",
1013
- "lstrip": false,
1014
- "normalized": false,
1015
- "rstrip": false,
1016
- "single_word": false,
1017
- "special": true
1018
- },
1019
- "127": {
1020
- "content": "<|reserved_47|>",
1021
- "lstrip": false,
1022
- "normalized": false,
1023
- "rstrip": false,
1024
- "single_word": false,
1025
- "special": true
1026
  }
1027
  },
1028
  "bos_token": "<s>",
 
639
  "rstrip": false,
640
  "single_word": false,
641
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  }
643
  },
644
  "bos_token": "<s>",
vocab.json CHANGED
The diff for this file is too large to render. See raw diff