mtasic85 commited on
Commit
99a750a
1 Parent(s): 8c6d2ba

train tokenizer

Browse files
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/TRAIN.md CHANGED
@@ -14,3 +14,9 @@ pip install -U -r requirements.in
14
  ```bash
15
  python -B train_tokenizer.py
16
  ```
 
 
 
 
 
 
 
14
  ```bash
15
  python -B train_tokenizer.py
16
  ```
17
+
18
+ ## Model
19
+
20
+ ```bash
21
+ python -B train_model.py
22
+ ```
scripts/requirements.in CHANGED
@@ -2,4 +2,5 @@ tqdm
2
  datasets
3
  jinja2
4
  transformers
5
- jsonlines
 
 
2
  datasets
3
  jinja2
4
  transformers
5
+ jsonlines
6
+ litgpt[all]
scripts/train_model.py CHANGED
@@ -1,34 +1,15 @@
1
  import gc
2
- import sys
3
-
4
  from datasets import load_dataset, Dataset
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
6
- from transformers import AutoConfig
7
- from transformers import DataCollatorForLanguageModeling
8
-
9
- import torch
10
- from torch.utils.data import DataLoader
11
- # import torch.multiprocessing as mp
12
-
13
-
14
- # x = input('Are you sure? [y/N] ')
15
- #
16
- # if x not in ('y', 'Y', 'yes'):
17
- # sys.exit(0)
18
 
19
 
20
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
- # mp.set_start_method('spawn', force=True)
22
-
23
-
24
- def _batch_iterator():
25
  ## code
26
  # dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
27
-
28
  # for row in dataset:
29
  # for n in row['keywords']:
30
  # yield n
31
-
32
  # del dataset
33
  # gc.collect()
34
 
@@ -53,7 +34,6 @@ def _batch_iterator():
53
 
54
  del dataset
55
  gc.collect()
56
- return
57
 
58
  # text
59
  dataset = load_dataset('nampdn-ai/tiny-textbooks', split='train')
@@ -186,108 +166,4 @@ def _batch_iterator():
186
  yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
187
 
188
  del dataset
189
- gc.collect()
190
-
191
-
192
- def batch_iterator():
193
- for text in _batch_iterator():
194
- row = {'text': text}
195
- yield row
196
-
197
- tokenizer = AutoTokenizer.from_pretrained('../')
198
-
199
- dataset = Dataset.from_generator(batch_iterator)
200
- print(dataset)
201
-
202
-
203
- def tokenize_function(examples):
204
- outputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=32 * 1024)
205
- outputs['labels'] = outputs['input_ids'].copy()
206
- return outputs
207
-
208
-
209
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
210
- tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.01)
211
-
212
- config = AutoConfig.from_pretrained('mistralai/Mistral-7B-Instruct-v0.3')
213
- config.bos_token_id = tokenizer.bos_token_id
214
- config.eos_token_id = tokenizer.eos_token_id
215
- config.unk_token_id = tokenizer.unk_token_id
216
- config.pad_token_id = tokenizer.pad_token_id
217
- config.hidden_size = 512
218
- config.intermediate_size = int(512 * 3.5) # 1792
219
- config.max_position_embeddings = 32 * 1024 # 32768
220
- config.num_attention_heads = 12
221
- config.num_hidden_layers = 10
222
- config.num_key_value_heads = 4
223
- config.rope_theta = 1_000_000.0
224
- config.sliding_window = 4096
225
- config.torch_dtype = torch.bfloat16
226
- config.use_cache = False
227
- print(config)
228
-
229
- model = AutoModelForCausalLM.from_config(config)
230
- model = model.to(torch.bfloat16)
231
- model = torch.compile(model)
232
- model.to(device)
233
- print(model)
234
-
235
- training_args = TrainingArguments(
236
- output_dir='./results',
237
- num_train_epochs=3,
238
- per_device_train_batch_size=1, # Adjust based on your GPU memory
239
- per_device_eval_batch_size=1,
240
- optim='adamw_bnb_8bit',
241
- gradient_accumulation_steps=8,
242
- gradient_checkpointing=True,
243
- warmup_steps=500,
244
- weight_decay=0.01,
245
- logging_dir='./logs',
246
- logging_steps=10,
247
- fp16=False,
248
- bf16=True,
249
- torch_compile=True,
250
- )
251
- print(training_args)
252
-
253
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
254
- print(data_collator)
255
-
256
- def collate_fn(examples):
257
- texts = [ex['text'] for ex in examples]
258
- batch = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=32*1024, return_token_type_ids=False)
259
- batch = {k: v.to(device) for k, v in batch.items()} # Move tensors to GPU
260
- batch['labels'] = batch['input_ids'].clone()
261
- return batch
262
-
263
- train_dataloader = DataLoader(
264
- tokenized_datasets["train"],
265
- shuffle=True,
266
- collate_fn=collate_fn,
267
- batch_size=training_args.per_device_train_batch_size,
268
- pin_memory=True,
269
- # num_workers=4
270
- )
271
-
272
- eval_dataloader = DataLoader(
273
- tokenized_datasets["test"],
274
- collate_fn=collate_fn,
275
- batch_size=training_args.per_device_eval_batch_size,
276
- pin_memory=True,
277
- # num_workers=4
278
- )
279
-
280
- trainer = Trainer(
281
- model=model,
282
- args=training_args,
283
- train_dataset=tokenized_datasets['train'],
284
- eval_dataset=tokenized_datasets['test'],
285
- tokenizer=tokenizer,
286
- data_collator=data_collator,
287
- )
288
-
289
- trainer.get_train_dataloader = lambda: train_dataloader
290
- trainer.get_eval_dataloader = lambda: eval_dataloader
291
-
292
- print(trainer)
293
- trainer.train()
 
1
  import gc
 
 
2
  from datasets import load_dataset, Dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
+ def batch_iterator():
 
 
 
 
6
  ## code
7
  # dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
8
+ #
9
  # for row in dataset:
10
  # for n in row['keywords']:
11
  # yield n
12
+ #
13
  # del dataset
14
  # gc.collect()
15
 
 
34
 
35
  del dataset
36
  gc.collect()
 
37
 
38
  # text
39
  dataset = load_dataset('nampdn-ai/tiny-textbooks', split='train')
 
166
  yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
167
 
168
  del dataset
169
+ gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/train_tokenizer.py CHANGED
@@ -4,13 +4,13 @@ import string
4
 
5
  from datasets import load_dataset
6
  from transformers import PreTrainedTokenizerFast
7
- from tokenizers import Tokenizer, normalizers, decoders, pre_tokenizers
8
  from tokenizers.models import BPE
9
  from tokenizers.trainers import BpeTrainer
10
  from tokenizers.processors import TemplateProcessing
11
 
12
 
13
- x = input('Are you sure?')
14
 
15
  if x not in ('y', 'Y', 'yes'):
16
  sys.exit(0)
@@ -183,7 +183,7 @@ def batch_iterator():
183
  # gc.collect()
184
 
185
 
186
- bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
187
  tokenizer = Tokenizer(bpe)
188
 
189
  special_tokens = [
@@ -204,11 +204,12 @@ special_tokens = [
204
  'tool',
205
  ]
206
 
 
 
 
207
  for i in range(64 - len(special_tokens)):
208
  special_tokens.append(f'<|reserved_{i}|>')
209
 
210
- # tokenizer.add_special_tokens(special_tokens)
211
-
212
  # ascii
213
  ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
214
 
@@ -222,17 +223,9 @@ dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
222
  code_keywords = [n for row in dataset for n in row['keywords']]
223
  del dataset
224
 
225
- tokenizer.normalizer = normalizers.Sequence([
226
- normalizers.Prepend("▁"),
227
- normalizers.Replace(" ", "▁"),
228
- ])
229
 
230
- tokenizer.decoder = decoders.Sequence([
231
- decoders.Replace("▁", " "), # Replace ▁ back to space
232
- decoders.ByteFallback(),
233
- decoders.Fuse(),
234
- decoders.Strip(' ', 1, 0),
235
- ])
236
 
237
  tokenizer.post_processor = TemplateProcessing(
238
  single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
@@ -240,12 +233,15 @@ tokenizer.post_processor = TemplateProcessing(
240
  special_tokens=[],
241
  )
242
 
 
 
243
  trainer = BpeTrainer(
244
- vocab_size=32064,
245
- min_frequency=2,
246
- max_token_length=8,
247
  special_tokens=special_tokens,
248
  initial_alphabet=ascii_chars + emoji_chars + code_keywords,
 
 
249
  )
250
 
251
  tokenizer.train_from_iterator(batch_iterator(), trainer)
@@ -269,8 +265,8 @@ fast_tokenizer = PreTrainedTokenizerFast(
269
  unk_token='<unk>',
270
  pad_token='</s>',
271
  clean_up_tokenization_spaces=False,
272
- spaces_between_special_tokens=False,
273
- use_default_system_prompt=False,
274
  )
275
 
276
  fast_tokenizer.save_pretrained('../')
 
4
 
5
  from datasets import load_dataset
6
  from transformers import PreTrainedTokenizerFast
7
+ from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
8
  from tokenizers.models import BPE
9
  from tokenizers.trainers import BpeTrainer
10
  from tokenizers.processors import TemplateProcessing
11
 
12
 
13
+ x = input('Are you sure? [y/N] ')
14
 
15
  if x not in ('y', 'Y', 'yes'):
16
  sys.exit(0)
 
183
  # gc.collect()
184
 
185
 
186
+ bpe = BPE(unk_token='<unk>', fuse_unk=False, byte_fallback=False)
187
  tokenizer = Tokenizer(bpe)
188
 
189
  special_tokens = [
 
204
  'tool',
205
  ]
206
 
207
+ for i in range(2, 25):
208
+ special_tokens.append(' ' * i)
209
+
210
  for i in range(64 - len(special_tokens)):
211
  special_tokens.append(f'<|reserved_{i}|>')
212
 
 
 
213
  # ascii
214
  ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
215
 
 
223
  code_keywords = [n for row in dataset for n in row['keywords']]
224
  del dataset
225
 
226
+ tokenizer.normalizer = normalizers.NFC()
 
 
 
227
 
228
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
 
 
 
 
 
229
 
230
  tokenizer.post_processor = TemplateProcessing(
231
  single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
 
233
  special_tokens=[],
234
  )
235
 
236
+ tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
237
+
238
  trainer = BpeTrainer(
239
+ vocab_size=32000,
240
+ # min_frequency=2,
 
241
  special_tokens=special_tokens,
242
  initial_alphabet=ascii_chars + emoji_chars + code_keywords,
243
+ # continuing_subword_prefix=None,
244
+ # end_of_word_suffix=None,
245
  )
246
 
247
  tokenizer.train_from_iterator(batch_iterator(), trainer)
 
265
  unk_token='<unk>',
266
  pad_token='</s>',
267
  clean_up_tokenization_spaces=False,
268
+ # spaces_between_special_tokens=False,
269
+ # use_default_system_prompt=False,
270
  )
271
 
272
  fast_tokenizer.save_pretrained('../')
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -121,7 +121,7 @@
121
  "special": true
122
  },
123
  "15": {
124
- "content": "<|reserved_0|>",
125
  "lstrip": false,
126
  "normalized": false,
127
  "rstrip": false,
@@ -129,7 +129,7 @@
129
  "special": true
130
  },
131
  "16": {
132
- "content": "<|reserved_1|>",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
@@ -137,7 +137,7 @@
137
  "special": true
138
  },
139
  "17": {
140
- "content": "<|reserved_2|>",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
@@ -145,7 +145,7 @@
145
  "special": true
146
  },
147
  "18": {
148
- "content": "<|reserved_3|>",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
@@ -153,7 +153,7 @@
153
  "special": true
154
  },
155
  "19": {
156
- "content": "<|reserved_4|>",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
@@ -161,7 +161,7 @@
161
  "special": true
162
  },
163
  "20": {
164
- "content": "<|reserved_5|>",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
@@ -169,7 +169,7 @@
169
  "special": true
170
  },
171
  "21": {
172
- "content": "<|reserved_6|>",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
@@ -177,7 +177,7 @@
177
  "special": true
178
  },
179
  "22": {
180
- "content": "<|reserved_7|>",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
@@ -185,7 +185,7 @@
185
  "special": true
186
  },
187
  "23": {
188
- "content": "<|reserved_8|>",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
@@ -193,7 +193,7 @@
193
  "special": true
194
  },
195
  "24": {
196
- "content": "<|reserved_9|>",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
@@ -201,7 +201,7 @@
201
  "special": true
202
  },
203
  "25": {
204
- "content": "<|reserved_10|>",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
@@ -209,7 +209,7 @@
209
  "special": true
210
  },
211
  "26": {
212
- "content": "<|reserved_11|>",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
@@ -217,7 +217,7 @@
217
  "special": true
218
  },
219
  "27": {
220
- "content": "<|reserved_12|>",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
@@ -225,7 +225,7 @@
225
  "special": true
226
  },
227
  "28": {
228
- "content": "<|reserved_13|>",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
@@ -233,7 +233,7 @@
233
  "special": true
234
  },
235
  "29": {
236
- "content": "<|reserved_14|>",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
@@ -241,7 +241,7 @@
241
  "special": true
242
  },
243
  "30": {
244
- "content": "<|reserved_15|>",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
@@ -249,7 +249,7 @@
249
  "special": true
250
  },
251
  "31": {
252
- "content": "<|reserved_16|>",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
@@ -257,7 +257,7 @@
257
  "special": true
258
  },
259
  "32": {
260
- "content": "<|reserved_17|>",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
@@ -265,7 +265,7 @@
265
  "special": true
266
  },
267
  "33": {
268
- "content": "<|reserved_18|>",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
@@ -273,7 +273,7 @@
273
  "special": true
274
  },
275
  "34": {
276
- "content": "<|reserved_19|>",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
@@ -281,7 +281,7 @@
281
  "special": true
282
  },
283
  "35": {
284
- "content": "<|reserved_20|>",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
@@ -289,7 +289,7 @@
289
  "special": true
290
  },
291
  "36": {
292
- "content": "<|reserved_21|>",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
@@ -297,7 +297,7 @@
297
  "special": true
298
  },
299
  "37": {
300
- "content": "<|reserved_22|>",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
@@ -305,7 +305,7 @@
305
  "special": true
306
  },
307
  "38": {
308
- "content": "<|reserved_23|>",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
@@ -313,7 +313,7 @@
313
  "special": true
314
  },
315
  "39": {
316
- "content": "<|reserved_24|>",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
@@ -321,7 +321,7 @@
321
  "special": true
322
  },
323
  "40": {
324
- "content": "<|reserved_25|>",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
@@ -329,7 +329,7 @@
329
  "special": true
330
  },
331
  "41": {
332
- "content": "<|reserved_26|>",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
@@ -337,7 +337,7 @@
337
  "special": true
338
  },
339
  "42": {
340
- "content": "<|reserved_27|>",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
@@ -345,7 +345,7 @@
345
  "special": true
346
  },
347
  "43": {
348
- "content": "<|reserved_28|>",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
@@ -353,7 +353,7 @@
353
  "special": true
354
  },
355
  "44": {
356
- "content": "<|reserved_29|>",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
@@ -361,7 +361,7 @@
361
  "special": true
362
  },
363
  "45": {
364
- "content": "<|reserved_30|>",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
@@ -369,7 +369,7 @@
369
  "special": true
370
  },
371
  "46": {
372
- "content": "<|reserved_31|>",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
@@ -377,7 +377,7 @@
377
  "special": true
378
  },
379
  "47": {
380
- "content": "<|reserved_32|>",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
@@ -385,7 +385,7 @@
385
  "special": true
386
  },
387
  "48": {
388
- "content": "<|reserved_33|>",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
@@ -393,7 +393,7 @@
393
  "special": true
394
  },
395
  "49": {
396
- "content": "<|reserved_34|>",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
@@ -401,7 +401,7 @@
401
  "special": true
402
  },
403
  "50": {
404
- "content": "<|reserved_35|>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
@@ -409,7 +409,7 @@
409
  "special": true
410
  },
411
  "51": {
412
- "content": "<|reserved_36|>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
@@ -417,7 +417,7 @@
417
  "special": true
418
  },
419
  "52": {
420
- "content": "<|reserved_37|>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
@@ -425,7 +425,7 @@
425
  "special": true
426
  },
427
  "53": {
428
- "content": "<|reserved_38|>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
@@ -433,7 +433,7 @@
433
  "special": true
434
  },
435
  "54": {
436
- "content": "<|reserved_39|>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
@@ -441,7 +441,7 @@
441
  "special": true
442
  },
443
  "55": {
444
- "content": "<|reserved_40|>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
@@ -449,7 +449,7 @@
449
  "special": true
450
  },
451
  "56": {
452
- "content": "<|reserved_41|>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
@@ -457,7 +457,7 @@
457
  "special": true
458
  },
459
  "57": {
460
- "content": "<|reserved_42|>",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
@@ -465,7 +465,7 @@
465
  "special": true
466
  },
467
  "58": {
468
- "content": "<|reserved_43|>",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
@@ -473,7 +473,7 @@
473
  "special": true
474
  },
475
  "59": {
476
- "content": "<|reserved_44|>",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
@@ -481,7 +481,7 @@
481
  "special": true
482
  },
483
  "60": {
484
- "content": "<|reserved_45|>",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
@@ -489,7 +489,7 @@
489
  "special": true
490
  },
491
  "61": {
492
- "content": "<|reserved_46|>",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
@@ -497,7 +497,7 @@
497
  "special": true
498
  },
499
  "62": {
500
- "content": "<|reserved_47|>",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
@@ -505,7 +505,7 @@
505
  "special": true
506
  },
507
  "63": {
508
- "content": "<|reserved_48|>",
509
  "lstrip": false,
510
  "normalized": false,
511
  "rstrip": false,
@@ -519,8 +519,6 @@
519
  "eos_token": "<|im_end|>",
520
  "model_max_length": 1000000000000000019884624838656,
521
  "pad_token": "</s>",
522
- "spaces_between_special_tokens": false,
523
  "tokenizer_class": "PreTrainedTokenizerFast",
524
- "unk_token": "<unk>",
525
- "use_default_system_prompt": false
526
  }
 
121
  "special": true
122
  },
123
  "15": {
124
+ "content": " ",
125
  "lstrip": false,
126
  "normalized": false,
127
  "rstrip": false,
 
129
  "special": true
130
  },
131
  "16": {
132
+ "content": " ",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
 
137
  "special": true
138
  },
139
  "17": {
140
+ "content": " ",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
 
145
  "special": true
146
  },
147
  "18": {
148
+ "content": " ",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
 
153
  "special": true
154
  },
155
  "19": {
156
+ "content": " ",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
 
161
  "special": true
162
  },
163
  "20": {
164
+ "content": " ",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
 
169
  "special": true
170
  },
171
  "21": {
172
+ "content": " ",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
 
177
  "special": true
178
  },
179
  "22": {
180
+ "content": " ",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
 
185
  "special": true
186
  },
187
  "23": {
188
+ "content": " ",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
 
193
  "special": true
194
  },
195
  "24": {
196
+ "content": " ",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
 
201
  "special": true
202
  },
203
  "25": {
204
+ "content": " ",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
 
209
  "special": true
210
  },
211
  "26": {
212
+ "content": " ",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
 
217
  "special": true
218
  },
219
  "27": {
220
+ "content": " ",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
 
225
  "special": true
226
  },
227
  "28": {
228
+ "content": " ",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
 
233
  "special": true
234
  },
235
  "29": {
236
+ "content": " ",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
 
241
  "special": true
242
  },
243
  "30": {
244
+ "content": " ",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
 
249
  "special": true
250
  },
251
  "31": {
252
+ "content": " ",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
 
257
  "special": true
258
  },
259
  "32": {
260
+ "content": " ",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
 
265
  "special": true
266
  },
267
  "33": {
268
+ "content": " ",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
 
273
  "special": true
274
  },
275
  "34": {
276
+ "content": " ",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
 
281
  "special": true
282
  },
283
  "35": {
284
+ "content": " ",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
 
289
  "special": true
290
  },
291
  "36": {
292
+ "content": " ",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
 
297
  "special": true
298
  },
299
  "37": {
300
+ "content": " ",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
 
305
  "special": true
306
  },
307
  "38": {
308
+ "content": "<|reserved_0|>",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
 
313
  "special": true
314
  },
315
  "39": {
316
+ "content": "<|reserved_1|>",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
 
321
  "special": true
322
  },
323
  "40": {
324
+ "content": "<|reserved_2|>",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
 
329
  "special": true
330
  },
331
  "41": {
332
+ "content": "<|reserved_3|>",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
 
337
  "special": true
338
  },
339
  "42": {
340
+ "content": "<|reserved_4|>",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
 
345
  "special": true
346
  },
347
  "43": {
348
+ "content": "<|reserved_5|>",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
 
353
  "special": true
354
  },
355
  "44": {
356
+ "content": "<|reserved_6|>",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
 
361
  "special": true
362
  },
363
  "45": {
364
+ "content": "<|reserved_7|>",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
 
369
  "special": true
370
  },
371
  "46": {
372
+ "content": "<|reserved_8|>",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
 
377
  "special": true
378
  },
379
  "47": {
380
+ "content": "<|reserved_9|>",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
 
385
  "special": true
386
  },
387
  "48": {
388
+ "content": "<|reserved_10|>",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
 
393
  "special": true
394
  },
395
  "49": {
396
+ "content": "<|reserved_11|>",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
 
401
  "special": true
402
  },
403
  "50": {
404
+ "content": "<|reserved_12|>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
 
409
  "special": true
410
  },
411
  "51": {
412
+ "content": "<|reserved_13|>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
 
417
  "special": true
418
  },
419
  "52": {
420
+ "content": "<|reserved_14|>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
 
425
  "special": true
426
  },
427
  "53": {
428
+ "content": "<|reserved_15|>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
 
433
  "special": true
434
  },
435
  "54": {
436
+ "content": "<|reserved_16|>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
 
441
  "special": true
442
  },
443
  "55": {
444
+ "content": "<|reserved_17|>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
 
449
  "special": true
450
  },
451
  "56": {
452
+ "content": "<|reserved_18|>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
 
457
  "special": true
458
  },
459
  "57": {
460
+ "content": "<|reserved_19|>",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
 
465
  "special": true
466
  },
467
  "58": {
468
+ "content": "<|reserved_20|>",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
 
473
  "special": true
474
  },
475
  "59": {
476
+ "content": "<|reserved_21|>",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
 
481
  "special": true
482
  },
483
  "60": {
484
+ "content": "<|reserved_22|>",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
 
489
  "special": true
490
  },
491
  "61": {
492
+ "content": "<|reserved_23|>",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
 
497
  "special": true
498
  },
499
  "62": {
500
+ "content": "<|reserved_24|>",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
 
505
  "special": true
506
  },
507
  "63": {
508
+ "content": "<|reserved_25|>",
509
  "lstrip": false,
510
  "normalized": false,
511
  "rstrip": false,
 
519
  "eos_token": "<|im_end|>",
520
  "model_max_length": 1000000000000000019884624838656,
521
  "pad_token": "</s>",
 
522
  "tokenizer_class": "PreTrainedTokenizerFast",
523
+ "unk_token": "<unk>"
 
524
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff