mtasic85 commited on
Commit
b1a1586
1 Parent(s): 75050d9
Files changed (5) hide show
  1. merges.txt +0 -0
  2. scripts/train_tokenizer.py +70 -41
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +569 -57
  5. vocab.json +0 -0
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/train_tokenizer.py CHANGED
@@ -151,36 +151,36 @@ def batch_iterator():
151
  del dataset
152
  gc.collect()
153
 
154
- ## instruction
155
- # dataset = load_dataset('arcee-ai/agent-data', split='train')
156
- #
157
- # for row in dataset['conversations']:
158
- # yield '\n'.join(n['value'] for n in row)
159
- #
160
- # del dataset
161
- # gc.collect()
162
 
163
- ## instruction
164
- # dataset = (
165
- # load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_filtered.jsonl', split='train'),
166
- # load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_multilingual.jsonl', split='train'),
167
- # )
168
- #
169
- # for d in dataset:
170
- # for row in d['messages']:
171
- # yield '\n'.join(n['content'] for n in row)
172
- #
173
- # del dataset
174
- # gc.collect()
175
 
176
- ## emoji
177
- # dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
178
- #
179
- # for row in dataset:
180
- # yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
181
- #
182
- # del dataset
183
- # gc.collect()
184
 
185
 
186
  bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
@@ -192,40 +192,69 @@ special_tokens = [
192
  '</s>',
193
  '<|im_start|>',
194
  '<|im_end|>',
 
 
 
 
 
195
  '<tools>',
196
  '</tools>',
197
  '<tool_call>',
198
  '</tool_call>',
199
  '<tool_response>',
200
  '</tool_response>',
201
- 'system',
202
- 'user',
203
- 'assistant',
204
- 'tool',
205
-
206
  '"arguments"',
 
 
207
  '<arguments>',
 
208
  '<argument>',
 
209
  '<argument-name>',
 
 
 
 
 
210
  '<parameter>',
 
211
  '<parameter-name>',
212
- '<value>',
213
- '<argument-value>',
 
214
  '<parameter-value>',
215
- '"name"',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  '<function>',
 
217
  '<function-name>',
 
 
 
 
 
218
  ]
219
 
220
  for i in range(2, 25):
221
  special_tokens.append(' ' * i)
222
 
223
- for i in range(64 - len(special_tokens)):
224
  special_tokens.append(f'<|reserved_{i}|>')
225
 
226
- ## ascii
227
- # ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
228
-
229
  # emoji
230
  dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
231
  emoji_chars = [row['character'] for row in dataset if len(row['character']) == 1]
@@ -252,7 +281,7 @@ tokenizer.post_processor = TemplateProcessing(
252
  tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
253
 
254
  trainer = BpeTrainer(
255
- vocab_size=32000,
256
  min_frequency=2,
257
  special_tokens=special_tokens,
258
  initial_alphabet=emoji_chars + programming_languages + code_keywords,
 
151
  del dataset
152
  gc.collect()
153
 
154
+ # instruction
155
+ dataset = load_dataset('arcee-ai/agent-data', split='train')
156
+
157
+ for row in dataset['conversations']:
158
+ yield '\n'.join(n['value'] for n in row)
159
+
160
+ del dataset
161
+ gc.collect()
162
 
163
+ # instruction
164
+ dataset = (
165
+ load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_filtered.jsonl', split='train'),
166
+ load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_multilingual.jsonl', split='train'),
167
+ )
168
+
169
+ for d in dataset:
170
+ for row in d['messages']:
171
+ yield '\n'.join(n['content'] for n in row)
172
+
173
+ del dataset
174
+ gc.collect()
175
 
176
+ # emoji
177
+ dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
178
+
179
+ for row in dataset:
180
+ yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
181
+
182
+ del dataset
183
+ gc.collect()
184
 
185
 
186
  bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
 
192
  '</s>',
193
  '<|im_start|>',
194
  '<|im_end|>',
195
+ 'system',
196
+ 'user',
197
+ 'assistant',
198
+ 'tool',
199
+
200
  '<tools>',
201
  '</tools>',
202
  '<tool_call>',
203
  '</tool_call>',
204
  '<tool_response>',
205
  '</tool_response>',
206
+
 
 
 
 
207
  '"arguments"',
208
+ '"name"',
209
+
210
  '<arguments>',
211
+ '</arguments>',
212
  '<argument>',
213
+ '</argument>',
214
  '<argument-name>',
215
+ '</argument-name>',
216
+ '<argument-type>',
217
+ '</argument-type>',
218
+ '<argument-value>',
219
+ '</argument-value>',
220
  '<parameter>',
221
+ '</parameter>',
222
  '<parameter-name>',
223
+ '</parameter-name>',
224
+ '<parameter-type>',
225
+ '</parameter-type>',
226
  '<parameter-value>',
227
+ '</parameter-value>',
228
+ '<field>',
229
+ '</field>',
230
+ '<field-name>',
231
+ '</field-name>',
232
+ '<field-type>',
233
+ '</field-type>',
234
+ '<field-value>',
235
+ '</field-value>',
236
+ '<name>',
237
+ '</name>',
238
+ '<type>',
239
+ '</type>',
240
+ '<value>',
241
+ '</value>',
242
  '<function>',
243
+ '</function>',
244
  '<function-name>',
245
+ '</function-name>',
246
+ '<function-type>',
247
+ '</function-type>',
248
+ '<function-value>',
249
+ '</function-value>',
250
  ]
251
 
252
  for i in range(2, 25):
253
  special_tokens.append(' ' * i)
254
 
255
+ for i in range(128 - len(special_tokens)):
256
  special_tokens.append(f'<|reserved_{i}|>')
257
 
 
 
 
258
  # emoji
259
  dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
260
  emoji_chars = [row['character'] for row in dataset if len(row['character']) == 1]
 
281
  tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
282
 
283
  trainer = BpeTrainer(
284
+ vocab_size=32768, # 2 ** 15
285
  min_frequency=2,
286
  special_tokens=special_tokens,
287
  initial_alphabet=emoji_chars + programming_languages + code_keywords,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -41,7 +41,7 @@
41
  "special": true
42
  },
43
  "5": {
44
- "content": "<tools>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
@@ -49,7 +49,7 @@
49
  "special": true
50
  },
51
  "6": {
52
- "content": "</tools>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
@@ -57,7 +57,7 @@
57
  "special": true
58
  },
59
  "7": {
60
- "content": "<tool_call>",
61
  "lstrip": false,
62
  "normalized": false,
63
  "rstrip": false,
@@ -65,7 +65,7 @@
65
  "special": true
66
  },
67
  "8": {
68
- "content": "</tool_call>",
69
  "lstrip": false,
70
  "normalized": false,
71
  "rstrip": false,
@@ -73,7 +73,7 @@
73
  "special": true
74
  },
75
  "9": {
76
- "content": "<tool_response>",
77
  "lstrip": false,
78
  "normalized": false,
79
  "rstrip": false,
@@ -81,7 +81,7 @@
81
  "special": true
82
  },
83
  "10": {
84
- "content": "</tool_response>",
85
  "lstrip": false,
86
  "normalized": false,
87
  "rstrip": false,
@@ -89,7 +89,7 @@
89
  "special": true
90
  },
91
  "11": {
92
- "content": "system",
93
  "lstrip": false,
94
  "normalized": false,
95
  "rstrip": false,
@@ -97,7 +97,7 @@
97
  "special": true
98
  },
99
  "12": {
100
- "content": "user",
101
  "lstrip": false,
102
  "normalized": false,
103
  "rstrip": false,
@@ -105,7 +105,7 @@
105
  "special": true
106
  },
107
  "13": {
108
- "content": "assistant",
109
  "lstrip": false,
110
  "normalized": false,
111
  "rstrip": false,
@@ -113,7 +113,7 @@
113
  "special": true
114
  },
115
  "14": {
116
- "content": "tool",
117
  "lstrip": false,
118
  "normalized": false,
119
  "rstrip": false,
@@ -129,7 +129,7 @@
129
  "special": true
130
  },
131
  "16": {
132
- "content": "<arguments>",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
@@ -137,7 +137,7 @@
137
  "special": true
138
  },
139
  "17": {
140
- "content": "<argument>",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
@@ -145,7 +145,7 @@
145
  "special": true
146
  },
147
  "18": {
148
- "content": "<argument-name>",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
@@ -153,7 +153,7 @@
153
  "special": true
154
  },
155
  "19": {
156
- "content": "<parameter>",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
@@ -161,7 +161,7 @@
161
  "special": true
162
  },
163
  "20": {
164
- "content": "<parameter-name>",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
@@ -169,7 +169,7 @@
169
  "special": true
170
  },
171
  "21": {
172
- "content": "<value>",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
@@ -177,7 +177,7 @@
177
  "special": true
178
  },
179
  "22": {
180
- "content": "<argument-value>",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
@@ -185,7 +185,7 @@
185
  "special": true
186
  },
187
  "23": {
188
- "content": "<parameter-value>",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
@@ -193,7 +193,7 @@
193
  "special": true
194
  },
195
  "24": {
196
- "content": "\"name\"",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
@@ -201,7 +201,7 @@
201
  "special": true
202
  },
203
  "25": {
204
- "content": "<function>",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
@@ -209,7 +209,7 @@
209
  "special": true
210
  },
211
  "26": {
212
- "content": "<function-name>",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
@@ -217,7 +217,7 @@
217
  "special": true
218
  },
219
  "27": {
220
- "content": " ",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
@@ -225,7 +225,7 @@
225
  "special": true
226
  },
227
  "28": {
228
- "content": " ",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
@@ -233,7 +233,7 @@
233
  "special": true
234
  },
235
  "29": {
236
- "content": " ",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
@@ -241,7 +241,7 @@
241
  "special": true
242
  },
243
  "30": {
244
- "content": " ",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
@@ -249,7 +249,7 @@
249
  "special": true
250
  },
251
  "31": {
252
- "content": " ",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
@@ -257,7 +257,7 @@
257
  "special": true
258
  },
259
  "32": {
260
- "content": " ",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
@@ -265,7 +265,7 @@
265
  "special": true
266
  },
267
  "33": {
268
- "content": " ",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
@@ -273,7 +273,7 @@
273
  "special": true
274
  },
275
  "34": {
276
- "content": " ",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
@@ -281,7 +281,7 @@
281
  "special": true
282
  },
283
  "35": {
284
- "content": " ",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
@@ -289,7 +289,7 @@
289
  "special": true
290
  },
291
  "36": {
292
- "content": " ",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
@@ -297,7 +297,7 @@
297
  "special": true
298
  },
299
  "37": {
300
- "content": " ",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
@@ -305,7 +305,7 @@
305
  "special": true
306
  },
307
  "38": {
308
- "content": " ",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
@@ -313,7 +313,7 @@
313
  "special": true
314
  },
315
  "39": {
316
- "content": " ",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
@@ -321,7 +321,7 @@
321
  "special": true
322
  },
323
  "40": {
324
- "content": " ",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
@@ -329,7 +329,7 @@
329
  "special": true
330
  },
331
  "41": {
332
- "content": " ",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
@@ -337,7 +337,7 @@
337
  "special": true
338
  },
339
  "42": {
340
- "content": " ",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
@@ -345,7 +345,7 @@
345
  "special": true
346
  },
347
  "43": {
348
- "content": " ",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
@@ -353,7 +353,7 @@
353
  "special": true
354
  },
355
  "44": {
356
- "content": " ",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
@@ -361,7 +361,7 @@
361
  "special": true
362
  },
363
  "45": {
364
- "content": " ",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
@@ -369,7 +369,7 @@
369
  "special": true
370
  },
371
  "46": {
372
- "content": " ",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
@@ -377,7 +377,7 @@
377
  "special": true
378
  },
379
  "47": {
380
- "content": " ",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
@@ -385,7 +385,7 @@
385
  "special": true
386
  },
387
  "48": {
388
- "content": " ",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
@@ -393,7 +393,7 @@
393
  "special": true
394
  },
395
  "49": {
396
- "content": " ",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
@@ -401,7 +401,7 @@
401
  "special": true
402
  },
403
  "50": {
404
- "content": "<|reserved_0|>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
@@ -409,7 +409,7 @@
409
  "special": true
410
  },
411
  "51": {
412
- "content": "<|reserved_1|>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
@@ -417,7 +417,7 @@
417
  "special": true
418
  },
419
  "52": {
420
- "content": "<|reserved_2|>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
@@ -425,7 +425,7 @@
425
  "special": true
426
  },
427
  "53": {
428
- "content": "<|reserved_3|>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
@@ -433,7 +433,7 @@
433
  "special": true
434
  },
435
  "54": {
436
- "content": "<|reserved_4|>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
@@ -441,7 +441,7 @@
441
  "special": true
442
  },
443
  "55": {
444
- "content": "<|reserved_5|>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
@@ -449,7 +449,7 @@
449
  "special": true
450
  },
451
  "56": {
452
- "content": "<|reserved_6|>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
@@ -457,7 +457,7 @@
457
  "special": true
458
  },
459
  "57": {
460
- "content": "<|reserved_7|>",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
@@ -465,7 +465,7 @@
465
  "special": true
466
  },
467
  "58": {
468
- "content": "<|reserved_8|>",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
@@ -473,7 +473,7 @@
473
  "special": true
474
  },
475
  "59": {
476
- "content": "<|reserved_9|>",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
@@ -481,7 +481,7 @@
481
  "special": true
482
  },
483
  "60": {
484
- "content": "<|reserved_10|>",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
@@ -489,7 +489,7 @@
489
  "special": true
490
  },
491
  "61": {
492
- "content": "<|reserved_11|>",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
@@ -497,7 +497,7 @@
497
  "special": true
498
  },
499
  "62": {
500
- "content": "<|reserved_12|>",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
@@ -505,12 +505,524 @@
505
  "special": true
506
  },
507
  "63": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  "content": "<|reserved_13|>",
509
  "lstrip": false,
510
  "normalized": false,
511
  "rstrip": false,
512
  "single_word": false,
513
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  }
515
  },
516
  "bos_token": "<s>",
 
41
  "special": true
42
  },
43
  "5": {
44
+ "content": "system",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
 
49
  "special": true
50
  },
51
  "6": {
52
+ "content": "user",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
 
57
  "special": true
58
  },
59
  "7": {
60
+ "content": "assistant",
61
  "lstrip": false,
62
  "normalized": false,
63
  "rstrip": false,
 
65
  "special": true
66
  },
67
  "8": {
68
+ "content": "tool",
69
  "lstrip": false,
70
  "normalized": false,
71
  "rstrip": false,
 
73
  "special": true
74
  },
75
  "9": {
76
+ "content": "<tools>",
77
  "lstrip": false,
78
  "normalized": false,
79
  "rstrip": false,
 
81
  "special": true
82
  },
83
  "10": {
84
+ "content": "</tools>",
85
  "lstrip": false,
86
  "normalized": false,
87
  "rstrip": false,
 
89
  "special": true
90
  },
91
  "11": {
92
+ "content": "<tool_call>",
93
  "lstrip": false,
94
  "normalized": false,
95
  "rstrip": false,
 
97
  "special": true
98
  },
99
  "12": {
100
+ "content": "</tool_call>",
101
  "lstrip": false,
102
  "normalized": false,
103
  "rstrip": false,
 
105
  "special": true
106
  },
107
  "13": {
108
+ "content": "<tool_response>",
109
  "lstrip": false,
110
  "normalized": false,
111
  "rstrip": false,
 
113
  "special": true
114
  },
115
  "14": {
116
+ "content": "</tool_response>",
117
  "lstrip": false,
118
  "normalized": false,
119
  "rstrip": false,
 
129
  "special": true
130
  },
131
  "16": {
132
+ "content": "\"name\"",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
 
137
  "special": true
138
  },
139
  "17": {
140
+ "content": "<arguments>",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
 
145
  "special": true
146
  },
147
  "18": {
148
+ "content": "</arguments>",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
 
153
  "special": true
154
  },
155
  "19": {
156
+ "content": "<argument>",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
 
161
  "special": true
162
  },
163
  "20": {
164
+ "content": "</argument>",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
 
169
  "special": true
170
  },
171
  "21": {
172
+ "content": "<argument-name>",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
 
177
  "special": true
178
  },
179
  "22": {
180
+ "content": "</argument-name>",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
 
185
  "special": true
186
  },
187
  "23": {
188
+ "content": "<argument-type>",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
 
193
  "special": true
194
  },
195
  "24": {
196
+ "content": "</argument-type>",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
 
201
  "special": true
202
  },
203
  "25": {
204
+ "content": "<argument-value>",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
 
209
  "special": true
210
  },
211
  "26": {
212
+ "content": "</argument-value>",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
 
217
  "special": true
218
  },
219
  "27": {
220
+ "content": "<parameter>",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
 
225
  "special": true
226
  },
227
  "28": {
228
+ "content": "</parameter>",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
 
233
  "special": true
234
  },
235
  "29": {
236
+ "content": "<parameter-name>",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
 
241
  "special": true
242
  },
243
  "30": {
244
+ "content": "</parameter-name>",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
 
249
  "special": true
250
  },
251
  "31": {
252
+ "content": "<parameter-type>",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
 
257
  "special": true
258
  },
259
  "32": {
260
+ "content": "</parameter-type>",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
 
265
  "special": true
266
  },
267
  "33": {
268
+ "content": "<parameter-value>",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
 
273
  "special": true
274
  },
275
  "34": {
276
+ "content": "</parameter-value>",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
 
281
  "special": true
282
  },
283
  "35": {
284
+ "content": "<field>",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
 
289
  "special": true
290
  },
291
  "36": {
292
+ "content": "</field>",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
 
297
  "special": true
298
  },
299
  "37": {
300
+ "content": "<field-name>",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
 
305
  "special": true
306
  },
307
  "38": {
308
+ "content": "</field-name>",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
 
313
  "special": true
314
  },
315
  "39": {
316
+ "content": "<field-type>",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
 
321
  "special": true
322
  },
323
  "40": {
324
+ "content": "</field-type>",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
 
329
  "special": true
330
  },
331
  "41": {
332
+ "content": "<field-value>",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
 
337
  "special": true
338
  },
339
  "42": {
340
+ "content": "</field-value>",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
 
345
  "special": true
346
  },
347
  "43": {
348
+ "content": "<name>",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
 
353
  "special": true
354
  },
355
  "44": {
356
+ "content": "</name>",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
 
361
  "special": true
362
  },
363
  "45": {
364
+ "content": "<type>",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
 
369
  "special": true
370
  },
371
  "46": {
372
+ "content": "</type>",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
 
377
  "special": true
378
  },
379
  "47": {
380
+ "content": "<value>",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
 
385
  "special": true
386
  },
387
  "48": {
388
+ "content": "</value>",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
 
393
  "special": true
394
  },
395
  "49": {
396
+ "content": "<function>",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
 
401
  "special": true
402
  },
403
  "50": {
404
+ "content": "</function>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
 
409
  "special": true
410
  },
411
  "51": {
412
+ "content": "<function-name>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
 
417
  "special": true
418
  },
419
  "52": {
420
+ "content": "</function-name>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
 
425
  "special": true
426
  },
427
  "53": {
428
+ "content": "<function-type>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
 
433
  "special": true
434
  },
435
  "54": {
436
+ "content": "</function-type>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
 
441
  "special": true
442
  },
443
  "55": {
444
+ "content": "<function-value>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
 
449
  "special": true
450
  },
451
  "56": {
452
+ "content": "</function-value>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
 
457
  "special": true
458
  },
459
  "57": {
460
+ "content": " ",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
 
465
  "special": true
466
  },
467
  "58": {
468
+ "content": " ",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
 
473
  "special": true
474
  },
475
  "59": {
476
+ "content": " ",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
 
481
  "special": true
482
  },
483
  "60": {
484
+ "content": " ",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
 
489
  "special": true
490
  },
491
  "61": {
492
+ "content": " ",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
 
497
  "special": true
498
  },
499
  "62": {
500
+ "content": " ",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
 
505
  "special": true
506
  },
507
  "63": {
508
+ "content": " ",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "64": {
516
+ "content": " ",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "65": {
524
+ "content": " ",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "66": {
532
+ "content": " ",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "67": {
540
+ "content": " ",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "68": {
548
+ "content": " ",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "69": {
556
+ "content": " ",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "70": {
564
+ "content": " ",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "71": {
572
+ "content": " ",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "72": {
580
+ "content": " ",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "73": {
588
+ "content": " ",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "74": {
596
+ "content": " ",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "75": {
604
+ "content": " ",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "76": {
612
+ "content": " ",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "77": {
620
+ "content": " ",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "78": {
628
+ "content": " ",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "79": {
636
+ "content": " ",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "80": {
644
+ "content": "<|reserved_0|>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "81": {
652
+ "content": "<|reserved_1|>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "82": {
660
+ "content": "<|reserved_2|>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "83": {
668
+ "content": "<|reserved_3|>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "84": {
676
+ "content": "<|reserved_4|>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "85": {
684
+ "content": "<|reserved_5|>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "86": {
692
+ "content": "<|reserved_6|>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "87": {
700
+ "content": "<|reserved_7|>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "88": {
708
+ "content": "<|reserved_8|>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "89": {
716
+ "content": "<|reserved_9|>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "90": {
724
+ "content": "<|reserved_10|>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "91": {
732
+ "content": "<|reserved_11|>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "92": {
740
+ "content": "<|reserved_12|>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "93": {
748
  "content": "<|reserved_13|>",
749
  "lstrip": false,
750
  "normalized": false,
751
  "rstrip": false,
752
  "single_word": false,
753
  "special": true
754
+ },
755
+ "94": {
756
+ "content": "<|reserved_14|>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "95": {
764
+ "content": "<|reserved_15|>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "96": {
772
+ "content": "<|reserved_16|>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "97": {
780
+ "content": "<|reserved_17|>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "98": {
788
+ "content": "<|reserved_18|>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "99": {
796
+ "content": "<|reserved_19|>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "100": {
804
+ "content": "<|reserved_20|>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "101": {
812
+ "content": "<|reserved_21|>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "102": {
820
+ "content": "<|reserved_22|>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "103": {
828
+ "content": "<|reserved_23|>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "104": {
836
+ "content": "<|reserved_24|>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "105": {
844
+ "content": "<|reserved_25|>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "106": {
852
+ "content": "<|reserved_26|>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "107": {
860
+ "content": "<|reserved_27|>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "108": {
868
+ "content": "<|reserved_28|>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "109": {
876
+ "content": "<|reserved_29|>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "110": {
884
+ "content": "<|reserved_30|>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "111": {
892
+ "content": "<|reserved_31|>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "112": {
900
+ "content": "<|reserved_32|>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "113": {
908
+ "content": "<|reserved_33|>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "114": {
916
+ "content": "<|reserved_34|>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "115": {
924
+ "content": "<|reserved_35|>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "116": {
932
+ "content": "<|reserved_36|>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "117": {
940
+ "content": "<|reserved_37|>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "118": {
948
+ "content": "<|reserved_38|>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "119": {
956
+ "content": "<|reserved_39|>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "120": {
964
+ "content": "<|reserved_40|>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "121": {
972
+ "content": "<|reserved_41|>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "122": {
980
+ "content": "<|reserved_42|>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "123": {
988
+ "content": "<|reserved_43|>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "124": {
996
+ "content": "<|reserved_44|>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "125": {
1004
+ "content": "<|reserved_45|>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "126": {
1012
+ "content": "<|reserved_46|>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "127": {
1020
+ "content": "<|reserved_47|>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
  }
1027
  },
1028
  "bos_token": "<s>",
vocab.json CHANGED
The diff for this file is too large to render. See raw diff