fix autotokenizer
Browse files- tokenizer.json +20 -9
- tokenizer_config.json +5 -5
tokenizer.json
CHANGED
@@ -37,7 +37,7 @@
|
|
37 |
"lstrip": false,
|
38 |
"rstrip": false,
|
39 |
"normalized": false,
|
40 |
-
"special":
|
41 |
},
|
42 |
{
|
43 |
"id": 4,
|
@@ -46,7 +46,7 @@
|
|
46 |
"lstrip": false,
|
47 |
"rstrip": false,
|
48 |
"normalized": false,
|
49 |
-
"special":
|
50 |
},
|
51 |
{
|
52 |
"id": 5,
|
@@ -55,7 +55,7 @@
|
|
55 |
"lstrip": false,
|
56 |
"rstrip": false,
|
57 |
"normalized": false,
|
58 |
-
"special":
|
59 |
},
|
60 |
{
|
61 |
"id": 6,
|
@@ -64,15 +64,26 @@
|
|
64 |
"lstrip": false,
|
65 |
"rstrip": false,
|
66 |
"normalized": false,
|
67 |
-
"special":
|
68 |
}
|
69 |
],
|
70 |
-
"normalizer":
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"pre_tokenizer": {
|
72 |
-
"type": "
|
73 |
-
"
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
76 |
},
|
77 |
"post_processor": {
|
78 |
"type": "TemplateProcessing",
|
|
|
37 |
"lstrip": false,
|
38 |
"rstrip": false,
|
39 |
"normalized": false,
|
40 |
+
"special": true
|
41 |
},
|
42 |
{
|
43 |
"id": 4,
|
|
|
46 |
"lstrip": false,
|
47 |
"rstrip": false,
|
48 |
"normalized": false,
|
49 |
+
"special": true
|
50 |
},
|
51 |
{
|
52 |
"id": 5,
|
|
|
55 |
"lstrip": false,
|
56 |
"rstrip": false,
|
57 |
"normalized": false,
|
58 |
+
"special": true
|
59 |
},
|
60 |
{
|
61 |
"id": 6,
|
|
|
64 |
"lstrip": false,
|
65 |
"rstrip": false,
|
66 |
"normalized": false,
|
67 |
+
"special": true
|
68 |
}
|
69 |
],
|
70 |
+
"normalizer": {
|
71 |
+
"type": "Replace",
|
72 |
+
"pattern": {
|
73 |
+
"Regex": " {2,}"
|
74 |
+
},
|
75 |
+
"content": "▁"
|
76 |
+
},
|
77 |
"pre_tokenizer": {
|
78 |
+
"type": "Sequence",
|
79 |
+
"pretokenizers": [
|
80 |
+
{
|
81 |
+
"type": "Metaspace",
|
82 |
+
"replacement": "▁",
|
83 |
+
"prepend_scheme": "first",
|
84 |
+
"split": false
|
85 |
+
}
|
86 |
+
]
|
87 |
},
|
88 |
"post_processor": {
|
89 |
"type": "TemplateProcessing",
|
tokenizer_config.json
CHANGED
@@ -33,7 +33,7 @@
|
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
35 |
"single_word": false,
|
36 |
-
"special":
|
37 |
},
|
38 |
"4": {
|
39 |
"content": "[/INST]",
|
@@ -41,7 +41,7 @@
|
|
41 |
"normalized": false,
|
42 |
"rstrip": false,
|
43 |
"single_word": false,
|
44 |
-
"special":
|
45 |
},
|
46 |
"5": {
|
47 |
"content": "<<SYS>>",
|
@@ -49,7 +49,7 @@
|
|
49 |
"normalized": false,
|
50 |
"rstrip": false,
|
51 |
"single_word": false,
|
52 |
-
"special":
|
53 |
},
|
54 |
"6": {
|
55 |
"content": "<</SYS>>",
|
@@ -57,7 +57,7 @@
|
|
57 |
"normalized": false,
|
58 |
"rstrip": false,
|
59 |
"single_word": false,
|
60 |
-
"special":
|
61 |
}
|
62 |
},
|
63 |
"bos_token": "<s>",
|
@@ -69,7 +69,7 @@
|
|
69 |
"pad_token": null,
|
70 |
"sp_model_kwargs": {},
|
71 |
"spaces_between_special_tokens": false,
|
72 |
-
"tokenizer_class": "
|
73 |
"unk_token": "<unk>",
|
74 |
"use_default_system_prompt": false
|
75 |
}
|
|
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
35 |
"single_word": false,
|
36 |
+
"special": true
|
37 |
},
|
38 |
"4": {
|
39 |
"content": "[/INST]",
|
|
|
41 |
"normalized": false,
|
42 |
"rstrip": false,
|
43 |
"single_word": false,
|
44 |
+
"special": true
|
45 |
},
|
46 |
"5": {
|
47 |
"content": "<<SYS>>",
|
|
|
49 |
"normalized": false,
|
50 |
"rstrip": false,
|
51 |
"single_word": false,
|
52 |
+
"special": true
|
53 |
},
|
54 |
"6": {
|
55 |
"content": "<</SYS>>",
|
|
|
57 |
"normalized": false,
|
58 |
"rstrip": false,
|
59 |
"single_word": false,
|
60 |
+
"special": true
|
61 |
}
|
62 |
},
|
63 |
"bos_token": "<s>",
|
|
|
69 |
"pad_token": null,
|
70 |
"sp_model_kwargs": {},
|
71 |
"spaces_between_special_tokens": false,
|
72 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
73 |
"unk_token": "<unk>",
|
74 |
"use_default_system_prompt": false
|
75 |
}
|