Upload folder using huggingface_hub
Browse files- tokenizer/special_tokens_map.json +8 -0
- tokenizer/tokenizer_config.json +63 -0
- tokenizer/vocab.json +1 -0
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "\u0002",
|
3 |
+
"cls_token": "\u0011",
|
4 |
+
"eos_token": "\u0003",
|
5 |
+
"pad_token": "\u0000",
|
6 |
+
"sep_token": "\u001d",
|
7 |
+
"unk_token": "\u001a"
|
8 |
+
}
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "\u0000",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"2": {
|
12 |
+
"content": "\u0002",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"3": {
|
20 |
+
"content": "\u0003",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"17": {
|
28 |
+
"content": "\u0011",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"26": {
|
36 |
+
"content": "\u001a",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"29": {
|
44 |
+
"content": "\u001d",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"bos_token": "\u0002",
|
53 |
+
"clean_up_tokenization_spaces": true,
|
54 |
+
"cls_token": "\u0011",
|
55 |
+
"eos_token": "\u0003",
|
56 |
+
"model_max_length": 1000000000000000019884624838656,
|
57 |
+
"pad_token": "\u0000",
|
58 |
+
"sep_token": "\u001d",
|
59 |
+
"split_special_tokens": true,
|
60 |
+
"tokenizer_class": "ByteTokenizer",
|
61 |
+
"unk_token": "\u001a",
|
62 |
+
"vocab_size": 256
|
63 |
+
}
|
tokenizer/vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"\u0000": 0, "\u0001": 1, "\u0002": 2, "\u0003": 3, "\u0004": 4, "\u0005": 5, "\u0006": 6, "\u0007": 7, "\b": 8, "\t": 9, "\n": 10, "\u000b": 11, "\f": 12, "\r": 13, "\u000e": 14, "\u000f": 15, "\u0010": 16, "\u0011": 17, "\u0012": 18, "\u0013": 19, "\u0014": 20, "\u0015": 21, "\u0016": 22, "\u0017": 23, "\u0018": 24, "\u0019": 25, "\u001a": 26, "\u001b": 27, "\u001c": 28, "\u001d": 29, "\u001e": 30, "\u001f": 31, " ": 32, "!": 33, "\"": 34, "#": 35, "$": 36, "%": 37, "&": 38, "'": 39, "(": 40, ")": 41, "*": 42, "+": 43, ",": 44, "-": 45, ".": 46, "/": 47, "0": 48, "1": 49, "2": 50, "3": 51, "4": 52, "5": 53, "6": 54, "7": 55, "8": 56, "9": 57, ":": 58, ";": 59, "<": 60, "=": 61, ">": 62, "?": 63, "@": 64, "A": 65, "B": 66, "C": 67, "D": 68, "E": 69, "F": 70, "G": 71, "H": 72, "I": 73, "J": 74, "K": 75, "L": 76, "M": 77, "N": 78, "O": 79, "P": 80, "Q": 81, "R": 82, "S": 83, "T": 84, "U": 85, "V": 86, "W": 87, "X": 88, "Y": 89, "Z": 90, "[": 91, "\\": 92, "]": 93, "^": 94, "_": 95, "`": 96, "a": 97, "b": 98, "c": 99, "d": 100, "e": 101, "f": 102, "g": 103, "h": 104, "i": 105, "j": 106, "k": 107, "l": 108, "m": 109, "n": 110, "o": 111, "p": 112, "q": 113, "r": 114, "s": 115, "t": 116, "u": 117, "v": 118, "w": 119, "x": 120, "y": 121, "z": 122, "{": 123, "|": 124, "}": 125, "~": 126, "\u007f": 127, "\u0080": 128, "\u0081": 129, "\u0082": 130, "\u0083": 131, "\u0084": 132, "\u0085": 133, "\u0086": 134, "\u0087": 135, "\u0088": 136, "\u0089": 137, "\u008a": 138, "\u008b": 139, "\u008c": 140, "\u008d": 141, "\u008e": 142, "\u008f": 143, "\u0090": 144, "\u0091": 145, "\u0092": 146, "\u0093": 147, "\u0094": 148, "\u0095": 149, "\u0096": 150, "\u0097": 151, "\u0098": 152, "\u0099": 153, "\u009a": 154, "\u009b": 155, "\u009c": 156, "\u009d": 157, "\u009e": 158, "\u009f": 159, "\u00a0": 160, "\u00a1": 161, "\u00a2": 162, "\u00a3": 163, "\u00a4": 164, "\u00a5": 165, "\u00a6": 166, "\u00a7": 167, "\u00a8": 168, "\u00a9": 169, "\u00aa": 170, "\u00ab": 171, "\u00ac": 172, "\u00ad": 173, "\u00ae": 174, "\u00af": 175, "\u00b0": 176, "\u00b1": 177, "\u00b2": 178, "\u00b3": 179, "\u00b4": 180, "\u00b5": 181, "\u00b6": 182, "\u00b7": 183, "\u00b8": 184, "\u00b9": 185, "\u00ba": 186, "\u00bb": 187, "\u00bc": 188, "\u00bd": 189, "\u00be": 190, "\u00bf": 191, "\u00c0": 192, "\u00c1": 193, "\u00c2": 194, "\u00c3": 195, "\u00c4": 196, "\u00c5": 197, "\u00c6": 198, "\u00c7": 199, "\u00c8": 200, "\u00c9": 201, "\u00ca": 202, "\u00cb": 203, "\u00cc": 204, "\u00cd": 205, "\u00ce": 206, "\u00cf": 207, "\u00d0": 208, "\u00d1": 209, "\u00d2": 210, "\u00d3": 211, "\u00d4": 212, "\u00d5": 213, "\u00d6": 214, "\u00d7": 215, "\u00d8": 216, "\u00d9": 217, "\u00da": 218, "\u00db": 219, "\u00dc": 220, "\u00dd": 221, "\u00de": 222, "\u00df": 223, "\u00e0": 224, "\u00e1": 225, "\u00e2": 226, "\u00e3": 227, "\u00e4": 228, "\u00e5": 229, "\u00e6": 230, "\u00e7": 231, "\u00e8": 232, "\u00e9": 233, "\u00ea": 234, "\u00eb": 235, "\u00ec": 236, "\u00ed": 237, "\u00ee": 238, "\u00ef": 239, "\u00f0": 240, "\u00f1": 241, "\u00f2": 242, "\u00f3": 243, "\u00f4": 244, "\u00f5": 245, "\u00f6": 246, "\u00f7": 247, "\u00f8": 248, "\u00f9": 249, "\u00fa": 250, "\u00fb": 251, "\u00fc": 252, "\u00fd": 253, "\u00fe": 254, "\u00ff": 255}
|