vshulev commited on
Commit
1b0c5d3
1 Parent(s): af353b0

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +5 -0
  2. tokenizer.json +305 -0
  3. tokenizer_config.json +34 -0
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "<CLS>",
3
+ "mask_token": "<MASK>",
4
+ "unk_token": "<UNK>"
5
+ }
tokenizer.json ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<MASK>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<CLS>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<UNK>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "Whitespace"
37
+ },
38
+ "post_processor": null,
39
+ "decoder": null,
40
+ "model": {
41
+ "type": "WordLevel",
42
+ "vocab": {
43
+ "<MASK>": 0,
44
+ "<CLS>": 1,
45
+ "<UNK>": 2,
46
+ "AAAA": 3,
47
+ "AAAC": 4,
48
+ "AAAG": 5,
49
+ "AAAT": 6,
50
+ "AACA": 7,
51
+ "AACC": 8,
52
+ "AACG": 9,
53
+ "AACT": 10,
54
+ "AAGA": 11,
55
+ "AAGC": 12,
56
+ "AAGG": 13,
57
+ "AAGT": 14,
58
+ "AATA": 15,
59
+ "AATC": 16,
60
+ "AATG": 17,
61
+ "AATT": 18,
62
+ "ACAA": 19,
63
+ "ACAC": 20,
64
+ "ACAG": 21,
65
+ "ACAT": 22,
66
+ "ACCA": 23,
67
+ "ACCC": 24,
68
+ "ACCG": 25,
69
+ "ACCT": 26,
70
+ "ACGA": 27,
71
+ "ACGC": 28,
72
+ "ACGG": 29,
73
+ "ACGT": 30,
74
+ "ACTA": 31,
75
+ "ACTC": 32,
76
+ "ACTG": 33,
77
+ "ACTT": 34,
78
+ "AGAA": 35,
79
+ "AGAC": 36,
80
+ "AGAG": 37,
81
+ "AGAT": 38,
82
+ "AGCA": 39,
83
+ "AGCC": 40,
84
+ "AGCG": 41,
85
+ "AGCT": 42,
86
+ "AGGA": 43,
87
+ "AGGC": 44,
88
+ "AGGG": 45,
89
+ "AGGT": 46,
90
+ "AGTA": 47,
91
+ "AGTC": 48,
92
+ "AGTG": 49,
93
+ "AGTT": 50,
94
+ "ATAA": 51,
95
+ "ATAC": 52,
96
+ "ATAG": 53,
97
+ "ATAT": 54,
98
+ "ATCA": 55,
99
+ "ATCC": 56,
100
+ "ATCG": 57,
101
+ "ATCT": 58,
102
+ "ATGA": 59,
103
+ "ATGC": 60,
104
+ "ATGG": 61,
105
+ "ATGT": 62,
106
+ "ATTA": 63,
107
+ "ATTC": 64,
108
+ "ATTG": 65,
109
+ "ATTT": 66,
110
+ "CAAA": 67,
111
+ "CAAC": 68,
112
+ "CAAG": 69,
113
+ "CAAT": 70,
114
+ "CACA": 71,
115
+ "CACC": 72,
116
+ "CACG": 73,
117
+ "CACT": 74,
118
+ "CAGA": 75,
119
+ "CAGC": 76,
120
+ "CAGG": 77,
121
+ "CAGT": 78,
122
+ "CATA": 79,
123
+ "CATC": 80,
124
+ "CATG": 81,
125
+ "CATT": 82,
126
+ "CCAA": 83,
127
+ "CCAC": 84,
128
+ "CCAG": 85,
129
+ "CCAT": 86,
130
+ "CCCA": 87,
131
+ "CCCC": 88,
132
+ "CCCG": 89,
133
+ "CCCT": 90,
134
+ "CCGA": 91,
135
+ "CCGC": 92,
136
+ "CCGG": 93,
137
+ "CCGT": 94,
138
+ "CCTA": 95,
139
+ "CCTC": 96,
140
+ "CCTG": 97,
141
+ "CCTT": 98,
142
+ "CGAA": 99,
143
+ "CGAC": 100,
144
+ "CGAG": 101,
145
+ "CGAT": 102,
146
+ "CGCA": 103,
147
+ "CGCC": 104,
148
+ "CGCG": 105,
149
+ "CGCT": 106,
150
+ "CGGA": 107,
151
+ "CGGC": 108,
152
+ "CGGG": 109,
153
+ "CGGT": 110,
154
+ "CGTA": 111,
155
+ "CGTC": 112,
156
+ "CGTG": 113,
157
+ "CGTT": 114,
158
+ "CTAA": 115,
159
+ "CTAC": 116,
160
+ "CTAG": 117,
161
+ "CTAT": 118,
162
+ "CTCA": 119,
163
+ "CTCC": 120,
164
+ "CTCG": 121,
165
+ "CTCT": 122,
166
+ "CTGA": 123,
167
+ "CTGC": 124,
168
+ "CTGG": 125,
169
+ "CTGT": 126,
170
+ "CTTA": 127,
171
+ "CTTC": 128,
172
+ "CTTG": 129,
173
+ "CTTT": 130,
174
+ "GAAA": 131,
175
+ "GAAC": 132,
176
+ "GAAG": 133,
177
+ "GAAT": 134,
178
+ "GACA": 135,
179
+ "GACC": 136,
180
+ "GACG": 137,
181
+ "GACT": 138,
182
+ "GAGA": 139,
183
+ "GAGC": 140,
184
+ "GAGG": 141,
185
+ "GAGT": 142,
186
+ "GATA": 143,
187
+ "GATC": 144,
188
+ "GATG": 145,
189
+ "GATT": 146,
190
+ "GCAA": 147,
191
+ "GCAC": 148,
192
+ "GCAG": 149,
193
+ "GCAT": 150,
194
+ "GCCA": 151,
195
+ "GCCC": 152,
196
+ "GCCG": 153,
197
+ "GCCT": 154,
198
+ "GCGA": 155,
199
+ "GCGC": 156,
200
+ "GCGG": 157,
201
+ "GCGT": 158,
202
+ "GCTA": 159,
203
+ "GCTC": 160,
204
+ "GCTG": 161,
205
+ "GCTT": 162,
206
+ "GGAA": 163,
207
+ "GGAC": 164,
208
+ "GGAG": 165,
209
+ "GGAT": 166,
210
+ "GGCA": 167,
211
+ "GGCC": 168,
212
+ "GGCG": 169,
213
+ "GGCT": 170,
214
+ "GGGA": 171,
215
+ "GGGC": 172,
216
+ "GGGG": 173,
217
+ "GGGT": 174,
218
+ "GGTA": 175,
219
+ "GGTC": 176,
220
+ "GGTG": 177,
221
+ "GGTT": 178,
222
+ "GTAA": 179,
223
+ "GTAC": 180,
224
+ "GTAG": 181,
225
+ "GTAT": 182,
226
+ "GTCA": 183,
227
+ "GTCC": 184,
228
+ "GTCG": 185,
229
+ "GTCT": 186,
230
+ "GTGA": 187,
231
+ "GTGC": 188,
232
+ "GTGG": 189,
233
+ "GTGT": 190,
234
+ "GTTA": 191,
235
+ "GTTC": 192,
236
+ "GTTG": 193,
237
+ "GTTT": 194,
238
+ "TAAA": 195,
239
+ "TAAC": 196,
240
+ "TAAG": 197,
241
+ "TAAT": 198,
242
+ "TACA": 199,
243
+ "TACC": 200,
244
+ "TACG": 201,
245
+ "TACT": 202,
246
+ "TAGA": 203,
247
+ "TAGC": 204,
248
+ "TAGG": 205,
249
+ "TAGT": 206,
250
+ "TATA": 207,
251
+ "TATC": 208,
252
+ "TATG": 209,
253
+ "TATT": 210,
254
+ "TCAA": 211,
255
+ "TCAC": 212,
256
+ "TCAG": 213,
257
+ "TCAT": 214,
258
+ "TCCA": 215,
259
+ "TCCC": 216,
260
+ "TCCG": 217,
261
+ "TCCT": 218,
262
+ "TCGA": 219,
263
+ "TCGC": 220,
264
+ "TCGG": 221,
265
+ "TCGT": 222,
266
+ "TCTA": 223,
267
+ "TCTC": 224,
268
+ "TCTG": 225,
269
+ "TCTT": 226,
270
+ "TGAA": 227,
271
+ "TGAC": 228,
272
+ "TGAG": 229,
273
+ "TGAT": 230,
274
+ "TGCA": 231,
275
+ "TGCC": 232,
276
+ "TGCG": 233,
277
+ "TGCT": 234,
278
+ "TGGA": 235,
279
+ "TGGC": 236,
280
+ "TGGG": 237,
281
+ "TGGT": 238,
282
+ "TGTA": 239,
283
+ "TGTC": 240,
284
+ "TGTG": 241,
285
+ "TGTT": 242,
286
+ "TTAA": 243,
287
+ "TTAC": 244,
288
+ "TTAG": 245,
289
+ "TTAT": 246,
290
+ "TTCA": 247,
291
+ "TTCC": 248,
292
+ "TTCG": 249,
293
+ "TTCT": 250,
294
+ "TTGA": 251,
295
+ "TTGC": 252,
296
+ "TTGG": 253,
297
+ "TTGT": 254,
298
+ "TTTA": 255,
299
+ "TTTC": 256,
300
+ "TTTG": 257,
301
+ "TTTT": 258
302
+ },
303
+ "unk_token": "<UNK>"
304
+ }
305
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<MASK>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<CLS>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<UNK>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "cls_token": "<CLS>",
30
+ "mask_token": "<MASK>",
31
+ "model_max_length": 1000000000000000019884624838656,
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "<UNK>"
34
+ }