BarcodeBERT-Entire-BOLD / tokenizer.json
vshulev's picture
Upload tokenizer
1b0c5d3 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<MASK>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<CLS>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<UNK>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"<MASK>": 0,
"<CLS>": 1,
"<UNK>": 2,
"AAAA": 3,
"AAAC": 4,
"AAAG": 5,
"AAAT": 6,
"AACA": 7,
"AACC": 8,
"AACG": 9,
"AACT": 10,
"AAGA": 11,
"AAGC": 12,
"AAGG": 13,
"AAGT": 14,
"AATA": 15,
"AATC": 16,
"AATG": 17,
"AATT": 18,
"ACAA": 19,
"ACAC": 20,
"ACAG": 21,
"ACAT": 22,
"ACCA": 23,
"ACCC": 24,
"ACCG": 25,
"ACCT": 26,
"ACGA": 27,
"ACGC": 28,
"ACGG": 29,
"ACGT": 30,
"ACTA": 31,
"ACTC": 32,
"ACTG": 33,
"ACTT": 34,
"AGAA": 35,
"AGAC": 36,
"AGAG": 37,
"AGAT": 38,
"AGCA": 39,
"AGCC": 40,
"AGCG": 41,
"AGCT": 42,
"AGGA": 43,
"AGGC": 44,
"AGGG": 45,
"AGGT": 46,
"AGTA": 47,
"AGTC": 48,
"AGTG": 49,
"AGTT": 50,
"ATAA": 51,
"ATAC": 52,
"ATAG": 53,
"ATAT": 54,
"ATCA": 55,
"ATCC": 56,
"ATCG": 57,
"ATCT": 58,
"ATGA": 59,
"ATGC": 60,
"ATGG": 61,
"ATGT": 62,
"ATTA": 63,
"ATTC": 64,
"ATTG": 65,
"ATTT": 66,
"CAAA": 67,
"CAAC": 68,
"CAAG": 69,
"CAAT": 70,
"CACA": 71,
"CACC": 72,
"CACG": 73,
"CACT": 74,
"CAGA": 75,
"CAGC": 76,
"CAGG": 77,
"CAGT": 78,
"CATA": 79,
"CATC": 80,
"CATG": 81,
"CATT": 82,
"CCAA": 83,
"CCAC": 84,
"CCAG": 85,
"CCAT": 86,
"CCCA": 87,
"CCCC": 88,
"CCCG": 89,
"CCCT": 90,
"CCGA": 91,
"CCGC": 92,
"CCGG": 93,
"CCGT": 94,
"CCTA": 95,
"CCTC": 96,
"CCTG": 97,
"CCTT": 98,
"CGAA": 99,
"CGAC": 100,
"CGAG": 101,
"CGAT": 102,
"CGCA": 103,
"CGCC": 104,
"CGCG": 105,
"CGCT": 106,
"CGGA": 107,
"CGGC": 108,
"CGGG": 109,
"CGGT": 110,
"CGTA": 111,
"CGTC": 112,
"CGTG": 113,
"CGTT": 114,
"CTAA": 115,
"CTAC": 116,
"CTAG": 117,
"CTAT": 118,
"CTCA": 119,
"CTCC": 120,
"CTCG": 121,
"CTCT": 122,
"CTGA": 123,
"CTGC": 124,
"CTGG": 125,
"CTGT": 126,
"CTTA": 127,
"CTTC": 128,
"CTTG": 129,
"CTTT": 130,
"GAAA": 131,
"GAAC": 132,
"GAAG": 133,
"GAAT": 134,
"GACA": 135,
"GACC": 136,
"GACG": 137,
"GACT": 138,
"GAGA": 139,
"GAGC": 140,
"GAGG": 141,
"GAGT": 142,
"GATA": 143,
"GATC": 144,
"GATG": 145,
"GATT": 146,
"GCAA": 147,
"GCAC": 148,
"GCAG": 149,
"GCAT": 150,
"GCCA": 151,
"GCCC": 152,
"GCCG": 153,
"GCCT": 154,
"GCGA": 155,
"GCGC": 156,
"GCGG": 157,
"GCGT": 158,
"GCTA": 159,
"GCTC": 160,
"GCTG": 161,
"GCTT": 162,
"GGAA": 163,
"GGAC": 164,
"GGAG": 165,
"GGAT": 166,
"GGCA": 167,
"GGCC": 168,
"GGCG": 169,
"GGCT": 170,
"GGGA": 171,
"GGGC": 172,
"GGGG": 173,
"GGGT": 174,
"GGTA": 175,
"GGTC": 176,
"GGTG": 177,
"GGTT": 178,
"GTAA": 179,
"GTAC": 180,
"GTAG": 181,
"GTAT": 182,
"GTCA": 183,
"GTCC": 184,
"GTCG": 185,
"GTCT": 186,
"GTGA": 187,
"GTGC": 188,
"GTGG": 189,
"GTGT": 190,
"GTTA": 191,
"GTTC": 192,
"GTTG": 193,
"GTTT": 194,
"TAAA": 195,
"TAAC": 196,
"TAAG": 197,
"TAAT": 198,
"TACA": 199,
"TACC": 200,
"TACG": 201,
"TACT": 202,
"TAGA": 203,
"TAGC": 204,
"TAGG": 205,
"TAGT": 206,
"TATA": 207,
"TATC": 208,
"TATG": 209,
"TATT": 210,
"TCAA": 211,
"TCAC": 212,
"TCAG": 213,
"TCAT": 214,
"TCCA": 215,
"TCCC": 216,
"TCCG": 217,
"TCCT": 218,
"TCGA": 219,
"TCGC": 220,
"TCGG": 221,
"TCGT": 222,
"TCTA": 223,
"TCTC": 224,
"TCTG": 225,
"TCTT": 226,
"TGAA": 227,
"TGAC": 228,
"TGAG": 229,
"TGAT": 230,
"TGCA": 231,
"TGCC": 232,
"TGCG": 233,
"TGCT": 234,
"TGGA": 235,
"TGGC": 236,
"TGGG": 237,
"TGGT": 238,
"TGTA": 239,
"TGTC": 240,
"TGTG": 241,
"TGTT": 242,
"TTAA": 243,
"TTAC": 244,
"TTAG": 245,
"TTAT": 246,
"TTCA": 247,
"TTCC": 248,
"TTCG": 249,
"TTCT": 250,
"TTGA": 251,
"TTGC": 252,
"TTGG": 253,
"TTGT": 254,
"TTTA": 255,
"TTTC": 256,
"TTTG": 257,
"TTTT": 258
},
"unk_token": "<UNK>"
}
}