brief
English and DNA sequence mixed BPE tokenizer
Basic use
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("dnagpt/dna_eng_bpe")
input = "Determine the following dna sequence is promoter or terminator TCTTTCTCTTCTGTATCATTCTACTTCTATGACTGCTCCTTCTCGAGTAAAACAGAATGTGTCTCAGGATTACTTTAAAACAAGACAAAGTATAGAGTTAAAATACATTTT"
token_list = tokenizer.tokenize(input)
print(" ".join(token_list))
#D eter mine Ġ the Ġ follow ing Ġ d na Ġ sequenc e Ġ is Ġ promo ter Ġ or Ġ termin ator Ġ Ġ TCTTTC TCTTC TGTATC ATTC TACTTC TATG ACTGC T...