|
|
|
|
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/other_tokenizer_vocab/hy', local_files_only=True, trust_remote_code=True) |
|
|
|
test_data = [line.strip() for line in open('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/data/test.txt', 'r').readlines()] |
|
|
|
num_origi_len = 0 |
|
num_token_len = 0 |
|
|
|
for d in test_data: |
|
a = tokenizer.encode(d) |
|
num_origi_len += len(d) |
|
num_token_len += len(a) |
|
b = tokenizer.decode(a) |
|
assert b == d, f"encode & decode not consistent: {d} vs {b}" |
|
|
|
print(f" original length: {num_origi_len}") |
|
print(f" token length: {num_token_len}") |
|
|