|
from tokenizers import ByteLevelBPETokenizer |
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto_tokenizer = AutoTokenizer.from_pretrained("./", use_fast=False, trust_remote_code=True) |
|
|
|
|
|
text = "Hello, world!" |
|
encoded = auto_tokenizer.encode(text) |
|
decoded = auto_tokenizer.decode(encoded) |
|
|
|
print("Encoded:", encoded) |
|
print("Decoded:", decoded) |
|
|
|
messages = [ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": "Hello, how are you?"}, |
|
{"role": "assistant", "content": "I'm good, thank you! How can I help you today?"}, |
|
{"role": "user", "content": "Nothing"}, |
|
] |
|
|
|
print('messages:', messages) |
|
ids = auto_tokenizer.apply_chat_template(messages) |
|
print(f"input_ids:\t{ids}") |
|
text = auto_tokenizer.decode(ids) |
|
print(f"input_text:\t[{text}]") |