add modified albert tokenizer
Browse files- README.md +16 -0
- tokenizer.json +0 -0
- tokenizer_config.json +3 -0
README.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
The purpose of this repo is to show the usefulness of saving the normalization operation used during the tokenizer training
|
3 |
+
|
4 |
+
```python
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
text = "This is a text with àccënts and CAPITAL LETTERS"
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("albert-large-v2")
|
9 |
+
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
|
10 |
+
|
11 |
+
# ['[CLS]', '▁this', '▁is', '▁a', '▁text', '▁with', '▁accent', 's', '▁and', '▁capital', '▁letters', '[SEP]']
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/albert-tokenizer-without-normalizer")
|
13 |
+
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
|
14 |
+
#
|
15 |
+
['[CLS]', '▁', '<unk>', 'his', '▁is', '▁a', '▁text', '▁with', '▁', '<unk>', 'cc', '<unk>', 'nts', '▁and', '▁', '<unk>', '▁', '<unk>', '[SEP]']
|
16 |
+
```
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tokenizer_class": "AlbertTokenizer"
|
3 |
+
}
|