|
"""Tokenization classes for ChineseCharTokenizer.""" |
|
|
|
from typing import Optional, Tuple, Union |
|
from transformers import BertTokenizer |
|
import numpy as np |
|
import os |
|
import re |
|
import shutil |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unicode_map = [ |
|
{'token': '[U_LAT]', 'range': (0x0000, 0x007F), 'meaning': 'Basic Latin', }, |
|
{'token': '[U_LAT]', 'range': (0x0080, 0x00FF), 'meaning': 'C1 Controls and Latin-1 Supplement', }, |
|
{'token': '[U_LAT]', 'range': (0x0100, 0x017F), 'meaning': 'Latin Extended-A', }, |
|
{'token': '[U_LAT]', 'range': (0x0180, 0x024F), 'meaning': 'Latin Extended-B', }, |
|
{'token': '[U_PHO]', 'range': (0x0250, 0x02AF), 'meaning': 'IPA Extensions', }, |
|
{'token': '[U_PHO]', 'range': (0x02B0, 0x02FF), 'meaning': 'Spacing Modifier Letters', }, |
|
{'token': '[U_PHO]', 'range': (0x0300, 0x036F), 'meaning': 'Combining Diacritical Marks', }, |
|
{'token': '[U_GRE]', 'range': (0x0370, 0x03FF), 'meaning': 'Greek/Coptic', }, |
|
{'token': '[U_RUS]', 'range': (0x0400, 0x04FF), 'meaning': 'Cyrillic', }, |
|
{'token': '[U_RUS]', 'range': (0x0500, 0x052F), 'meaning': 'Cyrillic Supplement', }, |
|
{'token': '[U_LAN]', 'range': (0x0530, 0x058F), 'meaning': 'Armenian', }, |
|
{'token': '[U_LAN]', 'range': (0x0590, 0x05FF), 'meaning': 'Hebrew', }, |
|
{'token': '[U_ARA]', 'range': (0x0600, 0x06FF), 'meaning': 'Arabic', }, |
|
{'token': '[U_LAN]', 'range': (0x0700, 0x074F), 'meaning': 'Syriac', }, |
|
{'token': '[U_ARA]', 'range': (0x0750, 0x077F), 'meaning': 'Undefined -> Arabic', }, |
|
{'token': '[U_LAN]', 'range': (0x0780, 0x07BF), 'meaning': 'Thaana', }, |
|
{'token': '[U_ARA]', 'range': (0x07C0, 0x08FF), 'meaning': 'Undefined -> Arabic', }, |
|
{'token': '[U_LAN]', 'range': (0x0900, 0x097F), 'meaning': 'Devanagari', }, |
|
{'token': '[U_LAN]', 'range': (0x0980, 0x09FF), 'meaning': 'Bengali/Assamese', }, |
|
{'token': '[U_LAN]', 'range': (0x0A00, 0x0A7F), 'meaning': 'Gurmukhi', }, |
|
{'token': '[U_LAN]', 'range': (0x0A80, 0x0AFF), 'meaning': 'Gujarati', }, |
|
{'token': '[U_LAN]', 'range': (0x0B00, 0x0B7F), 'meaning': 'Oriya', }, |
|
{'token': '[U_LAN]', 'range': (0x0B80, 0x0BFF), 'meaning': 'Tamil', }, |
|
{'token': '[U_LAN]', 'range': (0x0C00, 0x0C7F), 'meaning': 'Telugu', }, |
|
{'token': '[U_LAN]', 'range': (0x0C80, 0x0CFF), 'meaning': 'Kannada', }, |
|
{'token': '[U_LAN]', 'range': (0x0D00, 0x0DFF), 'meaning': 'Malayalam', }, |
|
|
|
{'token': '[U_LAN]', 'range': (0x0E00, 0x0E7F), 'meaning': 'Thai', }, |
|
{'token': '[U_LAN]', 'range': (0x0E80, 0x0EFF), 'meaning': 'Lao', }, |
|
{'token': '[U_LAN]', 'range': (0x0F00, 0x0FFF), 'meaning': 'Tibetan', }, |
|
{'token': '[U_LAN]', 'range': (0x1000, 0x109F), 'meaning': 'Myanmar', }, |
|
{'token': '[U_LAN]', 'range': (0x10A0, 0x10FF), 'meaning': 'Georgian', }, |
|
{'token': '[U_KOR]', 'range': (0x1100, 0x11FF), 'meaning': 'Hangul Jamo', }, |
|
{'token': '[U_LAN]', 'range': (0x1200, 0x137F), 'meaning': 'Ethiopic', }, |
|
{'token': '[U_LAN]', 'range': (0x1380, 0x139F), 'meaning': 'Undefined -> Ethiopic', }, |
|
{'token': '[U_LAN]', 'range': (0x13A0, 0x13FF), 'meaning': 'Cherokee', }, |
|
{'token': '[U_LAN]', 'range': (0x1400, 0x167F), 'meaning': 'Unified Canadian Aboriginal Syllabics', }, |
|
{'token': '[U_LAN]', 'range': (0x1680, 0x169F), 'meaning': 'Ogham', }, |
|
{'token': '[U_LAN]', 'range': (0x16A0, 0x16FF), 'meaning': 'Runic', }, |
|
{'token': '[U_LAN]', 'range': (0x1700, 0x171F), 'meaning': 'Tagalog', }, |
|
{'token': '[U_LAN]', 'range': (0x1720, 0x173F), 'meaning': 'Hanunoo', }, |
|
{'token': '[U_LAN]', 'range': (0x1740, 0x175F), 'meaning': 'Buhid', }, |
|
{'token': '[U_LAN]', 'range': (0x1760, 0x177F), 'meaning': 'Tagbanwa', }, |
|
{'token': '[U_LAN]', 'range': (0x1780, 0x17FF), 'meaning': 'Khmer', }, |
|
{'token': '[U_LAN]', 'range': (0x1800, 0x18AF), 'meaning': 'Mongolian', }, |
|
{'token': '[U_LAN]', 'range': (0x18B0, 0x18FF), 'meaning': 'Undefined -> Unified Canadian Aboriginal Syllabics', }, |
|
{'token': '[U_LAN]', 'range': (0x1900, 0x194F), 'meaning': 'Limbu', }, |
|
{'token': '[U_LAN]', 'range': (0x1950, 0x197F), 'meaning': 'Tai Le', }, |
|
{'token': '[U_LAN]', 'range': (0x1980, 0x19DF), 'meaning': 'Undefined -> New Tai Lue', }, |
|
{'token': '[U_LAN]', 'range': (0x19E0, 0x19FF), 'meaning': 'Khmer Symbols', }, |
|
{'token': '[U_LAN]', 'range': (0x1A00, 0x1CFF), 'meaning': 'Undefined -> Ol Chiki', }, |
|
{'token': '[U_PHO]', 'range': (0x1D00, 0x1D7F), 'meaning': 'Phonetic Extensions', }, |
|
{'token': '[U_PHO]', 'range': (0x1D80, 0x1DFF), 'meaning': 'Undefined -> Phonetic Extensions Supplement', }, |
|
{'token': '[U_LAT]', 'range': (0x1E00, 0x1EFF), 'meaning': 'Latin Extended Additional', }, |
|
{'token': '[U_GRE]', 'range': (0x1F00, 0x1FFF), 'meaning': 'Greek Extended', }, |
|
{'token': '[U_SYM]', 'range': (0x2000, 0x206F), 'meaning': 'General Punctuation', }, |
|
{'token': '[U_COM]', 'range': (0x2070, 0x209F), 'meaning': 'Superscripts and Subscripts', }, |
|
{'token': '¤', 'range': (0x20A0, 0x20CF), 'meaning': 'Currency Symbols', }, |
|
{'token': '[U_COM]', 'range': (0x20D0, 0x20FF), 'meaning': 'Combining Diacritical Marks for Symbols', }, |
|
{'token': '[U_SYM]', 'range': (0x2100, 0x214F), 'meaning': 'Letterlike Symbols', }, |
|
{'token': '[U_NUM]', 'range': (0x2150, 0x218F), 'meaning': 'Number Forms', }, |
|
{'token': '[U_SYM]', 'range': (0x2190, 0x21FF), 'meaning': 'Arrows', }, |
|
{'token': '[U_MAT]', 'range': (0x2200, 0x22FF), 'meaning': 'Mathematical Operators', }, |
|
{'token': '[U_SYM]', 'range': (0x2300, 0x23FF), 'meaning': 'Miscellaneous Technical', }, |
|
{'token': '[U_SYM]', 'range': (0x2400, 0x243F), 'meaning': 'Control Pictures', }, |
|
{'token': '[U_SYM]', 'range': (0x2440, 0x245F), 'meaning': 'Optical Character Recognition', }, |
|
{'token': '[U_NUM]', 'range': (0x2460, 0x24FF), 'meaning': 'Enclosed Alphanumerics', }, |
|
{'token': '[U_SYM]', 'range': (0x2500, 0x257F), 'meaning': 'Box Drawing', }, |
|
{'token': '[U_SYM]', 'range': (0x2580, 0x259F), 'meaning': 'Block Elements', }, |
|
{'token': '[U_SYM]', 'range': (0x25A0, 0x25FF), 'meaning': 'Geometric Shapes', }, |
|
{'token': '[U_SYM]', 'range': (0x2600, 0x26FF), 'meaning': 'Miscellaneous Symbols', }, |
|
{'token': '[U_SYM]', 'range': (0x2700, 0x27BF), 'meaning': 'Dingbats', }, |
|
{'token': '[U_MAT]', 'range': (0x27C0, 0x27EF), 'meaning': 'Miscellaneous Mathematical Symbols-A', }, |
|
{'token': '[U_SYM]', 'range': (0x27F0, 0x27FF), 'meaning': 'Supplemental Arrows-A', }, |
|
{'token': '[U_LAN]', 'range': (0x2800, 0x28FF), 'meaning': 'Braille Patterns', }, |
|
{'token': '[U_SYM]', 'range': (0x2900, 0x297F), 'meaning': 'Supplemental Arrows-B', }, |
|
{'token': '[U_MAT]', 'range': (0x2980, 0x29FF), 'meaning': 'Miscellaneous Mathematical Symbols-B', }, |
|
{'token': '[U_MAT]', 'range': (0x2A00, 0x2AFF), 'meaning': 'Supplemental Mathematical Operators', }, |
|
{'token': '[U_SYM]', 'range': (0x2B00, 0x2BFF), 'meaning': 'Miscellaneous Symbols and Arrows'}, |
|
{'token': '[U_LAN]', 'range': (0x2C00, 0x2E7F), 'meaning': 'Undefined -> Coptic', }, |
|
{'token': '[U_RAD]', 'range': (0x2E80, 0x2EFF), 'meaning': 'CJK Radicals Supplement', }, |
|
{'token': '[U_RAD]', 'range': (0x2F00, 0x2FDF), 'meaning': 'Kangxi Radicals', }, |
|
{'token': '[U_SYM]', 'range': (0x2FE0, 0x2FEF), 'meaning': 'Undefined -> Symbol', }, |
|
{'token': '[U_SYM]', 'range': (0x2FF0, 0x2FFF), 'meaning': 'Ideographic Description Characters', }, |
|
{'token': '[U_PUN]', 'range': (0x3000, 0x303F), 'meaning': 'CJK Symbols and Punctuation', }, |
|
{'token': '[U_JAP]', 'range': (0x3040, 0x309F), 'meaning': 'Hiragana', }, |
|
{'token': '[U_JAP]', 'range': (0x30A0, 0x30FF), 'meaning': 'Katakana', }, |
|
{'token': '[U_PHO]', 'range': (0x3100, 0x312F), 'meaning': 'Bopomofo', }, |
|
{'token': '[U_KOR]', 'range': (0x3130, 0x318F), 'meaning': 'Hangul Compatibility Jamo', }, |
|
{'token': '[U_JAP]', 'range': (0x3190, 0x319F), 'meaning': 'Kanbun (Kunten)', }, |
|
{'token': '[U_PHO]', 'range': (0x31A0, 0x31BF), 'meaning': 'Bopomofo Extended', }, |
|
{'token': '[U_RAD]', 'range': (0x31C0, 0x31EF), 'meaning': 'Undefined -> CJK Strokes', }, |
|
{'token': '[U_JAP]', 'range': (0x31F0, 0x31FF), 'meaning': 'Katakana Phonetic Extensions', }, |
|
{'token': '[U_NUM]', 'range': (0x3200, 0x32FF), 'meaning': 'Enclosed CJK Letters and Months', }, |
|
{'token': '[U_SYM]', 'range': (0x3300, 0x33FF), 'meaning': 'CJK Compatibility', }, |
|
{'token': '[U_CHI]', 'range': (0x3400, 0x4DBF), 'meaning': 'CJK Unified Ideographs Extension A', }, |
|
{'token': '[U_SYM]', 'range': (0x4DC0, 0x4DFF), 'meaning': 'Yijing Hexagram Symbols', }, |
|
{'token': '[U_CHI]', 'range': (0x4E00, 0x9FAF), 'meaning': 'CJK Unified Ideographs', }, |
|
{'token': '[U_CHI]', 'range': (0x9FB0, 0x9FFF), 'meaning': 'Undefined -> CJK Unified Ideographs', }, |
|
{'token': '[U_LAN]', 'range': (0xA000, 0xA48F), 'meaning': 'Yi Syllables', }, |
|
{'token': '[U_LAN]', 'range': (0xA490, 0xA4CF), 'meaning': 'Yi Radicals', }, |
|
{'token': '[U_LAN]', 'range': (0xA4D0, 0xABFF), 'meaning': 'Undefined -> Cherokee'}, |
|
{'token': '[U_KOR]', 'range': (0xAC00, 0xD7AF), 'meaning': 'Hangul Syllables', }, |
|
{'token': '[U_KOR]', 'range': (0xD7B0, 0xD7FF), 'meaning': 'Undefined -> Hangul Jamo Extended-B', }, |
|
{'range': (0xD800, 0xDBFF), 'meaning': 'High Surrogate Area', 'token': '[UNK]'}, |
|
{'range': (0xDC00, 0xDFFF), 'meaning': 'Low Surrogate Area', 'token': '[UNK]'}, |
|
{'range': (0xE000, 0xF8FF), 'meaning': 'Private Use Area', 'token': '[UNK]'}, |
|
{'token': '[U_CHI]', 'range': (0xF900, 0xFAFF), 'meaning': 'CJK Compatibility Ideographs', }, |
|
{'token': '[U_LAT]', 'range': (0xFB00, 0xFB4F), 'meaning': 'Alphabetic Presentation Forms', }, |
|
{'token': '[U_ARA]', 'range': (0xFB50, 0xFDFF), 'meaning': 'Arabic Presentation Forms-A', }, |
|
{'token': '[U_SYM]', 'range': (0xFE00, 0xFE0F), 'meaning': 'Variation Selectors', }, |
|
{'token': '[U_PUN]', 'range': (0xFE10, 0xFE1F), 'meaning': 'Undefined -> Vertical Forms', }, |
|
{'token': '[U_COM]', 'range': (0xFE20, 0xFE2F), 'meaning': 'Combining Half Marks', }, |
|
{'token': '[U_PUN]', 'range': (0xFE30, 0xFE4F), 'meaning': 'CJK Compatibility Forms', }, |
|
{'token': '[U_PUN]', 'range': (0xFE50, 0xFE6F), 'meaning': 'Small Form Variants', }, |
|
{'token': '[U_ARA]', 'range': (0xFE70, 0xFEFF), 'meaning': 'Arabic Presentation Forms-B', }, |
|
{'token': '[U_LAT]', 'range': (0xFF00, 0xFFEF), 'meaning': 'Halfwidth and Fullwidth Forms', }, |
|
{'token': '[U_SYM]', 'range': (0xFFF0, 0xFFFF), 'meaning': 'Specials', }, |
|
{'token': '[U_LAN]', 'range': (0x10000, 0x1007F), 'meaning': 'Linear B Syllabary', }, |
|
{'token': '[U_LAN]', 'range': (0x10080, 0x100FF), 'meaning': 'Linear B Ideograms', }, |
|
{'token': '[U_LAN]', 'range': (0x10100, 0x1013F), 'meaning': 'Aegean Numbers', }, |
|
{'token': '[U_LAN]', 'range': (0x10140, 0x102FF), 'meaning': 'Undefined -> Carian', }, |
|
{'token': '[U_LAN]', 'range': (0x10300, 0x1032F), 'meaning': 'Old Italic', }, |
|
{'token': '[U_LAN]', 'range': (0x10330, 0x1034F), 'meaning': 'Gothic', }, |
|
{'range': (0x10350, 0x1037F), 'meaning': 'Undefined', 'token': '[UNK]'}, |
|
{'token': '[U_LAN]', 'range': (0x10380, 0x1039F), 'meaning': 'Ugaritic', }, |
|
{'token': '[U_LAN]', 'range': (0x103A0, 0x103FF), 'meaning': 'Undefined -> Old Persian', }, |
|
{'token': '[U_LAN]', 'range': (0x10400, 0x1044F), 'meaning': 'Deseret', }, |
|
{'token': '[U_PHO]', 'range': (0x10450, 0x1047F), 'meaning': 'Shavian', }, |
|
{'token': '[U_LAN]', 'range': (0x10480, 0x104AF), 'meaning': 'Osmanya', }, |
|
{'token': '[U_LAN]', 'range': (0x104B0, 0x107FF), 'meaning': 'Undefined -> Osage', }, |
|
{'token': '[U_LAN]', 'range': (0x10800, 0x1083F), 'meaning': 'Cypriot Syllabary', }, |
|
{'token': '[U_LAN]', 'range': (0x10840, 0x1CFFF), 'meaning': 'Undefined -> Cuneiform, Chakma, Kharoshthi...', }, |
|
{'token': '[U_LAN]', 'range': (0x1D000, 0x1D0FF), 'meaning': 'Byzantine Musical Symbols', }, |
|
{'token': '[U_SYM]', 'range': (0x1D100, 0x1D1FF), 'meaning': 'Musical Symbols', }, |
|
{'range': (0x1D200, 0x1D2FF), 'meaning': 'Undefined', 'token': '[UNK]'}, |
|
{'token': '[U_SYM]', 'range': (0x1D300, 0x1D35F), 'meaning': 'Tai Xuan Jing Symbols', }, |
|
{'range': (0x1D360, 0x1D3FF), 'meaning': 'Undefined', 'token': '[UNK]'}, |
|
{'token': '[U_MAT]', 'range': (0x1D400, 0x1D7FF), 'meaning': 'Mathematical Alphanumeric Symbols', }, |
|
{'token': '[U_LAN]', 'range': (0x1D800, 0x1F003), 'meaning': 'Undefined -> Adlam', }, |
|
{'token': '[U_EMO]', 'range': (0x1F004, 0x1FAF8), 'meaning': 'Undefined -> Emoji', }, |
|
{'token': '[U_SYM]', 'range': (0x1FAF9, 0x1FFFF), 'meaning': 'Undefined -> Symbols for Legacy Computing', }, |
|
{'token': '[U_CHI]', 'range': (0x20000, 0x2A6DF), 'meaning': 'CJK Unified Ideographs Extension B', }, |
|
{'token': '[U_CHI]', 'range': (0x2A6E0, 0x2F7FF), 'meaning': 'Undefined -> CJK Unified Ideographs Extension F...', }, |
|
{'token': '[U_CHI]', 'range': (0x2F800, 0x2FA1F), 'meaning': 'CJK Compatibility Ideographs Supplement', }, |
|
{'range': (0x2FA20, 0x2FAAF), 'meaning': 'Undefined', 'token': '[UNK]'}, |
|
{'range': (0x2FAB0, 0x2FFFF), 'meaning': 'Unused', 'token': '[UNK]'}, |
|
{'token': '[U_CHI]', 'range': (0x30000, 0x3134F), 'meaning': 'Unused -> CJK Unified Ideographs Extension G (unassigned)', }, |
|
{'range': (0x31350, 0xDFFFF), 'meaning': 'Unused', 'token': '[UNK]'}, |
|
{'token': '[U_SYM]', 'range': (0xE0000, 0xE007F), 'meaning': 'Tags', }, |
|
{'range': (0xE0080, 0xE00FF), 'meaning': 'Unused', 'token': '[UNK]'}, |
|
{'token': '[U_SYM]', 'range': (0xE0100, 0xE01EF), 'meaning': 'Variation Selectors Supplement', }, |
|
{'range': (0xE01F0, 0xEFFFF), 'meaning': 'Unused', 'token': '[UNK]'}, |
|
{'range': (0xF0000, 0xFFFFD), 'meaning': 'Supplementary Private Use Area-A', 'token': '[UNK]'}, |
|
{'range': (0xFFFFE, 0xFFFFF), 'meaning': 'Unused', 'token': '[UNK]'}, |
|
{'range': (0x100000, 0x10FFFD), 'meaning': 'Supplementary Private Use Area-B', 'token': '[UNK]'}, |
|
] |
|
|
|
|
|
def get_unicode_ranges(): |
|
|
|
left_bounds = [m['range'][0] for m in unicode_map] |
|
right_bounds = [m['range'][1] for m in unicode_map] |
|
for right, left in zip(right_bounds[:-1], left_bounds[1:]): |
|
assert right+1 == left |
|
return np.array(right_bounds) |
|
|
|
|
|
def _is_chinese_char(cp): |
|
|
|
"""Checks whether CP is the codepoint of a CJK character.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ( |
|
(cp >= 0x4E00 and cp <= 0x9FFF) |
|
or (cp >= 0x3400 and cp <= 0x4DBF) |
|
or (cp >= 0x20000 and cp <= 0x2A6DF) |
|
or (cp >= 0x2A700 and cp <= 0x2B73F) |
|
or (cp >= 0x2B740 and cp <= 0x2B81F) |
|
or (cp >= 0x2B820 and cp <= 0x2CEAF) |
|
or (cp >= 0xF900 and cp <= 0xFAFF) |
|
or (cp >= 0x2F800 and cp <= 0x2FA1F) |
|
): |
|
return True |
|
|
|
return False |
|
|
|
|
|
def show_unicode(start=0x1F004, end=0x1FAF8): |
|
|
|
|
|
|
|
for i in range(start, end): |
|
print(chr(i), end=' ') |
|
print() |
|
|
|
|
|
def load_json(file): |
|
import json |
|
with open(file, 'r', encoding='utf-8') as f: |
|
obj = json.load(f) |
|
return obj |
|
|
|
|
|
class ChineseCharTokenizer(BertTokenizer): |
|
vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"} |
|
|
|
def __init__(self, vocab_file, *args, **kwargs): |
|
super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs) |
|
self.unicoder_ranges = get_unicode_ranges() |
|
self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token} |
|
self.enclosed_tokens_by_len = [ |
|
[token for token in self.enclosed_tokens if len(token) == 5], |
|
[token for token in self.enclosed_tokens if len(token) == 6], |
|
[token for token in self.enclosed_tokens if len(token) == 7] |
|
] |
|
self.dir = os.path.join(os.path.dirname(vocab_file)) |
|
self.replace_map = load_json(os.path.join(self.dir, 'replace.json')) |
|
|
|
|
|
def convert_token_to_representative(self, token: str) -> str: |
|
token = self.replace_map.get(token, token) |
|
if token in self.vocab: |
|
return token |
|
else: |
|
assert len(token) == 1, token |
|
if re.match(r'\s', token): |
|
return ' ' |
|
v = ord(token) |
|
if _is_chinese_char(v): |
|
return '[U_CHI]' |
|
elif v <= 0x10FFFD: |
|
i = np.searchsorted(self.unicoder_ranges, v) |
|
return unicode_map[i]['token'] |
|
else: |
|
return '[UNK]' |
|
|
|
|
|
def _tokenize(self, text): |
|
|
|
split_tokens = [] |
|
i = 0 |
|
while i < len(text): |
|
if text[i:i+5] in self.enclosed_tokens_by_len[0]: |
|
split_tokens.append(text[i:i+5]) |
|
i += 5 |
|
elif text[i:i+6] == '[MASK]': |
|
split_tokens.append('[MASK]') |
|
i += 6 |
|
elif text[i:i+7] in self.enclosed_tokens_by_len[2]: |
|
split_tokens.append(text[i:i+7]) |
|
i += 7 |
|
else: |
|
split_tokens.append(self.convert_token_to_representative(text[i])) |
|
i += 1 |
|
return split_tokens |
|
|
|
def _convert_token_to_id(self, token): |
|
return self.vocab.get(self.convert_token_to_representative(token), self.vocab.get(self.unk_token)) |
|
|
|
def convert_tokens_to_string(self, tokens): |
|
return ''.join(tokens) |
|
|
|
def save_pretrained(self, save_directory: Union[str, os.PathLike], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs) -> Tuple[str]: |
|
ret = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs) |
|
shutil.copyfile(os.path.join(self.dir, 'replace.json'), f'{save_directory}/replace.json') |
|
shutil.copyfile(os.path.join(self.dir, 'cctokenizer.py'), f'{save_directory}/cctokenizer.py') |
|
return ret |
|
|