xu-song commited on
Commit
428b731
1 Parent(s): 751936e
.gitignore CHANGED
@@ -13,4 +13,5 @@ dist/
13
  downloads/
14
  eggs/
15
  .eggs/
16
- .idea/
 
 
13
  downloads/
14
  eggs/
15
  .eggs/
16
+ .idea/
17
+ gradio_cached_examples
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: The Tokenizer Playground
3
  emoji: ⚡
4
  colorFrom: red
5
  colorTo: gray
@@ -10,3 +10,10 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Tokenizer Arena
3
  emoji: ⚡
4
  colorFrom: red
5
  colorTo: gray
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+ ## ss
16
+
17
+
18
+ ## ss
19
+
app.py CHANGED
@@ -9,7 +9,10 @@ plots
9
  table
10
 
11
  ## related demo
12
- http://text-processing.com/demo/tokenize/
 
 
 
13
 
14
  ## 可视化
15
 
@@ -28,15 +31,28 @@ css = """
28
  .space-show {white-space: pre-wrap;}
29
  .cell-wrap {white-space: pre-wrap;}
30
  .category-legend {display: none !important}
 
 
31
  """
32
 
33
- example_text = """中文测试:华为智能音箱发布:华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持
34
- 标点测试:,。!?;
35
- 空格测试: 2个空格 8个空格
36
- 数字测试:(10086 + 98) = 100184"""
37
 
 
 
 
 
 
 
 
 
38
 
39
- def tokenize(text, tokenizer_type):
 
 
 
 
 
40
  print(text, tokenizer_type)
41
  pos_tokens = []
42
  tokenizer = load_tokener(tokenizer_type)
@@ -46,12 +62,17 @@ def tokenize(text, tokenizer_type):
46
 
47
  for idx, token_id in enumerate(encoding):
48
  decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
49
- pos_tokens.extend([(decode_text, str(idx % 3))])
50
 
51
  # token "Byte": # 这是 utf-8编码吧?
52
  token = tokenizer.convert_ids_to_tokens([token_id])[0]
53
  if isinstance(token, bytes):
54
- token_str = token.decode("utf-8")
 
 
 
 
 
55
  token_bytes = token
56
  json_dumps = json.dumps(token_str)
57
  elif isinstance(token, str):
@@ -61,9 +82,11 @@ def tokenize(text, tokenizer_type):
61
  else:
62
  return
63
 
 
 
64
  table.append(
65
  {"TokenID": token_id,
66
- "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
67
  "Text": decode_text, #
68
  # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
69
  "Bytes": str(token_bytes),
@@ -73,74 +96,148 @@ def tokenize(text, tokenizer_type):
73
 
74
  table_df = pd.DataFrame(table)
75
  print(table)
76
- print(table_df)
 
 
 
77
 
78
- return pos_tokens, table_df
 
 
 
79
 
80
 
 
 
 
 
81
  def test_coding():
82
  bytes1 = b'\xe4\xb8\xad'
83
  print(bytes1) # b'\xe4\xb8\xad'
84
 
85
 
86
  with gr.Blocks(css=css) as demo:
87
- gr.HTML("""<h1 align="center">Tokenizer Arena</h1>""")
88
  # links: https://www.coderstool.com/utf8-encoding-decoding
 
 
 
89
  #
 
90
 
91
 
 
92
  user_input = gr.Textbox(
93
  value=example_text,
94
- lines=5
 
 
95
  ) # placeholder="Enter sentence here..."
96
 
97
  # submitBtn = gr.Button("生成回复", variant="primary")
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  # TODO: 图 表 压缩率
100
- # llama chatglm gpt_nexo_20b baichuan baichuan_7b
101
  with gr.Row():
102
  with gr.Column():
103
- tokenizer_type_1 = gr.Dropdown(
104
- all_tokenizers, value="llama", label="tokenizer"
105
- )
106
- token_counter_1 = None # 计数器
107
  output_text_1 = gr.Highlightedtext(
108
- label="Tokenization",
109
  show_legend=True,
110
  elem_classes="space-show"
111
  )
112
-
113
- output_table_1 = gr.Dataframe(
114
- headers=["TokenID", "Byte", "Text"],
115
- datatype=["str", "str", "str"],
116
- #elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
117
- )
118
-
119
  with gr.Column():
120
- tokenizer_type_2 = gr.Dropdown(
121
- all_tokenizers, value="baichuan_7b", label="tokenizer"
122
- )
123
- token_counter_2 = None # 计数器
124
  output_text_2 = gr.Highlightedtext(
125
- label="Tokenization",
126
  show_legend=True,
127
  elem_classes="space-show"
128
  )
129
 
130
- output_table_2 = gr.Dataframe(
131
- headers=["TokenID", "Token", "Text"],
132
- datatype=["str", "str", "str"],
133
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- user_input.change(tokenize,
136
- [user_input, tokenizer_type_1],
137
- [output_text_1, output_table_1])
138
- tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
139
 
140
- user_input.change(tokenize,
141
- [user_input, tokenizer_type_2],
142
- [output_text_2, output_table_2])
143
- tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2])
 
 
 
144
 
145
  # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
146
  # show_progress=True)
 
9
  table
10
 
11
  ## related demo
12
+ - [](http://text-processing.com/demo/tokenize/)
13
+ - [gpt-tokenizer](https://gpt-tokenizer.dev/)
14
+ - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
15
+ - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
16
 
17
  ## 可视化
18
 
 
31
  .space-show {white-space: pre-wrap;}
32
  .cell-wrap {white-space: pre-wrap;}
33
  .category-legend {display: none !important}
34
+ .statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
35
+ .statistics label {text-align: center !important;}
36
  """
37
 
38
+ example_text = """Replace this text in the input field to see how tokenization works
39
+ 华为智能音箱发布:华为Sound X"""
 
 
40
 
41
+ # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
42
+ examples = [
43
+ ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
44
+ ["标点测试:,。!?;", "baichuan_7b", "llama"],
45
+ ["符号测试:🦙", "baichuan_7b", "llama"],
46
+ ["中文测试:🦙", "baichuan_7b", "llama"],
47
+ ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
48
+ ]
49
 
50
+
51
+
52
+ def tokenize(text, tokenizer_type, color_num=5):
53
+ """
54
+ TODO: cache tokenizer
55
+ """
56
  print(text, tokenizer_type)
57
  pos_tokens = []
58
  tokenizer = load_tokener(tokenizer_type)
 
62
 
63
  for idx, token_id in enumerate(encoding):
64
  decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
65
+ pos_tokens.extend([(decode_text, str(idx % color_num))])
66
 
67
  # token "Byte": # 这是 utf-8编码吧?
68
  token = tokenizer.convert_ids_to_tokens([token_id])[0]
69
  if isinstance(token, bytes):
70
+ try:
71
+ token_str = token.decode("utf-8")
72
+ except:
73
+ token_str = token.decode("utf-8", errors="ignore")
74
+ print("decode_error", token, token_str)
75
+
76
  token_bytes = token
77
  json_dumps = json.dumps(token_str)
78
  elif isinstance(token, str):
 
82
  else:
83
  return
84
 
85
+
86
+
87
  table.append(
88
  {"TokenID": token_id,
89
+ "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
90
  "Text": decode_text, #
91
  # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
92
  "Bytes": str(token_bytes),
 
96
 
97
  table_df = pd.DataFrame(table)
98
  print(table)
99
+ # print(table_df)
100
+
101
+ return pos_tokens, table_df, len(encoding)
102
+
103
 
104
+ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
105
+ pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
106
+ pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
107
+ return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
108
 
109
 
110
+ def get_vocab_size(tokenizer_type):
111
+ tokenizer = load_tokener(tokenizer_type)
112
+ return tokenizer.vocab_size
113
+
114
  def test_coding():
115
  bytes1 = b'\xe4\xb8\xad'
116
  print(bytes1) # b'\xe4\xb8\xad'
117
 
118
 
119
  with gr.Blocks(css=css) as demo:
120
+ gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
121
  # links: https://www.coderstool.com/utf8-encoding-decoding
122
+ # 功能:输入文本,进行分词
123
+ # 分词器:常见的分词器有集中,
124
+ # 背景:方便分词、看词粒度、对比
125
  #
126
+ # Byte: 表示分词
127
 
128
 
129
+ gr.Markdown("## Input Text")
130
  user_input = gr.Textbox(
131
  value=example_text,
132
+ label="Input Text",
133
+ lines=5,
134
+ show_label=False,
135
  ) # placeholder="Enter sentence here..."
136
 
137
  # submitBtn = gr.Button("生成回复", variant="primary")
138
 
139
+ gr.Markdown("## Tokenization")
140
+
141
+ with gr.Row():
142
+ with gr.Column(scale=6):
143
+ with gr.Group():
144
+ tokenizer_type_1 = gr.Dropdown(
145
+ all_tokenizers,
146
+ value="llama",
147
+ label="Tokenizer 1",
148
+ )
149
+ with gr.Group():
150
+ """
151
+ <div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
152
+ """
153
+ with gr.Row():
154
+ stats_vocab_size_1 = gr.TextArea(
155
+ label="VocabSize",
156
+ lines=1,
157
+ elem_classes="statistics"
158
+ )
159
+ stats_token_size_1 = gr.TextArea(
160
+ label="Tokens",
161
+ lines=1,
162
+ elem_classes="statistics"
163
+ )
164
+ stats_3 = gr.TextArea(
165
+ label="Compress Rate",
166
+ lines=1,
167
+ elem_classes="statistics"
168
+ )
169
+ # https://www.onlinewebfonts.com/icon/418591
170
+ gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
171
+ with gr.Column(scale=6):
172
+ with gr.Group():
173
+ tokenizer_type_2 = gr.Dropdown(
174
+ all_tokenizers,
175
+ value="baichuan_7b",
176
+ label="Tokenizer 2",
177
+ )
178
+ with gr.Group():
179
+ with gr.Row():
180
+ stats_vocab_size_2 = gr.TextArea(
181
+ label="VocabSize",
182
+ lines=1,
183
+ elem_classes="statistics"
184
+ )
185
+ stats_token_size_2 = gr.TextArea(
186
+ label="Tokens",
187
+ lines=1,
188
+ elem_classes="statistics"
189
+ )
190
+ stats_6 = gr.TextArea(
191
+ label="Compress Rate",
192
+ lines=1,
193
+ elem_classes="statistics"
194
+ )
195
+
196
+
197
+
198
  # TODO: 图 表 压缩率
 
199
  with gr.Row():
200
  with gr.Column():
 
 
 
 
201
  output_text_1 = gr.Highlightedtext(
202
+ label="Tokens 1",
203
  show_legend=True,
204
  elem_classes="space-show"
205
  )
 
 
 
 
 
 
 
206
  with gr.Column():
 
 
 
 
207
  output_text_2 = gr.Highlightedtext(
208
+ label="Tokens 2",
209
  show_legend=True,
210
  elem_classes="space-show"
211
  )
212
 
213
+ with gr.Row():
214
+ output_table_1 = gr.Dataframe(
215
+ headers=["TokenID", "Byte", "Text"],
216
+ datatype=["str", "str", "str"],
217
+ # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
218
+ )
219
+ output_table_2 = gr.Dataframe(
220
+ headers=["TokenID", "Token", "Text"],
221
+ datatype=["str", "str", "str"],
222
+ )
223
+
224
+ tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
225
+ tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1])
226
+
227
+ user_input.change(tokenize_pair,
228
+ [user_input, tokenizer_type_1, tokenizer_type_2],
229
+ [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2])
230
 
231
+ tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2])
232
+ tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2])
 
 
233
 
234
+ gr.Examples(
235
+ examples,
236
+ [user_input, tokenizer_type_1, tokenizer_type_2],
237
+ [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2],
238
+ tokenize_pair,
239
+ cache_examples=True,
240
+ )
241
 
242
  # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
243
  # show_progress=True)
app_v1.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # author: xusong
3
+ # time: 2022/8/23 16:06
4
+
5
+ """
6
+
7
+ plots
8
+
9
+ table
10
+
11
+ ## related demo
12
+ - [](http://text-processing.com/demo/tokenize/)
13
+ - [gpt-tokenizer](https://gpt-tokenizer.dev/)
14
+ - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
15
+ - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
16
+
17
+ ## 可视化
18
+
19
+ [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
20
+ """
21
+
22
+ import json
23
+ import pandas as pd
24
+ import gradio as gr
25
+
26
+ from vocab import all_tokenizers, load_tokener
27
+
28
+ # 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673
29
+ # 隐藏legend:
30
+ css = """
31
+ .space-show {white-space: pre-wrap;}
32
+ .cell-wrap {white-space: pre-wrap;}
33
+ .category-legend {display: none !important}
34
+ """
35
+
36
+ example_text = """Replace this text in the input field to see how tokenization works
37
+ 中文测试:华为智能音箱发布:华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持
38
+ 数字测试:(10086 + 98) = 100184"""
39
+
40
+ # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
41
+ examples = [
42
+ # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
43
+ ["标点测试:,。!?;", "baichuan_7b", "llama"],
44
+ ["标点测试:🦙", "baichuan_7b", "llama"],
45
+ ]
46
+
47
+
48
+ def tokenize(text, tokenizer_type, color_num=5):
49
+ print(text, tokenizer_type)
50
+ pos_tokens = []
51
+ tokenizer = load_tokener(tokenizer_type)
52
+ encoding = tokenizer.encode(text)
53
+
54
+ table = []
55
+
56
+ for idx, token_id in enumerate(encoding):
57
+ decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
58
+ pos_tokens.extend([(decode_text, str(idx % color_num))])
59
+
60
+ # token "Byte": # 这是 utf-8编码吧?
61
+ token = tokenizer.convert_ids_to_tokens([token_id])[0]
62
+ if isinstance(token, bytes):
63
+ try:
64
+ token_str = token.decode("utf-8")
65
+ except:
66
+ token_str = token.decode("utf-8", errors="ignore")
67
+ print("decode_error", token, token_str)
68
+
69
+ token_bytes = token
70
+ json_dumps = json.dumps(token_str)
71
+ elif isinstance(token, str):
72
+ token_str = token
73
+ token_bytes = bytes(token_str, "utf-8")
74
+ json_dumps = json.dumps(token_str)
75
+ else:
76
+ return
77
+
78
+ table.append(
79
+ {"TokenID": token_id,
80
+ "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
81
+ "Text": decode_text, #
82
+ # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
83
+ "Bytes": str(token_bytes),
84
+ # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
85
+ }
86
+ )
87
+
88
+ table_df = pd.DataFrame(table)
89
+ print(table)
90
+ # print(table_df)
91
+
92
+ return pos_tokens, table_df
93
+
94
+
95
+ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
96
+ pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
97
+ pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
98
+ return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
99
+
100
+
101
+ def test_coding():
102
+ bytes1 = b'\xe4\xb8\xad'
103
+ print(bytes1) # b'\xe4\xb8\xad'
104
+
105
+
106
+ with gr.Blocks(css=css) as demo:
107
+ gr.HTML("""<h1 align="center">The Tokenizer Arena</h1>""")
108
+ # links: https://www.coderstool.com/utf8-encoding-decoding
109
+ #
110
+
111
+
112
+
113
+ gr.Markdown("## Input Text")
114
+ user_input = gr.Textbox(
115
+ value=example_text,
116
+ label="Input Text",
117
+ lines=5
118
+ ) # placeholder="Enter sentence here..."
119
+
120
+ # submitBtn = gr.Button("生成回复", variant="primary")
121
+
122
+ gr.Markdown("## Tokenization")
123
+
124
+ # with gr.Row():
125
+
126
+
127
+
128
+ # TODO: 图 表 压缩率
129
+ with gr.Row():
130
+ with gr.Column():
131
+ tokenizer_type_1 = gr.Dropdown(
132
+ all_tokenizers,
133
+ value="llama",
134
+ label="Tokenizer 1",
135
+ )
136
+ token_counter_1 = None # 计数器
137
+ output_text_1 = gr.Highlightedtext(
138
+ label="Tokens 1",
139
+ show_legend=True,
140
+ elem_classes="space-show"
141
+ )
142
+
143
+ with gr.Column():
144
+ tokenizer_type_2 = gr.Dropdown(
145
+ all_tokenizers,
146
+ value="baichuan_7b",
147
+ label="Tokenizer 2"
148
+ )
149
+ token_counter_2 = None # 计数器
150
+ output_text_2 = gr.Highlightedtext(
151
+ label="Tokens 2",
152
+ show_legend=True,
153
+ elem_classes="space-show"
154
+ )
155
+
156
+ with gr.Row():
157
+ output_table_1 = gr.Dataframe(
158
+ headers=["TokenID", "Byte", "Text"],
159
+ datatype=["str", "str", "str"],
160
+ # elem_classes="space-show", # 给���个Dataframe加这个css不起作用,因此直接修改cell-wrap
161
+ )
162
+ output_table_2 = gr.Dataframe(
163
+ headers=["TokenID", "Token", "Text"],
164
+ datatype=["str", "str", "str"],
165
+ )
166
+
167
+ user_input.change(tokenize,
168
+ [user_input, tokenizer_type_1],
169
+ [output_text_1, output_table_1])
170
+ tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
171
+
172
+ user_input.change(tokenize,
173
+ [user_input, tokenizer_type_2],
174
+ [output_text_2, output_table_2])
175
+
176
+ tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2])
177
+
178
+ gr.Examples(
179
+ examples,
180
+ [user_input, tokenizer_type_1, tokenizer_type_2],
181
+ [output_text_1, output_table_1, output_text_2, output_table_2],
182
+ tokenize_pair,
183
+ cache_examples=True,
184
+ )
185
+
186
+ # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
187
+ # show_progress=True)
188
+
189
+ # examples=[
190
+ # ["What a beautiful morning for a walk!"],
191
+ # ["It was the best of times, it was the worst of times."],
192
+ # ["多个空格 It ss was the best of times, it was the worst of times."],
193
+ # ]
194
+
195
+ if __name__ == "__main__":
196
+ demo.launch()
images/VS.svg ADDED
tokenizer.py ADDED
File without changes
vocab/__init__.py CHANGED
@@ -1,7 +1,18 @@
1
- import transformers
2
  import importlib
3
  from enum import Enum, auto
4
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  Animal = Enum('Animal', 'ANT BEE CAT DOG')
6
 
7
  uniq_tokenizers = [
@@ -29,7 +40,7 @@ all_tokenizers = [
29
  #
30
  # ##### glm系列
31
  # "glm_chinese",
32
- "chatglm",
33
  #
34
  # #### llama alpaca系列
35
  "llama", # '中文单字': 700, '中文多字': 0
 
 
1
  import importlib
2
  from enum import Enum, auto
3
 
4
+
5
+ """
6
+ Interface:
7
+ -
8
+
9
+ tokenizer.parent = ""
10
+ tokenizer.type = TokenizerType.ByteBPE.name
11
+ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
12
+ tokenizer.comments = "split all numbers into individual digits, " \
13
+ "and fallback to bytes to decompose unknown UTF-8 characters"
14
+ """
15
+
16
  Animal = Enum('Animal', 'ANT BEE CAT DOG')
17
 
18
  uniq_tokenizers = [
 
40
  #
41
  # ##### glm系列
42
  # "glm_chinese",
43
+ "chatglm_6b",
44
  #
45
  # #### llama alpaca系列
46
  "llama", # '中文单字': 700, '中文多字': 0
vocab/baichuan_7b/__init__.py CHANGED
@@ -6,3 +6,6 @@ tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remo
6
 
7
  # byte-bpe sentencepiece
8
  tokenizer.type = TokenizerType.ByteBPE
 
 
 
 
6
 
7
  # byte-bpe sentencepiece
8
  tokenizer.type = TokenizerType.ByteBPE
9
+
10
+ tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"
11
+
vocab/{chatglm → chatglm_6b}/README.md RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/__init__.py RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/chatglm.vocab RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/test_chatglm.py RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/tokenizer/config.json RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py RENAMED
File without changes
vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json RENAMED
File without changes
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -16,7 +16,11 @@ def decode(self, tokens, errors="replace"):
16
  decode_str = "null"
17
  return decode_str
18
 
 
 
 
19
 
20
  Encoding.decode = decode
 
21
 
22
 
 
16
  decode_str = "null"
17
  return decode_str
18
 
19
+ def convert_ids_to_tokens(self, tokens):
20
+ return tokenizer.decode_tokens_bytes(tokens)
21
+
22
 
23
  Encoding.decode = decode
24
+ Encoding.convert_ids_to_tokens = convert_ids_to_tokens
25
 
26
 
vocab/gpt_35_turbo/test2.py CHANGED
@@ -22,6 +22,10 @@ print(decoding_bytes)
22
  # print(token, token_str, json.dumps(token_str))
23
 
24
 
 
 
 
 
25
  f_out = open("vocab.jsonl", "w")
26
  # 100255
27
  for i in range(tokenizer.n_vocab):
 
22
  # print(token, token_str, json.dumps(token_str))
23
 
24
 
25
+ tokenizer.decode_tokens_bytes([10])
26
+ tokenizer.decode_single_token_bytes(10)
27
+ tokenizer.decode_bytes([10])
28
+
29
  f_out = open("vocab.jsonl", "w")
30
  # 100255
31
  for i in range(tokenizer.n_vocab):
vocab/{bert_kplug → kplug}/README.md RENAMED
File without changes
vocab/kplug/__init__.py ADDED
File without changes
vocab/{bert_kplug → kplug}/bpe_oov.py RENAMED
File without changes
vocab/{bert_kplug → kplug}/bpe_oov2.py RENAMED
File without changes
vocab/{bert_kplug → kplug}/jd_vocab.py RENAMED
File without changes
vocab/{bert_kplug → kplug}/langconv.py RENAMED
File without changes
vocab/{bert_kplug → kplug}/test_langconv.py RENAMED
File without changes
vocab/{bert_kplug → kplug}/vocab.jd.txt RENAMED
File without changes
vocab/{bert_kplug → kplug}/vocab.jd.txt.v2 RENAMED
File without changes
vocab/{bert_kplug → kplug}/zh_wiki.py RENAMED
File without changes