Spaces:
Sleeping
Sleeping
Add more percentiles and BAAI/bge preset tokenizer
Browse files
app.py
CHANGED
@@ -12,6 +12,8 @@ from transformers import AutoTokenizer
|
|
12 |
|
13 |
tokenizers = {
|
14 |
"bert": "google-bert/bert-base-uncased",
|
|
|
|
|
15 |
"blenderbot": "facebook/blenderbot-3B",
|
16 |
"bloom": "bigscience/bloom-560m",
|
17 |
"bloomz": "bigscience/bloomz-7b1",
|
@@ -65,12 +67,12 @@ def count(model_id, dataset_id, config, split, column, add_special_tokens=True):
|
|
65 |
# not 100% accurate but good enough
|
66 |
words = re.findall(pattern, item[column])
|
67 |
wordcounter.append(len(words))
|
68 |
-
|
69 |
-
df = pd.DataFrame(tokencounter).describe().T
|
70 |
df.insert(0, "type", "tokens")
|
71 |
-
dfc = pd.DataFrame(charcounter).describe().T
|
72 |
dfc.insert(0, "type", "chars")
|
73 |
-
dfw = pd.DataFrame(wordcounter).describe().T
|
74 |
dfw.insert(0, "type", "words")
|
75 |
df.loc[-1] = dfw.values[0]
|
76 |
df.index = df.index + 1 # shifting index
|
@@ -105,6 +107,7 @@ demo = gr.Interface(
|
|
105 |
["tiiuae/falcon-7b", "imdb", "", "test", "text"],
|
106 |
["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
|
107 |
["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
|
|
|
108 |
],
|
109 |
cache_examples=True,
|
110 |
)
|
|
|
12 |
|
13 |
tokenizers = {
|
14 |
"bert": "google-bert/bert-base-uncased",
|
15 |
+
"bge-en": "BAAI/bge-base-en-v1.5",
|
16 |
+
"bge-zh": "BAAI/bge-base-zh-v1.5",
|
17 |
"blenderbot": "facebook/blenderbot-3B",
|
18 |
"bloom": "bigscience/bloom-560m",
|
19 |
"bloomz": "bigscience/bloomz-7b1",
|
|
|
67 |
# not 100% accurate but good enough
|
68 |
words = re.findall(pattern, item[column])
|
69 |
wordcounter.append(len(words))
|
70 |
+
percentiles = [0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
|
71 |
+
df = pd.DataFrame(tokencounter).describe(percentiles=percentiles).T
|
72 |
df.insert(0, "type", "tokens")
|
73 |
+
dfc = pd.DataFrame(charcounter).describe(percentiles=percentiles).T
|
74 |
dfc.insert(0, "type", "chars")
|
75 |
+
dfw = pd.DataFrame(wordcounter).describe(percentiles=percentiles).T
|
76 |
dfw.insert(0, "type", "words")
|
77 |
df.loc[-1] = dfw.values[0]
|
78 |
df.index = df.index + 1 # shifting index
|
|
|
107 |
["tiiuae/falcon-7b", "imdb", "", "test", "text"],
|
108 |
["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
|
109 |
["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
|
110 |
+
["BAAI/bge-base-en-v1.5", "PolyAI/banking77", "", "test", "text"],
|
111 |
],
|
112 |
cache_examples=True,
|
113 |
)
|