Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
β’
91c6e89
1
Parent(s):
c8b2c09
change 'proprietary' models to 'external' models and added news models
Browse files
proprietary_models_results.json β external_models_results.json
RENAMED
@@ -6,6 +6,7 @@
|
|
6 |
"date": "2024-04-12",
|
7 |
"status": "full",
|
8 |
"main_language": "Portuguese",
|
|
|
9 |
"result_metrics": {
|
10 |
"enem_challenge": 0.7172848145556333,
|
11 |
"bluex": 0.5549374130737135,
|
@@ -27,6 +28,7 @@
|
|
27 |
"date": "2024-04-13",
|
28 |
"status": "full",
|
29 |
"main_language": "Portuguese",
|
|
|
30 |
"result_metrics": {
|
31 |
"enem_challenge": 0.8180545836249126,
|
32 |
"bluex": 0.717663421418637,
|
@@ -48,6 +50,7 @@
|
|
48 |
"date": "2024-03-08",
|
49 |
"status": "full",
|
50 |
"main_language": "English",
|
|
|
51 |
"result_metrics": {
|
52 |
"enem_challenge": 0.7214835549335199,
|
53 |
"bluex": 0.6244784422809457,
|
@@ -69,6 +72,7 @@
|
|
69 |
"date": "2024-04-13",
|
70 |
"status": "full",
|
71 |
"main_language": "English",
|
|
|
72 |
"result_metrics": {
|
73 |
"enem_challenge": 0.7718684394681595,
|
74 |
"bluex": 0.6662030598052852,
|
@@ -90,6 +94,7 @@
|
|
90 |
"date": "2024-03-08",
|
91 |
"status": "full",
|
92 |
"main_language": "English",
|
|
|
93 |
"result_metrics": {
|
94 |
"enem_challenge": 0.7130860741777467,
|
95 |
"bluex": 0.5869262865090403,
|
@@ -111,6 +116,7 @@
|
|
111 |
"date": "2024-04-15",
|
112 |
"status": "full",
|
113 |
"main_language": "English",
|
|
|
114 |
"result_metrics": {
|
115 |
"enem_challenge": 0.8509447165850245,
|
116 |
"bluex": 0.7719054242002782,
|
@@ -132,6 +138,7 @@
|
|
132 |
"date": "2024-05-18",
|
133 |
"status": "full",
|
134 |
"main_language": "English",
|
|
|
135 |
"result_metrics": {
|
136 |
"enem_challenge": 0.7844646606018194,
|
137 |
"bluex": 0.6954102920723226,
|
@@ -153,6 +160,7 @@
|
|
153 |
"date": "2024-05-18",
|
154 |
"status": "full",
|
155 |
"main_language": "English",
|
|
|
156 |
"result_metrics": {
|
157 |
"enem_challenge": 0.8264520643806857,
|
158 |
"bluex": 0.7482614742698191,
|
@@ -166,5 +174,72 @@
|
|
166 |
},
|
167 |
"result_metrics_average": 0.7914657682594597,
|
168 |
"result_metrics_npm": 0.6834036936130392
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
}
|
170 |
]
|
|
|
6 |
"date": "2024-04-12",
|
7 |
"status": "full",
|
8 |
"main_language": "Portuguese",
|
9 |
+
"model_type": "proprietary",
|
10 |
"result_metrics": {
|
11 |
"enem_challenge": 0.7172848145556333,
|
12 |
"bluex": 0.5549374130737135,
|
|
|
28 |
"date": "2024-04-13",
|
29 |
"status": "full",
|
30 |
"main_language": "Portuguese",
|
31 |
+
"model_type": "proprietary",
|
32 |
"result_metrics": {
|
33 |
"enem_challenge": 0.8180545836249126,
|
34 |
"bluex": 0.717663421418637,
|
|
|
50 |
"date": "2024-03-08",
|
51 |
"status": "full",
|
52 |
"main_language": "English",
|
53 |
+
"model_type": "proprietary",
|
54 |
"result_metrics": {
|
55 |
"enem_challenge": 0.7214835549335199,
|
56 |
"bluex": 0.6244784422809457,
|
|
|
72 |
"date": "2024-04-13",
|
73 |
"status": "full",
|
74 |
"main_language": "English",
|
75 |
+
"model_type": "proprietary",
|
76 |
"result_metrics": {
|
77 |
"enem_challenge": 0.7718684394681595,
|
78 |
"bluex": 0.6662030598052852,
|
|
|
94 |
"date": "2024-03-08",
|
95 |
"status": "full",
|
96 |
"main_language": "English",
|
97 |
+
"model_type": "proprietary",
|
98 |
"result_metrics": {
|
99 |
"enem_challenge": 0.7130860741777467,
|
100 |
"bluex": 0.5869262865090403,
|
|
|
116 |
"date": "2024-04-15",
|
117 |
"status": "full",
|
118 |
"main_language": "English",
|
119 |
+
"model_type": "proprietary",
|
120 |
"result_metrics": {
|
121 |
"enem_challenge": 0.8509447165850245,
|
122 |
"bluex": 0.7719054242002782,
|
|
|
138 |
"date": "2024-05-18",
|
139 |
"status": "full",
|
140 |
"main_language": "English",
|
141 |
+
"model_type": "proprietary",
|
142 |
"result_metrics": {
|
143 |
"enem_challenge": 0.7844646606018194,
|
144 |
"bluex": 0.6954102920723226,
|
|
|
160 |
"date": "2024-05-18",
|
161 |
"status": "full",
|
162 |
"main_language": "English",
|
163 |
+
"model_type": "proprietary",
|
164 |
"result_metrics": {
|
165 |
"enem_challenge": 0.8264520643806857,
|
166 |
"bluex": 0.7482614742698191,
|
|
|
174 |
},
|
175 |
"result_metrics_average": 0.7914657682594597,
|
176 |
"result_metrics_npm": 0.6834036936130392
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"model": "gemini-1.5-flash",
|
180 |
+
"name": "Gemini 1.5 Flash",
|
181 |
+
"link": "https://cloud.google.com/vertex-ai",
|
182 |
+
"date": "2024-08-09",
|
183 |
+
"status": "full",
|
184 |
+
"main_language": "English",
|
185 |
+
"model_type": "proprietary",
|
186 |
+
"result_metrics": {
|
187 |
+
"enem_challenge": 0.8306508047585724,
|
188 |
+
"bluex": 0.7579972183588317,
|
189 |
+
"oab_exams": 0.6446469248291572,
|
190 |
+
"assin2_sts": 0.838806085610371,
|
191 |
+
"assin2_rte": 0.9366169973822607,
|
192 |
+
"faquad_nli": 0.7963910785668922,
|
193 |
+
"hatebr_offensive": 0.9092078461170015,
|
194 |
+
"portuguese_hate_speech": 0.6932563987219857,
|
195 |
+
"tweetsentbr": 0.7312948963367732
|
196 |
+
},
|
197 |
+
"result_metrics_average": 0.7932075834090939,
|
198 |
+
"result_metrics_npm": 0.6855338135928848
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"model": "gpt-4o-mini-2024-07-18",
|
202 |
+
"name": "GPT 4o Mini (2024-07-18)",
|
203 |
+
"link": "https://www.openai.com/",
|
204 |
+
"date": "2024-07-25",
|
205 |
+
"status": "full",
|
206 |
+
"main_language": "English",
|
207 |
+
"model_type": "proprietary",
|
208 |
+
"result_metrics": {
|
209 |
+
"enem_challenge": 0.7669699090272918,
|
210 |
+
"bluex": 0.6842837273991655,
|
211 |
+
"oab_exams": 0.6013667425968109,
|
212 |
+
"assin2_sts": 0.7259038954527597,
|
213 |
+
"assin2_rte": 0.942809846745341,
|
214 |
+
"faquad_nli": 0.819807735300693,
|
215 |
+
"hatebr_offensive": 0.8682357029532165,
|
216 |
+
"portuguese_hate_speech": 0.7501413502853012,
|
217 |
+
"tweetsentbr": 0.7509303825869922
|
218 |
+
},
|
219 |
+
"result_metrics_average": 0.7678276991497301,
|
220 |
+
"result_metrics_npm": 0.6595966999910003
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"model": "nemotron-4-340b-instruct",
|
224 |
+
"name": "nvidia/Nemotron-4-340B-Instruct (Nvidia API)",
|
225 |
+
"link": "https://build.nvidia.com/nvidia/nemotron-4-340b-instruct",
|
226 |
+
"date": "2024-06-30",
|
227 |
+
"status": "full",
|
228 |
+
"main_language": "English",
|
229 |
+
"model_type": "chat",
|
230 |
+
"params": 340.0,
|
231 |
+
"result_metrics": {
|
232 |
+
"enem_challenge": 0.6648005598320503,
|
233 |
+
"bluex": 0.6578581363004172,
|
234 |
+
"oab_exams": 0.7020501138952164,
|
235 |
+
"assin2_sts": 0.7857731021403329,
|
236 |
+
"assin2_rte": 0.9489354458928496,
|
237 |
+
"faquad_nli": 0.8194444444444444,
|
238 |
+
"hatebr_offensive": 0.8641580001234928,
|
239 |
+
"portuguese_hate_speech": 0.7761835184102864,
|
240 |
+
"tweetsentbr": 0.780880021326841
|
241 |
+
},
|
242 |
+
"result_metrics_average": 0.7777870380406591,
|
243 |
+
"result_metrics_npm": 0.6740728488043128
|
244 |
}
|
245 |
]
|
src/display/utils.py
CHANGED
@@ -166,24 +166,30 @@ human_baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
|
|
166 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
167 |
human_baseline_row["π€ Leaderboard Average"] = None
|
168 |
|
169 |
-
#
|
170 |
-
|
171 |
-
if os.path.exists('
|
172 |
-
with open('
|
173 |
all_models = json.load(f)
|
174 |
for model_data in all_models:
|
175 |
model_row = deepcopy(baseline_row)
|
176 |
model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
|
177 |
model_row[AutoEvalColumn.dummy.name] = model_data['model']
|
178 |
-
model_row[AutoEvalColumn.license.name] = "Proprietary"
|
179 |
for task in Tasks:
|
180 |
model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
|
181 |
model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
|
182 |
model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
|
186 |
-
|
187 |
|
188 |
@dataclass
|
189 |
class ModelDetails:
|
|
|
166 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
167 |
human_baseline_row["π€ Leaderboard Average"] = None
|
168 |
|
169 |
+
#External models
|
170 |
+
external_rows = []
|
171 |
+
if os.path.exists('external_models_results.json'):
|
172 |
+
with open('external_models_results.json', 'r', encoding='utf8') as f:
|
173 |
all_models = json.load(f)
|
174 |
for model_data in all_models:
|
175 |
model_row = deepcopy(baseline_row)
|
176 |
model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
|
177 |
model_row[AutoEvalColumn.dummy.name] = model_data['model']
|
|
|
178 |
for task in Tasks:
|
179 |
model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
|
180 |
model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
|
181 |
model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
|
182 |
+
|
183 |
+
model_type = ModelType.from_str(model_data['model_type'])
|
184 |
+
model_row[AutoEvalColumn.model_type.name] = model_type.name
|
185 |
+
model_row[AutoEvalColumn.model_type_symbol.name] = model_type.symbol
|
186 |
+
if model_type == ModelType.proprietary:
|
187 |
+
model_row[AutoEvalColumn.license.name] = "Proprietary"
|
188 |
+
if 'params' in model_data:
|
189 |
+
model_row[AutoEvalColumn.params.name] = model_data['params']
|
190 |
+
|
191 |
model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
|
192 |
+
external_rows.append(model_row)
|
193 |
|
194 |
@dataclass
|
195 |
class ModelDetails:
|
src/populate.py
CHANGED
@@ -5,7 +5,7 @@ import copy
|
|
5 |
import pandas as pd
|
6 |
|
7 |
from src.display.formatting import has_no_nan_values, make_requests_clickable_model
|
8 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row,
|
9 |
from src.leaderboard.filter_models import filter_models_flags
|
10 |
from src.leaderboard.read_evals import get_raw_eval_results
|
11 |
|
@@ -14,8 +14,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
14 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
all_data_json.append(baseline_row)
|
17 |
-
for
|
18 |
-
all_data_json.append(
|
19 |
filter_models_flags(all_data_json)
|
20 |
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
5 |
import pandas as pd
|
6 |
|
7 |
from src.display.formatting import has_no_nan_values, make_requests_clickable_model
|
8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row, external_rows
|
9 |
from src.leaderboard.filter_models import filter_models_flags
|
10 |
from src.leaderboard.read_evals import get_raw_eval_results
|
11 |
|
|
|
14 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
all_data_json.append(baseline_row)
|
17 |
+
for external_row in external_rows:
|
18 |
+
all_data_json.append(external_row)
|
19 |
filter_models_flags(all_data_json)
|
20 |
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|