Update space
Browse files- app.py +14 -5
- src/populate.py +23 -9
app.py
CHANGED
@@ -151,6 +151,15 @@ with demo:
|
|
151 |
'</p>'
|
152 |
)
|
153 |
gr.HTML(INTRODUCTION_TEXT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
156 |
|
@@ -256,7 +265,7 @@ with demo:
|
|
256 |
AutoEvalColumn.rank_math_geometry.name,
|
257 |
AutoEvalColumn.rank_math_probability.name,
|
258 |
],
|
259 |
-
rank_col=['sort_by_rank', 1, 4],
|
260 |
)
|
261 |
)
|
262 |
|
@@ -277,7 +286,7 @@ with demo:
|
|
277 |
# AutoEvalColumn.rank_math_geometry.name,
|
278 |
# AutoEvalColumn.rank_math_probability.name,
|
279 |
],
|
280 |
-
rank_col=['sort_by_score', 1, 4],
|
281 |
)
|
282 |
)
|
283 |
|
@@ -389,7 +398,7 @@ with demo:
|
|
389 |
AutoEvalColumn.rank_reason_logical.name,
|
390 |
AutoEvalColumn.rank_reason_social.name,
|
391 |
],
|
392 |
-
rank_col=['sort_by_rank', 1, 3],
|
393 |
)
|
394 |
)
|
395 |
|
@@ -406,7 +415,7 @@ with demo:
|
|
406 |
AutoEvalColumn.score_reason_logical.name,
|
407 |
AutoEvalColumn.score_reason_social.name,
|
408 |
],
|
409 |
-
rank_col=['sort_by_score', 1, 3],
|
410 |
)
|
411 |
)
|
412 |
|
@@ -488,7 +497,7 @@ with demo:
|
|
488 |
|
489 |
AutoEvalColumn.rank_chemistry.name,
|
490 |
],
|
491 |
-
rank_col=['sort_by_rank', 4, 5],
|
492 |
)
|
493 |
)
|
494 |
|
|
|
151 |
'</p>'
|
152 |
)
|
153 |
gr.HTML(INTRODUCTION_TEXT)
|
154 |
+
|
155 |
+
'''
|
156 |
+
TEXT = (
|
157 |
+
'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
|
158 |
+
''
|
159 |
+
'</p>'
|
160 |
+
)
|
161 |
+
gr.HTML(TEXT)
|
162 |
+
'''
|
163 |
|
164 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
165 |
|
|
|
265 |
AutoEvalColumn.rank_math_geometry.name,
|
266 |
AutoEvalColumn.rank_math_probability.name,
|
267 |
],
|
268 |
+
rank_col=['sort_by_rank', 1, 4, 'Math'],
|
269 |
)
|
270 |
)
|
271 |
|
|
|
286 |
# AutoEvalColumn.rank_math_geometry.name,
|
287 |
# AutoEvalColumn.rank_math_probability.name,
|
288 |
],
|
289 |
+
rank_col=['sort_by_score', 1, 4, 'Math'],
|
290 |
)
|
291 |
)
|
292 |
|
|
|
398 |
AutoEvalColumn.rank_reason_logical.name,
|
399 |
AutoEvalColumn.rank_reason_social.name,
|
400 |
],
|
401 |
+
rank_col=['sort_by_rank', 1, 3, 'Reasoning'],
|
402 |
)
|
403 |
)
|
404 |
|
|
|
415 |
AutoEvalColumn.score_reason_logical.name,
|
416 |
AutoEvalColumn.score_reason_social.name,
|
417 |
],
|
418 |
+
rank_col=['sort_by_score', 1, 3, 'Reasoning'],
|
419 |
)
|
420 |
)
|
421 |
|
|
|
497 |
|
498 |
AutoEvalColumn.rank_chemistry.name,
|
499 |
],
|
500 |
+
rank_col=['sort_by_rank', 4, 5, 'Science'],
|
501 |
)
|
502 |
)
|
503 |
|
src/populate.py
CHANGED
@@ -42,11 +42,17 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
42 |
start_idx = rank_col[1]
|
43 |
end_idx = rank_col[2]
|
44 |
avg_scores = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
df[
|
48 |
-
df = df.sort_values(by=[
|
49 |
-
df[
|
50 |
|
51 |
# df = df.drop(columns=benchmark_cols[offset_idx:])
|
52 |
# print(benchmark_cols)
|
@@ -68,11 +74,18 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
68 |
start_idx = rank_col[1]
|
69 |
end_idx = rank_col[2]
|
70 |
avg_rank = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
71 |
-
df.insert(1, "Average Rank", avg_rank)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# we'll skip NaN, instrad of deleting the whole row
|
78 |
df = df.fillna('--')
|
@@ -80,7 +93,8 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
80 |
rank = np.arange(1, len(df)+1)
|
81 |
df.insert(0, 'Rank', rank)
|
82 |
|
83 |
-
|
|
|
84 |
|
85 |
|
86 |
|
|
|
42 |
start_idx = rank_col[1]
|
43 |
end_idx = rank_col[2]
|
44 |
avg_scores = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
45 |
+
if len(rank_col) == 4:
|
46 |
+
avg_col_name = f"Overall ({rank_col[3]})"
|
47 |
+
else:
|
48 |
+
# avg_col_name = "Average Score"
|
49 |
+
avg_col_name = 'Overall'
|
50 |
+
|
51 |
+
df.insert(1, avg_col_name, avg_scores)
|
52 |
|
53 |
+
df[avg_col_name] = avg_scores.round(decimals=4)
|
54 |
+
df = df.sort_values(by=[avg_col_name], ascending=False)
|
55 |
+
df[avg_col_name] = df[avg_col_name].map('{:.2f}'.format)
|
56 |
|
57 |
# df = df.drop(columns=benchmark_cols[offset_idx:])
|
58 |
# print(benchmark_cols)
|
|
|
74 |
start_idx = rank_col[1]
|
75 |
end_idx = rank_col[2]
|
76 |
avg_rank = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
|
|
77 |
|
78 |
+
if len(rank_col) == 4:
|
79 |
+
avg_col_name = f"Overall ({rank_col[3]})"
|
80 |
+
else:
|
81 |
+
# avg_col_name = "Average Rank"
|
82 |
+
avg_col_name = 'Overall'
|
83 |
+
|
84 |
+
df.insert(1, avg_col_name, avg_rank)
|
85 |
+
|
86 |
+
df[avg_col_name] = avg_rank.round(decimals=4)
|
87 |
+
df = df.sort_values(by=[avg_col_name], ascending=True)
|
88 |
+
df[avg_col_name] = df[avg_col_name].map('{:.2f}'.format)
|
89 |
|
90 |
# we'll skip NaN, instrad of deleting the whole row
|
91 |
df = df.fillna('--')
|
|
|
93 |
rank = np.arange(1, len(df)+1)
|
94 |
df.insert(0, 'Rank', rank)
|
95 |
|
96 |
+
# print(benchmark_cols)
|
97 |
+
# df.style.background_gradient(cmap='coolwarm', subset=benchmark_cols)
|
98 |
|
99 |
|
100 |
|