Lisa Dunlap
commited on
Commit
β’
13ecd9b
1
Parent(s):
0ba05dc
moved delta to new column, updated ranking
Browse files
app.py
CHANGED
@@ -26,6 +26,8 @@ def make_default_md(arena_df, elo_results):
|
|
26 |
|
27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
28 |
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
|
|
|
|
29 |
"""
|
30 |
return leaderboard_md
|
31 |
|
@@ -213,32 +215,57 @@ def get_full_table(arena_df, model_table_df):
|
|
213 |
|
214 |
def create_ranking_str(ranking, ranking_difference):
|
215 |
if ranking_difference > 0:
|
216 |
-
return f"{int(ranking)} (\u2191{int(ranking_difference)})"
|
|
|
217 |
elif ranking_difference < 0:
|
218 |
-
return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
|
|
|
219 |
else:
|
220 |
return f"{int(ranking)}"
|
221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
223 |
-
|
224 |
-
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
225 |
arena_df = arena_df[arena_df["num_battles"] > 2000]
|
|
|
|
|
226 |
|
227 |
# arena_df["final_ranking"] = range(1, len(arena_df) + 1)
|
228 |
# sort by rating
|
229 |
if arena_subset_df is not None:
|
230 |
# filter out models not in the arena_df
|
231 |
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
232 |
-
|
233 |
-
arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
# assign ranking by the order
|
236 |
-
|
|
|
237 |
# join arena_df and arena_subset_df on index
|
238 |
-
arena_df = arena_subset_df.join(arena_df["
|
239 |
-
arena_df
|
|
|
|
|
|
|
240 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
241 |
-
|
242 |
values = []
|
243 |
for i in range(len(arena_df)):
|
244 |
row = []
|
@@ -247,10 +274,11 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
247 |
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
248 |
0
|
249 |
]
|
250 |
-
|
251 |
# rank
|
252 |
ranking = arena_df.iloc[i].get("final_ranking") or i+1
|
253 |
row.append(ranking)
|
|
|
|
|
254 |
# model display name
|
255 |
row.append(model_name)
|
256 |
# elo rating
|
@@ -272,7 +300,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
272 |
row.append(
|
273 |
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
274 |
)
|
275 |
-
|
276 |
cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
|
277 |
if cutoff_date == "-":
|
278 |
row.append("Unknown")
|
@@ -421,13 +448,85 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
421 |
else:
|
422 |
pass
|
423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
def update_leaderboard_and_plots(category):
|
425 |
arena_subset_df = arena_dfs[category]
|
|
|
426 |
elo_subset_results = category_elo_results[category]
|
427 |
arena_df = arena_dfs["Total"]
|
428 |
-
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
|
429 |
if category != "Total":
|
430 |
arena_values = update_leaderboard_df(arena_values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
432 |
p2 = elo_subset_results["battle_count_heatmap"]
|
433 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
@@ -436,18 +535,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
436 |
"""
|
437 |
leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
|
438 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
439 |
-
|
440 |
-
def update_leaderboard_df(arena_table_vals):
|
441 |
-
elo_datarame = pd.DataFrame(arena_table_vals, columns=["Rank", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff"])
|
442 |
-
|
443 |
-
# goal: color the rows based on the rank with styler
|
444 |
-
def highlight_max(s):
|
445 |
-
# all items in S which contain up arrow should be green, down arrow should be red, otherwise black
|
446 |
-
return ["color: green" if "\u2191" in v else "color: red" if "\u2193" in v else "" for v in s]
|
447 |
-
|
448 |
-
styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank"])
|
449 |
-
|
450 |
-
return styled_df
|
451 |
|
452 |
category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
|
453 |
|
|
|
26 |
|
27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
28 |
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
29 |
+
|
30 |
+
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
31 |
"""
|
32 |
return leaderboard_md
|
33 |
|
|
|
215 |
|
216 |
def create_ranking_str(ranking, ranking_difference):
|
217 |
if ranking_difference > 0:
|
218 |
+
# return f"{int(ranking)} (\u2191{int(ranking_difference)})"
|
219 |
+
return f"{int(ranking)} \u2191"
|
220 |
elif ranking_difference < 0:
|
221 |
+
# return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
|
222 |
+
return f"{int(ranking)} \u2193"
|
223 |
else:
|
224 |
return f"{int(ranking)}"
|
225 |
|
226 |
+
def recompute_final_ranking(arena_df):
|
227 |
+
# compute ranking based on CI
|
228 |
+
ranking = {}
|
229 |
+
for i, model_a in enumerate(arena_df.index):
|
230 |
+
ranking[model_a] = 1
|
231 |
+
for j, model_b in enumerate(arena_df.index):
|
232 |
+
if i == j:
|
233 |
+
continue
|
234 |
+
if arena_df.loc[model_b]["rating_q025"] > arena_df.loc[model_a]["rating_q975"]:
|
235 |
+
ranking[model_a] += 1
|
236 |
+
return list(ranking.values())
|
237 |
+
|
238 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
239 |
+
arena_df = arena_df.sort_values(by=["rating"], ascending=False)
|
|
|
240 |
arena_df = arena_df[arena_df["num_battles"] > 2000]
|
241 |
+
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
242 |
+
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
243 |
|
244 |
# arena_df["final_ranking"] = range(1, len(arena_df) + 1)
|
245 |
# sort by rating
|
246 |
if arena_subset_df is not None:
|
247 |
# filter out models not in the arena_df
|
248 |
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
249 |
+
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
|
250 |
+
# arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
|
251 |
+
# arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
252 |
+
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
|
253 |
+
# keep only the models in the subset in arena_df and recompute final_ranking
|
254 |
+
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
|
255 |
+
# recompute final ranking
|
256 |
+
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
257 |
|
258 |
# assign ranking by the order
|
259 |
+
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
|
260 |
+
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
|
261 |
# join arena_df and arena_subset_df on index
|
262 |
+
arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
|
263 |
+
# arena_df = arena_subset_df.join(arena_df["rating"], rsuffix="_global", how="inner")
|
264 |
+
arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
|
265 |
+
# arena_df['ranking_difference'] = arena_df['rating_global'] - arena_df['rating']
|
266 |
+
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
267 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
268 |
+
|
269 |
values = []
|
270 |
for i in range(len(arena_df)):
|
271 |
row = []
|
|
|
274 |
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
275 |
0
|
276 |
]
|
|
|
277 |
# rank
|
278 |
ranking = arena_df.iloc[i].get("final_ranking") or i+1
|
279 |
row.append(ranking)
|
280 |
+
if arena_subset_df is not None:
|
281 |
+
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
|
282 |
# model display name
|
283 |
row.append(model_name)
|
284 |
# elo rating
|
|
|
300 |
row.append(
|
301 |
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
302 |
)
|
|
|
303 |
cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
|
304 |
if cutoff_date == "-":
|
305 |
row.append("Unknown")
|
|
|
448 |
else:
|
449 |
pass
|
450 |
|
451 |
+
def update_leaderboard_df(arena_table_vals):
|
452 |
+
elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Ξ", "π€ Model", "β Arena Elo", "π 95% CI", "π³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
|
453 |
+
|
454 |
+
# goal: color the rows based on the rank with styler
|
455 |
+
def highlight_max(s):
|
456 |
+
# all items in S which contain up arrow should be green, down arrow should be red, otherwise black
|
457 |
+
return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
|
458 |
+
|
459 |
+
def highlight_rank_max(s):
|
460 |
+
return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
|
461 |
+
|
462 |
+
return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Ξ"])
|
463 |
+
|
464 |
def update_leaderboard_and_plots(category):
|
465 |
arena_subset_df = arena_dfs[category]
|
466 |
+
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
467 |
elo_subset_results = category_elo_results[category]
|
468 |
arena_df = arena_dfs["Total"]
|
469 |
+
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Total" else None)
|
470 |
if category != "Total":
|
471 |
arena_values = update_leaderboard_df(arena_values)
|
472 |
+
arena_values = gr.Dataframe(
|
473 |
+
headers=[
|
474 |
+
"Rank",
|
475 |
+
"Ξ",
|
476 |
+
"π€ Model",
|
477 |
+
"β Arena Elo",
|
478 |
+
"π 95% CI",
|
479 |
+
"π³οΈ Votes",
|
480 |
+
"Organization",
|
481 |
+
"License",
|
482 |
+
"Knowledge Cutoff",
|
483 |
+
],
|
484 |
+
datatype=[
|
485 |
+
"str",
|
486 |
+
"number",
|
487 |
+
"markdown",
|
488 |
+
"number",
|
489 |
+
"str",
|
490 |
+
"number",
|
491 |
+
"str",
|
492 |
+
"str",
|
493 |
+
"str",
|
494 |
+
],
|
495 |
+
value=arena_values,
|
496 |
+
elem_id="arena_leaderboard_dataframe",
|
497 |
+
height=700,
|
498 |
+
column_widths=[50, 50, 190, 110, 100, 90, 160, 150, 140],
|
499 |
+
wrap=True,
|
500 |
+
)
|
501 |
+
else:
|
502 |
+
arena_values = gr.Dataframe(
|
503 |
+
headers=[
|
504 |
+
"Rank",
|
505 |
+
"π€ Model",
|
506 |
+
"β Arena Elo",
|
507 |
+
"π 95% CI",
|
508 |
+
"π³οΈ Votes",
|
509 |
+
"Organization",
|
510 |
+
"License",
|
511 |
+
"Knowledge Cutoff",
|
512 |
+
],
|
513 |
+
datatype=[
|
514 |
+
"str",
|
515 |
+
"markdown",
|
516 |
+
"number",
|
517 |
+
"str",
|
518 |
+
"number",
|
519 |
+
"str",
|
520 |
+
"str",
|
521 |
+
"str",
|
522 |
+
],
|
523 |
+
value=arena_values,
|
524 |
+
elem_id="arena_leaderboard_dataframe",
|
525 |
+
height=700,
|
526 |
+
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
527 |
+
wrap=True,
|
528 |
+
)
|
529 |
+
|
530 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
531 |
p2 = elo_subset_results["battle_count_heatmap"]
|
532 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
|
|
535 |
"""
|
536 |
leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
|
537 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
|
539 |
category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
|
540 |
|