Lisa Dunlap commited on
Commit
13ecd9b
β€’
1 Parent(s): 0ba05dc

moved delta to new column, updated ranking

Browse files
Files changed (1) hide show
  1. app.py +112 -25
app.py CHANGED
@@ -26,6 +26,8 @@ def make_default_md(arena_df, elo_results):
26
 
27
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
28
  We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
 
 
29
  """
30
  return leaderboard_md
31
 
@@ -213,32 +215,57 @@ def get_full_table(arena_df, model_table_df):
213
 
214
  def create_ranking_str(ranking, ranking_difference):
215
  if ranking_difference > 0:
216
- return f"{int(ranking)} (\u2191{int(ranking_difference)})"
 
217
  elif ranking_difference < 0:
218
- return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
 
219
  else:
220
  return f"{int(ranking)}"
221
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
223
- # arena_df = arena_df.sort_values(by=["rating"], ascending=False)
224
- arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
225
  arena_df = arena_df[arena_df["num_battles"] > 2000]
 
 
226
 
227
  # arena_df["final_ranking"] = range(1, len(arena_df) + 1)
228
  # sort by rating
229
  if arena_subset_df is not None:
230
  # filter out models not in the arena_df
231
  arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
232
- # arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
233
- arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
 
 
 
 
 
 
234
 
235
  # assign ranking by the order
236
- # arena_subset_df["final_ranking"] = range(1, len(arena_subset_df) + 1)
 
237
  # join arena_df and arena_subset_df on index
238
- arena_df = arena_subset_df.join(arena_df["final_ranking"], rsuffix="_global", how="inner")
239
- arena_df['ranking_difference'] = arena_df['final_ranking_global'] - arena_df['final_ranking']
 
 
 
240
  arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
241
-
242
  values = []
243
  for i in range(len(arena_df)):
244
  row = []
@@ -247,10 +274,11 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
247
  model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
248
  0
249
  ]
250
-
251
  # rank
252
  ranking = arena_df.iloc[i].get("final_ranking") or i+1
253
  row.append(ranking)
 
 
254
  # model display name
255
  row.append(model_name)
256
  # elo rating
@@ -272,7 +300,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
272
  row.append(
273
  model_table_df[model_table_df["key"] == model_key]["License"].values[0]
274
  )
275
-
276
  cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
277
  if cutoff_date == "-":
278
  row.append("Unknown")
@@ -421,13 +448,85 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
421
  else:
422
  pass
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  def update_leaderboard_and_plots(category):
425
  arena_subset_df = arena_dfs[category]
 
426
  elo_subset_results = category_elo_results[category]
427
  arena_df = arena_dfs["Total"]
428
- arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
429
  if category != "Total":
430
  arena_values = update_leaderboard_df(arena_values)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  p1 = elo_subset_results["win_fraction_heatmap"]
432
  p2 = elo_subset_results["battle_count_heatmap"]
433
  p3 = elo_subset_results["bootstrap_elo_rating"]
@@ -436,18 +535,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
436
  """
437
  leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
438
  return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
439
-
440
- def update_leaderboard_df(arena_table_vals):
441
- elo_datarame = pd.DataFrame(arena_table_vals, columns=["Rank", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff"])
442
-
443
- # goal: color the rows based on the rank with styler
444
- def highlight_max(s):
445
- # all items in S which contain up arrow should be green, down arrow should be red, otherwise black
446
- return ["color: green" if "\u2191" in v else "color: red" if "\u2193" in v else "" for v in s]
447
-
448
- styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank"])
449
-
450
- return styled_df
451
 
452
  category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
453
 
 
26
 
27
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
28
  We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
29
+
30
+ Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
31
  """
32
  return leaderboard_md
33
 
 
215
 
216
  def create_ranking_str(ranking, ranking_difference):
217
  if ranking_difference > 0:
218
+ # return f"{int(ranking)} (\u2191{int(ranking_difference)})"
219
+ return f"{int(ranking)} \u2191"
220
  elif ranking_difference < 0:
221
+ # return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
222
+ return f"{int(ranking)} \u2193"
223
  else:
224
  return f"{int(ranking)}"
225
 
226
+ def recompute_final_ranking(arena_df):
227
+ # compute ranking based on CI
228
+ ranking = {}
229
+ for i, model_a in enumerate(arena_df.index):
230
+ ranking[model_a] = 1
231
+ for j, model_b in enumerate(arena_df.index):
232
+ if i == j:
233
+ continue
234
+ if arena_df.loc[model_b]["rating_q025"] > arena_df.loc[model_a]["rating_q975"]:
235
+ ranking[model_a] += 1
236
+ return list(ranking.values())
237
+
238
  def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
239
+ arena_df = arena_df.sort_values(by=["rating"], ascending=False)
 
240
  arena_df = arena_df[arena_df["num_battles"] > 2000]
241
+ arena_df["final_ranking"] = recompute_final_ranking(arena_df)
242
+ arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
243
 
244
  # arena_df["final_ranking"] = range(1, len(arena_df) + 1)
245
  # sort by rating
246
  if arena_subset_df is not None:
247
  # filter out models not in the arena_df
248
  arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
249
+ arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
250
+ # arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
251
+ # arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
252
+ arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
253
+ # keep only the models in the subset in arena_df and recompute final_ranking
254
+ arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
255
+ # recompute final ranking
256
+ arena_df["final_ranking"] = recompute_final_ranking(arena_df)
257
 
258
  # assign ranking by the order
259
+ arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
260
+ arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
261
  # join arena_df and arena_subset_df on index
262
+ arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
263
+ # arena_df = arena_subset_df.join(arena_df["rating"], rsuffix="_global", how="inner")
264
+ arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
265
+ # arena_df['ranking_difference'] = arena_df['rating_global'] - arena_df['rating']
266
+ arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
267
  arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
268
+
269
  values = []
270
  for i in range(len(arena_df)):
271
  row = []
 
274
  model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
275
  0
276
  ]
 
277
  # rank
278
  ranking = arena_df.iloc[i].get("final_ranking") or i+1
279
  row.append(ranking)
280
+ if arena_subset_df is not None:
281
+ row.append(arena_df.iloc[i].get("ranking_difference") or 0)
282
  # model display name
283
  row.append(model_name)
284
  # elo rating
 
300
  row.append(
301
  model_table_df[model_table_df["key"] == model_key]["License"].values[0]
302
  )
 
303
  cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
304
  if cutoff_date == "-":
305
  row.append("Unknown")
 
448
  else:
449
  pass
450
 
451
+ def update_leaderboard_df(arena_table_vals):
452
+ elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Ξ”", "πŸ€– Model", "⭐ Arena Elo", "πŸ“Š 95% CI", "πŸ—³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
453
+
454
+ # goal: color the rows based on the rank with styler
455
+ def highlight_max(s):
456
+ # all items in S which contain up arrow should be green, down arrow should be red, otherwise black
457
+ return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
458
+
459
+ def highlight_rank_max(s):
460
+ return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
461
+
462
+ return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Ξ”"])
463
+
464
  def update_leaderboard_and_plots(category):
465
  arena_subset_df = arena_dfs[category]
466
+ arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
467
  elo_subset_results = category_elo_results[category]
468
  arena_df = arena_dfs["Total"]
469
+ arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Total" else None)
470
  if category != "Total":
471
  arena_values = update_leaderboard_df(arena_values)
472
+ arena_values = gr.Dataframe(
473
+ headers=[
474
+ "Rank",
475
+ "Ξ”",
476
+ "πŸ€– Model",
477
+ "⭐ Arena Elo",
478
+ "πŸ“Š 95% CI",
479
+ "πŸ—³οΈ Votes",
480
+ "Organization",
481
+ "License",
482
+ "Knowledge Cutoff",
483
+ ],
484
+ datatype=[
485
+ "str",
486
+ "number",
487
+ "markdown",
488
+ "number",
489
+ "str",
490
+ "number",
491
+ "str",
492
+ "str",
493
+ "str",
494
+ ],
495
+ value=arena_values,
496
+ elem_id="arena_leaderboard_dataframe",
497
+ height=700,
498
+ column_widths=[50, 50, 190, 110, 100, 90, 160, 150, 140],
499
+ wrap=True,
500
+ )
501
+ else:
502
+ arena_values = gr.Dataframe(
503
+ headers=[
504
+ "Rank",
505
+ "πŸ€– Model",
506
+ "⭐ Arena Elo",
507
+ "πŸ“Š 95% CI",
508
+ "πŸ—³οΈ Votes",
509
+ "Organization",
510
+ "License",
511
+ "Knowledge Cutoff",
512
+ ],
513
+ datatype=[
514
+ "str",
515
+ "markdown",
516
+ "number",
517
+ "str",
518
+ "number",
519
+ "str",
520
+ "str",
521
+ "str",
522
+ ],
523
+ value=arena_values,
524
+ elem_id="arena_leaderboard_dataframe",
525
+ height=700,
526
+ column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
527
+ wrap=True,
528
+ )
529
+
530
  p1 = elo_subset_results["win_fraction_heatmap"]
531
  p2 = elo_subset_results["battle_count_heatmap"]
532
  p3 = elo_subset_results["bootstrap_elo_rating"]
 
535
  """
536
  leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
537
  return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539
  category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
540