Lisa Dunlap commited on
Commit
df2a130
β€’
1 Parent(s): a2fadac

moved buttons back to tab

Browse files
Files changed (1) hide show
  1. app.py +63 -63
app.py CHANGED
@@ -31,42 +31,44 @@ We've collected over **500,000** human preference votes to rank LLMs with the El
31
  return leaderboard_md
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
 
35
  # total_votes = sum(arena_df["num_battles"]) // 2
36
- # total_models = len(arena_df)
37
- # total_code_votes = sum(arena_chinese_df["num_battles"]) // 2
38
- # total_code_models = len(arena_chinese_df)
39
  # total_long_votes = sum(arena_long_df["num_battles"]) // 2
40
- # total_long_models = len(arena_long_df)
41
  # total_english_votes = sum(arena_english_df["num_battles"]) // 2
42
- # total_english_models = len(arena_english_df)
43
 
 
44
  # leaderboard_md = f"""
45
- # Total #models: **{total_models}**. Total #votes: **{total_votes}**. Total code #votes: **{total_code_votes}**. Last updated: March 29, 2024.
 
 
 
 
46
 
47
- # Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
48
  # """
49
- # return leaderboard_md
50
-
51
- def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
52
- # Calculate totals for each arena
53
- total_votes = sum(arena_df["num_battles"]) // 2
54
- total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
55
- total_long_votes = sum(arena_long_df["num_battles"]) // 2
56
- total_english_votes = sum(arena_english_df["num_battles"]) // 2
57
-
58
- # Constructing the markdown table
59
- leaderboard_md = f"""
60
- Last updated: March 29, 2024.
61
- | | **Total** | English | Chinese | Long Context |
62
- | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
63
- | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
64
- | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
65
 
66
- Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
67
- """
68
-
69
- return leaderboard_md
70
 
71
 
72
 
@@ -231,9 +233,9 @@ def get_full_table(arena_df, model_table_df):
231
 
232
  def create_ranking_str(ranking, ranking_difference):
233
  if ranking_difference > 0:
234
- return f"{int(ranking)} (\u2191 {int(ranking_difference)})"
235
  elif ranking_difference < 0:
236
- return f"{int(ranking)} (\u2193 {int(-ranking_difference)})"
237
  else:
238
  return f"{int(ranking)}"
239
 
@@ -293,12 +295,17 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
293
  print(f"{model_key} - {e}")
294
  return values
295
 
296
- def get_plots(elo_subset_results):
 
297
  p1 = elo_subset_results["win_fraction_heatmap"]
298
  p2 = elo_subset_results["battle_count_heatmap"]
299
  p3 = elo_subset_results["bootstrap_elo_rating"]
300
  p4 = elo_subset_results["average_win_rate_bar"]
301
- return p1, p2, p3, p4
 
 
 
 
302
 
303
 
304
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
@@ -325,6 +332,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
325
  default_md = make_default_md(arena_df, elo_results)
326
 
327
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
 
328
  if leaderboard_table_file:
329
  data = load_leaderboard_table_csv(leaderboard_table_file)
330
  model_table_df = pd.DataFrame(data)
@@ -333,17 +341,21 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
333
  # arena table
334
  arena_table_vals = get_arena_table(arena_df, model_table_df)
335
  with gr.Tab("Arena Elo", id=0):
336
- md = make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df)
337
- gr.Markdown(md, elem_id="leaderboard_markdown")
338
  with gr.Row():
339
  overall_rating = gr.Button("Overall")
340
- update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
 
341
  english_rating = gr.Button("English")
342
- update_english_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_english_df)
 
343
  chinese_rating = gr.Button("Chinese")
344
- update_chinese_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_chinese_df)
 
345
  long_context_rating = gr.Button("Long Context")
346
- update_long_context_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_long_df)
 
347
  elo_display_df = gr.Dataframe(
348
  headers=[
349
  "Rank",
@@ -368,14 +380,14 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
368
  value=arena_table_vals,
369
  elem_id="arena_leaderboard_dataframe",
370
  height=700,
371
- column_widths=[70, 190, 120, 100, 90, 140, 150, 140],
372
  wrap=True,
373
  )
374
  # Setup the button click action
375
- overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
376
- english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
377
- chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
378
- long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
379
 
380
  with gr.Tab("Full Leaderboard", id=1):
381
  md = make_full_leaderboard_md(elo_results)
@@ -418,22 +430,12 @@ See Figure 3 below for visualization of the confidence intervals.
418
  leader_component_values[:] = [default_md, p1, p2, p3, p4]
419
 
420
  if show_plot:
421
- gr.Markdown(
422
  f"""## More Statistics for Chatbot Arena\n
423
- Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
424
- You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
425
  """,
426
  elem_id="leaderboard_markdown"
427
  )
428
- with gr.Row():
429
- overall_plots = gr.Button("Overall")
430
- update_overall_plots = lambda _: get_plots(elo_results)
431
- english_plots = gr.Button("English")
432
- update_english_plot = lambda _: get_plots(elo_english_results)
433
- chinese_plots = gr.Button("Chinese")
434
- update_chinese_plot = lambda _: get_plots(elo_chinese_results)
435
- long_context_plots = gr.Button("Long Context")
436
- update_long_context_plot = lambda _: get_plots(elo_long_results)
437
  with gr.Row():
438
  with gr.Column():
439
  gr.Markdown(
@@ -457,10 +459,10 @@ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12
457
  )
458
  plot_4 = gr.Plot(p4, show_label=False)
459
 
460
- overall_plots.click(fn=update_overall_plots, inputs=overall_plots, outputs=[plot_1, plot_2, plot_3, plot_4])
461
- english_plots.click(fn=update_english_plot, inputs=english_plots, outputs=[plot_1, plot_2, plot_3, plot_4])
462
- chinese_plots.click(fn=update_chinese_plot, inputs=chinese_plots, outputs=[plot_1, plot_2, plot_3, plot_4])
463
- long_context_plots.click(fn=update_long_context_plot, inputs=long_context_plots, outputs=[plot_1, plot_2, plot_3, plot_4])
464
 
465
  gr.Markdown(acknowledgment_md)
466
 
@@ -494,7 +496,7 @@ block_css = """
494
 
495
  #arena_leaderboard_dataframe td {
496
  line-height: 0.15em;
497
- font-size: 20px;
498
  }
499
  #arena_leaderboard_dataframe th {
500
  font-size: 20px;
@@ -503,7 +505,7 @@ block_css = """
503
 
504
  #full_leaderboard_dataframe td {
505
  line-height: 0.15em;
506
- font-size: 20px;
507
  }
508
  #full_leaderboard_dataframe th {
509
  font-size: 20px;
@@ -538,12 +540,10 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
538
 
539
  def build_demo(elo_results_file, leaderboard_table_file):
540
  text_size = gr.themes.sizes.text_lg
541
- print("text_size", text_size)
542
 
543
  with gr.Blocks(
544
  title="Chatbot Arena Leaderboard",
545
- # theme=gr.themes.Soft(text_size=text_size),
546
- # theme='reilnuud/polite',
547
  theme = gr.themes.Base.load("theme.json"),
548
  css=block_css,
549
  ) as demo:
 
31
  return leaderboard_md
32
 
33
 
34
+ def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
35
+ total_votes = sum(arena_df["num_battles"]) // 2
36
+ total_models = len(arena_df)
37
+ space = "&nbsp;&nbsp;&nbsp;"
38
+ if arena_subset_df is not None:
39
+ total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
40
+ total_subset_models = len(arena_subset_df)
41
+ vote_str = f"{space} {name} #models: **{total_subset_models}**.{space} {name} #votes: **{'{:,}'.format(total_subset_votes)}**."
42
+ else:
43
+ vote_str = ""
44
+ leaderboard_md = f"""
45
+ Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
46
+
47
+ Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
48
+
49
+ **NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
50
+ """
51
+ return leaderboard_md
52
+
53
  # def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
54
+ # # Calculate totals for each arena
55
  # total_votes = sum(arena_df["num_battles"]) // 2
56
+ # total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
 
 
57
  # total_long_votes = sum(arena_long_df["num_battles"]) // 2
 
58
  # total_english_votes = sum(arena_english_df["num_battles"]) // 2
 
59
 
60
+ # # Constructing the markdown table
61
  # leaderboard_md = f"""
62
+ # Last updated: March 29, 2024.
63
+ # | | **Total** | English | Chinese | Long Context |
64
+ # | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
65
+ # | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
66
+ # | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
67
 
68
+ # Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
69
  # """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # return leaderboard_md
 
 
 
72
 
73
 
74
 
 
233
 
234
  def create_ranking_str(ranking, ranking_difference):
235
  if ranking_difference > 0:
236
+ return f"{int(ranking)} (\u2191{int(ranking_difference)})"
237
  elif ranking_difference < 0:
238
+ return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
239
  else:
240
  return f"{int(ranking)}"
241
 
 
295
  print(f"{model_key} - {e}")
296
  return values
297
 
298
+ def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_df, elo_subset_results):
299
+ arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
300
  p1 = elo_subset_results["win_fraction_heatmap"]
301
  p2 = elo_subset_results["battle_count_heatmap"]
302
  p3 = elo_subset_results["bootstrap_elo_rating"]
303
  p4 = elo_subset_results["average_win_rate_bar"]
304
+ more_stats_md = f"""## More Statistics for Chatbot Arena ({button})\n
305
+ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
306
+ """
307
+ leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
308
+ return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
309
 
310
 
311
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
 
332
  default_md = make_default_md(arena_df, elo_results)
333
 
334
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
335
+ # md = make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df)
336
  if leaderboard_table_file:
337
  data = load_leaderboard_table_csv(leaderboard_table_file)
338
  model_table_df = pd.DataFrame(data)
 
341
  # arena table
342
  arena_table_vals = get_arena_table(arena_df, model_table_df)
343
  with gr.Tab("Arena Elo", id=0):
344
+ md = make_arena_leaderboard_md(arena_df)
345
+ leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
346
  with gr.Row():
347
  overall_rating = gr.Button("Overall")
348
+ # update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
349
+ update_overall_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, None, elo_results)
350
  english_rating = gr.Button("English")
351
+ update_english_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_english_df, elo_english_results)
352
+ # update_english_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_english_df)
353
  chinese_rating = gr.Button("Chinese")
354
+ update_chinese_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_chinese_df, elo_chinese_results)
355
+ # update_chinese_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_chinese_df)
356
  long_context_rating = gr.Button("Long Context")
357
+ update_long_context_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_long_df, elo_long_results)
358
+ # update_long_context_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_long_df)
359
  elo_display_df = gr.Dataframe(
360
  headers=[
361
  "Rank",
 
380
  value=arena_table_vals,
381
  elem_id="arena_leaderboard_dataframe",
382
  height=700,
383
+ column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
384
  wrap=True,
385
  )
386
  # Setup the button click action
387
+ # overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
388
+ # english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
389
+ # chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
390
+ # long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
391
 
392
  with gr.Tab("Full Leaderboard", id=1):
393
  md = make_full_leaderboard_md(elo_results)
 
430
  leader_component_values[:] = [default_md, p1, p2, p3, p4]
431
 
432
  if show_plot:
433
+ more_stats_md = gr.Markdown(
434
  f"""## More Statistics for Chatbot Arena\n
435
+ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
 
436
  """,
437
  elem_id="leaderboard_markdown"
438
  )
 
 
 
 
 
 
 
 
 
439
  with gr.Row():
440
  with gr.Column():
441
  gr.Markdown(
 
459
  )
460
  plot_4 = gr.Plot(p4, show_label=False)
461
 
462
+ overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
463
+ english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
464
+ chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
465
+ long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
466
 
467
  gr.Markdown(acknowledgment_md)
468
 
 
496
 
497
  #arena_leaderboard_dataframe td {
498
  line-height: 0.15em;
499
+ font-size: 18px;
500
  }
501
  #arena_leaderboard_dataframe th {
502
  font-size: 20px;
 
505
 
506
  #full_leaderboard_dataframe td {
507
  line-height: 0.15em;
508
+ font-size: 18px;
509
  }
510
  #full_leaderboard_dataframe th {
511
  font-size: 20px;
 
540
 
541
  def build_demo(elo_results_file, leaderboard_table_file):
542
  text_size = gr.themes.sizes.text_lg
 
543
 
544
  with gr.Blocks(
545
  title="Chatbot Arena Leaderboard",
546
+ # theme=gr.themes.Base(text_size=text_size),
 
547
  theme = gr.themes.Base.load("theme.json"),
548
  css=block_css,
549
  ) as demo: