sheonhan commited on
Commit
0227006
1 Parent(s): 9cea2a5

Add GPT-4 & human eval tab

Browse files
Files changed (6) hide show
  1. .gitignore +4 -0
  2. app.py +255 -99
  3. content.py +26 -1
  4. elo_utils.py +175 -0
  5. utils.py +4 -20
  6. visualizations.py +137 -0
.gitignore CHANGED
@@ -4,3 +4,7 @@ __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
 
 
 
 
 
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
7
+
8
+ gpt_4_evals/
9
+ human_evals/
10
+ model_counts.html
app.py CHANGED
@@ -1,20 +1,24 @@
1
- import os
2
  import json
 
3
  from datetime import datetime, timezone
4
 
5
- import numpy as np
6
  import gradio as gr
 
7
  import pandas as pd
8
-
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
- from content import *
11
- from huggingface_hub import Repository, HfApi
12
  from transformers import AutoConfig
 
 
 
13
  from utils import get_eval_results_dicts, make_clickable_model
14
 
15
  # clone / pull the lmeh eval data
16
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
17
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 
 
18
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
19
 
20
  api = HfApi()
@@ -56,6 +60,27 @@ if H4_TOKEN:
56
  requested_models_dir = "./evals/eval_requests"
57
  requested_models = get_all_requested_models(requested_models_dir)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # parse the results
61
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
@@ -100,6 +125,16 @@ BENCHMARK_COLS = [
100
  "TruthfulQA (0-shot) ⬆️",
101
  ]
102
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  def has_no_nan_values(df, columns):
105
  return df[columns].notna().all(axis=1)
@@ -213,6 +248,42 @@ def get_evaluation_queue_df():
213
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
214
 
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  original_df = get_leaderboard_df()
217
  leaderboard_df = original_df.copy()
218
  (
@@ -220,6 +291,14 @@ leaderboard_df = original_df.copy()
220
  running_eval_queue_df,
221
  pending_eval_queue_df,
222
  ) = get_evaluation_queue_df()
 
 
 
 
 
 
 
 
223
 
224
 
225
  def is_model_on_hub(model_name, revision) -> bool:
@@ -359,12 +438,11 @@ custom_css = """
359
  }
360
 
361
  /* Hides the final column */
362
- table td:last-child,
363
- table th:last-child {
364
  display: none;
365
  }
366
 
367
-
368
  /* Limit the width of the first column so that names don't expand too much */
369
  table td:first-child,
370
  table th:first-child {
@@ -373,13 +451,30 @@ table th:first-child {
373
  white-space: nowrap;
374
  }
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  """
377
 
378
 
379
  demo = gr.Blocks(css=custom_css)
380
  with demo:
381
  gr.HTML(TITLE)
382
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
383
 
384
  with gr.Row():
385
  with gr.Column():
@@ -393,97 +488,158 @@ with demo:
393
  with gr.Accordion("✨ CHANGELOG", open=False):
394
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
395
 
396
- with gr.Box(elem_id="search-bar-table-box"):
397
- search_bar = gr.Textbox(
398
- placeholder="🔍 Search your model and press ENTER...",
399
- show_label=False,
400
- elem_id="search-bar",
401
- )
402
-
403
- leaderboard_table = gr.components.Dataframe(
404
- value=leaderboard_df,
405
- headers=COLS,
406
- datatype=TYPES,
407
- max_rows=5,
408
- elem_id="leaderboard-table",
409
- )
410
-
411
- # Dummy leaderboard for handling the case when the user uses backspace key
412
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
413
- value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
414
- )
415
-
416
- search_bar.submit(
417
- search_table,
418
- [hidden_leaderboard_table_for_search, search_bar],
419
- leaderboard_table,
420
- )
421
-
422
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
423
-
424
- with gr.Accordion("✅ Finished Evaluations", open=False):
425
- finished_eval_table = gr.components.Dataframe(
426
- value=finished_eval_queue_df,
427
- headers=EVAL_COLS,
428
- datatype=EVAL_TYPES,
429
- max_rows=5,
430
- )
431
- with gr.Accordion("🔄 Running Evaluation Queue", open=False):
432
- running_eval_table = gr.components.Dataframe(
433
- value=running_eval_queue_df,
434
- headers=EVAL_COLS,
435
- datatype=EVAL_TYPES,
436
- max_rows=5,
437
- )
438
-
439
- with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
440
- pending_eval_table = gr.components.Dataframe(
441
- value=pending_eval_queue_df,
442
- headers=EVAL_COLS,
443
- datatype=EVAL_TYPES,
444
- max_rows=5,
445
- )
446
-
447
- refresh_button = gr.Button("Refresh")
448
- refresh_button.click(
449
- refresh,
450
- inputs=[],
451
- outputs=[
452
- leaderboard_table,
453
- finished_eval_table,
454
- running_eval_table,
455
- pending_eval_table,
456
- ],
457
- )
458
-
459
- with gr.Accordion("Submit a new model for evaluation"):
460
- with gr.Row():
461
- with gr.Column():
462
- model_name_textbox = gr.Textbox(label="Model name")
463
- revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
464
-
465
  with gr.Column():
466
- is_8bit_toggle = gr.Checkbox(
467
- False, label="8 bit eval", visible=not IS_PUBLIC
468
- )
469
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
470
- is_delta_weight = gr.Checkbox(False, label="Delta weights")
471
- base_model_name_textbox = gr.Textbox(label="base model (for delta)")
472
-
473
- submit_button = gr.Button("Submit Eval")
474
- submission_result = gr.Markdown()
475
- submit_button.click(
476
- add_new_eval,
477
- [
478
- model_name_textbox,
479
- base_model_name_textbox,
480
- revision_name_textbox,
481
- is_8bit_toggle,
482
- private,
483
- is_delta_weight,
484
- ],
485
- submission_result,
486
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
  scheduler = BackgroundScheduler()
489
  scheduler.add_job(restart_space, "interval", seconds=3600)
 
 
1
  import json
2
+ import os
3
  from datetime import datetime, timezone
4
 
5
+
6
  import gradio as gr
7
+ import numpy as np
8
  import pandas as pd
 
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
+ from huggingface_hub import HfApi, Repository
 
11
  from transformers import AutoConfig
12
+
13
+ from content import *
14
+ from elo_utils import get_elo_plots, get_elo_results_dicts
15
  from utils import get_eval_results_dicts, make_clickable_model
16
 
17
  # clone / pull the lmeh eval data
18
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
19
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
20
+ HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
21
+ GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
22
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
23
 
24
  api = HfApi()
 
60
  requested_models_dir = "./evals/eval_requests"
61
  requested_models = get_all_requested_models(requested_models_dir)
62
 
63
+ human_eval_repo = None
64
+ if H4_TOKEN and not os.path.isdir("./human_evals"):
65
+ print("Pulling human evaluation repo")
66
+ human_eval_repo = Repository(
67
+ local_dir="./human_evals/",
68
+ clone_from=HUMAN_EVAL_REPO,
69
+ use_auth_token=H4_TOKEN,
70
+ repo_type="dataset",
71
+ )
72
+ human_eval_repo.git_pull()
73
+
74
+ gpt_4_eval_repo = None
75
+ if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
76
+ print("Pulling GPT-4 evaluation repo")
77
+ gpt_4_eval_repo = Repository(
78
+ local_dir="./gpt_4_evals/",
79
+ clone_from=GPT_4_EVAL_REPO,
80
+ use_auth_token=H4_TOKEN,
81
+ repo_type="dataset",
82
+ )
83
+ gpt_4_eval_repo.git_pull()
84
 
85
  # parse the results
86
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
 
125
  "TruthfulQA (0-shot) ⬆️",
126
  ]
127
 
128
+ ELO_COLS = [
129
+ "Model",
130
+ "GPT-4 (all)",
131
+ "Human (all)",
132
+ "Human (instruct)",
133
+ "Human (code-instruct)",
134
+ ]
135
+ ELO_TYPES = ["markdown", "number", "number", "number", "number"]
136
+ ELO_SORT_COL = "GPT-4 (all)"
137
+
138
 
139
  def has_no_nan_values(df, columns):
140
  return df[columns].notna().all(axis=1)
 
248
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
249
 
250
 
251
+ def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
252
+ if human_eval_repo:
253
+ print("Pulling human_eval_repo changes")
254
+ human_eval_repo.git_pull()
255
+
256
+ all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
257
+ dataframe = pd.DataFrame.from_records(all_data)
258
+ dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
259
+ dataframe = dataframe[ELO_COLS]
260
+ return dataframe
261
+
262
+
263
+ def get_elo_elements():
264
+ df_instruct = pd.read_json("human_evals/without_code.json")
265
+ df_code_instruct = pd.read_json("human_evals/with_code.json")
266
+
267
+ elo_leaderboard = get_elo_leaderboard(
268
+ df_instruct, df_code_instruct, tie_allowed=False
269
+ )
270
+ elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
271
+ df_instruct, df_code_instruct, tie_allowed=True
272
+ )
273
+ plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
274
+ df_instruct, df_code_instruct, tie_allowed=False
275
+ )
276
+
277
+ return (
278
+ elo_leaderboard,
279
+ elo_leaderboard_with_tie_allowed,
280
+ plot_1,
281
+ plot_2,
282
+ plot_3,
283
+ plot_4,
284
+ )
285
+
286
+
287
  original_df = get_leaderboard_df()
288
  leaderboard_df = original_df.copy()
289
  (
 
291
  running_eval_queue_df,
292
  pending_eval_queue_df,
293
  ) = get_evaluation_queue_df()
294
+ (
295
+ elo_leaderboard,
296
+ elo_leaderboard_with_tie_allowed,
297
+ plot_1,
298
+ plot_2,
299
+ plot_3,
300
+ plot_4,
301
+ ) = get_elo_elements()
302
 
303
 
304
  def is_model_on_hub(model_name, revision) -> bool:
 
438
  }
439
 
440
  /* Hides the final column */
441
+ #llm-benchmark-tab-table table td:last-child,
442
+ #llm-benchmark-tab-table table th:last-child {
443
  display: none;
444
  }
445
 
 
446
  /* Limit the width of the first column so that names don't expand too much */
447
  table td:first-child,
448
  table th:first-child {
 
451
  white-space: nowrap;
452
  }
453
 
454
+ .tab-buttons button {
455
+ font-size: 16px;
456
+ }
457
+
458
+ #scale-logo {
459
+ border-style: none !important;
460
+ box-shadow: none;
461
+ display: block;
462
+ margin-left: auto;
463
+ margin-right: auto;
464
+ max-width: 600px;
465
+ }
466
+
467
+ #scale-logo .download {
468
+ display: none;
469
+ }
470
  """
471
 
472
 
473
  demo = gr.Blocks(css=custom_css)
474
  with demo:
475
  gr.HTML(TITLE)
476
+ with gr.Row():
477
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
478
 
479
  with gr.Row():
480
  with gr.Column():
 
488
  with gr.Accordion("✨ CHANGELOG", open=False):
489
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
490
 
491
+ with gr.Tabs(elem_classes="tab-buttons"):
492
+ with gr.TabItem("📊 LLM Benchmarks", elem_id="llm-benchmark-tab-table"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  with gr.Column():
494
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
495
+ with gr.Box(elem_id="search-bar-table-box"):
496
+ search_bar = gr.Textbox(
497
+ placeholder="🔍 Search your model and press ENTER...",
498
+ show_label=False,
499
+ elem_id="search-bar",
500
+ )
501
+
502
+ leaderboard_table = gr.components.Dataframe(
503
+ value=leaderboard_df,
504
+ headers=COLS,
505
+ datatype=TYPES,
506
+ max_rows=5,
507
+ elem_id="leaderboard-table",
508
+ )
509
+
510
+ # Dummy leaderboard for handling the case when the user uses backspace key
511
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
512
+ value=original_df,
513
+ headers=COLS,
514
+ datatype=TYPES,
515
+ max_rows=5,
516
+ visible=False,
517
+ )
518
+
519
+ search_bar.submit(
520
+ search_table,
521
+ [hidden_leaderboard_table_for_search, search_bar],
522
+ leaderboard_table,
523
+ )
524
+
525
+ with gr.Row():
526
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
527
+
528
+ with gr.Accordion("✅ Finished Evaluations", open=False):
529
+ with gr.Row():
530
+ finished_eval_table = gr.components.Dataframe(
531
+ value=finished_eval_queue_df,
532
+ headers=EVAL_COLS,
533
+ datatype=EVAL_TYPES,
534
+ max_rows=5,
535
+ )
536
+ with gr.Accordion("🔄 Running Evaluation Queue", open=False):
537
+ with gr.Row():
538
+ running_eval_table = gr.components.Dataframe(
539
+ value=running_eval_queue_df,
540
+ headers=EVAL_COLS,
541
+ datatype=EVAL_TYPES,
542
+ max_rows=5,
543
+ )
544
+
545
+ with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
546
+ with gr.Row():
547
+ pending_eval_table = gr.components.Dataframe(
548
+ value=pending_eval_queue_df,
549
+ headers=EVAL_COLS,
550
+ datatype=EVAL_TYPES,
551
+ max_rows=5,
552
+ )
553
+
554
+ with gr.Row():
555
+ refresh_button = gr.Button("Refresh")
556
+ refresh_button.click(
557
+ refresh,
558
+ inputs=[],
559
+ outputs=[
560
+ leaderboard_table,
561
+ finished_eval_table,
562
+ running_eval_table,
563
+ pending_eval_table,
564
+ ],
565
+ )
566
+ with gr.Accordion("Submit a new model for evaluation"):
567
+ with gr.Row():
568
+ with gr.Column():
569
+ model_name_textbox = gr.Textbox(label="Model name")
570
+ revision_name_textbox = gr.Textbox(
571
+ label="revision", placeholder="main"
572
+ )
573
+
574
+ with gr.Column():
575
+ is_8bit_toggle = gr.Checkbox(
576
+ False, label="8 bit eval", visible=not IS_PUBLIC
577
+ )
578
+ private = gr.Checkbox(
579
+ False, label="Private", visible=not IS_PUBLIC
580
+ )
581
+ is_delta_weight = gr.Checkbox(False, label="Delta weights")
582
+ base_model_name_textbox = gr.Textbox(
583
+ label="base model (for delta)"
584
+ )
585
+
586
+ submit_button = gr.Button("Submit Eval")
587
+ submission_result = gr.Markdown()
588
+ submit_button.click(
589
+ add_new_eval,
590
+ [
591
+ model_name_textbox,
592
+ base_model_name_textbox,
593
+ revision_name_textbox,
594
+ is_8bit_toggle,
595
+ private,
596
+ is_delta_weight,
597
+ ],
598
+ submission_result,
599
+ )
600
+ with gr.TabItem(
601
+ "🧑‍⚖️ Human & GPT-4 Evaluations 🤖", elem_id="human-gpt-tab-table"
602
+ ):
603
+ with gr.Row():
604
+ with gr.Column(scale=2):
605
+ gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
606
+ with gr.Column(scale=1):
607
+ gr.Image(
608
+ "scale-hf-logo.png", elem_id="scale-logo", show_label=False
609
+ )
610
+ gr.Markdown("## No tie")
611
+ elo_leaderboard_table = gr.components.Dataframe(
612
+ value=elo_leaderboard,
613
+ headers=ELO_COLS,
614
+ datatype=ELO_TYPES,
615
+ max_rows=5,
616
+ )
617
+
618
+ gr.Markdown("## Tie allowed*")
619
+ elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
620
+ value=elo_leaderboard_with_tie_allowed,
621
+ headers=ELO_COLS,
622
+ datatype=ELO_TYPES,
623
+ max_rows=5,
624
+ )
625
+
626
+ gr.Markdown("\* Results when the scores of 4 and 5 were treated as ties.", elem_classes="markdown-text")
627
+ # with gr.Box():
628
+ # visualization_title = gr.HTML(VISUALIZATION_TITLE)
629
+ # with gr.Row():
630
+ # with gr.Column():
631
+ # gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
632
+ # plot_1 = gr.Plot(plot_1, show_label=False)
633
+ # with gr.Column():
634
+ # gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
635
+ # plot_2 = gr.Plot(plot_2, show_label=False)
636
+ # with gr.Row():
637
+ # with gr.Column():
638
+ # gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
639
+ # plot_3 = gr.Plot(plot_3, show_label=False)
640
+ # with gr.Column():
641
+ # gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
642
+ # plot_4 = gr.Plot(plot_4, show_label=False)
643
 
644
  scheduler = BackgroundScheduler()
645
  scheduler.add_job(restart_space, "interval", seconds=3600)
content.py CHANGED
@@ -1,4 +1,7 @@
1
  CHANGELOG_TEXT = f"""
 
 
 
2
  ## [2023-06-05]
3
  - Increase concurrent thread count to 40
4
  - Search models on ENTER
@@ -47,7 +50,11 @@ INTRODUCTION_TEXT = f"""
47
 
48
  🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
49
 
50
- 📈 We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks:
 
 
 
 
51
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
52
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
53
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
@@ -56,6 +63,15 @@ INTRODUCTION_TEXT = f"""
56
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
57
  """
58
 
 
 
 
 
 
 
 
 
 
59
  EVALUATION_QUEUE_TEXT = f"""
60
  # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
61
  """
@@ -128,3 +144,12 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
128
  primaryClass={cs.CL}
129
  }"""
130
 
 
 
 
 
 
 
 
 
 
 
1
  CHANGELOG_TEXT = f"""
2
+ ## [2023-06-12]
3
+ - Add Human & GPT-4 Evaluations
4
+
5
  ## [2023-06-05]
6
  - Increase concurrent thread count to 40
7
  - Search models on ENTER
 
50
 
51
  🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
52
 
53
+ 📈 In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
54
+ """
55
+
56
+ LLM_BENCHMARKS_TEXT = f"""
57
+ Evaluation is performed against 4 popular benchmarks:
58
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
59
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
60
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
 
63
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
64
  """
65
 
66
+ HUMAN_GPT_EVAL_TEXT = f"""
67
+ Evaluation is performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts. The prompts cover tasks such as brainstorming, creative generation, commonsense reasoning, open question answering, summarization, and code generation. Comparisons are made by humans and a model on a 1-8 Likert scale, where the labeler is required to choose a preference each time. Using these preferences, we create bootstrapped Elo rankings.
68
+
69
+ We collaborated with **Scale AI** to generate the completions using a professional data labeling workforce on their platform, [following the labeling instructions found here](https://docs.google.com/document/d/1c5-96Lj-UH4lzKjLvJ_MRQaVMjtoEXTYA4dvoAYVCHc/edit?usp=sharing). To understand the evaluation of popular models, we also had GPT-4 label the completions using this prompt.
70
+
71
+ For more information on the calibration and initiation of these measurements, please refer to the [announcement blog post](https://huggingface.co/blog/llm-leaderboard). We would like to express our gratitude to **LMSYS** for providing a [useful notebook](https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5?usp=sharing) for computing Elo estimates and plots.
72
+ """
73
+
74
+
75
  EVALUATION_QUEUE_TEXT = f"""
76
  # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
77
  """
 
144
  primaryClass={cs.CL}
145
  }"""
146
 
147
+ VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
148
+
149
+ PLOT_1_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
150
+
151
+ PLOT_2_TITLE = "Comparison Count of Each Combination of Models (not allowing ties)"
152
+
153
+ PLOT_3_TITLE = "Elo Estimates with error bars (ties allowed)"
154
+
155
+ PLOT_4_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
elo_utils.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from datasets import load_dataset
8
+
9
+ from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
10
+ from utils import make_clickable_model
11
+ from visualizations import (get_bootstrap_result, switch_model_a_b,
12
+ visualize_battle_count, visualize_bootstrap_scores,
13
+ visualize_pairwise_win_fraction,
14
+ visualize_rating_count)
15
+
16
+
17
+ @dataclass
18
+ class EloEvalResult:
19
+ model: str
20
+ gpt_4_all: int
21
+ human_all: int
22
+ human_instruct: int
23
+ human_code_instruct: int
24
+ tie_allowed: bool
25
+
26
+ def to_dict(self):
27
+ base_model = f"{self.model}"
28
+ data_dict = {}
29
+ data_dict["Model"] = make_clickable_model(base_model)
30
+ data_dict["GPT-4 (all)"] = self.gpt_4_all
31
+ data_dict["Human (all)"] = self.human_all
32
+ data_dict["Human (instruct)"] = self.human_instruct
33
+ data_dict["Human (code-instruct)"] = self.human_code_instruct
34
+
35
+ return data_dict
36
+
37
+
38
+ def create_eval_df(df, tie_allowed):
39
+ responses = []
40
+ for _, row in df.iterrows():
41
+ if row["status"] == "canceled":
42
+ continue
43
+
44
+ rating = row["response"]["annotations"]["Preference"]
45
+ if rating == "NaN":
46
+ continue
47
+
48
+ scores = row["response"]["responses"]
49
+ if any(s["Preference"] == "" for s in scores):
50
+ continue
51
+
52
+ response = {
53
+ "id": row["task_id"],
54
+ "prompt": row["params"]["templateVariables"]["prompt"],
55
+ "model_a": row["params"]["templateVariables"]["modela"],
56
+ "model_b": row["params"]["templateVariables"]["modelb"],
57
+ "response_a": row["params"]["templateVariables"]["response1"],
58
+ "response_b": row["params"]["templateVariables"]["response2"],
59
+ "rating": int(rating),
60
+ "ratings": [np.array([s["Preference"] for s in scores], dtype=np.int32)],
61
+ }
62
+
63
+ if tie_allowed:
64
+ response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie"
65
+ else:
66
+ response["win"] = "model_a" if response["rating"] < 5 else "model_b"
67
+
68
+ responses.append(response)
69
+
70
+ return pd.DataFrame(responses)
71
+
72
+
73
+ def create_eval_df_for_gpt(df, tie_allowed):
74
+ responses = []
75
+ for _, row in df.iterrows():
76
+ response = {
77
+ "id": row["review_id"],
78
+ "prompt": row["question"],
79
+ "model_a": row["model1"],
80
+ "model_b": row["model2"],
81
+ "response_a": row["answer1"],
82
+ "response_b": row["answer2"],
83
+ "rating": row["score"][0],
84
+ }
85
+
86
+ if tie_allowed:
87
+ response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie"
88
+ else:
89
+ response["win"] = "model_a" if response["rating"] < 5 else "model_b"
90
+
91
+ responses.append(response)
92
+
93
+ return pd.DataFrame(responses)
94
+
95
+
96
+ # Compute the Elo rating for each model
97
+ def compute_elo(df, k=32, scale=400, base=10, initial_rating=1000):
98
+ rating = defaultdict(lambda: initial_rating)
99
+
100
+ for _, model_a, model_b, win in df[["model_a", "model_b", "win"]].itertuples():
101
+ ra = rating[model_a]
102
+ rb = rating[model_b]
103
+ ea = 1 / (1 + base ** ((rb - ra) / scale))
104
+ eb = 1 / (1 + base ** ((ra - rb) / scale))
105
+ if win == "model_a":
106
+ sa = 1
107
+ elif win == "model_b":
108
+ sa = 0
109
+ elif win == "tie" or win == "tie (bothbad)":
110
+ sa = 0.5
111
+ else:
112
+ raise Exception(f"unexpected vote {win}")
113
+ rating[model_a] += k * (sa - ea)
114
+ rating[model_b] += k * (1 - sa - eb)
115
+
116
+ return rating
117
+
118
+
119
+ def convert_rating_from_float_to_int(df):
120
+ return {model: int(rating) for model, rating in compute_elo(df).items()}
121
+
122
+
123
+ def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
124
+ df_all = pd.concat([df_instruct, df_code_instruct])
125
+
126
+ df_gpt_4 = load_dataset(
127
+ "gpt_4_evals/data/", split="train", revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846"
128
+ ).to_pandas()
129
+
130
+ dfs = [df_instruct, df_code_instruct, df_all]
131
+ elo_ratings = [convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed)) for df in dfs]
132
+
133
+ gpt_4_elo_ratings = convert_rating_from_float_to_int(create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed))
134
+ elo_ratings.append(gpt_4_elo_ratings)
135
+
136
+ results = [
137
+ EloEvalResult(
138
+ model=model_name,
139
+ gpt_4_all=elo_ratings[3][model_name],
140
+ human_all=elo_ratings[2][model_name],
141
+ human_instruct=elo_ratings[0][model_name],
142
+ human_code_instruct=elo_ratings[1][model_name],
143
+ tie_allowed=tie_allowed,
144
+ )
145
+ for model_name in elo_ratings[0].keys()
146
+ ]
147
+
148
+ return results
149
+
150
+
151
+ def get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) -> List[Dict]:
152
+ eval_results = get_elo_results(df_instruct, df_code_instruct, tie_allowed)
153
+ return [r.to_dict() for r in eval_results]
154
+
155
+
156
+ def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
157
+ df_instruct = create_eval_df(df_instruct, tie_allowed=tie_allowed)
158
+ df_code_instruct = create_eval_df(df_code_instruct, tie_allowed=tie_allowed)
159
+ df_all = pd.concat([df_instruct, df_code_instruct])
160
+ game = df_all[["model_a", "model_b", "win"]]
161
+
162
+ game_switch = switch_model_a_b(game)
163
+ plot_1 = visualize_pairwise_win_fraction(game_switch, PLOT_1_TITLE)
164
+
165
+ plot_2 = visualize_battle_count(game_switch, PLOT_2_TITLE)
166
+
167
+ BOOTSTRAP_ROUNDS = 1000
168
+ if "bootstrap_elo_lu" not in globals():
169
+ bootstrap_elo_lu = get_bootstrap_result(game_switch, compute_elo, BOOTSTRAP_ROUNDS)
170
+
171
+ plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
172
+
173
+ plot_4 = visualize_rating_count(game, PLOT_4_TITLE)
174
+
175
+ return plot_1, plot_2, plot_3, plot_4
utils.py CHANGED
@@ -1,21 +1,11 @@
1
- import os
2
- import shutil
3
- import numpy as np
4
- import gradio as gr
5
- from huggingface_hub import Repository, HfApi
6
- from transformers import AutoConfig, AutoModel
7
- import json
8
- from apscheduler.schedulers.background import BackgroundScheduler
9
- import pandas as pd
10
- import datetime
11
  import glob
 
12
  from dataclasses import dataclass
13
- from typing import List, Tuple, Dict
14
 
15
- # clone / pull the lmeh eval data
16
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
17
- LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
18
 
 
19
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
20
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
21
  BENCH_TO_NAME = {
@@ -71,13 +61,11 @@ class EvalResult:
71
  data_dict["eval_name"] = self.eval_name
72
  data_dict["8bit"] = self.is_8bit
73
  data_dict["Model"] = make_clickable_model(base_model)
74
- # dummy column to implement search bar (hidden by custom CSS)
75
  data_dict["model_name_for_query"] = base_model
76
  data_dict["Revision"] = self.revision
77
  data_dict["Average ⬆️"] = round(
78
  sum([v for k, v in self.results.items()]) / 4.0, 1
79
  )
80
- # data_dict["# params"] = get_n_params(base_model)
81
 
82
  for benchmark in BENCHMARKS:
83
  if not benchmark in self.results.keys():
@@ -151,7 +139,3 @@ def get_eval_results_dicts(is_public=True) -> List[Dict]:
151
  eval_results = get_eval_results(is_public)
152
 
153
  return [e.to_dict() for e in eval_results]
154
-
155
-
156
- eval_results_dict = get_eval_results_dicts()
157
- # print(eval_results_dict)
 
 
 
 
 
 
 
 
 
 
 
1
  import glob
2
+ import json
3
  from dataclasses import dataclass
4
+ from typing import Dict, List, Tuple
5
 
6
+ import numpy as np
 
 
7
 
8
+ # clone / pull the lmeh eval data
9
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
10
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
11
  BENCH_TO_NAME = {
 
61
  data_dict["eval_name"] = self.eval_name
62
  data_dict["8bit"] = self.is_8bit
63
  data_dict["Model"] = make_clickable_model(base_model)
 
64
  data_dict["model_name_for_query"] = base_model
65
  data_dict["Revision"] = self.revision
66
  data_dict["Average ⬆️"] = round(
67
  sum([v for k, v in self.results.items()]) / 4.0, 1
68
  )
 
69
 
70
  for benchmark in BENCHMARKS:
71
  if not benchmark in self.results.keys():
 
139
  eval_results = get_eval_results(is_public)
140
 
141
  return [e.to_dict() for e in eval_results]
 
 
 
 
visualizations.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import plotly.express as px
6
+
7
+
8
+ # 1
9
+ def compute_pairwise_win_fraction(battles):
10
+ # Times each model wins as Model A
11
+ a_win_ptbl = pd.pivot_table(
12
+ battles[battles["win"] == "model_a"],
13
+ index="model_a",
14
+ columns="model_b",
15
+ aggfunc="size",
16
+ fill_value=0,
17
+ )
18
+
19
+ # Table counting times each model wins as Model B
20
+ b_win_ptbl = pd.pivot_table(
21
+ battles[battles["win"] == "model_b"],
22
+ index="model_a",
23
+ columns="model_b",
24
+ aggfunc="size",
25
+ fill_value=0,
26
+ )
27
+
28
+ # Table counting number of A-B pairs
29
+ num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
30
+
31
+ # Computing the proportion of wins for each model as A and as B
32
+ # against all other models
33
+ row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T)
34
+
35
+ # Arrange ordering according to proprition of wins
36
+ prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
37
+ model_names = list(prop_wins.keys())
38
+ row_beats_col = row_beats_col_freq.loc[model_names, model_names]
39
+ return row_beats_col
40
+
41
+
42
+ def visualize_pairwise_win_fraction(battles, title):
43
+ row_beats_col = compute_pairwise_win_fraction(battles)
44
+ fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title)
45
+ fig.update_layout(
46
+ xaxis_title="Model B",
47
+ yaxis_title="Model A",
48
+ xaxis_side="top",
49
+ title_y=0.07,
50
+ title_x=0.5,
51
+ )
52
+ fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
53
+ return fig
54
+
55
+
56
+ # 2
57
+ def switch_model_a_b(df):
58
+ df_switch = df.copy()
59
+ # switch with probability 0.5
60
+ for i, row in df.iterrows():
61
+ if np.random.rand() < 0.5:
62
+ df_switch.at[i, "model_a"] = row["model_b"]
63
+ df_switch.at[i, "model_b"] = row["model_a"]
64
+ if row["win"] == "model_a":
65
+ df_switch.at[i, "win"] = "model_b"
66
+ elif row["win"] == "model_b":
67
+ df_switch.at[i, "win"] = "model_a"
68
+ return df_switch
69
+
70
+
71
+ def visualize_battle_count(battles, title):
72
+ ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
73
+ battle_counts = ptbl + ptbl.T
74
+ ordering = battle_counts.sum().sort_values(ascending=False).index
75
+ fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600)
76
+ fig.update_layout(
77
+ xaxis_title="Model B",
78
+ yaxis_title="Model A",
79
+ xaxis_side="top",
80
+ title_y=0.07,
81
+ title_x=0.5,
82
+ )
83
+ fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
84
+ return fig
85
+
86
+
87
+ # 3
88
+ def get_bootstrap_result(battles, func_compute_elo, num_round):
89
+ rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)]
90
+ df = pd.DataFrame(rows)
91
+ return df[df.median().sort_values(ascending=False).index]
92
+
93
+
94
+ def visualize_bootstrap_scores(df, title):
95
+ bars = (
96
+ pd.DataFrame(
97
+ dict(
98
+ lower=df.quantile(0.025),
99
+ rating=df.quantile(0.5),
100
+ upper=df.quantile(0.975),
101
+ )
102
+ )
103
+ .reset_index(names="model")
104
+ .sort_values("rating", ascending=False)
105
+ )
106
+ bars["error_y"] = bars["upper"] - bars["rating"]
107
+ bars["error_y_minus"] = bars["rating"] - bars["lower"]
108
+ bars["rating_rounded"] = np.round(bars["rating"], 2)
109
+ fig = px.scatter(
110
+ bars,
111
+ x="model",
112
+ y="rating",
113
+ error_y="error_y",
114
+ error_y_minus="error_y_minus",
115
+ text="rating_rounded",
116
+ title=title,
117
+ )
118
+ fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
119
+ return fig
120
+
121
+
122
+ # 4
123
+ def visualize_rating_count(df, title):
124
+ df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts()
125
+ fig = px.bar(df_all_value_counts, title=title, text_auto=True)
126
+
127
+ min_y = df_all_value_counts.min()
128
+ max_y = df_all_value_counts.max()
129
+
130
+ y_end = math.ceil(min_y / 100) * 100
131
+ y_begin = math.floor(max_y / 100) * 100
132
+
133
+ fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
134
+ fig.update_yaxes(range=[y_begin, y_end])
135
+ # save the plot for the blog:
136
+ fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn")
137
+ return fig