Linker1907 commited on
Commit
d16cee2
1 Parent(s): e868f35

Using the new backend

Browse files
README.md CHANGED
@@ -8,6 +8,7 @@ sdk_version: 3.27.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -15,26 +15,40 @@ from src.assets.text_content import *
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
  from src.assets.css_html_js import custom_css, get_window_url_params
18
- from src.utils_display import AutoEvalColumn, EvalQueueColumn, EloEvalColumn, fields, styled_error, styled_warning, styled_message
19
- from src.init import load_all_info_from_hub
20
 
21
  # clone / pull the lmeh eval data
22
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
23
- LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 
 
 
 
 
 
24
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
25
  ADD_PLOTS = False
26
 
27
- EVAL_REQUESTS_PATH = "auto_evals/eval_requests"
 
28
 
29
- api = HfApi()
 
30
 
 
31
 
32
  def restart_space():
33
  api.restart_space(
34
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
35
  )
36
 
37
- auto_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO)
 
 
 
 
 
38
 
39
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
40
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
@@ -60,9 +74,12 @@ def has_nan_values(df, columns):
60
 
61
 
62
  def get_leaderboard_df():
63
- if auto_eval_repo:
64
  print("Pulling evaluation results for the leaderboard.")
65
- auto_eval_repo.git_pull()
 
 
 
66
 
67
  all_data = get_eval_results_dicts(IS_PUBLIC)
68
 
@@ -84,9 +101,12 @@ def get_leaderboard_df():
84
 
85
  def get_evaluation_queue_df():
86
  # todo @saylortwift: replace the repo by the one you created for the eval queue
87
- if auto_eval_repo:
 
 
 
88
  print("Pulling changes for the evaluation queue.")
89
- auto_eval_repo.git_pull()
90
 
91
  entries = [
92
  entry
@@ -106,7 +126,7 @@ def get_evaluation_queue_df():
106
  data["revision"] = data.get("revision", "main")
107
 
108
  all_evals.append(data)
109
- else:
110
  # this is a folder
111
  sub_entries = [
112
  e
@@ -124,10 +144,10 @@ def get_evaluation_queue_df():
124
 
125
  pending_list = [e for e in all_evals if e["status"] == "PENDING"]
126
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
127
- finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
128
- df_pending = pd.DataFrame.from_records(pending_list)
129
- df_running = pd.DataFrame.from_records(running_list)
130
- df_finished = pd.DataFrame.from_records(finished_list)
131
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
132
 
133
 
@@ -149,7 +169,7 @@ def is_model_on_hub(model_name, revision) -> bool:
149
  return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
150
 
151
  except Exception as e:
152
- print("Could not get the model config from the hub.: \n", e)
153
  return False, "was not found on hub!"
154
 
155
 
@@ -200,7 +220,7 @@ def add_new_eval(
200
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
201
 
202
  # Check for duplicate submission
203
- if out_path.split("eval_requests/")[1].lower() in requested_models:
204
  return styled_warning("This model has been already submitted.")
205
 
206
  with open(out_path, "w") as f:
@@ -208,13 +228,17 @@ def add_new_eval(
208
 
209
  api.upload_file(
210
  path_or_fileobj=out_path,
211
- path_in_repo=out_path,
212
- repo_id=LMEH_REPO,
213
  token=H4_TOKEN,
214
  repo_type="dataset",
 
215
  )
216
 
217
- return styled_message("Your request has been submitted to the evaluation queue!")
 
 
 
218
 
219
 
220
  def refresh():
@@ -310,13 +334,6 @@ with demo:
310
  )
311
  with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
312
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
313
- with gr.Accordion("📙 Citation", open=False):
314
- citation_button = gr.Textbox(
315
- value=CITATION_BUTTON_TEXT,
316
- label=CITATION_BUTTON_LABEL,
317
- elem_id="citation-button",
318
- ).style(show_copy_button=True)
319
-
320
 
321
  with gr.Column():
322
  with gr.Row():
@@ -396,6 +413,14 @@ with demo:
396
  submission_result,
397
  )
398
 
 
 
 
 
 
 
 
 
399
  dummy = gr.Textbox(visible=False)
400
  demo.load(
401
  change_tab,
 
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
  from src.assets.css_html_js import custom_css, get_window_url_params
18
+ from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
+ from src.init import get_all_requested_models, load_all_info_from_hub
20
 
21
  # clone / pull the lmeh eval data
22
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
23
+
24
+ QUEUE_REPO = "open-llm-leaderboard/requests"
25
+ RESULTS_REPO = "open-llm-leaderboard/results"
26
+
27
+ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
28
+ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
29
+
30
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
31
  ADD_PLOTS = False
32
 
33
+ EVAL_REQUESTS_PATH = "eval-queue"
34
+ EVAL_RESULTS_PATH = "eval-results"
35
 
36
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
37
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
38
 
39
+ api = HfApi()
40
 
41
  def restart_space():
42
  api.restart_space(
43
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
44
  )
45
 
46
+ eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
47
+
48
+ if not IS_PUBLIC:
49
+ eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
50
+ else:
51
+ eval_queue_private, eval_results_private = None, None
52
 
53
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
54
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 
74
 
75
 
76
  def get_leaderboard_df():
77
+ if eval_results:
78
  print("Pulling evaluation results for the leaderboard.")
79
+ eval_results.git_pull()
80
+ if eval_results_private:
81
+ print("Pulling evaluation results for the leaderboard.")
82
+ eval_results_private.git_pull()
83
 
84
  all_data = get_eval_results_dicts(IS_PUBLIC)
85
 
 
101
 
102
  def get_evaluation_queue_df():
103
  # todo @saylortwift: replace the repo by the one you created for the eval queue
104
+ if eval_queue:
105
+ print("Pulling changes for the evaluation queue.")
106
+ eval_queue.git_pull()
107
+ if eval_queue_private:
108
  print("Pulling changes for the evaluation queue.")
109
+ eval_queue_private.git_pull()
110
 
111
  entries = [
112
  entry
 
126
  data["revision"] = data.get("revision", "main")
127
 
128
  all_evals.append(data)
129
+ elif ".md" not in entry:
130
  # this is a folder
131
  sub_entries = [
132
  e
 
144
 
145
  pending_list = [e for e in all_evals if e["status"] == "PENDING"]
146
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
147
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
148
+ df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
149
+ df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
150
+ df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
151
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
152
 
153
 
 
169
  return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
170
 
171
  except Exception as e:
172
+ print(f"Could not get the model config from the hub.: {e}")
173
  return False, "was not found on hub!"
174
 
175
 
 
220
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
221
 
222
  # Check for duplicate submission
223
+ if out_path.split("eval-queue/")[1].lower() in requested_models:
224
  return styled_warning("This model has been already submitted.")
225
 
226
  with open(out_path, "w") as f:
 
228
 
229
  api.upload_file(
230
  path_or_fileobj=out_path,
231
+ path_in_repo=out_path.split("eval-queue/")[1],
232
+ repo_id=QUEUE_REPO,
233
  token=H4_TOKEN,
234
  repo_type="dataset",
235
+ commit_message=f"Add {model} to eval queue",
236
  )
237
 
238
+ # remove the local file
239
+ os.remove(out_path)
240
+
241
+ return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
242
 
243
 
244
  def refresh():
 
334
  )
335
  with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
336
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
337
 
338
  with gr.Column():
339
  with gr.Row():
 
413
  submission_result,
414
  )
415
 
416
+ with gr.Row():
417
+ with gr.Accordion("📙 Citation", open=False):
418
+ citation_button = gr.Textbox(
419
+ value=CITATION_BUTTON_TEXT,
420
+ label=CITATION_BUTTON_LABEL,
421
+ elem_id="citation-button",
422
+ ).style(show_copy_button=True)
423
+
424
  dummy = gr.Textbox(visible=False)
425
  demo.load(
426
  change_tab,
src/assets/text_content.py CHANGED
@@ -61,7 +61,7 @@ INTRODUCTION_TEXT = f"""
61
 
62
  🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
63
 
64
- Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance bencmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
65
  """
66
 
67
  LLM_BENCHMARKS_TEXT = f"""
@@ -78,6 +78,29 @@ With the plethora of large language models (LLMs) and chatbots being released we
78
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # Reproduction
82
  To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
83
  `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
@@ -87,10 +110,17 @@ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs
87
  *You can expect results to vary slightly for different batch sizes because of padding.*
88
 
89
  The tasks and few shots parameters are:
90
- - ARC: 25-shot, *arc-challenge*
91
- - HellaSwag: 10-shot, *hellaswag*
92
- - TruthfulQA: 0-shot, *truthfulqa-mc* (mc2 score)
93
- - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions*
 
 
 
 
 
 
 
94
  """
95
 
96
  EVALUATION_QUEUE_TEXT = f"""
 
61
 
62
  🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
63
 
64
+ Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
65
  """
66
 
67
  LLM_BENCHMARKS_TEXT = f"""
 
78
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
79
 
80
 
81
+ # Some good practices before submitting a model
82
+
83
+ ## 1) Make sure you can load your model and tokenizer using AutoClasses:
84
+ ```python
85
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
86
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
87
+ model = AutoModel.from_pretrained("your model name", revision=revision)
88
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
89
+ ```
90
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
91
+
92
+ Note: make sure your model is public!
93
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
94
+
95
+ ## 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
96
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of weights of your model to the `Extended Viewer`!
97
+
98
+ ## 3) Make sure your model has an open license!
99
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
100
+
101
+ ## 4) Fill up your model card
102
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
103
+
104
  # Reproduction
105
  To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
106
  `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
 
110
  *You can expect results to vary slightly for different batch sizes because of padding.*
111
 
112
  The tasks and few shots parameters are:
113
+ - ARC: 25-shot, *arc-challenge* (`acc_norm`)
114
+ - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
115
+ - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
116
+ - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
117
+
118
+ # In case of model failure
119
+ If your model is displayed in the `FAILED` category, its execution stopped.
120
+ Make sure you have followed the above steps first.
121
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
122
+
123
+
124
  """
125
 
126
  EVALUATION_QUEUE_TEXT = f"""
src/auto_leaderboard/load_results.py CHANGED
@@ -7,14 +7,13 @@ from typing import Dict, List, Tuple
7
  from src.utils_display import AutoEvalColumn, make_clickable_model
8
  import numpy as np
9
 
10
- # clone / pull the lmeh eval data
11
- METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
12
- BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
13
  BENCH_TO_NAME = {
14
- "arc_challenge": AutoEvalColumn.arc.name,
15
  "hellaswag": AutoEvalColumn.hellaswag.name,
16
- "hendrycks": AutoEvalColumn.mmlu.name,
17
- "truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
18
  }
19
 
20
 
@@ -24,8 +23,8 @@ class EvalResult:
24
  org: str
25
  model: str
26
  revision: str
27
- is_8bit: bool
28
  results: dict
 
29
 
30
  def to_dict(self):
31
  if self.org is not None:
@@ -44,7 +43,7 @@ class EvalResult:
44
  )
45
 
46
  for benchmark in BENCHMARKS:
47
- if not benchmark in self.results.keys():
48
  self.results[benchmark] = None
49
 
50
  for k, v in BENCH_TO_NAME.items():
@@ -53,57 +52,61 @@ class EvalResult:
53
  return data_dict
54
 
55
 
56
- def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
57
  with open(json_filepath) as fp:
58
  data = json.load(fp)
59
 
60
- path_split = json_filepath.split("/")
61
- org = None
62
- model = path_split[-4]
63
- is_8bit = path_split[-2] == "8bit"
64
- revision = path_split[-3]
65
- if len(path_split) == 7:
66
- # handles gpt2 type models that don't have an org
67
- result_key = f"{model}_{revision}_{is_8bit}"
 
 
 
 
 
 
 
68
  else:
69
- org = path_split[-5]
70
- result_key = f"{org}_{model}_{revision}_{is_8bit}"
 
71
 
72
- eval_result = None
73
  for benchmark, metric in zip(BENCHMARKS, METRICS):
74
- if benchmark in json_filepath:
75
- accs = np.array([v[metric] for v in data["results"].values()])
76
- mean_acc = round(np.mean(accs) * 100.0, 1)
77
- eval_result = EvalResult(
78
- result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
79
- )
 
80
 
81
- return result_key, eval_result
82
 
83
 
84
  def get_eval_results(is_public) -> List[EvalResult]:
85
  json_filepaths = glob.glob(
86
- "auto_evals/eval_results/public/**/16bit/*.json", recursive=True
87
  )
88
  if not is_public:
89
  json_filepaths += glob.glob(
90
- "auto_evals/eval_results/private/**/*.json", recursive=True
91
- )
92
- json_filepaths += glob.glob(
93
- "auto_evals/eval_results/private/**/*.json", recursive=True
94
  )
95
- # include the 8bit evals of public models
96
- json_filepaths += glob.glob(
97
- "auto_evals/eval_results/public/**/8bit/*.json", recursive=True
98
- )
99
  eval_results = {}
100
 
101
  for json_filepath in json_filepaths:
102
- result_key, eval_result = parse_eval_result(json_filepath)
103
- if result_key in eval_results.keys():
104
- eval_results[result_key].results.update(eval_result.results)
105
- else:
106
- eval_results[result_key] = eval_result
 
107
 
108
  eval_results = [v for v in eval_results.values()]
109
 
 
7
  from src.utils_display import AutoEvalColumn, make_clickable_model
8
  import numpy as np
9
 
10
+ METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
11
+ BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
 
12
  BENCH_TO_NAME = {
13
+ "arc:challenge": AutoEvalColumn.arc.name,
14
  "hellaswag": AutoEvalColumn.hellaswag.name,
15
+ "hendrycksTest": AutoEvalColumn.mmlu.name,
16
+ "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
17
  }
18
 
19
 
 
23
  org: str
24
  model: str
25
  revision: str
 
26
  results: dict
27
+ is_8bit: bool = False
28
 
29
  def to_dict(self):
30
  if self.org is not None:
 
43
  )
44
 
45
  for benchmark in BENCHMARKS:
46
+ if benchmark not in self.results.keys():
47
  self.results[benchmark] = None
48
 
49
  for k, v in BENCH_TO_NAME.items():
 
52
  return data_dict
53
 
54
 
55
+ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
56
  with open(json_filepath) as fp:
57
  data = json.load(fp)
58
 
59
+ config = data["config"]
60
+ model = config.get("model_name", None)
61
+ if model is None:
62
+ model = config.get("model_args", None)
63
+
64
+ model_sha = config.get("model_sha", "")
65
+ eval_sha = config.get("lighteval_sha", "")
66
+ model_split = model.split("/", 1)
67
+
68
+ model = model_split[-1]
69
+
70
+ if len(model_split) == 1:
71
+ org = None
72
+ model = model_split[0]
73
+ result_key = f"{model}_{model_sha}_{eval_sha}"
74
  else:
75
+ org = model_split[0]
76
+ model = model_split[1]
77
+ result_key = f"{org}_{model}_{model_sha}_{eval_sha}"
78
 
79
+ eval_results = []
80
  for benchmark, metric in zip(BENCHMARKS, METRICS):
81
+ accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
82
+ if accs.size == 0:
83
+ continue
84
+ mean_acc = round(np.mean(accs) * 100.0, 1)
85
+ eval_results.append(EvalResult(
86
+ result_key, org, model, model_sha, {benchmark: mean_acc}
87
+ ))
88
 
89
+ return result_key, eval_results
90
 
91
 
92
  def get_eval_results(is_public) -> List[EvalResult]:
93
  json_filepaths = glob.glob(
94
+ "eval-results/**/results*.json", recursive=True
95
  )
96
  if not is_public:
97
  json_filepaths += glob.glob(
98
+ "private-eval-results/**/results*.json", recursive=True
 
 
 
99
  )
100
+
 
 
 
101
  eval_results = {}
102
 
103
  for json_filepath in json_filepaths:
104
+ result_key, results = parse_eval_result(json_filepath)
105
+ for eval_result in results:
106
+ if result_key in eval_results.keys():
107
+ eval_results[result_key].results.update(eval_result.results)
108
+ else:
109
+ eval_results[result_key] = eval_result
110
 
111
  eval_results = [v for v in eval_results.values()]
112
 
src/init.py CHANGED
@@ -13,26 +13,37 @@ def get_all_requested_models(requested_models_dir):
13
  if current_depth == depth:
14
  file_names.extend([os.path.join(root, file) for file in files])
15
 
16
- return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
17
 
18
- def load_all_info_from_hub(LMEH_REPO):
19
- auto_eval_repo = None
 
20
  requested_models = None
 
21
  if H4_TOKEN:
22
  print("Pulling evaluation requests and results.")
23
 
24
- auto_eval_repo = Repository(
25
- local_dir="./auto_evals/",
26
- clone_from=LMEH_REPO,
 
 
 
 
 
 
 
 
27
  use_auth_token=H4_TOKEN,
28
  repo_type="dataset",
29
  )
30
- auto_eval_repo.git_pull()
31
 
32
- requested_models_dir = "./auto_evals/eval_requests"
33
- requested_models = get_all_requested_models(requested_models_dir)
 
34
 
35
- return auto_eval_repo, requested_models
36
 
37
 
38
  #def load_results(model, benchmark, metric):
 
13
  if current_depth == depth:
14
  file_names.extend([os.path.join(root, file) for file in files])
15
 
16
+ return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
17
 
18
+ def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
19
+ eval_queue_repo = None
20
+ eval_results_repo = None
21
  requested_models = None
22
+
23
  if H4_TOKEN:
24
  print("Pulling evaluation requests and results.")
25
 
26
+ eval_queue_repo = Repository(
27
+ local_dir=QUEUE_PATH,
28
+ clone_from=QUEUE_REPO,
29
+ use_auth_token=H4_TOKEN,
30
+ repo_type="dataset",
31
+ )
32
+ eval_queue_repo.git_pull()
33
+
34
+ eval_results_repo = Repository(
35
+ local_dir=RESULTS_PATH,
36
+ clone_from=RESULTS_REPO,
37
  use_auth_token=H4_TOKEN,
38
  repo_type="dataset",
39
  )
40
+ eval_results_repo.git_pull()
41
 
42
+ requested_models = get_all_requested_models("eval-queue")
43
+ else:
44
+ print("No HuggingFace token provided. Skipping evaluation requests and results.")
45
 
46
+ return eval_queue_repo, requested_models, eval_results_repo
47
 
48
 
49
  #def load_results(model, benchmark, metric):
src/utils_display.py CHANGED
@@ -15,17 +15,17 @@ def fields(raw_class):
15
  @dataclass(frozen=True)
16
  class AutoEvalColumn: # Auto evals column
17
  model = ColumnContent("Model", "markdown", True)
18
- revision = ColumnContent("Revision", "str", True, True)
 
 
 
 
19
  model_type = ColumnContent("Type", "bool", False)
20
  is_8bit = ColumnContent("8bit", "bool", False, True)
21
  license = ColumnContent("Hub License", "str", False)
22
  params = ColumnContent("#Params (B)", "number", False)
23
  likes = ColumnContent("Hub ❤️", "number", False)
24
- average = ColumnContent("Average ⬆️", "number", True)
25
- arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
26
- hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
27
- mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
28
- truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
29
  dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
30
 
31
  @dataclass(frozen=True)
 
15
  @dataclass(frozen=True)
16
  class AutoEvalColumn: # Auto evals column
17
  model = ColumnContent("Model", "markdown", True)
18
+ average = ColumnContent("Average ⬆️", "number", True)
19
+ arc = ColumnContent("ARC ⬆️", "number", True)
20
+ hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
21
+ mmlu = ColumnContent("MMLU ⬆️", "number", True)
22
+ truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
23
  model_type = ColumnContent("Type", "bool", False)
24
  is_8bit = ColumnContent("8bit", "bool", False, True)
25
  license = ColumnContent("Hub License", "str", False)
26
  params = ColumnContent("#Params (B)", "number", False)
27
  likes = ColumnContent("Hub ❤️", "number", False)
28
+ revision = ColumnContent("Model sha", "str", False, False)
 
 
 
 
29
  dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
30
 
31
  @dataclass(frozen=True)