Spaces:
Runtime error
Runtime error
Linker1907
commited on
Commit
•
d16cee2
1
Parent(s):
e868f35
Using the new backend
Browse files- README.md +1 -0
- app.py +52 -27
- src/assets/text_content.py +35 -5
- src/auto_leaderboard/load_results.py +44 -41
- src/init.py +21 -10
- src/utils_display.py +6 -6
README.md
CHANGED
@@ -8,6 +8,7 @@ sdk_version: 3.27.0
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
+
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -15,26 +15,40 @@ from src.assets.text_content import *
|
|
15 |
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
16 |
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
17 |
from src.assets.css_html_js import custom_css, get_window_url_params
|
18 |
-
from src.utils_display import AutoEvalColumn, EvalQueueColumn,
|
19 |
-
from src.init import load_all_info_from_hub
|
20 |
|
21 |
# clone / pull the lmeh eval data
|
22 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
25 |
ADD_PLOTS = False
|
26 |
|
27 |
-
EVAL_REQUESTS_PATH = "
|
|
|
28 |
|
29 |
-
|
|
|
30 |
|
|
|
31 |
|
32 |
def restart_space():
|
33 |
api.restart_space(
|
34 |
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
|
35 |
)
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
40 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
@@ -60,9 +74,12 @@ def has_nan_values(df, columns):
|
|
60 |
|
61 |
|
62 |
def get_leaderboard_df():
|
63 |
-
if
|
64 |
print("Pulling evaluation results for the leaderboard.")
|
65 |
-
|
|
|
|
|
|
|
66 |
|
67 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
68 |
|
@@ -84,9 +101,12 @@ def get_leaderboard_df():
|
|
84 |
|
85 |
def get_evaluation_queue_df():
|
86 |
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
87 |
-
if
|
|
|
|
|
|
|
88 |
print("Pulling changes for the evaluation queue.")
|
89 |
-
|
90 |
|
91 |
entries = [
|
92 |
entry
|
@@ -106,7 +126,7 @@ def get_evaluation_queue_df():
|
|
106 |
data["revision"] = data.get("revision", "main")
|
107 |
|
108 |
all_evals.append(data)
|
109 |
-
|
110 |
# this is a folder
|
111 |
sub_entries = [
|
112 |
e
|
@@ -124,10 +144,10 @@ def get_evaluation_queue_df():
|
|
124 |
|
125 |
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
126 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
127 |
-
finished_list = [e for e in all_evals if e["status"]
|
128 |
-
df_pending = pd.DataFrame.from_records(pending_list)
|
129 |
-
df_running = pd.DataFrame.from_records(running_list)
|
130 |
-
df_finished = pd.DataFrame.from_records(finished_list)
|
131 |
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
132 |
|
133 |
|
@@ -149,7 +169,7 @@ def is_model_on_hub(model_name, revision) -> bool:
|
|
149 |
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
150 |
|
151 |
except Exception as e:
|
152 |
-
print("Could not get the model config from the hub.:
|
153 |
return False, "was not found on hub!"
|
154 |
|
155 |
|
@@ -200,7 +220,7 @@ def add_new_eval(
|
|
200 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
201 |
|
202 |
# Check for duplicate submission
|
203 |
-
if out_path.split("
|
204 |
return styled_warning("This model has been already submitted.")
|
205 |
|
206 |
with open(out_path, "w") as f:
|
@@ -208,13 +228,17 @@ def add_new_eval(
|
|
208 |
|
209 |
api.upload_file(
|
210 |
path_or_fileobj=out_path,
|
211 |
-
path_in_repo=out_path,
|
212 |
-
repo_id=
|
213 |
token=H4_TOKEN,
|
214 |
repo_type="dataset",
|
|
|
215 |
)
|
216 |
|
217 |
-
|
|
|
|
|
|
|
218 |
|
219 |
|
220 |
def refresh():
|
@@ -310,13 +334,6 @@ with demo:
|
|
310 |
)
|
311 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
312 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
313 |
-
with gr.Accordion("📙 Citation", open=False):
|
314 |
-
citation_button = gr.Textbox(
|
315 |
-
value=CITATION_BUTTON_TEXT,
|
316 |
-
label=CITATION_BUTTON_LABEL,
|
317 |
-
elem_id="citation-button",
|
318 |
-
).style(show_copy_button=True)
|
319 |
-
|
320 |
|
321 |
with gr.Column():
|
322 |
with gr.Row():
|
@@ -396,6 +413,14 @@ with demo:
|
|
396 |
submission_result,
|
397 |
)
|
398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
dummy = gr.Textbox(visible=False)
|
400 |
demo.load(
|
401 |
change_tab,
|
|
|
15 |
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
16 |
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
17 |
from src.assets.css_html_js import custom_css, get_window_url_params
|
18 |
+
from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
|
19 |
+
from src.init import get_all_requested_models, load_all_info_from_hub
|
20 |
|
21 |
# clone / pull the lmeh eval data
|
22 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
23 |
+
|
24 |
+
QUEUE_REPO = "open-llm-leaderboard/requests"
|
25 |
+
RESULTS_REPO = "open-llm-leaderboard/results"
|
26 |
+
|
27 |
+
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
28 |
+
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
29 |
+
|
30 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
31 |
ADD_PLOTS = False
|
32 |
|
33 |
+
EVAL_REQUESTS_PATH = "eval-queue"
|
34 |
+
EVAL_RESULTS_PATH = "eval-results"
|
35 |
|
36 |
+
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
37 |
+
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
38 |
|
39 |
+
api = HfApi()
|
40 |
|
41 |
def restart_space():
|
42 |
api.restart_space(
|
43 |
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
|
44 |
)
|
45 |
|
46 |
+
eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
|
47 |
+
|
48 |
+
if not IS_PUBLIC:
|
49 |
+
eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
|
50 |
+
else:
|
51 |
+
eval_queue_private, eval_results_private = None, None
|
52 |
|
53 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
54 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
74 |
|
75 |
|
76 |
def get_leaderboard_df():
|
77 |
+
if eval_results:
|
78 |
print("Pulling evaluation results for the leaderboard.")
|
79 |
+
eval_results.git_pull()
|
80 |
+
if eval_results_private:
|
81 |
+
print("Pulling evaluation results for the leaderboard.")
|
82 |
+
eval_results_private.git_pull()
|
83 |
|
84 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
85 |
|
|
|
101 |
|
102 |
def get_evaluation_queue_df():
|
103 |
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
104 |
+
if eval_queue:
|
105 |
+
print("Pulling changes for the evaluation queue.")
|
106 |
+
eval_queue.git_pull()
|
107 |
+
if eval_queue_private:
|
108 |
print("Pulling changes for the evaluation queue.")
|
109 |
+
eval_queue_private.git_pull()
|
110 |
|
111 |
entries = [
|
112 |
entry
|
|
|
126 |
data["revision"] = data.get("revision", "main")
|
127 |
|
128 |
all_evals.append(data)
|
129 |
+
elif ".md" not in entry:
|
130 |
# this is a folder
|
131 |
sub_entries = [
|
132 |
e
|
|
|
144 |
|
145 |
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
146 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
147 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
148 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
149 |
+
df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
|
150 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
|
151 |
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
152 |
|
153 |
|
|
|
169 |
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
170 |
|
171 |
except Exception as e:
|
172 |
+
print(f"Could not get the model config from the hub.: {e}")
|
173 |
return False, "was not found on hub!"
|
174 |
|
175 |
|
|
|
220 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
221 |
|
222 |
# Check for duplicate submission
|
223 |
+
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
224 |
return styled_warning("This model has been already submitted.")
|
225 |
|
226 |
with open(out_path, "w") as f:
|
|
|
228 |
|
229 |
api.upload_file(
|
230 |
path_or_fileobj=out_path,
|
231 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
232 |
+
repo_id=QUEUE_REPO,
|
233 |
token=H4_TOKEN,
|
234 |
repo_type="dataset",
|
235 |
+
commit_message=f"Add {model} to eval queue",
|
236 |
)
|
237 |
|
238 |
+
# remove the local file
|
239 |
+
os.remove(out_path)
|
240 |
+
|
241 |
+
return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
|
242 |
|
243 |
|
244 |
def refresh():
|
|
|
334 |
)
|
335 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
336 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
338 |
with gr.Column():
|
339 |
with gr.Row():
|
|
|
413 |
submission_result,
|
414 |
)
|
415 |
|
416 |
+
with gr.Row():
|
417 |
+
with gr.Accordion("📙 Citation", open=False):
|
418 |
+
citation_button = gr.Textbox(
|
419 |
+
value=CITATION_BUTTON_TEXT,
|
420 |
+
label=CITATION_BUTTON_LABEL,
|
421 |
+
elem_id="citation-button",
|
422 |
+
).style(show_copy_button=True)
|
423 |
+
|
424 |
dummy = gr.Textbox(visible=False)
|
425 |
demo.load(
|
426 |
change_tab,
|
src/assets/text_content.py
CHANGED
@@ -61,7 +61,7 @@ INTRODUCTION_TEXT = f"""
|
|
61 |
|
62 |
🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
|
63 |
|
64 |
-
Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance
|
65 |
"""
|
66 |
|
67 |
LLM_BENCHMARKS_TEXT = f"""
|
@@ -78,6 +78,29 @@ With the plethora of large language models (LLMs) and chatbots being released we
|
|
78 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
79 |
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
# Reproduction
|
82 |
To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
|
83 |
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
@@ -87,10 +110,17 @@ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs
|
|
87 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
88 |
|
89 |
The tasks and few shots parameters are:
|
90 |
-
- ARC: 25-shot, *arc-challenge*
|
91 |
-
- HellaSwag: 10-shot, *hellaswag*
|
92 |
-
- TruthfulQA: 0-shot, *truthfulqa-mc* (mc2
|
93 |
-
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
"""
|
95 |
|
96 |
EVALUATION_QUEUE_TEXT = f"""
|
|
|
61 |
|
62 |
🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
|
63 |
|
64 |
+
Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
|
65 |
"""
|
66 |
|
67 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
78 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
79 |
|
80 |
|
81 |
+
# Some good practices before submitting a model
|
82 |
+
|
83 |
+
## 1) Make sure you can load your model and tokenizer using AutoClasses:
|
84 |
+
```python
|
85 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
86 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
87 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
88 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
89 |
+
```
|
90 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
91 |
+
|
92 |
+
Note: make sure your model is public!
|
93 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
94 |
+
|
95 |
+
## 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
96 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of weights of your model to the `Extended Viewer`!
|
97 |
+
|
98 |
+
## 3) Make sure your model has an open license!
|
99 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
100 |
+
|
101 |
+
## 4) Fill up your model card
|
102 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
103 |
+
|
104 |
# Reproduction
|
105 |
To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
|
106 |
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
|
|
110 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
111 |
|
112 |
The tasks and few shots parameters are:
|
113 |
+
- ARC: 25-shot, *arc-challenge* (`acc_norm`)
|
114 |
+
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
115 |
+
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
116 |
+
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
|
117 |
+
|
118 |
+
# In case of model failure
|
119 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
120 |
+
Make sure you have followed the above steps first.
|
121 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
122 |
+
|
123 |
+
|
124 |
"""
|
125 |
|
126 |
EVALUATION_QUEUE_TEXT = f"""
|
src/auto_leaderboard/load_results.py
CHANGED
@@ -7,14 +7,13 @@ from typing import Dict, List, Tuple
|
|
7 |
from src.utils_display import AutoEvalColumn, make_clickable_model
|
8 |
import numpy as np
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
|
13 |
BENCH_TO_NAME = {
|
14 |
-
"
|
15 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
16 |
-
"
|
17 |
-
"
|
18 |
}
|
19 |
|
20 |
|
@@ -24,8 +23,8 @@ class EvalResult:
|
|
24 |
org: str
|
25 |
model: str
|
26 |
revision: str
|
27 |
-
is_8bit: bool
|
28 |
results: dict
|
|
|
29 |
|
30 |
def to_dict(self):
|
31 |
if self.org is not None:
|
@@ -44,7 +43,7 @@ class EvalResult:
|
|
44 |
)
|
45 |
|
46 |
for benchmark in BENCHMARKS:
|
47 |
-
if not
|
48 |
self.results[benchmark] = None
|
49 |
|
50 |
for k, v in BENCH_TO_NAME.items():
|
@@ -53,57 +52,61 @@ class EvalResult:
|
|
53 |
return data_dict
|
54 |
|
55 |
|
56 |
-
def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
|
57 |
with open(json_filepath) as fp:
|
58 |
data = json.load(fp)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
model
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
else:
|
69 |
-
org =
|
70 |
-
|
|
|
71 |
|
72 |
-
|
73 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
74 |
-
if benchmark in
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
80 |
|
81 |
-
return result_key,
|
82 |
|
83 |
|
84 |
def get_eval_results(is_public) -> List[EvalResult]:
|
85 |
json_filepaths = glob.glob(
|
86 |
-
"
|
87 |
)
|
88 |
if not is_public:
|
89 |
json_filepaths += glob.glob(
|
90 |
-
"
|
91 |
-
)
|
92 |
-
json_filepaths += glob.glob(
|
93 |
-
"auto_evals/eval_results/private/**/*.json", recursive=True
|
94 |
)
|
95 |
-
|
96 |
-
json_filepaths += glob.glob(
|
97 |
-
"auto_evals/eval_results/public/**/8bit/*.json", recursive=True
|
98 |
-
)
|
99 |
eval_results = {}
|
100 |
|
101 |
for json_filepath in json_filepaths:
|
102 |
-
result_key,
|
103 |
-
|
104 |
-
eval_results
|
105 |
-
|
106 |
-
|
|
|
107 |
|
108 |
eval_results = [v for v in eval_results.values()]
|
109 |
|
|
|
7 |
from src.utils_display import AutoEvalColumn, make_clickable_model
|
8 |
import numpy as np
|
9 |
|
10 |
+
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
|
11 |
+
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
|
|
|
12 |
BENCH_TO_NAME = {
|
13 |
+
"arc:challenge": AutoEvalColumn.arc.name,
|
14 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
15 |
+
"hendrycksTest": AutoEvalColumn.mmlu.name,
|
16 |
+
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
|
17 |
}
|
18 |
|
19 |
|
|
|
23 |
org: str
|
24 |
model: str
|
25 |
revision: str
|
|
|
26 |
results: dict
|
27 |
+
is_8bit: bool = False
|
28 |
|
29 |
def to_dict(self):
|
30 |
if self.org is not None:
|
|
|
43 |
)
|
44 |
|
45 |
for benchmark in BENCHMARKS:
|
46 |
+
if benchmark not in self.results.keys():
|
47 |
self.results[benchmark] = None
|
48 |
|
49 |
for k, v in BENCH_TO_NAME.items():
|
|
|
52 |
return data_dict
|
53 |
|
54 |
|
55 |
+
def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
56 |
with open(json_filepath) as fp:
|
57 |
data = json.load(fp)
|
58 |
|
59 |
+
config = data["config"]
|
60 |
+
model = config.get("model_name", None)
|
61 |
+
if model is None:
|
62 |
+
model = config.get("model_args", None)
|
63 |
+
|
64 |
+
model_sha = config.get("model_sha", "")
|
65 |
+
eval_sha = config.get("lighteval_sha", "")
|
66 |
+
model_split = model.split("/", 1)
|
67 |
+
|
68 |
+
model = model_split[-1]
|
69 |
+
|
70 |
+
if len(model_split) == 1:
|
71 |
+
org = None
|
72 |
+
model = model_split[0]
|
73 |
+
result_key = f"{model}_{model_sha}_{eval_sha}"
|
74 |
else:
|
75 |
+
org = model_split[0]
|
76 |
+
model = model_split[1]
|
77 |
+
result_key = f"{org}_{model}_{model_sha}_{eval_sha}"
|
78 |
|
79 |
+
eval_results = []
|
80 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
81 |
+
accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
|
82 |
+
if accs.size == 0:
|
83 |
+
continue
|
84 |
+
mean_acc = round(np.mean(accs) * 100.0, 1)
|
85 |
+
eval_results.append(EvalResult(
|
86 |
+
result_key, org, model, model_sha, {benchmark: mean_acc}
|
87 |
+
))
|
88 |
|
89 |
+
return result_key, eval_results
|
90 |
|
91 |
|
92 |
def get_eval_results(is_public) -> List[EvalResult]:
|
93 |
json_filepaths = glob.glob(
|
94 |
+
"eval-results/**/results*.json", recursive=True
|
95 |
)
|
96 |
if not is_public:
|
97 |
json_filepaths += glob.glob(
|
98 |
+
"private-eval-results/**/results*.json", recursive=True
|
|
|
|
|
|
|
99 |
)
|
100 |
+
|
|
|
|
|
|
|
101 |
eval_results = {}
|
102 |
|
103 |
for json_filepath in json_filepaths:
|
104 |
+
result_key, results = parse_eval_result(json_filepath)
|
105 |
+
for eval_result in results:
|
106 |
+
if result_key in eval_results.keys():
|
107 |
+
eval_results[result_key].results.update(eval_result.results)
|
108 |
+
else:
|
109 |
+
eval_results[result_key] = eval_result
|
110 |
|
111 |
eval_results = [v for v in eval_results.values()]
|
112 |
|
src/init.py
CHANGED
@@ -13,26 +13,37 @@ def get_all_requested_models(requested_models_dir):
|
|
13 |
if current_depth == depth:
|
14 |
file_names.extend([os.path.join(root, file) for file in files])
|
15 |
|
16 |
-
return set([file_name.lower().split("
|
17 |
|
18 |
-
def load_all_info_from_hub(
|
19 |
-
|
|
|
20 |
requested_models = None
|
|
|
21 |
if H4_TOKEN:
|
22 |
print("Pulling evaluation requests and results.")
|
23 |
|
24 |
-
|
25 |
-
local_dir=
|
26 |
-
clone_from=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
use_auth_token=H4_TOKEN,
|
28 |
repo_type="dataset",
|
29 |
)
|
30 |
-
|
31 |
|
32 |
-
|
33 |
-
|
|
|
34 |
|
35 |
-
return
|
36 |
|
37 |
|
38 |
#def load_results(model, benchmark, metric):
|
|
|
13 |
if current_depth == depth:
|
14 |
file_names.extend([os.path.join(root, file) for file in files])
|
15 |
|
16 |
+
return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
|
17 |
|
18 |
+
def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
|
19 |
+
eval_queue_repo = None
|
20 |
+
eval_results_repo = None
|
21 |
requested_models = None
|
22 |
+
|
23 |
if H4_TOKEN:
|
24 |
print("Pulling evaluation requests and results.")
|
25 |
|
26 |
+
eval_queue_repo = Repository(
|
27 |
+
local_dir=QUEUE_PATH,
|
28 |
+
clone_from=QUEUE_REPO,
|
29 |
+
use_auth_token=H4_TOKEN,
|
30 |
+
repo_type="dataset",
|
31 |
+
)
|
32 |
+
eval_queue_repo.git_pull()
|
33 |
+
|
34 |
+
eval_results_repo = Repository(
|
35 |
+
local_dir=RESULTS_PATH,
|
36 |
+
clone_from=RESULTS_REPO,
|
37 |
use_auth_token=H4_TOKEN,
|
38 |
repo_type="dataset",
|
39 |
)
|
40 |
+
eval_results_repo.git_pull()
|
41 |
|
42 |
+
requested_models = get_all_requested_models("eval-queue")
|
43 |
+
else:
|
44 |
+
print("No HuggingFace token provided. Skipping evaluation requests and results.")
|
45 |
|
46 |
+
return eval_queue_repo, requested_models, eval_results_repo
|
47 |
|
48 |
|
49 |
#def load_results(model, benchmark, metric):
|
src/utils_display.py
CHANGED
@@ -15,17 +15,17 @@ def fields(raw_class):
|
|
15 |
@dataclass(frozen=True)
|
16 |
class AutoEvalColumn: # Auto evals column
|
17 |
model = ColumnContent("Model", "markdown", True)
|
18 |
-
|
|
|
|
|
|
|
|
|
19 |
model_type = ColumnContent("Type", "bool", False)
|
20 |
is_8bit = ColumnContent("8bit", "bool", False, True)
|
21 |
license = ColumnContent("Hub License", "str", False)
|
22 |
params = ColumnContent("#Params (B)", "number", False)
|
23 |
likes = ColumnContent("Hub ❤️", "number", False)
|
24 |
-
|
25 |
-
arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
|
26 |
-
hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
|
27 |
-
mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
|
28 |
-
truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
|
29 |
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
30 |
|
31 |
@dataclass(frozen=True)
|
|
|
15 |
@dataclass(frozen=True)
|
16 |
class AutoEvalColumn: # Auto evals column
|
17 |
model = ColumnContent("Model", "markdown", True)
|
18 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
19 |
+
arc = ColumnContent("ARC ⬆️", "number", True)
|
20 |
+
hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
|
21 |
+
mmlu = ColumnContent("MMLU ⬆️", "number", True)
|
22 |
+
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
23 |
model_type = ColumnContent("Type", "bool", False)
|
24 |
is_8bit = ColumnContent("8bit", "bool", False, True)
|
25 |
license = ColumnContent("Hub License", "str", False)
|
26 |
params = ColumnContent("#Params (B)", "number", False)
|
27 |
likes = ColumnContent("Hub ❤️", "number", False)
|
28 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
|
|
|
|
|
|
|
|
29 |
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
30 |
|
31 |
@dataclass(frozen=True)
|