bugfix
Browse files- app.py +2 -2
- src/about.py +8 -3
- src/leaderboard/read_evals.py +16 -7
- src/populate.py +34 -34
app.py
CHANGED
@@ -27,7 +27,7 @@ from src.display.utils import (
|
|
27 |
Precision,
|
28 |
)
|
29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
30 |
-
from src.populate import
|
31 |
from src.submission.submit import add_new_eval
|
32 |
|
33 |
|
@@ -49,7 +49,7 @@ def restart_space():
|
|
49 |
# restart_space()
|
50 |
|
51 |
try:
|
52 |
-
print(EVAL_RESULTS_PATH)
|
53 |
snapshot_download(
|
54 |
repo_id=RESULTS_REPO,
|
55 |
local_dir=EVAL_RESULTS_PATH,
|
|
|
27 |
Precision,
|
28 |
)
|
29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
30 |
+
from src.populate import get_leaderboard_df
|
31 |
from src.submission.submit import add_new_eval
|
32 |
|
33 |
|
|
|
49 |
# restart_space()
|
50 |
|
51 |
try:
|
52 |
+
print("Saving results locally at:", EVAL_RESULTS_PATH)
|
53 |
snapshot_download(
|
54 |
repo_id=RESULTS_REPO,
|
55 |
local_dir=EVAL_RESULTS_PATH,
|
src/about.py
CHANGED
@@ -15,18 +15,23 @@ class Task:
|
|
15 |
# ---------------------------------------------------
|
16 |
class Tasks(Enum):
|
17 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
18 |
-
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
|
19 |
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
|
20 |
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
|
21 |
-
|
22 |
task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
|
|
|
|
|
|
|
23 |
task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
|
24 |
task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
|
|
|
|
|
25 |
task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
|
26 |
task8 = Task("news_sum", "bertscore,none", "News Sum")
|
|
|
27 |
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
|
28 |
task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
|
29 |
-
task11 = Task("xcopa_it", "acc,none", "
|
30 |
|
31 |
|
32 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
15 |
# ---------------------------------------------------
|
16 |
class Tasks(Enum):
|
17 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
|
18 |
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
|
19 |
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
|
20 |
+
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
|
21 |
task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
|
22 |
+
task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
|
23 |
+
task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS")
|
24 |
+
task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo")
|
25 |
task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
|
26 |
task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
|
27 |
+
task14 = Task("ironita_irony", "f1,none", "IronITA Irony")
|
28 |
+
task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm")
|
29 |
task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
|
30 |
task8 = Task("news_sum", "bertscore,none", "News Sum")
|
31 |
+
task16 = Task("sentipolc", "f1,none", "SENTIPOLC")
|
32 |
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
|
33 |
task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
|
34 |
+
task11 = Task("xcopa_it", "acc,none", "XCOPA")
|
35 |
|
36 |
|
37 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/leaderboard/read_evals.py
CHANGED
@@ -11,6 +11,8 @@ from src.display.formatting import make_clickable_model
|
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
|
|
|
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
@@ -80,6 +82,8 @@ class EvalResult:
|
|
80 |
architecture = ";".join(architectures)
|
81 |
|
82 |
# Extract results available in this file (some results are split in several files)
|
|
|
|
|
83 |
results = {}
|
84 |
for task in Tasks:
|
85 |
task = task.value
|
@@ -102,6 +106,8 @@ class EvalResult:
|
|
102 |
|
103 |
results[task.benchmark] = mean_acc
|
104 |
|
|
|
|
|
105 |
return self(
|
106 |
eval_name=result_key,
|
107 |
full_model=full_model,
|
@@ -204,7 +210,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
204 |
for model_result_filepath in model_result_filepaths:
|
205 |
# Creation of result
|
206 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
207 |
-
|
|
|
208 |
|
209 |
# Store results of same eval together
|
210 |
eval_name = eval_result.eval_name
|
@@ -213,12 +220,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
213 |
else:
|
214 |
eval_results[eval_name] = eval_result
|
215 |
|
216 |
-
|
217 |
-
for v in eval_results.
|
218 |
try:
|
219 |
v.to_dict() # we test if the dict version is complete
|
220 |
-
|
221 |
-
except
|
222 |
-
|
|
|
|
|
223 |
|
224 |
-
return
|
|
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
+
import pdb
|
15 |
+
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
|
|
82 |
architecture = ";".join(architectures)
|
83 |
|
84 |
# Extract results available in this file (some results are split in several files)
|
85 |
+
|
86 |
+
# pdb.set_trace()
|
87 |
results = {}
|
88 |
for task in Tasks:
|
89 |
task = task.value
|
|
|
106 |
|
107 |
results[task.benchmark] = mean_acc
|
108 |
|
109 |
+
# pdb.set_trace()
|
110 |
+
|
111 |
return self(
|
112 |
eval_name=result_key,
|
113 |
full_model=full_model,
|
|
|
210 |
for model_result_filepath in model_result_filepaths:
|
211 |
# Creation of result
|
212 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
213 |
+
|
214 |
+
# eval_result.update_with_request_file(requests_path)
|
215 |
|
216 |
# Store results of same eval together
|
217 |
eval_name = eval_result.eval_name
|
|
|
220 |
else:
|
221 |
eval_results[eval_name] = eval_result
|
222 |
|
223 |
+
results_for_table = list()
|
224 |
+
for k, v in eval_results.items():
|
225 |
try:
|
226 |
v.to_dict() # we test if the dict version is complete
|
227 |
+
results_for_table.append(v)
|
228 |
+
except RuntimeError as e: # not all eval values present
|
229 |
+
print(f"Issue with results of: ", k)
|
230 |
+
raise e
|
231 |
+
# continue
|
232 |
|
233 |
+
return results_for_table
|
src/populate.py
CHANGED
@@ -22,37 +22,37 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
22 |
return raw_data, df
|
23 |
|
24 |
|
25 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
22 |
return raw_data, df
|
23 |
|
24 |
|
25 |
+
# def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
+
# """Creates the different dataframes for the evaluation queues requestes"""
|
27 |
+
# entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
28 |
+
# all_evals = []
|
29 |
+
|
30 |
+
# for entry in entries:
|
31 |
+
# if ".json" in entry:
|
32 |
+
# file_path = os.path.join(save_path, entry)
|
33 |
+
# with open(file_path) as fp:
|
34 |
+
# data = json.load(fp)
|
35 |
+
|
36 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
+
|
39 |
+
# all_evals.append(data)
|
40 |
+
# elif ".md" not in entry:
|
41 |
+
# # this is a folder
|
42 |
+
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
43 |
+
# for sub_entry in sub_entries:
|
44 |
+
# file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
+
# with open(file_path) as fp:
|
46 |
+
# data = json.load(fp)
|
47 |
+
|
48 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
+
# all_evals.append(data)
|
51 |
+
|
52 |
+
# pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
+
# running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
+
# finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
55 |
+
# df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
+
# df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
+
# df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
+
# return df_finished[cols], df_running[cols], df_pending[cols]
|