eduagarcia commited on
Commit
8aaf0e7
β€’
1 Parent(s): 0cc3edb

Add env variable SHOW_INCOMPLETE_EVALS and order evaluation queue by priority

Browse files
app.py CHANGED
@@ -30,7 +30,19 @@ from src.display.utils import (
30
  WeightType,
31
  Precision
32
  )
33
- from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 
 
 
 
 
 
 
 
 
 
 
 
34
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
35
  from src.submission.submit import add_new_eval
36
  from src.scripts.update_all_request_files import update_dynamic_files
@@ -81,7 +93,8 @@ def init_space():
81
  requests_path=EVAL_REQUESTS_PATH,
82
  dynamic_path=DYNAMIC_INFO_FILE_PATH,
83
  cols=COLS,
84
- benchmark_cols=BENCHMARK_COLS
 
85
  )
86
  update_collections(original_df.copy())
87
  leaderboard_df = original_df.copy()
@@ -93,7 +106,7 @@ def init_space():
93
  running_eval_queue_df,
94
  pending_eval_queue_df,
95
  failed_eval_queue_df
96
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
97
 
98
  return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, failed_eval_queue_df
99
 
 
30
  WeightType,
31
  Precision
32
  )
33
+ from src.envs import (
34
+ API,
35
+ EVAL_REQUESTS_PATH,
36
+ DYNAMIC_INFO_REPO,
37
+ DYNAMIC_INFO_FILE_PATH,
38
+ DYNAMIC_INFO_PATH,
39
+ EVAL_RESULTS_PATH,
40
+ H4_TOKEN, IS_PUBLIC,
41
+ QUEUE_REPO,
42
+ REPO_ID,
43
+ RESULTS_REPO,
44
+ SHOW_INCOMPLETE_EVALS
45
+ )
46
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
47
  from src.submission.submit import add_new_eval
48
  from src.scripts.update_all_request_files import update_dynamic_files
 
93
  requests_path=EVAL_REQUESTS_PATH,
94
  dynamic_path=DYNAMIC_INFO_FILE_PATH,
95
  cols=COLS,
96
+ benchmark_cols=BENCHMARK_COLS,
97
+ show_incomplete=SHOW_INCOMPLETE_EVALS
98
  )
99
  update_collections(original_df.copy())
100
  leaderboard_df = original_df.copy()
 
106
  running_eval_queue_df,
107
  pending_eval_queue_df,
108
  failed_eval_queue_df
109
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS, show_incomplete=SHOW_INCOMPLETE_EVALS)
110
 
111
  return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, failed_eval_queue_df
112
 
src/display/changelog.py CHANGED
@@ -2,7 +2,7 @@ CHANGELOG_TEXT = f"""
2
  # Changes made to the leaderboard
3
 
4
  ### [1.1.0] - 2024-02-16
5
- Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
6
  Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
7
 
8
  ### [1.0.0] - 2024-02-01
 
2
  # Changes made to the leaderboard
3
 
4
  ### [1.1.0] - 2024-02-16
5
+ Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
6
  Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
7
 
8
  ### [1.0.0] - 2024-02-01
src/envs.py CHANGED
@@ -65,4 +65,6 @@ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = str2bool(get_config("GET_ORIGINAL_HF_
65
  ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
66
  ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
67
 
 
 
68
  API = HfApi(token=H4_TOKEN)
 
65
  ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
66
  ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
67
 
68
+ SHOW_INCOMPLETE_EVALS = str2bool(get_config("SHOW_INCOMPLETE_EVALS", False))
69
+
70
  API = HfApi(token=H4_TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -12,7 +12,7 @@ from huggingface_hub import ModelCard
12
 
13
  from src.display.formatting import make_clickable_model
14
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
15
- from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
16
 
17
  @dataclass
18
  class EvalResult:
@@ -216,7 +216,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
216
  with open(tmp_request_file, "r") as f:
217
  req_content = json.load(f)
218
  if (
219
- req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
220
  and req_content["precision"] == precision.split(".")[-1]
221
  ):
222
  request_file = tmp_request_file
@@ -262,7 +262,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
262
  results = []
263
  for v in eval_results.values():
264
  try:
265
- if v.status in ["FINISHED", "PENDING_NEW_EVAL"] and not v.hidden:
266
  v.to_dict() # we test if the dict version is complete
267
  results.append(v)
268
  except KeyError as e: # not all eval values present
 
12
 
13
  from src.display.formatting import make_clickable_model
14
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
15
+ from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, SHOW_INCOMPLETE_EVALS
16
 
17
  @dataclass
18
  class EvalResult:
 
216
  with open(tmp_request_file, "r") as f:
217
  req_content = json.load(f)
218
  if (
219
+ req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL" if SHOW_INCOMPLETE_EVALS else "FINISHED"]
220
  and req_content["precision"] == precision.split(".")[-1]
221
  ):
222
  request_file = tmp_request_file
 
262
  results = []
263
  for v in eval_results.values():
264
  try:
265
+ if v.status in ["FINISHED", "PENDING_NEW_EVAL" if SHOW_INCOMPLETE_EVALS else "FINISHED"] and not v.hidden:
266
  v.to_dict() # we test if the dict version is complete
267
  results.append(v)
268
  except KeyError as e: # not all eval values present
src/populate.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import os
 
3
 
4
  import pandas as pd
5
 
@@ -9,7 +10,7 @@ from src.leaderboard.filter_models import filter_models_flags
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
  all_data_json.append(baseline_row)
@@ -21,11 +22,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
21
  df = df[cols].round(decimals=2)
22
 
23
  # filter out if any of the benchmarks have not been produced
24
- #df = df[has_no_nan_values(df, benchmark_cols)]
 
25
  return raw_data, df
26
 
27
 
28
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
29
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
30
  all_evals = []
31
 
@@ -51,12 +53,23 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
51
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
  all_evals.append(data)
53
 
54
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
 
 
 
55
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
56
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
57
  failed_list = [e for e in all_evals if e["status"] == "FAILED"]
58
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
59
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
60
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
61
  df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
 
 
 
 
 
 
 
62
  return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
 
1
  import json
2
  import os
3
+ import copy
4
 
5
  import pandas as pd
6
 
 
10
  from src.leaderboard.read_evals import get_raw_eval_results
11
 
12
 
13
+ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list, show_incomplete=False) -> pd.DataFrame:
14
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
  all_data_json.append(baseline_row)
 
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
25
+ if not show_incomplete:
26
+ df = df[has_no_nan_values(df, benchmark_cols)]
27
  return raw_data, df
28
 
29
 
30
+ def get_evaluation_queue_df(save_path: str, cols: list, show_incomplete=False) -> list[pd.DataFrame]:
31
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
32
  all_evals = []
33
 
 
53
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
54
  all_evals.append(data)
55
 
56
+ cols_pending = copy.deepcopy(cols)
57
+ cols_pending.append('source')
58
+ cols_pending.append('submitted_time')
59
+
60
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PENDING_NEW_EVAL"]]
61
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
62
+ finished_list = [e for e in all_evals if e["status"] in ["FINISHED", "PENDING_NEW_EVAL" if show_incomplete else "FINISHED"]]
63
  failed_list = [e for e in all_evals if e["status"] == "FAILED"]
64
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols_pending)
65
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
66
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
67
  df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
68
+
69
+ df_pending['source_priority'] = df_pending["source"].apply(lambda x: {"manual": 0, "leaderboard": 1, "script": 2}.get(x, 3))
70
+ df_pending['status_priority'] = df_pending["status"].apply(lambda x: {"PENDING": 2, "RERUN": 0, "PENDING_NEW_EVAL": 1}.get(x, 3))
71
+
72
+ df_pending = df_pending.sort_values(['source_priority', 'status_priority', 'submitted_time'])
73
+ df_pending = df_pending.drop(['source_priority', 'status_priority', 'submitted_time', 'source'], axis=1)
74
+
75
  return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
tasks_config/pt_config.yaml CHANGED
@@ -10,6 +10,7 @@ config:
10
  LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
11
  GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
12
  TRUST_REMOTE_CODE: true
 
13
  readme:
14
  general_description: |
15
  πŸ“ The πŸš€ Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
 
10
  LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
11
  GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
12
  TRUST_REMOTE_CODE: true
13
+ SHOW_INCOMPLETE_EVALS: false
14
  readme:
15
  general_description: |
16
  πŸ“ The πŸš€ Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of