eduagarcia commited on
Commit
71ecfbb
1 Parent(s): 88c4c5f

Feature: FIELD with original HF Leaderboard ranking

Browse files
.gitignore CHANGED
@@ -7,6 +7,7 @@ __pycache__/
7
  run_dot_env.sh
8
  hub/
9
  modules/
 
10
 
11
  eval-queue/
12
  eval-results/
 
7
  run_dot_env.sh
8
  hub/
9
  modules/
10
+ original_results/
11
 
12
  eval-queue/
13
  eval-results/
src/display/utils.py CHANGED
@@ -2,6 +2,7 @@ from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  from typing import List
4
  import pandas as pd
 
5
 
6
  def fields(raw_class):
7
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -112,7 +113,8 @@ auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool",
112
  auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
113
  # Dummy column for the search bar (hidden by the custom CSS)
114
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
115
-
 
116
 
117
 
118
  # We use make dataclass to dynamically fill the scores from Tasks
@@ -160,6 +162,8 @@ for task in Tasks:
160
  if task.value.baseline is not None:
161
  baseline_list.append(task.value.baseline)
162
  baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
 
 
163
 
164
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
165
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -201,6 +205,8 @@ for task in Tasks:
201
  if task.value.human_baseline is not None:
202
  baseline_list.append(task.value.human_baseline)
203
  human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
 
 
204
 
205
  @dataclass
206
  class ModelDetails:
@@ -278,3 +284,13 @@ NUMERIC_INTERVALS = {
278
  "~60": pd.Interval(45, 70, closed="right"),
279
  "70+": pd.Interval(70, 10000, closed="right"),
280
  }
 
 
 
 
 
 
 
 
 
 
 
2
  from enum import Enum
3
  from typing import List
4
  import pandas as pd
5
+ from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
6
 
7
  def fields(raw_class):
8
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
113
  auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
114
  # Dummy column for the search bar (hidden by the custom CSS)
115
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
116
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
117
+ auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
118
 
119
 
120
  # We use make dataclass to dynamically fill the scores from Tasks
 
162
  if task.value.baseline is not None:
163
  baseline_list.append(task.value.baseline)
164
  baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
165
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
166
+ baseline_row["original_benchmark_average"] = None
167
 
168
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
169
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 
205
  if task.value.human_baseline is not None:
206
  baseline_list.append(task.value.human_baseline)
207
  human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
208
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
209
+ human_baseline_row["original_benchmark_average"] = None
210
 
211
  @dataclass
212
  class ModelDetails:
 
284
  "~60": pd.Interval(45, 70, closed="right"),
285
  "70+": pd.Interval(70, 10000, closed="right"),
286
  }
287
+
288
+ #Original HF LEaderboard tasks and metrics
289
+ ORIGINAL_TASKS = [
290
+ ("arc:challenge", "acc_norm"),
291
+ ("hellaswag", "acc_norm"),
292
+ ("hendrycksTest", "acc"),
293
+ ("truthfulqa:mc", "mc2"),
294
+ ("winogrande", "acc"),
295
+ ("gsm8k", "acc")
296
+ ]
src/envs.py CHANGED
@@ -38,4 +38,9 @@ HAS_HIGHER_RATE_LIMIT = os.environ.get("HAS_HIGHER_RATE_LIMIT", "TheBloke").spli
38
 
39
  TRUST_REMOTE_CODE = bool(os.getenv("TRUST_REMOTE_CODE", False))
40
 
 
 
 
 
 
41
  API = HfApi(token=H4_TOKEN)
 
38
 
39
  TRUST_REMOTE_CODE = bool(os.getenv("TRUST_REMOTE_CODE", False))
40
 
41
+ #Set if you want to get an extra field with the average eval results from the HF leaderboard
42
+ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(os.getenv("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
43
+ ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = os.getenv("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
44
+ ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
45
+
46
  API = HfApi(token=H4_TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -10,8 +10,8 @@ import numpy as np
10
  from huggingface_hub import ModelCard
11
 
12
  from src.display.formatting import make_clickable_model
13
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
14
-
15
 
16
  @dataclass
17
  class EvalResult:
@@ -37,9 +37,10 @@ class EvalResult:
37
  tags: list = None
38
  json_filename: str = None
39
  eval_time: float = 0.0
 
40
 
41
  @classmethod
42
- def init_from_json_file(self, json_filepath):
43
  """Inits the result from the specific model result file"""
44
  with open(json_filepath) as fp:
45
  data = json.load(fp)
@@ -68,12 +69,15 @@ class EvalResult:
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
71
- for task in Tasks:
72
- task = task.value
73
- """
 
 
 
74
  # We skip old mmlu entries
75
  wrong_mmlu_version = False
76
- if task.benchmark == "hendrycksTest":
77
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
78
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
79
  wrong_mmlu_version = True
@@ -82,19 +86,19 @@ class EvalResult:
82
  continue
83
 
84
  # Some truthfulQA values are NaNs
85
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
86
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
87
- results[task.benchmark] = 0.0
88
  continue
89
- """
90
  # We average all scores of a given metric (mostly for mmlu)
91
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
92
  if accs.size == 0 or any([acc is None for acc in accs]):
93
  continue
94
 
95
 
96
  mean_acc = np.mean(accs) * 100.0
97
- results[task.benchmark] = mean_acc
98
 
99
  return self(
100
  eval_name=result_key,
@@ -131,8 +135,13 @@ class EvalResult:
131
  self.still_on_hub = file_dict["still_on_hub"]
132
  self.flagged = any("flagged" in tag for tag in file_dict["tags"])
133
  self.tags = file_dict["tags"]
 
 
 
 
 
 
134
 
135
-
136
  def to_dict(self):
137
  """Converts the Eval Result to a dict compatible with our dataframe display"""
138
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -160,6 +169,9 @@ class EvalResult:
160
  for task in Tasks:
161
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
162
 
 
 
 
163
  return data_dict
164
 
165
 
 
10
  from huggingface_hub import ModelCard
11
 
12
  from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
14
+ from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
15
 
16
  @dataclass
17
  class EvalResult:
 
37
  tags: list = None
38
  json_filename: str = None
39
  eval_time: float = 0.0
40
+ original_benchmark_average: float = None
41
 
42
  @classmethod
43
+ def init_from_json_file(self, json_filepath, is_original=False):
44
  """Inits the result from the specific model result file"""
45
  with open(json_filepath) as fp:
46
  data = json.load(fp)
 
69
 
70
  # Extract results available in this file (some results are split in several files)
71
  results = {}
72
+ tasks = [(task.value.benchmark, task.value.metric) for task in Tasks]
73
+ if is_original:
74
+ tasks = ORIGINAL_TASKS
75
+ for task in tasks:
76
+ benchmark, metric = task
77
+
78
  # We skip old mmlu entries
79
  wrong_mmlu_version = False
80
+ if benchmark == "hendrycksTest":
81
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
82
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
83
  wrong_mmlu_version = True
 
86
  continue
87
 
88
  # Some truthfulQA values are NaNs
89
+ if benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
90
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
91
+ results[benchmark] = 0.0
92
  continue
93
+
94
  # We average all scores of a given metric (mostly for mmlu)
95
+ accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
96
  if accs.size == 0 or any([acc is None for acc in accs]):
97
  continue
98
 
99
 
100
  mean_acc = np.mean(accs) * 100.0
101
+ results[benchmark] = mean_acc
102
 
103
  return self(
104
  eval_name=result_key,
 
135
  self.still_on_hub = file_dict["still_on_hub"]
136
  self.flagged = any("flagged" in tag for tag in file_dict["tags"])
137
  self.tags = file_dict["tags"]
138
+ if 'original_llm_scores' in file_dict:
139
+ if len(file_dict['original_llm_scores']) > 0:
140
+ if self.precision.value.name in file_dict['original_llm_scores']:
141
+ self.original_benchmark_average = file_dict['original_llm_scores'][self.precision.value.name]
142
+ else:
143
+ self.original_benchmark_average = max(list(file_dict['original_llm_scores'].values()))
144
 
 
145
  def to_dict(self):
146
  """Converts the Eval Result to a dict compatible with our dataframe display"""
147
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
169
  for task in Tasks:
170
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
171
 
172
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
173
+ data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
174
+
175
  return data_dict
176
 
177
 
src/scripts/update_all_request_files.py CHANGED
@@ -1,13 +1,26 @@
1
  from huggingface_hub import ModelFilter, snapshot_download
2
  from huggingface_hub import ModelCard
3
 
 
4
  import json
5
  import time
 
6
 
7
  from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
8
- from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
9
-
10
- def update_models(file_path, models):
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
  Search through all JSON files in the specified root folder and its subfolders,
13
  and update the likes key in JSON dict from value of input dict
@@ -20,6 +33,7 @@ def update_models(file_path, models):
20
  data['likes'] = 0
21
  data['downloads'] = 0
22
  data['created_at'] = ""
 
23
  continue
24
 
25
  model_cfg = models[model_id]
@@ -28,6 +42,7 @@ def update_models(file_path, models):
28
  data['created_at'] = str(model_cfg.created_at)
29
  #data['params'] = get_model_size(model_cfg, data['precision'])
30
  data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
 
31
 
32
  # Is the model still on the hub?
33
  model_name = model_id
@@ -44,6 +59,23 @@ def update_models(file_path, models):
44
  status, _, model_card = check_model_card(model_id)
45
  tags = get_model_tags(model_card, model_id)
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  data["tags"] = tags
48
 
49
  with open(file_path, 'w') as f:
@@ -68,11 +100,34 @@ def update_dynamic_files():
68
  ))
69
  id_to_model = {model.id : model for model in models}
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
72
 
73
  start = time.time()
74
 
75
- update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
76
 
77
  print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
78
 
 
1
  from huggingface_hub import ModelFilter, snapshot_download
2
  from huggingface_hub import ModelCard
3
 
4
+ import os
5
  import json
6
  import time
7
+ from collections import defaultdict
8
 
9
  from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
10
+ from src.leaderboard.read_evals import EvalResult
11
+ from src.envs import (
12
+ DYNAMIC_INFO_REPO,
13
+ DYNAMIC_INFO_PATH,
14
+ DYNAMIC_INFO_FILE_PATH,
15
+ API,
16
+ H4_TOKEN,
17
+ ORIGINAL_HF_LEADERBOARD_RESULTS_REPO,
18
+ ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH,
19
+ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
20
+ )
21
+ from src.display.utils import ORIGINAL_TASKS
22
+
23
+ def update_models(file_path, models, original_leaderboard_files=None):
24
  """
25
  Search through all JSON files in the specified root folder and its subfolders,
26
  and update the likes key in JSON dict from value of input dict
 
33
  data['likes'] = 0
34
  data['downloads'] = 0
35
  data['created_at'] = ""
36
+ data['original_llm_scores'] = {}
37
  continue
38
 
39
  model_cfg = models[model_id]
 
42
  data['created_at'] = str(model_cfg.created_at)
43
  #data['params'] = get_model_size(model_cfg, data['precision'])
44
  data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
45
+ data['original_llm_scores'] = {}
46
 
47
  # Is the model still on the hub?
48
  model_name = model_id
 
59
  status, _, model_card = check_model_card(model_id)
60
  tags = get_model_tags(model_card, model_id)
61
 
62
+
63
+ if original_leaderboard_files is not None and model_id in original_leaderboard_files:
64
+ eval_results = {}
65
+ for filepath in original_leaderboard_files[model_id]:
66
+ eval_result = EvalResult.init_from_json_file(filepath, is_original=True)
67
+ # Store results of same eval together
68
+ eval_name = eval_result.eval_name
69
+ if eval_name in eval_results.keys():
70
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
71
+ else:
72
+ eval_results[eval_name] = eval_result
73
+ for eval_result in eval_results.values():
74
+ precision = eval_result.precision.value.name
75
+ if len(eval_result.results) < len(ORIGINAL_TASKS):
76
+ continue
77
+ data['original_llm_scores'][precision] = sum([v for v in eval_result.results.values() if v is not None]) / len(ORIGINAL_TASKS)
78
+
79
  data["tags"] = tags
80
 
81
  with open(file_path, 'w') as f:
 
100
  ))
101
  id_to_model = {model.id : model for model in models}
102
 
103
+ id_to_leaderboard_files = defaultdict(list)
104
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
105
+ try:
106
+ print("UPDATE_DYNAMIC: Downloading Original HF Leaderboard results snapshot")
107
+ snapshot_download(
108
+ repo_id=ORIGINAL_HF_LEADERBOARD_RESULTS_REPO, local_dir=ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
109
+ )
110
+ #original_leaderboard_files = [] #API.list_repo_files(ORIGINAL_HF_LEADERBOARD_RESULTS_REPO, repo_type='dataset')
111
+ for dirpath,_,filenames in os.walk(ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH):
112
+ for f in filenames:
113
+ if not (f.startswith('results_') and f.endswith('.json')):
114
+ continue
115
+
116
+ filepath = os.path.join(dirpath[len(ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH)+1:], f)
117
+ model_id = filepath[:filepath.find('/results_')]
118
+ id_to_leaderboard_files[model_id].append(os.path.join(dirpath, f))
119
+
120
+ for model_id in id_to_leaderboard_files:
121
+ id_to_leaderboard_files[model_id].sort()
122
+ except Exception as e:
123
+ print(f"UPDATE_DYNAMIC: Could not download original results from : {e}")
124
+ id_to_leaderboard_files = None
125
+
126
  print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
127
 
128
  start = time.time()
129
 
130
+ update_models(DYNAMIC_INFO_FILE_PATH, id_to_model, id_to_leaderboard_files)
131
 
132
  print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
133