open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Jan 24

Commit

71ecfbb

•

1 Parent(s): 88c4c5f

Feature: FIELD with original HF Leaderboard ranking

Browse files

Files changed (5) hide show

.gitignore +1 -0
src/display/utils.py +17 -1
src/envs.py +5 -0
src/leaderboard/read_evals.py +26 -14
src/scripts/update_all_request_files.py +59 -4

.gitignore CHANGED Viewed

@@ -7,6 +7,7 @@ __pycache__/
 run_dot_env.sh
 hub/
 modules/
 eval-queue/
 eval-results/

 run_dot_env.sh
 hub/
 modules/
+original_results/
 eval-queue/
 eval-results/

src/display/utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ from dataclasses import dataclass, make_dataclass
 from enum import Enum
 from typing import List
 import pandas as pd
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -112,7 +113,8 @@ auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool",
 auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
 # We use make dataclass to dynamically fill the scores from Tasks
@@ -160,6 +162,8 @@ for task in Tasks:
     if task.value.baseline is not None:
         baseline_list.append(task.value.baseline)
 baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -201,6 +205,8 @@ for task in Tasks:
     if task.value.human_baseline is not None:
         baseline_list.append(task.value.human_baseline)
 human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
 @dataclass
 class ModelDetails:
@@ -278,3 +284,13 @@ NUMERIC_INTERVALS = {
     "~60": pd.Interval(45, 70, closed="right"),
     "70+": pd.Interval(70, 10000, closed="right"),
 }

 from enum import Enum
 from typing import List
 import pandas as pd
+from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
+if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+    auto_eval_column_dict.append(["original_benchmark_average", ColumnContent, ColumnContent("🤗 Leaderboard Average", "number", False)])
 # We use make dataclass to dynamically fill the scores from Tasks
     if task.value.baseline is not None:
         baseline_list.append(task.value.baseline)
 baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
+if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+    baseline_row["original_benchmark_average"] = None
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
     if task.value.human_baseline is not None:
         baseline_list.append(task.value.human_baseline)
 human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
+if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+    human_baseline_row["original_benchmark_average"] = None
 @dataclass
 class ModelDetails:
     "~60": pd.Interval(45, 70, closed="right"),
     "70+": pd.Interval(70, 10000, closed="right"),
 }
+#Original HF LEaderboard tasks and metrics
+ORIGINAL_TASKS = [
+    ("arc:challenge", "acc_norm"),
+    ("hellaswag", "acc_norm"),
+    ("hendrycksTest", "acc"),
+    ("truthfulqa:mc", "mc2"),
+    ("winogrande", "acc"),
+    ("gsm8k", "acc")
+]

src/envs.py CHANGED Viewed

@@ -38,4 +38,9 @@ HAS_HIGHER_RATE_LIMIT = os.environ.get("HAS_HIGHER_RATE_LIMIT", "TheBloke").spli
 TRUST_REMOTE_CODE = bool(os.getenv("TRUST_REMOTE_CODE", False))
 API = HfApi(token=H4_TOKEN)

 TRUST_REMOTE_CODE = bool(os.getenv("TRUST_REMOTE_CODE", False))
+#Set if you want to get an extra field with the average eval results from the HF leaderboard
+GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(os.getenv("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
+ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = os.getenv("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
+ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
 API = HfApi(token=H4_TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -10,8 +10,8 @@ import numpy as np
 from huggingface_hub import ModelCard
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 @dataclass
 class EvalResult:
@@ -37,9 +37,10 @@ class EvalResult:
     tags: list = None
     json_filename: str = None
     eval_time: float = 0.0
     @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -68,12 +69,15 @@ class EvalResult:
         # Extract results available in this file (some results are split in several files)
         results = {}
-        for task in Tasks:
-            task = task.value
-            """
             # We skip old mmlu entries
             wrong_mmlu_version = False
-            if task.benchmark == "hendrycksTest":
                 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
                     if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
                         wrong_mmlu_version = True
@@ -82,19 +86,19 @@ class EvalResult:
                 continue
             # Some truthfulQA values are NaNs
-            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
-                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
-                    results[task.benchmark] = 0.0
                     continue
-            """
             # We average all scores of a given metric (mostly for mmlu)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
@@ -131,8 +135,13 @@ class EvalResult:
         self.still_on_hub = file_dict["still_on_hub"]
         self.flagged = any("flagged" in tag for tag in file_dict["tags"])
         self.tags = file_dict["tags"]
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -160,6 +169,9 @@ class EvalResult:
         for task in Tasks:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict

 from huggingface_hub import ModelCard
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
+from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
 @dataclass
 class EvalResult:
     tags: list = None
     json_filename: str = None
     eval_time: float = 0.0
+    original_benchmark_average: float = None
     @classmethod
+    def init_from_json_file(self, json_filepath, is_original=False):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         # Extract results available in this file (some results are split in several files)
         results = {}
+        tasks = [(task.value.benchmark, task.value.metric) for task in Tasks]
+        if is_original:
+            tasks = ORIGINAL_TASKS
+        for task in tasks:
+            benchmark, metric = task
             # We skip old mmlu entries
             wrong_mmlu_version = False
+            if benchmark == "hendrycksTest":
                 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
                     if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
                         wrong_mmlu_version = True
                 continue
             # Some truthfulQA values are NaNs
+            if benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
+                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
+                    results[benchmark] = 0.0
                     continue
             # We average all scores of a given metric (mostly for mmlu)
+            accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
+            results[benchmark] = mean_acc
         return self(
             eval_name=result_key,
         self.still_on_hub = file_dict["still_on_hub"]
         self.flagged = any("flagged" in tag for tag in file_dict["tags"])
         self.tags = file_dict["tags"]
+        if 'original_llm_scores' in file_dict:
+            if len(file_dict['original_llm_scores']) > 0:
+                if self.precision.value.name in file_dict['original_llm_scores']:
+                    self.original_benchmark_average = file_dict['original_llm_scores'][self.precision.value.name]
+                else:
+                    self.original_benchmark_average = max(list(file_dict['original_llm_scores'].values()))
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         for task in Tasks:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
+        if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+            data_dict[AutoEvalColumn.original_benchmark_average.name] = self.original_benchmark_average
         return data_dict

src/scripts/update_all_request_files.py CHANGED Viewed

@@ -1,13 +1,26 @@
 from huggingface_hub import ModelFilter, snapshot_download
 from huggingface_hub import ModelCard
 import json
 import time
 from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
-from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
-def update_models(file_path, models):
     """
     Search through all JSON files in the specified root folder and its subfolders,
     and update the likes key in JSON dict from value of input dict
@@ -20,6 +33,7 @@ def update_models(file_path, models):
                 data['likes'] = 0
                 data['downloads'] = 0
                 data['created_at'] = ""
                 continue
             model_cfg = models[model_id]
@@ -28,6 +42,7 @@ def update_models(file_path, models):
             data['created_at'] = str(model_cfg.created_at)
             #data['params'] = get_model_size(model_cfg, data['precision'])
             data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
             # Is the model still on the hub?
             model_name = model_id
@@ -44,6 +59,23 @@ def update_models(file_path, models):
                 status, _, model_card = check_model_card(model_id)
                 tags = get_model_tags(model_card, model_id)
             data["tags"] = tags
     with open(file_path, 'w') as f:
@@ -68,11 +100,34 @@ def update_dynamic_files():
     ))
     id_to_model = {model.id : model for model in models}
     print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
     start = time.time()
-    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
     print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")

 from huggingface_hub import ModelFilter, snapshot_download
 from huggingface_hub import ModelCard
+import os
 import json
 import time
+from collections import defaultdict
 from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
+from src.leaderboard.read_evals import EvalResult
+from src.envs import (
+    DYNAMIC_INFO_REPO,
+    DYNAMIC_INFO_PATH,
+    DYNAMIC_INFO_FILE_PATH,
+    API,
+    H4_TOKEN,
+    ORIGINAL_HF_LEADERBOARD_RESULTS_REPO,
+    ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH,
+    GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
+)
+from src.display.utils import ORIGINAL_TASKS
+def update_models(file_path, models, original_leaderboard_files=None):
     """
     Search through all JSON files in the specified root folder and its subfolders,
     and update the likes key in JSON dict from value of input dict
                 data['likes'] = 0
                 data['downloads'] = 0
                 data['created_at'] = ""
+                data['original_llm_scores'] = {}
                 continue
             model_cfg = models[model_id]
             data['created_at'] = str(model_cfg.created_at)
             #data['params'] = get_model_size(model_cfg, data['precision'])
             data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
+            data['original_llm_scores'] = {}
             # Is the model still on the hub?
             model_name = model_id
                 status, _, model_card = check_model_card(model_id)
                 tags = get_model_tags(model_card, model_id)
+            if original_leaderboard_files is not None and model_id in original_leaderboard_files:
+                eval_results = {}
+                for filepath in original_leaderboard_files[model_id]:
+                    eval_result = EvalResult.init_from_json_file(filepath, is_original=True)
+                    # Store results of same eval together
+                    eval_name = eval_result.eval_name
+                    if eval_name in eval_results.keys():
+                        eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+                    else:
+                        eval_results[eval_name] = eval_result
+                for eval_result in eval_results.values():
+                    precision = eval_result.precision.value.name
+                    if len(eval_result.results) < len(ORIGINAL_TASKS):
+                        continue
+                    data['original_llm_scores'][precision] = sum([v for v in eval_result.results.values() if v is not None]) / len(ORIGINAL_TASKS)
             data["tags"] = tags
     with open(file_path, 'w') as f:
     ))
     id_to_model = {model.id : model for model in models}
+    id_to_leaderboard_files = defaultdict(list)
+    if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+        try:
+            print("UPDATE_DYNAMIC: Downloading Original HF Leaderboard results snapshot")
+            snapshot_download(
+                repo_id=ORIGINAL_HF_LEADERBOARD_RESULTS_REPO, local_dir=ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+            )
+            #original_leaderboard_files = [] #API.list_repo_files(ORIGINAL_HF_LEADERBOARD_RESULTS_REPO, repo_type='dataset')
+            for dirpath,_,filenames in os.walk(ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH):
+                for f in filenames:
+                    if not (f.startswith('results_') and f.endswith('.json')):
+                        continue
+                    filepath = os.path.join(dirpath[len(ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH)+1:], f)
+                    model_id = filepath[:filepath.find('/results_')]
+                    id_to_leaderboard_files[model_id].append(os.path.join(dirpath, f))
+            for model_id in id_to_leaderboard_files:
+                id_to_leaderboard_files[model_id].sort()
+        except Exception as e:
+            print(f"UPDATE_DYNAMIC: Could not download original results from : {e}")
+            id_to_leaderboard_files = None
     print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
     start = time.time()
+    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model, id_to_leaderboard_files)
     print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")