open_pt_llm_leaderboard

Restarting on CPU Upgrade

App Files Files Community

eduagarcia commited on Feb 21

Commit

8aaf0e7

•

1 Parent(s): 0cc3edb

Add env variable SHOW_INCOMPLETE_EVALS and order evaluation queue by priority

Browse files

Files changed (6) hide show

app.py +16 -3
src/display/changelog.py +1 -1
src/envs.py +2 -0
src/leaderboard/read_evals.py +3 -3
src/populate.py +19 -6
tasks_config/pt_config.yaml +1 -0

app.py CHANGED Viewed

@@ -30,7 +30,19 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.scripts.update_all_request_files import update_dynamic_files
@@ -81,7 +93,8 @@ def init_space():
         requests_path=EVAL_REQUESTS_PATH,
         dynamic_path=DYNAMIC_INFO_FILE_PATH,
         cols=COLS,
-        benchmark_cols=BENCHMARK_COLS
     )
     update_collections(original_df.copy())
     leaderboard_df = original_df.copy()
@@ -93,7 +106,7 @@ def init_space():
         running_eval_queue_df,
         pending_eval_queue_df,
         failed_eval_queue_df
-    ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
     return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, failed_eval_queue_df

     WeightType,
     Precision
 )
+from src.envs import (
+    API,
+    EVAL_REQUESTS_PATH,
+    DYNAMIC_INFO_REPO,
+    DYNAMIC_INFO_FILE_PATH,
+    DYNAMIC_INFO_PATH,
+    EVAL_RESULTS_PATH,
+    H4_TOKEN, IS_PUBLIC,
+    QUEUE_REPO,
+    REPO_ID,
+    RESULTS_REPO,
+    SHOW_INCOMPLETE_EVALS
+)
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.scripts.update_all_request_files import update_dynamic_files
         requests_path=EVAL_REQUESTS_PATH,
         dynamic_path=DYNAMIC_INFO_FILE_PATH,
         cols=COLS,
+        benchmark_cols=BENCHMARK_COLS,
+        show_incomplete=SHOW_INCOMPLETE_EVALS
     )
     update_collections(original_df.copy())
     leaderboard_df = original_df.copy()
         running_eval_queue_df,
         pending_eval_queue_df,
         failed_eval_queue_df
+    ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS, show_incomplete=SHOW_INCOMPLETE_EVALS)
     return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, failed_eval_queue_df

src/display/changelog.py CHANGED Viewed

@@ -2,7 +2,7 @@ CHANGELOG_TEXT = f"""
 # Changes made to the leaderboard
 ### [1.1.0] - 2024-02-16
-Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
 Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
 ### [1.0.0] - 2024-02-01

 # Changes made to the leaderboard
 ### [1.1.0] - 2024-02-16
+Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
 Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
 ### [1.0.0] - 2024-02-01

src/envs.py CHANGED Viewed

@@ -65,4 +65,6 @@ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = str2bool(get_config("GET_ORIGINAL_HF_
 ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
 ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
 API = HfApi(token=H4_TOKEN)

 ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
 ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
+SHOW_INCOMPLETE_EVALS = str2bool(get_config("SHOW_INCOMPLETE_EVALS", False))
 API = HfApi(token=H4_TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -12,7 +12,7 @@ from huggingface_hub import ModelCard
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
-from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
 @dataclass
 class EvalResult:
@@ -216,7 +216,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
-                req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
                 and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
@@ -262,7 +262,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     results = []
     for v in eval_results.values():
         try:
-            if v.status in ["FINISHED", "PENDING_NEW_EVAL"] and not v.hidden:
                 v.to_dict() # we test if the dict version is complete
                 results.append(v)
         except KeyError as e:  # not all eval values present

 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
+from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, SHOW_INCOMPLETE_EVALS
 @dataclass
 class EvalResult:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
+                req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL" if SHOW_INCOMPLETE_EVALS else "FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
     results = []
     for v in eval_results.values():
         try:
+            if v.status in ["FINISHED", "PENDING_NEW_EVAL" if SHOW_INCOMPLETE_EVALS else "FINISHED"] and not v.hidden:
                 v.to_dict() # we test if the dict version is complete
                 results.append(v)
         except KeyError as e:  # not all eval values present

src/populate.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
 import pandas as pd
@@ -9,7 +10,7 @@ from src.leaderboard.filter_models import filter_models_flags
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
     all_data_json = [v.to_dict() for v in raw_data]
     all_data_json.append(baseline_row)
@@ -21,11 +22,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
-    #df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
@@ -51,12 +53,23 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
                 data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
     failed_list = [e for e in all_evals if e["status"] == "FAILED"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
     df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
     return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]

 import json
 import os
+import copy
 import pandas as pd
 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list, show_incomplete=False) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
     all_data_json = [v.to_dict() for v in raw_data]
     all_data_json.append(baseline_row)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
+    if not show_incomplete:
+        df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df
+def get_evaluation_queue_df(save_path: str, cols: list, show_incomplete=False) -> list[pd.DataFrame]:
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
                 data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
+    cols_pending = copy.deepcopy(cols)
+    cols_pending.append('source')
+    cols_pending.append('submitted_time')
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PENDING_NEW_EVAL"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [e for e in all_evals if e["status"] in ["FINISHED", "PENDING_NEW_EVAL" if show_incomplete else "FINISHED"]]
     failed_list = [e for e in all_evals if e["status"] == "FAILED"]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=cols_pending)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
     df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
+    df_pending['source_priority'] = df_pending["source"].apply(lambda x: {"manual": 0, "leaderboard": 1, "script": 2}.get(x, 3))
+    df_pending['status_priority'] = df_pending["status"].apply(lambda x: {"PENDING": 2, "RERUN": 0, "PENDING_NEW_EVAL": 1}.get(x, 3))
+    df_pending = df_pending.sort_values(['source_priority', 'status_priority', 'submitted_time'])
+    df_pending = df_pending.drop(['source_priority', 'status_priority', 'submitted_time', 'source'], axis=1)
     return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]

tasks_config/pt_config.yaml CHANGED Viewed

@@ -10,6 +10,7 @@ config:
   LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
   GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
   TRUST_REMOTE_CODE: true
 readme:
   general_description: |
     📐 The 🚀 Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of

   LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
   GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
   TRUST_REMOTE_CODE: true
+  SHOW_INCOMPLETE_EVALS: false
 readme:
   general_description: |
     📐 The 🚀 Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of