Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Minseok Bae commited on Jan 2

Commit

58b9de9

•

1 Parent(s): d7b7dc6

Integrated backend pipelines - error occurs during model submission. (Debugging needed).

Browse files

Files changed (23) hide show

.gitignore +2 -0
app.py +53 -73
main_backend.py +72 -0
requirements.txt +2 -1
scripts/create_request_file.py +8 -6
src/backend/evaluate_model.py +81 -23
src/backend/manage_requests.py +17 -23
src/backend/model_operations.py +151 -23
src/backend/run_eval_suite.py +37 -24
src/backend/sort_queue.py +1 -2
src/backend/util.py +51 -19
src/display/about.py +18 -15
src/display/css_html_js.py +1 -1
src/display/utils.py +23 -12
src/envs.py +6 -3
src/leaderboard/read_evals.py +66 -74
src/populate.py +10 -10
src/submission/check_validity.py +2 -4
src/submission/submit.py +21 -25
tests/test_evaluate_model.py +87 -0
tests/test_evaluator.py +59 -0
tests/test_main_backend.py +54 -0
tests/test_summary_generator.py +68 -0

.gitignore CHANGED Viewed

@@ -11,5 +11,7 @@ human_evals/
 eval-queue/
 eval-results/
 auto_evals/
 src/assets/model_counts.html

 eval-queue/
 eval-results/
 auto_evals/
+eval-queue-bk/
+eval-results-bk/
 src/assets/model_counts.html

app.py CHANGED Viewed

@@ -3,60 +3,40 @@ import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from src.display.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
 from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
-    API.restart_space(repo_id=REPO_ID, token=TOKEN)
 try:
-    print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
 try:
-    print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
@@ -76,17 +56,17 @@ def update_table(
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
-    return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
-        AutoEvalColumn.model.name,
     ]
     # We use COLS to maintain sorting
     filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
     ]
     return filtered_df
@@ -104,7 +84,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
         if len(final_df) > 0:
             filtered_df = pd.concat(final_df)
             filtered_df = filtered_df.drop_duplicates(
-                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
             )
     return filtered_df
@@ -117,14 +97,14 @@ def filter_models(
     if show_deleted:
         filtered_df = df
     else:  # Show only still on the hub models
-        filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
     type_emoji = [t[0] for t in type_query]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
-    numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
-    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
     mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
     filtered_df = filtered_df.loc[mask]
@@ -133,8 +113,8 @@ def filter_models(
 demo = gr.Blocks(css=custom_css)
 with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
@@ -150,12 +130,12 @@ with demo:
                         shown_columns = gr.CheckboxGroup(
                             choices=[
                                 c.name
-                                for c in fields(AutoEvalColumn)
                                 if not c.hidden and not c.never_hidden and not c.dummy
                             ],
                             value=[
                                 c.name
-                                for c in fields(AutoEvalColumn)
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
                             label="Select columns to show",
@@ -170,34 +150,34 @@ with demo:
                     #with gr.Box(elem_id="box-filter"):
                     filter_columns_type = gr.CheckboxGroup(
                         label="Model types",
-                        choices=[t.to_str() for t in ModelType],
-                        value=[t.to_str() for t in ModelType],
                         interactive=True,
                         elem_id="filter-columns-type",
                     )
                     filter_columns_precision = gr.CheckboxGroup(
                         label="Precision",
-                        choices=[i.value.name for i in Precision],
-                        value=[i.value.name for i in Precision],
                         interactive=True,
                         elem_id="filter-columns-precision",
                     )
                     filter_columns_size = gr.CheckboxGroup(
                         label="Model sizes (in billions of parameters)",
-                        choices=list(NUMERIC_INTERVALS.keys()),
-                        value=list(NUMERIC_INTERVALS.keys()),
                         interactive=True,
                         elem_id="filter-columns-size",
                     )
             leaderboard_table = gr.components.Dataframe(
                 value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
                     + shown_columns.value
-                    + [AutoEvalColumn.dummy.name]
                 ],
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
-                datatype=TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
@@ -206,9 +186,9 @@ with demo:
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
-                datatype=TYPES,
                 visible=False,
             )
             search_bar.submit(
@@ -241,12 +221,12 @@ with demo:
                 )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Column():
                     with gr.Accordion(
@@ -256,8 +236,8 @@ with demo:
                         with gr.Row():
                             finished_eval_table = gr.components.Dataframe(
                                 value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
                                 row_count=5,
                             )
                     with gr.Accordion(
@@ -267,8 +247,8 @@ with demo:
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
                                 value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
                                 row_count=5,
                             )
@@ -279,8 +259,8 @@ with demo:
                         with gr.Row():
                             pending_eval_table = gr.components.Dataframe(
                                 value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
                                 row_count=5,
                             )
             with gr.Row():
@@ -291,7 +271,7 @@ with demo:
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
                         value=None,
@@ -300,14 +280,14 @@ with demo:
                 with gr.Column():
                     precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
                         value="float16",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
                         label="Weights type",
                         multiselect=False,
                         value="Original",
@@ -318,7 +298,7 @@ with demo:
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
-                add_new_eval,
                 [
                     model_name_textbox,
                     base_model_name_textbox,
@@ -333,8 +313,8 @@ with demo:
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
                 lines=20,
                 elem_id="citation-button",
                 show_copy_button=True,

 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+import src.display.about as about
 from src.display.css_html_js import custom_css
+import src.display.utils as utils
+import src.envs as envs
+import src.populate as populate
+import src.submission.submit as submit
 def restart_space():
+    envs.API.restart_space(repo_id=envs.REPO_ID, token=envs.TOKEN)
 try:
+    print(envs.EVAL_REQUESTS_PATH)
     snapshot_download(
+        repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
 try:
+    print(envs.EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
+raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
+) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
 # Searching and filtering
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    return df[(df[utils.AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
+        utils.AutoEvalColumn.model_type_symbol.name,
+        utils.AutoEvalColumn.model.name,
     ]
     # We use COLS to maintain sorting
     filtered_df = df[
+        always_here_cols + [c for c in utils.COLS if c in df.columns and c in columns] + [utils.AutoEvalColumn.dummy.name]
     ]
     return filtered_df
         if len(final_df) > 0:
             filtered_df = pd.concat(final_df)
             filtered_df = filtered_df.drop_duplicates(
+                subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
             )
     return filtered_df
     if show_deleted:
         filtered_df = df
     else:  # Show only still on the hub models
+        filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name] == True]
     type_emoji = [t[0] for t in type_query]
+    filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
+    filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
+    numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
+    params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
     mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
     filtered_df = filtered_df.loc[mask]
 demo = gr.Blocks(css=custom_css)
 with demo:
+    gr.HTML(about.TITLE)
+    gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
                         shown_columns = gr.CheckboxGroup(
                             choices=[
                                 c.name
+                                for c in utils.fields(utils.AutoEvalColumn)
                                 if not c.hidden and not c.never_hidden and not c.dummy
                             ],
                             value=[
                                 c.name
+                                for c in utils.fields(utils.AutoEvalColumn)
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
                             label="Select columns to show",
                     #with gr.Box(elem_id="box-filter"):
                     filter_columns_type = gr.CheckboxGroup(
                         label="Model types",
+                        choices=[t.to_str() for t in utils.ModelType],
+                        value=[t.to_str() for t in utils.ModelType],
                         interactive=True,
                         elem_id="filter-columns-type",
                     )
                     filter_columns_precision = gr.CheckboxGroup(
                         label="Precision",
+                        choices=[i.value.name for i in utils.Precision],
+                        value=[i.value.name for i in utils.Precision],
                         interactive=True,
                         elem_id="filter-columns-precision",
                     )
                     filter_columns_size = gr.CheckboxGroup(
                         label="Model sizes (in billions of parameters)",
+                        choices=list(utils.NUMERIC_INTERVALS.keys()),
+                        value=list(utils.NUMERIC_INTERVALS.keys()),
                         interactive=True,
                         elem_id="filter-columns-size",
                     )
             leaderboard_table = gr.components.Dataframe(
                 value=leaderboard_df[
+                    [c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
                     + shown_columns.value
+                    + [utils.AutoEvalColumn.dummy.name]
                 ],
+                headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+                datatype=utils.TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=original_df[utils.COLS],
+                headers=utils.COLS,
+                datatype=utils.TYPES,
                 visible=False,
             )
             search_bar.submit(
                 )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
+                    gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Column():
                     with gr.Accordion(
                         with gr.Row():
                             finished_eval_table = gr.components.Dataframe(
                                 value=finished_eval_queue_df,
+                                headers=utils.EVAL_COLS,
+                                datatype=utils.EVAL_TYPES,
                                 row_count=5,
                             )
                     with gr.Accordion(
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
                                 value=running_eval_queue_df,
+                                headers=utils.EVAL_COLS,
+                                datatype=utils.EVAL_TYPES,
                                 row_count=5,
                             )
                         with gr.Row():
                             pending_eval_table = gr.components.Dataframe(
                                 value=pending_eval_queue_df,
+                                headers=utils.EVAL_COLS,
+                                datatype=utils.EVAL_TYPES,
                                 row_count=5,
                             )
             with gr.Row():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
                         value=None,
                 with gr.Column():
                     precision = gr.Dropdown(
+                        choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
                         label="Precision",
                         multiselect=False,
                         value="float16",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in utils.WeightType],
                         label="Weights type",
                         multiselect=False,
                         value="Original",
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
+                submit.add_new_eval,
                 [
                     model_name_textbox,
                     base_model_name_textbox,
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
+                value=about.CITATION_BUTTON_TEXT,
+                label=about.CITATION_BUTTON_LABEL,
                 lines=20,
                 elem_id="citation-button",
                 show_copy_button=True,

main_backend.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import logging
+import pprint
+from huggingface_hub import snapshot_download
+import src.backend.run_eval_suite as run_eval_suite
+import src.backend.manage_requests as manage_requests
+import src.backend.sort_queue as sort_queue
+import src.envs as envs
+logging.basicConfig(level=logging.ERROR)
+pp = pprint.PrettyPrinter(width=80)
+PENDING_STATUS = "PENDING"
+RUNNING_STATUS = "RUNNING"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+snapshot_download(repo_id=envs.RESULTS_REPO, revision="main",
+                local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
+                local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+def run_auto_eval():
+    current_pending_status = [PENDING_STATUS]
+    manage_requests.check_completed_evals(
+        api=envs.API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=envs.QUEUE_REPO,
+        local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=envs.RESULTS_REPO,
+        local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
+    )
+    eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
+                                                    hf_repo=envs.QUEUE_REPO,
+                                                    local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
+    eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
+    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+    if len(eval_requests) == 0:
+        print("No eval requests found. Exiting.")
+        return
+    eval_request = eval_requests[0]
+    pp.pprint(eval_request)
+    manage_requests.set_eval_request(
+        api=envs.API,
+        eval_request=eval_request,
+        new_status=RUNNING_STATUS,
+        hf_repo=envs.QUEUE_REPO,
+        local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
+    )
+    run_eval_suite.run_evaluation(
+        eval_request=eval_request,
+        local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
+        results_repo=envs.RESULTS_REPO,
+        batch_size=1,
+        device=envs.DEVICE,
+        no_cache=True,
+    )
+if __name__ == "__main__":
+    run_auto_eval()

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
 transformers==4.35.2
-tokenizers>=0.15.0

 requests==2.28.2
 tqdm==4.65.0
 transformers==4.35.2
+tokenizers>=0.15.0
+sentence-transformers==2.2.2

scripts/create_request_file.py CHANGED Viewed

@@ -7,10 +7,9 @@ from datetime import datetime, timezone
 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
-from util import QUEUE_REPO, EVAL_REQUESTS_PATH
-# EVAL_REQUESTS_PATH = "eval-queue"
-# QUEUE_REPO = "open-llm-leaderboard/requests"
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
 model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
@@ -25,7 +24,8 @@ def get_model_size(model_info, precision: str):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
-            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
         except AttributeError:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
@@ -37,13 +37,15 @@ def get_model_size(model_info, precision: str):
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")
     precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
     model_type = click.prompt("Enter model type", type=click.Choice(model_types))
-    weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
     base_model = click.prompt("Enter base model", default="")
     status = click.prompt("Enter status", default="FINISHED")

 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
+from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
 model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b"
+                            else float(model_size[:-1]) / 1e3, 3)
         except AttributeError:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH,
+                    repo_type="dataset")
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")
     precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
     model_type = click.prompt("Enter model type", type=click.Choice(model_types))
+    weight_type = click.prompt("Enter weight type", default="Original",
+                            type=click.Choice(weight_types))
     base_model = click.prompt("Enter base model", default="")
     status = click.prompt("Enter status", default="FINISHED")

src/backend/evaluate_model.py CHANGED Viewed

@@ -1,37 +1,95 @@
 import pandas as pd
-from leaderboard.src.backend.model_operations import SummaryGenerator, EvaluationModel
-from envs import HEM_PATH, SOURCE_PATH
-from leaderboard.src.backend.util import load_dataframe, format_results
 class Evaluator:
-    def __init__(self, model, revision, precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'):
         self.model = model
         self.revision = revision
         self.precision = precision
-        self.num_fewshot = num_fewshot
         self.batch_size = batch_size
         self.device = device
         self.no_cache = no_cache
         self.limit = limit
         self.write_out = write_out
         self.output_base_path = output_base_path
-        self.summary_generator = SummaryGenerator(model, revision)
-        self.eval_model = EvaluationModel(HEM_PATH)
     def evaluate(self):
-        df = load_dataframe(SOURCE_PATH)
-        generated_summaries_df = self.summary_generator.generate_summaries(df)
-        avg_summary_len = self.summary_generator.avg_length
-        answer_rate = self.summary_generator.answer_rate
-        hallucination_scores = self.eval_model.evaluate_hallucination(generated_summaries_df)
-        accuracy = self.eval_model.compute_accuracy
-        hallucination_rate = self.eval_model.hallucination_rate
-        results = format_results(hallucination_scores, self.model, self.revision, self.precision, accuracy, hallucination_rate, answer_rate, avg_summary_len)
-        return results

+import logging
 import pandas as pd
+import src.envs as envs
+from src.backend.model_operations import SummaryGenerator, EvaluationModel
+import src.backend.util as util
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
 class Evaluator:
+    """A class to evaluate summaries generated by a language model.
+    Attributes:
+        model (str): The name or path of the model.
+        revision (str): The model revision.
+        precision (str): The precision setting of the model.
+        num_fewshot (int): Number of few-shot examples to use.
+        batch_size (int): Batch size for processing.
+        device (str): The device to run the model on.
+        no_cache (bool): Flag to disable caching.
+        limit (int): Limit on the number of items to process.
+        write_out (bool): Whether to write results to a file.
+        output_base_path (str): Base path for output files.
+        summary_generator (SummaryGenerator): Instance for generating summaries.
+        eval_model (EvaluationModel): Instance for evaluating summaries.
+    """
+    def __init__(self, model, revision, precision, batch_size,
+                device, no_cache, limit, write_out=True,
+                output_base_path='logs'):
+        """Initializes the Evaluator with the given model and settings.
+        Args:
+            model (str): The name or path of the model.
+            revision (str): The model revision.
+            precision (str): The precision setting of the model.
+            num_fewshot (int): Number of few-shot examples to use.
+            batch_size (int): Batch size for processing.
+            device (str): The device to run the model on.
+            no_cache (bool): Flag to disable caching.
+            limit (int): Limit on the number of items to process.
+            write_out (bool): Whether to write results to a file.
+            output_base_path (str): Base path for output files.
+        """
         self.model = model
         self.revision = revision
         self.precision = precision
         self.batch_size = batch_size
         self.device = device
         self.no_cache = no_cache
         self.limit = limit
         self.write_out = write_out
         self.output_base_path = output_base_path
+        try:
+            self.summary_generator = SummaryGenerator(model, revision)
+            self.eval_model = EvaluationModel(envs.HEM_PATH)
+        except Exception as e:
+            logging.error(f"Error initializing Evaluator: {e}")
+            raise
     def evaluate(self):
+        """
+        Performs the evaluation process by generating summaries
+        and computing metrics.
+        Returns:
+            dict: A dictionary containing evaluation results.
+        """
+        try:
+            df = pd.read_csv(envs.SOURCE_PATH)
+            generated_summaries_df = self.summary_generator.generate_summaries(df)
+            avg_summary_len = self.summary_generator.avg_length
+            answer_rate = self.summary_generator.answer_rate
+            error_rate = self.summary_generator.error_rate
+            hallucination_scores = self.eval_model.evaluate_hallucination(
+                generated_summaries_df)
+            accuracy = self.eval_model.compute_accuracy()
+            hallucination_rate = self.eval_model.hallucination_rate
+            results = util.format_results(model_name=self.model, revision=self.revision,
+                                        precision=self.precision, accuracy=accuracy,
+                hallucination_rate=hallucination_rate, answer_rate=answer_rate,
+                avg_summary_len=avg_summary_len, error_rate=error_rate)
+            return results
+        except FileNotFoundError:
+            logging.error(f"File not found: {envs.SOURCE_PATH}")
+            raise
+        except Exception as e:
+            logging.error(f"Error during evaluation: {e}")
+            raise

src/backend/manage_requests.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import glob
 import json
 from dataclasses import dataclass
 from typing import Optional
 from huggingface_hub import HfApi, snapshot_download
-from src.envs import TOKEN
 @dataclass
 class EvalRequest:
@@ -22,42 +22,34 @@ class EvalRequest:
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     def get_model_args(self):
         model_args = f"pretrained={self.model},revision={self.revision}"
         if self.precision in ["float16", "bfloat16"]:
             model_args += f",dtype={self.precision}"
-        # Quantized models need some added config, the install of bits and bytes, etc
-        #elif self.precision == "8bit":
-        #    model_args += ",load_in_8bit=True"
-        #elif self.precision == "4bit":
-        #    model_args += ",load_in_4bit=True"
-        #elif self.precision == "GPTQ":
-            # A GPTQ model does not need dtype to be specified,
-            # it will be inferred from the config
-            pass
         else:
-            raise Exception(f"Unknown precision {self.precision}.")
         return model_args
-def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
-    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
     json_filepath = eval_request.json_filepath
     with open(json_filepath) as fp:
         data = json.load(fp)
-    data["status"] = set_to_status
     with open(json_filepath, "w") as f:
         f.write(json.dumps(data))
     api.upload_file(
         path_or_fileobj=json_filepath,
-        path_in_repo=json_filepath.replace(local_dir, ""),
         repo_id=hf_repo,
         repo_type="dataset",
     )
@@ -69,9 +61,10 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
     likes.
     Returns:
-        `list[EvalRequest]`: a list of model info dicts.
     """
-    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
     eval_requests = []
@@ -97,7 +90,8 @@ def check_completed_evals(
     local_dir_results: str,
 ):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
-    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
     running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
@@ -107,10 +101,10 @@ def check_completed_evals(
         print(f"Checking {model}")
         output_path = model
-        output_file = f"{local_dir_results}/{output_path}/results*.json"
-        output_file_exists = len(glob.glob(output_file)) > 0
-        if output_file_exists:
             print(
                 f"EXISTS output file exists for {model} setting it to {completed_status}"
             )

+import os
 import glob
 import json
 from dataclasses import dataclass
 from typing import Optional
 from huggingface_hub import HfApi, snapshot_download
 @dataclass
 class EvalRequest:
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     def get_model_args(self):
         model_args = f"pretrained={self.model},revision={self.revision}"
         if self.precision in ["float16", "bfloat16"]:
             model_args += f",dtype={self.precision}"
         else:
+            raise ValueError(f"Unknown precision {self.precision}.")
         return model_args
+def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
+                    hf_repo: str, local_dir: str):
+    """Updates a given eval request with its new status on the hub (running, completed, failed,)"""
     json_filepath = eval_request.json_filepath
     with open(json_filepath) as fp:
         data = json.load(fp)
+    data["status"] = new_status
     with open(json_filepath, "w") as f:
         f.write(json.dumps(data))
     api.upload_file(
         path_or_fileobj=json_filepath,
+        path_in_repo=os.path.relpath(json_filepath, start=local_dir),
         repo_id=hf_repo,
         repo_type="dataset",
     )
     likes.
     Returns:
+        list[EvalRequest]: a list of model info dicts.
     """
+    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
+                    repo_type="dataset", max_workers=60)
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
     eval_requests = []
     local_dir_results: str,
 ):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results,
+                    repo_type="dataset", max_workers=60)
     running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
         print(f"Checking {model}")
         output_path = model
+        output_files = f"{local_dir_results}/{output_path}/results*.json"
+        output_files_exists = len(glob.glob(output_files)) > 0
+        if output_files_exists:
             print(
                 f"EXISTS output file exists for {model} setting it to {completed_status}"
             )

src/backend/model_operations.py CHANGED Viewed

@@ -1,96 +1,224 @@
 import numpy as np
 import pandas as pd
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import CrossEncoder
-from leaderboard.src.backend.util import generate_prompt
 def load_evaluation_model(model_path):
     model = CrossEncoder(model_path)
-    model.save_pretrained('.checkpoints/{model_path}')
     return model
 class SummaryGenerator:
     def __init__(self, model_id, revision):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
-        self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
         self.summaries_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
     def generate_summaries(self, df):
         source, summary, dataset = [], [], []
         for index, row in df.iterrows():
             _source = row['text']
             _dataset = row['dataset']
-            prompt = generate_prompt(_source)
-            inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024, revision=self.revision)
             try:
-                outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False, temperature=0.0, revision=self.revision)
-                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True, revision=self.revision)
             except Exception as e:
                 print(f"Error at index {index}: {e}")
                 response = ""
             summary.append(response)
             source.append(_source)
             dataset.append(_dataset)
-        self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"])
         self._compute_avg_length()
         self._compute_answer_rate()
         return self.summaries_df
     def _compute_avg_length(self):
         total_words = 0
         count = 0
         for summary in self.summaries_df['summary']:
             if summary != "":
-                words = summary.split()
                 total_words += len(words)
                 count += 1
         self.avg_length = 0 if count == 0 else total_words / count
     def _compute_answer_rate(self):
-        non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary != "")
         total_rows = len(self.summaries_df)
         self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
 class EvaluationModel:
     def __init__(self, model_path):
         self.model = load_evaluation_model(model_path)
         self.scores = []
         self.accuracy = None
         self.hallucination_rate = None
     def evaluate_hallucination(self, summaries_df):
-        # Convert to NumPy arrays for efficient processing
-        source_docs = np.array(summaries_df['source'])
-        generated_summaries = np.array(summaries_df['summary'])
-        scores = self.model.predict(source_docs, generated_summaries)
-        self.scores = scores
-        return self.scores
-    def compute_accuracy(self):
         if not self.scores:
-            raise ValueError("Scores not calculated. Call evaluate_hallucination() first.")
         # Use threshold of 0.5 to compute accuracy
-        num_above_threshold = sum(score >= 0.5 for score in self.scores)
         num_total = len(self.scores)
-        if num_total == 0:
             raise ValueError("No scores available to compute accuracy.")
         self.accuracy = (num_above_threshold / num_total) * 100
         self.hallucination_rate = 100 - self.accuracy
-        return self.accuracy

+import logging
 import numpy as np
 import pandas as pd
+import spacy
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import CrossEncoder
+import src.backend.util as util
+# Set up basic configuration for logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+# Load spacy model for word tokenization
+nlp = spacy.load("en_core_web_sm")
 def load_evaluation_model(model_path):
+    """Load the evaluation model from the given path
+    Args:
+        model_path (str): Path to the evaluation model
+    Returns:
+        CrossEncoder: The evaluation model
+    """
     model = CrossEncoder(model_path)
     return model
+class ModelLoadingException(Exception):
+    """Exception raised for errors in loading a model.
+    Attributes:
+        model_id (str): The model identifier.
+        revision (str): The model revision.
+    """
+    def __init__(self, model_id, revision, messages="Error initializing model"):
+        self.model_id = model_id
+        self.revision = revision
+        super().__init__(f"{messages} id={model_id} revision={revision}")
 class SummaryGenerator:
+    """A class to generate summaries using a causal language model.
+    Attributes:
+        tokenizer (AutoTokenizer): Tokenizer for the model.
+        model (AutoModelForCausalLM): The causal language model.
+        summaries_df (DataFrame): DataFrame to store generated summaries.
+        revision (str): Model revision.
+        avg_length (float): Average length of summaries.
+        answer_rate (float): Rate of non-empty summaries.
+    """
     def __init__(self, model_id, revision):
+        """
+        Initializes the SummaryGenerator with a model.
+        Args:
+            model_id (str): Identifier for the model.
+            revision (str): Revision of the model.
+        """
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
+            self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
+        except Exception as e:
+            logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}")
+            raise ModelLoadingException(model_id, revision) from e
         self.summaries_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
+        self.error_rate = None
     def generate_summaries(self, df):
+        """Generate summaries for a given DataFrame of source docs.
+        Args:
+            df (DataFrame): DataFrame containing source docs.
+        Returns:
+            summaries_df (DataFrame): Generated summaries by the model.
+        """
         source, summary, dataset = [], [], []
+        error_count = 0
         for index, row in df.iterrows():
             _source = row['text']
             _dataset = row['dataset']
+            prompt = util.generate_prompt(_source)
+            inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
+                                    revision=self.revision)
             try:
+                outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
+                                            temperature=0.0, revision=self.revision)
+                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True,
+                                                revision=self.revision)
             except Exception as e:
                 print(f"Error at index {index}: {e}")
                 response = ""
+                error_count += 1
             summary.append(response)
             source.append(_source)
             dataset.append(_dataset)
+        self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
+                                        columns=["source", "summary", "dataset"])
         self._compute_avg_length()
         self._compute_answer_rate()
+        self._compute_error_rate(error_count)
         return self.summaries_df
     def _compute_avg_length(self):
+        """
+        Compute the average length of non-empty summaries using SpaCy.
+        """
         total_words = 0
         count = 0
         for summary in self.summaries_df['summary']:
             if summary != "":
+                doc = nlp(summary)
+                words = [token.text for token in doc if token.is_alpha]
                 total_words += len(words)
                 count += 1
         self.avg_length = 0 if count == 0 else total_words / count
     def _compute_answer_rate(self):
+        """
+        Compute the rate of non-empty summaries.
+        """
+        non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary)
         total_rows = len(self.summaries_df)
         self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
+    def _compute_error_rate(self, count):
+        """
+        Compute the error rate of summaries.
+        """
+        total_rows = len(self.summaries_df)
+        self.error_rate = 0 if total_rows == 0 else count / total_rows
 class EvaluationModel:
+    """A class to evaluate generated summaries.
+    Attributes:
+        model (CrossEncoder): The evaluation model.
+        scores (list): List of evaluation scores.
+        accuracy (float): Accuracy of the summaries.
+        hallucination_rate (float): Rate of hallucination in summaries.
+    """
     def __init__(self, model_path):
+        """
+        Initializes the EvaluationModel with a CrossEncoder model.
+        Args:
+            model_path (str): Path to the CrossEncoder model.
+        """
         self.model = load_evaluation_model(model_path)
         self.scores = []
         self.accuracy = None
         self.hallucination_rate = None
     def evaluate_hallucination(self, summaries_df):
+        """
+        Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute
+        of the instance with the computed scores.
+        Args:
+            summaries_df (DataFrame): DataFrame containing source docs and summaries.
+        Returns:
+            list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
+        """
+        source_docs = np.array(summaries_df['source'])
+        generated_summaries = np.array(summaries_df['summary'])
+        try:
+            scores = self.model.predict(source_docs, generated_summaries)
+            self.scores = scores
+            return self.scores
+        except Exception as e:
+            logging.error(f"Error evaluating hallucination: {e}")
+            raise
+    def compute_accuracy(self, threshold=0.5):
+        """
+        Compute the accuracy of the evaluated summaries based on the previously calculated scores.
+        This method relies on the 'scores' attribute being populated, typically via the
+        'evaluate_hallucination' method.
+        Returns:
+            float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate'
+                    attributes of the instance.
+        Raises:
+            ValueError: If scores have not been calculated prior to calling this method.
+        """
         if not self.scores:
+            error_msg = "Scores not calculated. Call evaluate_hallucination() first."
+            logging.error(error_msg)
+            raise ValueError(error_msg)
         # Use threshold of 0.5 to compute accuracy
+        num_above_threshold = sum(score >= threshold for score in self.scores)
         num_total = len(self.scores)
+        if not num_total:
             raise ValueError("No scores available to compute accuracy.")
         self.accuracy = (num_above_threshold / num_total) * 100
         self.hallucination_rate = 100 - self.accuracy
+        return self.accuracy

src/backend/run_eval_suite.py CHANGED Viewed

@@ -3,43 +3,56 @@ import os
 import logging
 from datetime import datetime
-# from lm_eval import tasks, evaluator, utils
-from evaluate_model import Evaluator
-from src.envs import RESULTS_REPO, API
 from src.backend.manage_requests import EvalRequest
-from util import load_dataframe, format_results
 logging.getLogger("openai").setLevel(logging.WARNING)
-def run_evaluation(eval_request: EvalRequest, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
-    if limit:
-        print(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-    # task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
-    # print(f"Selected Tasks: {task_names}")
-    evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs')
-    results = evaluator.evaluate()
-    # results["config"]["model_dtype"] = eval_request.precision
-    # results["config"]["model_name"] = eval_request.model
-    # results["config"]["model_sha"] = eval_request.revision
     dumped = json.dumps(results, indent=2)
-    print(dumped)
-    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
-    print(evaluator.make_table(results))
-    API.upload_file(
         path_or_fileobj=output_path,
         path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
         repo_id=results_repo,

 import logging
 from datetime import datetime
+import src.envs as envs
 from src.backend.manage_requests import EvalRequest
+from src.backend.evaluate_model import Evaluator
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
 logging.getLogger("openai").setLevel(logging.WARNING)
+def run_evaluation(eval_request: EvalRequest, batch_size, device,
+                local_dir: str, results_repo: str, no_cache=True, limit=None):
+    """
+    Run the evaluation for a given model and upload the results.
+    Args:
+        eval_request (EvalRequest): The evaluation request object containing model details.
+        num_fewshot (int): Number of few-shot examples.
+        batch_size (int): Batch size for processing.
+        device (str): The device to run the evaluation on.
+        local_dir (str): Local directory path for saving results.
+        results_repo (str): Repository ID where results will be uploaded.
+        no_cache (bool): Whether to disable caching.
+        limit (int, optional): Limit on the number of items to process. Use with caution.
+    Returns:
+        dict: A dictionary containing evaluation results.
+    """
+    if limit:
+        logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+    try:
+        evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
+                            batch_size, device, no_cache, limit, write_out=True,
+                            output_base_path='logs')
+        results = evaluator.evaluate()
+    except Exception as e:
+        logging.error(f"Error during evaluation: {e}")
+        raise
     dumped = json.dumps(results, indent=2)
+    logging.info(dumped)
+    output_path = os.path.join(local_dir, *eval_request.model.split("/"),
+                            f"results_{datetime.now()}.json")
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
+    envs.API.upload_file(
         path_or_fileobj=output_path,
         path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
         repo_id=results_repo,

src/backend/sort_queue.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import re
 from dataclasses import dataclass
 from huggingface_hub import HfApi
@@ -25,4 +24,4 @@ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.params, reverse=False)
 def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

 from dataclasses import dataclass
 from huggingface_hub import HfApi
     return sorted(eval_requests, key=lambda x: x.params, reverse=False)
 def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/backend/util.py CHANGED Viewed

@@ -1,18 +1,41 @@
-import pandas as pd
-def load_dataframe(data_path):
-    df = pd.read_csv(data_path)
-    return df
-def generate_prompt(source_passage):
-    return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
-    You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described.'
-    Passage:
-    {source_passage}
     """
-def format_results(hallucination_scores, model_name, revision, precision, accuracy, hallucination_rate, answer_rate, avg_summary_len):
-    # Define the structure of the results (JSON)
     results = {
         "config": {
             "model_dtype": precision, # Precision with which you ran the evaluation
@@ -20,13 +43,22 @@ def format_results(hallucination_scores, model_name, revision, precision, accura
             "model_sha": revision # Hash of the model
         },
         "results": {
-            "hallucination_eval": {
-                "HEM Scores": hallucination_scores,
-                "Accuracy": accuracy,
-                "Hallucination Rate": hallucination_rate,
-                "Answer Rate": answer_rate,
-                "Average Summary Length": avg_summary_len,
             }
         }
     }
-    return results

+def generate_prompt(source_passage: str) -> str:
+    """
+    Generates a prompt for a chatbot to summarize a given passage.
+    Args:
+        source_passage (str): The passage to be summarized.
+    Returns:
+        str: A formatted prompt string for the chatbot.
+    """
+    if not source_passage:
+        raise ValueError("Source passage is empty.")
+    return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
+    You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described:'
+    Passage:\n {source_passage}
+    """
+def format_results(model_name: str, revision: str, precision: str, accuracy: float,
+                hallucination_rate: float, answer_rate: float, avg_summary_len: float,
+                error_rate: float) -> dict:
+    """
+    Formats the evaluation results into a structured dictionary.
+    Args:
+        model_name (str): The name of the evaluated model.
+        revision (str): The revision hash of the model.
+        precision (str): The precision with which the evaluation was run.
+        accuracy (float): The accuracy score from the evaluation.
+        hallucination_rate (float): The hallucination rate from the evaluation.
+        answer_rate (float): The answer rate from the evaluation.
+        avg_summary_len (float): The average summary length from the evaluation.
+        error_rate (float): The rate at which errors occurred during summary generation.
+    Returns:
+        dict: A dictionary containing the structured evaluation results.
     """
     results = {
         "config": {
             "model_dtype": precision, # Precision with which you ran the evaluation
             "model_sha": revision # Hash of the model
         },
         "results": {
+            "accuracy": {
+                "accuracy": accuracy
+            },
+            "hallucination_rate": {
+                "hallucination_rate": hallucination_rate
+            },
+            "answer_rate": {
+                "answer_rate": answer_rate
+            },
+            "average_summary_length": {
+                "average_summary_length": avg_summary_len
+            },
+            "error_rate": {
+                "error_rate": error_rate
             }
         }
     }
+    return results

src/display/about.py CHANGED Viewed

@@ -1,20 +1,23 @@
 from dataclasses import dataclass
 from enum import Enum
-# @dataclass
-# class Task:
-#     benchmark: str
-#     metric: str
-#     col_name: str
-# # Init: to update with your specific keys
-# class Tasks(Enum):
-#     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-#     task0 = Task("task_name1", "metric_name", "First task")
-#     task1 = Task("task_name2", "metric_name", "Second task")
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
@@ -24,7 +27,7 @@ This Leaderboard evaluates how much easy LLM hallucinates in factual summarizati
 """
 # Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility

 from dataclasses import dataclass
 from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    accuracy = Task("accuracy", "accuracy", "Accuracy")
+    hallucination_rate = Task("hallucination_rate",
+                            "hallucination_rate", "Hallucination Rate")
+    answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
+    average_summary_length = Task("average_summary_length",
+                                "average_summary_length", "Average Summary Length")
+    error_rate = Task("error_rate", "error_rate", "Error Rate")
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
 """
 # Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = """
 ## How it works
 ## Reproducibility

src/display/css_html_js.py CHANGED Viewed

@@ -33,7 +33,7 @@ custom_css = """
     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }

     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-# from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -24,16 +24,27 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-# Accuracy
-auto_eval_column_dict.append(["accuracy", ColumnContent, ColumnContent("Accuracy ⬆️", "number", True)])
-# Hallucination Rate
-auto_eval_column_dict.append(["hallucination_rate", ColumnContent, ColumnContent("Hallucination Rate ⬇️", "number", True)])
-# Answer Rate
-auto_eval_column_dict.append(["answer_rate", ColumnContent, ColumnContent("Answer Rate ⬆️", "number", True)])
-# Average Summary Length
-auto_eval_column_dict.append(["average_summary_length", ColumnContent, ColumnContent("Average Summary Length", "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
@@ -126,7 +137,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = ["Accuracy", "Hallucination Rate", "Answer Rate", "Average Summary Length"]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

 import pandas as pd
+from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
+                            ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent,
+                            ColumnContent("Model", "markdown", True, never_hidden=True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# # Accuracy
+# auto_eval_column_dict.append(["accuracy", ColumnContent,
+#                             ColumnContent("Accuracy", "number", True)])
+# # Hallucination Rate
+# auto_eval_column_dict.append(["hallucination_rate", ColumnContent,
+#                             ColumnContent("Hallucination Rate", "number", True)])
+# # Answer Rate
+# auto_eval_column_dict.append(["answer_rate", ColumnContent,
+#                             ColumnContent("Answer Rate", "number", True)])
+# # Average Summary Length
+# auto_eval_column_dict.append(["average_summary_length", ColumnContent,
+#                             ColumnContent("Average Summary Length", "number", True)])
+# # Error Rate
+# auto_eval_column_dict.append(["error_rate", ColumnContent,
+#                             ColumnContent("Error Rate", "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/envs.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from huggingface_hub import HfApi
-# replace this with our token
 TOKEN = os.environ.get("HF_TOKEN", None)
 OWNER = "vectara"
@@ -15,8 +15,11 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 API = HfApi(token=TOKEN)
-SOURCE_PATH = "/datasets/leaderboard_summaries.csv"
-HEM_PATH = 'vectara/hallucination_evaluation_model'

 from huggingface_hub import HfApi
+# replace this with our token
 TOKEN = os.environ.get("HF_TOKEN", None)
 OWNER = "vectara"
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+DEVICE = "cpu"
 API = HfApi(token=TOKEN)
+SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
+HEM_PATH = 'vectara/hallucination_evaluation_model'

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,33 +1,32 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
-import dateutil
 import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
@@ -39,42 +38,38 @@ class EvalResult:
         config = data.get("config")
         # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
             result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
-        hallucination_eval = data["results"].get("hallucination_eval", {})
-        # Extract metrics from hallucination eval
-        results = {
-            "HEM Scores": hallucination_eval.get("HEM Scores", None),
-            "Accuracy": hallucination_eval.get("Accuracy", None),
-            "Hallucination Rate": hallucination_eval.get("Hallucination Rate", None),
-            "Answer Rate": hallucination_eval.get("Answer Rate", None),
-            "Average Summary Length": hallucination_eval.get("Average Summary Length", None),
-        }
         return self(
             eval_name=result_key,
@@ -82,7 +77,7 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
@@ -90,47 +85,44 @@ class EvalResult:
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
-        except Exception:
             print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        accuracy = self.results.get("Accuracy", None)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.dummy.name: self.full_model,
-            AutoEvalColumn.revision.name: self.revision,
-            # AutoEvalColumn.average.name: average,
-            AutoEvalColumn.accuracy.name: accuracy,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
-        # for task in Tasks:
-        #     data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        data_dict["Hallucination Rate"] = self.results.get("Hallucination Rate", None)
-        data_dict["Answer Rate"] = self.results.get("Answer Rate", None)
-        data_dict["Average Summary Length"] = self.results.get("Average Summary Length", None)
         return data_dict
@@ -163,7 +155,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         # Sort the files by date
@@ -172,8 +164,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
@@ -184,7 +175,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result

 import glob
 import json
 import os
 from dataclasses import dataclass
 import numpy as np
+import dateutil
+import src.display.formatting as formatting
+import src.display.utils as utils
+import src.submission.check_validity as check_validity
 @dataclass
 class EvalResult:
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
+    precision: utils.Precision = utils.Precision.Unknown
+    model_type: utils.ModelType = utils.ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: utils.WeightType = utils.WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
     @classmethod
         config = data.get("config")
         # Precision
+        precision = utils.Precision.from_str(config.get("model_dtype"))
         # Get model and org
+        full_model = config.get("model_name", config.get("model_args", None))
+        org, model = full_model.split("/", 1) if "/" in full_model else (None, full_model)
+        if org:
             result_key = f"{org}_{model}_{precision.value.name}"
+        else:
+            result_key = f"{model}_{precision.value.name}"
+        still_on_hub, _, model_config = check_validity.is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True,
+            test_tokenizer=False)
+        if model_config:
+            architecture = ";".join(getattr(model_config, "architectures", ["?"]))
+        else:
+            architecture = "?"
         # Extract results available in this file (some results are split in several files)
+        results = {}
+        for task in utils.Tasks:
+            task = task.value
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             org=org,
             model=model,
             results=results,
+            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model,
+                                                self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
+            self.model_type = utils.ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = utils.WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
+        except FileNotFoundError:
             print(f"Could not find request file for {self.org}/{self.model}")
+        except json.JSONDecodeError:
+            print(f"Error decoding JSON in request file for {self.org}/{self.model}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            utils.AutoEvalColumn.precision.name: self.precision.value.name,
+            utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
+            utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            utils.AutoEvalColumn.architecture.name: self.architecture,
+            utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
+            utils.AutoEvalColumn.dummy.name: self.full_model,
+            utils.AutoEvalColumn.revision.name: self.revision,
+            utils.AutoEvalColumn.license.name: self.license,
+            utils.AutoEvalColumn.likes.name: self.likes,
+            utils.AutoEvalColumn.params.name: self.num_params,
+            utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
+        for task in utils.Tasks:
+            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
+        if not files or any([not f.endswith(".json") for f in files]):
             continue
         # Sort the files by date
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
+        model_result_filepaths.extend([os.path.join(root, file) for file in files])
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in
+                                                    eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result

src/populate.py CHANGED Viewed

@@ -3,21 +3,21 @@ import os
 import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.accuracy.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df
@@ -31,8 +31,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             with open(file_path) as fp:
                 data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
@@ -43,8 +43,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
                 with open(file_path) as fp:
                     data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

 import pandas as pd
+import src.display.formatting as formatting
+import src.display.utils as utils
+import src.leaderboard.read_evals as read_evals
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    df = df.sort_values(by=[utils.AutoEvalColumn.accuracy.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
+    df = df[formatting.has_no_nan_values(df, benchmark_cols)]
     return raw_data, df
             with open(file_path) as fp:
                 data = json.load(fp)
+            data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
+            data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
                 with open(file_path) as fp:
                     data = json.load(fp)
+                data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
+                data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

src/submission/check_validity.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import json
 import os
-import re
 from collections import defaultdict
-from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
@@ -37,11 +35,11 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
-            tokenizer_config = get_tokenizer_config(model_name)
             if tokenizer_config is not None:
                 tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
             else:
-                tokenizer_class_candidate = config.tokenizer_class
             tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

 import json
 import os
 from collections import defaultdict
 import huggingface_hub
 from huggingface_hub import ModelCard
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
+            tokenizer_config = get_tokenizer_config(model_name)
             if tokenizer_config is not None:
                 tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
             else:
+                tokenizer_class_candidate = config.tokenizer_class
             tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

src/submission/submit.py CHANGED Viewed

@@ -2,14 +2,10 @@ import json
 import os
 from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
@@ -25,7 +21,7 @@ def add_new_eval(
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
     model_path = model
@@ -37,7 +33,7 @@ def add_new_eval(
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
@@ -45,32 +41,32 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
-        model_info = API.model_info(repo_id=model, revision=revision)
     except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
     # Were the model card and license filled?
     try:
         license = model_info.cardData["license"]
     except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
     if not modelcard_OK:
-        return styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
@@ -91,11 +87,11 @@ def add_new_eval(
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
     print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
@@ -103,10 +99,10 @@ def add_new_eval(
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
-    API.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",
     )
@@ -114,6 +110,6 @@ def add_new_eval(
     # Remove the local file
     os.remove(out_path)
-    return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

 import os
 from datetime import datetime, timezone
+import src.display.formatting as formatting
+import src.envs as envs
+import src.submission.check_validity as check_validity
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = check_validity.already_submitted_models(envs.EVAL_REQUESTS_PATH)
     user_name = ""
     model_path = model
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
+        return formatting.styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=base_model, revision=revision, token=envs.TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
+            return formatting.styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
+        model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
+            return formatting.styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
+        model_info = envs.API.model_info(repo_id=model, revision=revision)
     except Exception:
+        return formatting.styled_error("Could not get your model information. Please fill it up properly.")
+    model_size = check_validity.get_model_size(model_info=model_info, precision=precision)
     # Were the model card and license filled?
     try:
         license = model_info.cardData["license"]
     except Exception:
+        return formatting.styled_error("Please select a license for your model")
+    modelcard_OK, error_msg = check_validity.check_model_card(model)
     if not modelcard_OK:
+        return formatting.styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
+        return formatting.styled_warning("This model has been already submitted.")
     print("Creating eval file")
+    OUT_DIR = f"{envs.EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
+    envs.API.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=envs.QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",
     )
     # Remove the local file
     os.remove(out_path)
+    return formatting.styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

tests/test_evaluate_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import unittest
+from unittest.mock import patch
+import pandas as pd
+import src.backend.evaluate_model as evaluate_model
+import src.envs as envs
+class TestEvaluator(unittest.TestCase):
+    def setUp(self):
+        self.model_name = 'test_model'
+        self.revision = 'test_revision'
+        self.precision = 'test_precision'
+        self.batch_size = 10
+        self.device = 'test_device'
+        self.no_cache = False
+        self.limit = 10
+    @patch('src.backend.evaluate_model.SummaryGenerator')
+    @patch('src.backend.evaluate_model.EvaluationModel')
+    def test_evaluator_initialization(self, mock_eval_model, mock_summary_generator):
+        evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
+                                            self.precision, self.batch_size,
+                                            self.device, self.no_cache, self.limit)
+        mock_summary_generator.assert_called_once_with(self.model_name, self.revision)
+        mock_eval_model.assert_called_once_with(envs.HEM_PATH)
+        self.assertEqual(evaluator.model, self.model_name)
+    @patch('src.backend.evaluate_model.EvaluationModel')
+    @patch('src.backend.evaluate_model.SummaryGenerator')
+    def test_evaluator_initialization_error(self, mock_summary_generator, mock_eval_model):
+        mock_eval_model.side_effect = Exception('test_exception')
+        with self.assertRaises(Exception):
+            evaluate_model.Evaluator(self.model_name, self.revision,
+                                    self.precision, self.batch_size,
+                                    self.device, self.no_cache, self.limit)
+    @patch('src.backend.evaluate_model.SummaryGenerator')
+    @patch('src.backend.evaluate_model.EvaluationModel')
+    @patch('src.backend.evaluate_model.pd.read_csv')
+    @patch('src.backend.util.format_results')
+    def test_evaluate_method(self, mock_format_results, mock_read_csv, mock_eval_model,
+                            mock_summary_generator):
+        evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
+                                            self.precision, self.batch_size,
+                                            self.device, self.no_cache, self.limit)
+        # Mock setup
+        mock_format_results.return_value = {'test': 'result'}
+        mock_read_csv.return_value = pd.DataFrame({'column1': ['data1', 'data2']})
+        mock_summary_generator.return_value.generate_summaries.return_value = pd.DataFrame({'column1': ['summary1', 'summary2']})
+        mock_summary_generator.return_value.avg_length = 100
+        mock_summary_generator.return_value.answer_rate = 1.0
+        mock_summary_generator.return_value.error_rate = 0.0
+        mock_eval_model.return_value.compute_accuracy.return_value = 1.0
+        mock_eval_model.return_value.hallucination_rate = 0.0
+        mock_eval_model.return_value.evaluate_hallucination.return_value = [0.5]
+        # Method call and assertions
+        results = evaluator.evaluate()
+        mock_format_results.assert_called_once_with(model_name=self.model_name,
+                                                    revision=self.revision,
+                                                    precision=self.precision,
+                                                    accuracy=1.0, hallucination_rate=0.0,
+                                                    answer_rate=1.0, avg_summary_len=100,
+                                                    error_rate=0.0)
+        mock_read_csv.assert_called_once_with(envs.SOURCE_PATH)
+    @patch('src.backend.evaluate_model.SummaryGenerator')
+    @patch('src.backend.evaluate_model.EvaluationModel')
+    @patch('src.backend.evaluate_model.pd.read_csv')
+    def test_evaluate_with_file_not_found(self, mock_read_csv, mock_eval_model,
+                                        mock_summary_generator):
+        mock_read_csv.side_effect = FileNotFoundError('test_exception')
+        evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
+                                            self.precision, self.batch_size,
+                                            self.device, self.no_cache, self.limit)
+        with self.assertRaises(FileNotFoundError):
+            evaluator.evaluate()
+if __name__ == '__main__':
+    unittest.main()

tests/test_evaluator.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import unittest
+from unittest.mock import patch
+import pandas as pd
+import src.backend.model_operations as model_operations
+class TestEvaluator(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "test_model"
+    @patch("src.backend.model_operations.load_evaluation_model")
+    def test_init(self, mock_load_evaluation_model):
+        model_operations.EvaluationModel(self.model_path)
+        mock_load_evaluation_model.assert_called_once_with(self.model_path)
+    @patch("src.backend.model_operations.load_evaluation_model")
+    def test_evaluate_hallucination(self, mock_load_evaluation_model):
+        model = model_operations.EvaluationModel(self.model_path)
+        df = pd.DataFrame({'source': ['source1', 'source2'], 'summary': ['summary1', 'summary2']})
+        mock_load_evaluation_model.return_value.predict.return_value = [0.8, 0.2]
+        scores = model.evaluate_hallucination(df)
+        self.assertEqual(scores, [0.8, 0.2])
+    @patch("src.backend.model_operations.load_evaluation_model")
+    def test_evaluate_hallucination_exception(self, mock_load_evaluation_model):
+        model = model_operations.EvaluationModel(self.model_path)
+        df = pd.DataFrame({'source': ['source1', 'source2'], 'summary': ['summary1', 'summary2']})
+        mock_load_evaluation_model.return_value.predict.side_effect = Exception("Test exception")
+        with self.assertRaises(Exception):
+            scores = model.evaluate_hallucination(df)
+    @patch("src.backend.model_operations.load_evaluation_model")
+    def test_compute_accuracy(self, mock_load_evaluation_model):
+        model = model_operations.EvaluationModel(self.model_path)
+        model.scores = [0.8, 0.2]
+        accuracy = model.compute_accuracy()
+        expected_accuracy = 50.0
+        self.assertEqual(accuracy, expected_accuracy)
+class TestLoadEvaluationModel(unittest.TestCase):
+    @patch("src.backend.model_operations.CrossEncoder")
+    def test_load_evaluation_model(self, mock_cross_encoder):
+        model_path = 'test_model_path'
+        model_operations.load_evaluation_model(model_path)
+        mock_cross_encoder.assert_called_once_with(model_path)
+if __name__ == '__main__':
+    unittest.main()

tests/test_main_backend.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import unittest
+from unittest.mock import patch
+import main_backend
+import src.backend.manage_requests as manage_requests
+class TestMainBackend(unittest.TestCase):
+    @patch('src.backend.manage_requests.check_completed_evals')
+    @patch('src.backend.manage_requests.get_eval_requests')
+    @patch('src.backend.sort_queue.sort_models_by_priority')
+    @patch('src.backend.manage_requests.set_eval_request')
+    @patch('src.backend.run_eval_suite.run_evaluation')
+    def test_run_auto_eval_with_pending_requests(self, mock_run_evaluation, mock_set_eval_request,
+                            mock_sort_models_by_priority, mock_get_eval_requests,
+                            mock_check_completed_evals):
+        mock_sort_models_by_priority.return_value = [manage_requests.EvalRequest(
+            model="test_model",
+            private=True,
+            status="PENDING",
+            json_filepath="test_filepath",
+            weight_type="test_weight_type",
+            precision="test_precision",
+            base_model="test_base_model",
+            revision="test_revision",
+        )]
+        main_backend.run_auto_eval()
+        # Assertions
+        mock_check_completed_evals.assert_called()
+        mock_get_eval_requests.assert_called()
+        mock_sort_models_by_priority.assert_called()
+        mock_set_eval_request.assert_called()
+        mock_run_evaluation.assert_called()
+    @patch('builtins.print')
+    @patch('src.backend.manage_requests.check_completed_evals')
+    @patch('src.backend.manage_requests.get_eval_requests')
+    def test_run_auto_eval_with_no_pending_requests(self, mock_get_eval_requests,
+                                                    mock_check_completed_evals, mock_print):
+        mock_get_eval_requests.return_value = []
+        main_backend.run_auto_eval()
+        # Assertions
+        mock_check_completed_evals.assert_called()
+        mock_get_eval_requests.assert_called()
+        mock_print.assert_any_call("No eval requests found. Exiting.")
+if __name__ == "__main__":
+    unittest.main()

tests/test_summary_generator.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import unittest
+from unittest.mock import patch
+import pandas as pd
+import src.backend.evaluate_model as evaluate_model
+class TestSummaryGenerator(unittest.TestCase):
+    def setUp(self):
+        self.model_id = "test_model"
+        self.revision = "test_revision"
+    @patch("src.backend.model_operations.AutoTokenizer")
+    @patch("src.backend.model_operations.AutoModelForCausalLM")
+    def test_init(self, mock_model, mock_tokenizer):
+        evaluate_model.SummaryGenerator(self.model_id, self.revision)
+        mock_tokenizer.from_pretrained.assert_called_once_with(self.model_id,
+                                                            self.revision)
+        mock_model.from_pretrained.assert_called_once_with(self.model_id,
+                                                        self.revision)
+    @patch("src.backend.model_operations.nlp")
+    @patch("src.backend.model_operations.AutoTokenizer")
+    @patch("src.backend.model_operations.AutoModelForCausalLM")
+    def test_generate_summaries(self, mock_model, mock_tokenizer, mock_nlp):
+        df = pd.DataFrame({'text': ['text1', 'text2'],
+                        'dataset': ['dataset1', 'dataset2']})
+        generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
+        generator.generate_summaries(df)
+        self.assertEqual(len(generator.summaries_df), len(df))
+    @patch("src.backend.model_operations.AutoTokenizer")
+    @patch("src.backend.model_operations.AutoModelForCausalLM")
+    def test_compute_avg_length(self, mock_model, mock_tokenizer):
+        generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
+        test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
+                                'dataset': ['dataset']})
+        generator.summaries_df = test_df
+        generator._compute_avg_length()
+        self.assertEqual(generator.avg_length, 4)
+    @patch("src.backend.model_operations.AutoTokenizer")
+    @patch("src.backend.model_operations.AutoModelForCausalLM")
+    def test_compute_answer_rate(self, mock_model, mock_tokenizer):
+        generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
+        test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
+                                'dataset': ['dataset']})
+        generator.summaries_df = test_df
+        generator._compute_answer_rate()
+        self.assertEqual(generator.answer_rate, 1)
+    @patch("src.backend.model_operations.AutoTokenizer")
+    @patch("src.backend.model_operations.AutoModelForCausalLM")
+    def test_error_rate(self, mock_model, mock_tokenizer):
+        generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
+        test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
+                                'dataset': ['dataset']})
+        generator.summaries_df = test_df
+        generator._compute_error_rate(0)
+        self.assertEqual(generator.error_rate, 0)
+if __name__ == "__main__":
+    unittest.main()