Spaces:

demo-leaderboard-backend
/

backend

Running on CPU Upgrade

App Files Files Community

eval_harness_v043_updates

#10

by meg HF staff - opened Jul 9

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+17

-15

Files changed (4) hide show

app.py +2 -2
main_backend_harness.py +1 -2
requirements.txt +2 -2
src/backend/run_eval_suite_harness.py +12 -9

app.py CHANGED Viewed

@@ -8,8 +8,8 @@ configure_root_logger()
 from functools import partial
 import gradio as gr
-from main_backend_lighteval import run_auto_eval
-# from main_backend_harness import run_auto_eval
 from src.display.log_visualizer import log_file_to_html_string
 from src.display.css_html_js import dark_mode_gradio_js
 from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO

 from functools import partial
 import gradio as gr
+# from main_backend_lighteval import run_auto_eval
+from main_backend_harness import run_auto_eval
 from src.display.log_visualizer import log_file_to_html_string
 from src.display.css_html_js import dark_mode_gradio_js
 from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO

main_backend_harness.py CHANGED Viewed

@@ -70,9 +70,8 @@ def run_auto_eval():
         num_fewshot=NUM_FEWSHOT,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
-        batch_size=1,
         device=DEVICE,
-        no_cache=True,
         limit=LIMIT
         )

         num_fewshot=NUM_FEWSHOT,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
+        batch_size="auto",
         device=DEVICE,
         limit=LIMIT
         )

requirements.txt CHANGED Viewed

@@ -5,12 +5,12 @@ huggingface-hub>=0.18.0
 python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
-accelerate==0.24.1
 sentencepiece
 # Evaluation suites
 lighteval
-lm_eval
 # Log Visualizer
 BeautifulSoup4==4.12.2

 python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
+accelerate>=0.26.0
 sentencepiece
 # Evaluation suites
 lighteval
+lm_eval==0.4.3
 # Log Visualizer
 BeautifulSoup4==4.12.2

src/backend/run_eval_suite_harness.py CHANGED Viewed

@@ -4,26 +4,29 @@ import logging
 from datetime import datetime
 from lm_eval import tasks, evaluator, utils
 from src.envs import RESULTS_REPO, API
 from src.backend.manage_requests import EvalRequest
 from src.logging import setup_logger
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
-def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
     """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
     Args:
         eval_request (EvalRequest): Input evaluation request file representation
         task_names (list): Tasks to launch
         num_fewshot (int): Number of few shots to use
-        batch_size (int): Selected batch size
-        device (str): "cpu" or "gpu:0", depending on what you assigned to the space
         local_dir (str): Where to save the results locally
         results_repo (str): To which repository to upload the results
-        no_cache (bool, optional): Whether to use a cache or not.
         limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
     Returns:
@@ -34,21 +37,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
             "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
-    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
     logger.info(f"Selected Tasks: {task_names}")
     results = evaluator.simple_evaluate(
-        model="hf-causal-experimental", # "hf-causal"
         model_args=eval_request.get_model_args(),
         tasks=task_names,
         num_fewshot=num_fewshot,
         batch_size=batch_size,
         device=device,
-        no_cache=no_cache,
         limit=limit,
-        write_out=True,
-        output_base_path="logs"
     )
     results["config"]["model_dtype"] = eval_request.precision

 from datetime import datetime
 from lm_eval import tasks, evaluator, utils
+from lm_eval.tasks import TaskManager
 from src.envs import RESULTS_REPO, API
 from src.backend.manage_requests import EvalRequest
 from src.logging import setup_logger
+from typing import Union
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
+def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: Union[int, str], device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
     """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
     Args:
         eval_request (EvalRequest): Input evaluation request file representation
         task_names (list): Tasks to launch
         num_fewshot (int): Number of few shots to use
+        batch_size (int or str): Selected batch size or 'auto'
+        device (str): "cpu" or "cuda:0", depending on what you assigned to the space
         local_dir (str): Where to save the results locally
         results_repo (str): To which repository to upload the results
+        no_cache (bool, optional): Whether to use a cache or not
         limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
     Returns:
             "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
+    task_manager = TaskManager()
+    all_tasks = task_manager.all_tasks
+    task_names = utils.pattern_match(task_names, all_tasks)
     logger.info(f"Selected Tasks: {task_names}")
     results = evaluator.simple_evaluate(
+        model="hf",
         model_args=eval_request.get_model_args(),
         tasks=task_names,
         num_fewshot=num_fewshot,
         batch_size=batch_size,
         device=device,
         limit=limit,
+        write_out=True # Whether to write out an example document and model input, for checking task integrity
     )
     results["config"]["model_dtype"] = eval_request.precision