eval_harness_v043_updates

#10
by meg HF staff - opened
app.py CHANGED
@@ -8,8 +8,8 @@ configure_root_logger()
8
  from functools import partial
9
 
10
  import gradio as gr
11
- from main_backend_lighteval import run_auto_eval
12
- # from main_backend_harness import run_auto_eval
13
  from src.display.log_visualizer import log_file_to_html_string
14
  from src.display.css_html_js import dark_mode_gradio_js
15
  from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
 
8
  from functools import partial
9
 
10
  import gradio as gr
11
+ # from main_backend_lighteval import run_auto_eval
12
+ from main_backend_harness import run_auto_eval
13
  from src.display.log_visualizer import log_file_to_html_string
14
  from src.display.css_html_js import dark_mode_gradio_js
15
  from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
main_backend_harness.py CHANGED
@@ -70,9 +70,8 @@ def run_auto_eval():
70
  num_fewshot=NUM_FEWSHOT,
71
  local_dir=EVAL_RESULTS_PATH_BACKEND,
72
  results_repo=RESULTS_REPO,
73
- batch_size=1,
74
  device=DEVICE,
75
- no_cache=True,
76
  limit=LIMIT
77
  )
78
 
 
70
  num_fewshot=NUM_FEWSHOT,
71
  local_dir=EVAL_RESULTS_PATH_BACKEND,
72
  results_repo=RESULTS_REPO,
73
+ batch_size="auto",
74
  device=DEVICE,
 
75
  limit=LIMIT
76
  )
77
 
requirements.txt CHANGED
@@ -5,12 +5,12 @@ huggingface-hub>=0.18.0
5
  python-dateutil==2.8.2
6
  requests==2.28.2
7
  tqdm==4.65.0
8
- accelerate==0.24.1
9
  sentencepiece
10
 
11
  # Evaluation suites
12
  lighteval
13
- lm_eval
14
 
15
  # Log Visualizer
16
  BeautifulSoup4==4.12.2
 
5
  python-dateutil==2.8.2
6
  requests==2.28.2
7
  tqdm==4.65.0
8
+ accelerate>=0.26.0
9
  sentencepiece
10
 
11
  # Evaluation suites
12
  lighteval
13
+ lm_eval==0.4.3
14
 
15
  # Log Visualizer
16
  BeautifulSoup4==4.12.2
src/backend/run_eval_suite_harness.py CHANGED
@@ -4,26 +4,29 @@ import logging
4
  from datetime import datetime
5
 
6
  from lm_eval import tasks, evaluator, utils
 
7
 
8
  from src.envs import RESULTS_REPO, API
9
  from src.backend.manage_requests import EvalRequest
10
  from src.logging import setup_logger
11
 
 
 
12
  logging.getLogger("openai").setLevel(logging.WARNING)
13
  logger = setup_logger(__name__)
14
 
15
- def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
16
  """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
17
 
18
  Args:
19
  eval_request (EvalRequest): Input evaluation request file representation
20
  task_names (list): Tasks to launch
21
  num_fewshot (int): Number of few shots to use
22
- batch_size (int): Selected batch size
23
- device (str): "cpu" or "gpu:0", depending on what you assigned to the space
24
  local_dir (str): Where to save the results locally
25
  results_repo (str): To which repository to upload the results
26
- no_cache (bool, optional): Whether to use a cache or not.
27
  limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
28
 
29
  Returns:
@@ -34,21 +37,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
34
  "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
35
  )
36
 
37
- task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
 
 
38
 
39
  logger.info(f"Selected Tasks: {task_names}")
40
 
41
  results = evaluator.simple_evaluate(
42
- model="hf-causal-experimental", # "hf-causal"
43
  model_args=eval_request.get_model_args(),
44
  tasks=task_names,
45
  num_fewshot=num_fewshot,
46
  batch_size=batch_size,
47
  device=device,
48
- no_cache=no_cache,
49
  limit=limit,
50
- write_out=True,
51
- output_base_path="logs"
52
  )
53
 
54
  results["config"]["model_dtype"] = eval_request.precision
 
4
  from datetime import datetime
5
 
6
  from lm_eval import tasks, evaluator, utils
7
+ from lm_eval.tasks import TaskManager
8
 
9
  from src.envs import RESULTS_REPO, API
10
  from src.backend.manage_requests import EvalRequest
11
  from src.logging import setup_logger
12
 
13
+ from typing import Union
14
+
15
  logging.getLogger("openai").setLevel(logging.WARNING)
16
  logger = setup_logger(__name__)
17
 
18
+ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: Union[int, str], device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
19
  """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
20
 
21
  Args:
22
  eval_request (EvalRequest): Input evaluation request file representation
23
  task_names (list): Tasks to launch
24
  num_fewshot (int): Number of few shots to use
25
+ batch_size (int or str): Selected batch size or 'auto'
26
+ device (str): "cpu" or "cuda:0", depending on what you assigned to the space
27
  local_dir (str): Where to save the results locally
28
  results_repo (str): To which repository to upload the results
29
+ no_cache (bool, optional): Whether to use a cache or not
30
  limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
31
 
32
  Returns:
 
37
  "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
38
  )
39
 
40
+ task_manager = TaskManager()
41
+ all_tasks = task_manager.all_tasks
42
+ task_names = utils.pattern_match(task_names, all_tasks)
43
 
44
  logger.info(f"Selected Tasks: {task_names}")
45
 
46
  results = evaluator.simple_evaluate(
47
+ model="hf",
48
  model_args=eval_request.get_model_args(),
49
  tasks=task_names,
50
  num_fewshot=num_fewshot,
51
  batch_size=batch_size,
52
  device=device,
 
53
  limit=limit,
54
+ write_out=True # Whether to write out an example document and model input, for checking task integrity
 
55
  )
56
 
57
  results["config"]["model_dtype"] = eval_request.precision