Minseok Bae commited on
Commit
58b9de9
1 Parent(s): d7b7dc6

Integrated backend pipelines - error occurs during model submission. (Debugging needed).

Browse files
.gitignore CHANGED
@@ -11,5 +11,7 @@ human_evals/
11
  eval-queue/
12
  eval-results/
13
  auto_evals/
 
 
14
 
15
  src/assets/model_counts.html
 
11
  eval-queue/
12
  eval-results/
13
  auto_evals/
14
+ eval-queue-bk/
15
+ eval-results-bk/
16
 
17
  src/assets/model_counts.html
app.py CHANGED
@@ -3,60 +3,40 @@ import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
- from src.display.about import (
7
- CITATION_BUTTON_LABEL,
8
- CITATION_BUTTON_TEXT,
9
- EVALUATION_QUEUE_TEXT,
10
- INTRODUCTION_TEXT,
11
- LLM_BENCHMARKS_TEXT,
12
- TITLE,
13
- )
14
  from src.display.css_html_js import custom_css
15
- from src.display.utils import (
16
- BENCHMARK_COLS,
17
- COLS,
18
- EVAL_COLS,
19
- EVAL_TYPES,
20
- NUMERIC_INTERVALS,
21
- TYPES,
22
- AutoEvalColumn,
23
- ModelType,
24
- fields,
25
- WeightType,
26
- Precision
27
- )
28
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
- from src.submission.submit import add_new_eval
31
 
32
 
33
  def restart_space():
34
- API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
36
  try:
37
- print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
- print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
47
  )
48
  except Exception:
49
  restart_space()
50
 
51
-
52
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
  leaderboard_df = original_df.copy()
54
 
55
  (
56
  finished_eval_queue_df,
57
  running_eval_queue_df,
58
  pending_eval_queue_df,
59
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
 
61
 
62
  # Searching and filtering
@@ -76,17 +56,17 @@ def update_table(
76
 
77
 
78
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
79
- return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
80
 
81
 
82
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
83
  always_here_cols = [
84
- AutoEvalColumn.model_type_symbol.name,
85
- AutoEvalColumn.model.name,
86
  ]
87
  # We use COLS to maintain sorting
88
  filtered_df = df[
89
- always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
90
  ]
91
  return filtered_df
92
 
@@ -104,7 +84,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
104
  if len(final_df) > 0:
105
  filtered_df = pd.concat(final_df)
106
  filtered_df = filtered_df.drop_duplicates(
107
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
108
  )
109
 
110
  return filtered_df
@@ -117,14 +97,14 @@ def filter_models(
117
  if show_deleted:
118
  filtered_df = df
119
  else: # Show only still on the hub models
120
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
121
 
122
  type_emoji = [t[0] for t in type_query]
123
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
125
 
126
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
127
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
128
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
129
  filtered_df = filtered_df.loc[mask]
130
 
@@ -133,8 +113,8 @@ def filter_models(
133
 
134
  demo = gr.Blocks(css=custom_css)
135
  with demo:
136
- gr.HTML(TITLE)
137
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
138
 
139
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
140
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
@@ -150,12 +130,12 @@ with demo:
150
  shown_columns = gr.CheckboxGroup(
151
  choices=[
152
  c.name
153
- for c in fields(AutoEvalColumn)
154
  if not c.hidden and not c.never_hidden and not c.dummy
155
  ],
156
  value=[
157
  c.name
158
- for c in fields(AutoEvalColumn)
159
  if c.displayed_by_default and not c.hidden and not c.never_hidden
160
  ],
161
  label="Select columns to show",
@@ -170,34 +150,34 @@ with demo:
170
  #with gr.Box(elem_id="box-filter"):
171
  filter_columns_type = gr.CheckboxGroup(
172
  label="Model types",
173
- choices=[t.to_str() for t in ModelType],
174
- value=[t.to_str() for t in ModelType],
175
  interactive=True,
176
  elem_id="filter-columns-type",
177
  )
178
  filter_columns_precision = gr.CheckboxGroup(
179
  label="Precision",
180
- choices=[i.value.name for i in Precision],
181
- value=[i.value.name for i in Precision],
182
  interactive=True,
183
  elem_id="filter-columns-precision",
184
  )
185
  filter_columns_size = gr.CheckboxGroup(
186
  label="Model sizes (in billions of parameters)",
187
- choices=list(NUMERIC_INTERVALS.keys()),
188
- value=list(NUMERIC_INTERVALS.keys()),
189
  interactive=True,
190
  elem_id="filter-columns-size",
191
  )
192
 
193
  leaderboard_table = gr.components.Dataframe(
194
  value=leaderboard_df[
195
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
196
  + shown_columns.value
197
- + [AutoEvalColumn.dummy.name]
198
  ],
199
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
- datatype=TYPES,
201
  elem_id="leaderboard-table",
202
  interactive=False,
203
  visible=True,
@@ -206,9 +186,9 @@ with demo:
206
 
207
  # Dummy leaderboard for handling the case when the user uses backspace key
208
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
209
- value=original_df[COLS],
210
- headers=COLS,
211
- datatype=TYPES,
212
  visible=False,
213
  )
214
  search_bar.submit(
@@ -241,12 +221,12 @@ with demo:
241
  )
242
 
243
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
244
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
245
 
246
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
247
  with gr.Column():
248
  with gr.Row():
249
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
250
 
251
  with gr.Column():
252
  with gr.Accordion(
@@ -256,8 +236,8 @@ with demo:
256
  with gr.Row():
257
  finished_eval_table = gr.components.Dataframe(
258
  value=finished_eval_queue_df,
259
- headers=EVAL_COLS,
260
- datatype=EVAL_TYPES,
261
  row_count=5,
262
  )
263
  with gr.Accordion(
@@ -267,8 +247,8 @@ with demo:
267
  with gr.Row():
268
  running_eval_table = gr.components.Dataframe(
269
  value=running_eval_queue_df,
270
- headers=EVAL_COLS,
271
- datatype=EVAL_TYPES,
272
  row_count=5,
273
  )
274
 
@@ -279,8 +259,8 @@ with demo:
279
  with gr.Row():
280
  pending_eval_table = gr.components.Dataframe(
281
  value=pending_eval_queue_df,
282
- headers=EVAL_COLS,
283
- datatype=EVAL_TYPES,
284
  row_count=5,
285
  )
286
  with gr.Row():
@@ -291,7 +271,7 @@ with demo:
291
  model_name_textbox = gr.Textbox(label="Model name")
292
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
293
  model_type = gr.Dropdown(
294
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
295
  label="Model type",
296
  multiselect=False,
297
  value=None,
@@ -300,14 +280,14 @@ with demo:
300
 
301
  with gr.Column():
302
  precision = gr.Dropdown(
303
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
304
  label="Precision",
305
  multiselect=False,
306
  value="float16",
307
  interactive=True,
308
  )
309
  weight_type = gr.Dropdown(
310
- choices=[i.value.name for i in WeightType],
311
  label="Weights type",
312
  multiselect=False,
313
  value="Original",
@@ -318,7 +298,7 @@ with demo:
318
  submit_button = gr.Button("Submit Eval")
319
  submission_result = gr.Markdown()
320
  submit_button.click(
321
- add_new_eval,
322
  [
323
  model_name_textbox,
324
  base_model_name_textbox,
@@ -333,8 +313,8 @@ with demo:
333
  with gr.Row():
334
  with gr.Accordion("📙 Citation", open=False):
335
  citation_button = gr.Textbox(
336
- value=CITATION_BUTTON_TEXT,
337
- label=CITATION_BUTTON_LABEL,
338
  lines=20,
339
  elem_id="citation-button",
340
  show_copy_button=True,
 
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
+ import src.display.about as about
 
 
 
 
 
 
 
7
  from src.display.css_html_js import custom_css
8
+ import src.display.utils as utils
9
+ import src.envs as envs
10
+ import src.populate as populate
11
+ import src.submission.submit as submit
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  def restart_space():
15
+ envs.API.restart_space(repo_id=envs.REPO_ID, token=envs.TOKEN)
16
 
17
  try:
18
+ print(envs.EVAL_REQUESTS_PATH)
19
  snapshot_download(
20
+ repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
21
  )
22
  except Exception:
23
  restart_space()
24
  try:
25
+ print(envs.EVAL_RESULTS_PATH)
26
  snapshot_download(
27
+ repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
28
  )
29
  except Exception:
30
  restart_space()
31
 
32
+ raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
 
33
  leaderboard_df = original_df.copy()
34
 
35
  (
36
  finished_eval_queue_df,
37
  running_eval_queue_df,
38
  pending_eval_queue_df,
39
+ ) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
40
 
41
 
42
  # Searching and filtering
 
56
 
57
 
58
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
59
+ return df[(df[utils.AutoEvalColumn.dummy.name].str.contains(query, case=False))]
60
 
61
 
62
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
63
  always_here_cols = [
64
+ utils.AutoEvalColumn.model_type_symbol.name,
65
+ utils.AutoEvalColumn.model.name,
66
  ]
67
  # We use COLS to maintain sorting
68
  filtered_df = df[
69
+ always_here_cols + [c for c in utils.COLS if c in df.columns and c in columns] + [utils.AutoEvalColumn.dummy.name]
70
  ]
71
  return filtered_df
72
 
 
84
  if len(final_df) > 0:
85
  filtered_df = pd.concat(final_df)
86
  filtered_df = filtered_df.drop_duplicates(
87
+ subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
88
  )
89
 
90
  return filtered_df
 
97
  if show_deleted:
98
  filtered_df = df
99
  else: # Show only still on the hub models
100
+ filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name] == True]
101
 
102
  type_emoji = [t[0] for t in type_query]
103
+ filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
104
+ filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
105
 
106
+ numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
107
+ params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
108
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
109
  filtered_df = filtered_df.loc[mask]
110
 
 
113
 
114
  demo = gr.Blocks(css=custom_css)
115
  with demo:
116
+ gr.HTML(about.TITLE)
117
+ gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
118
 
119
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
120
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
 
130
  shown_columns = gr.CheckboxGroup(
131
  choices=[
132
  c.name
133
+ for c in utils.fields(utils.AutoEvalColumn)
134
  if not c.hidden and not c.never_hidden and not c.dummy
135
  ],
136
  value=[
137
  c.name
138
+ for c in utils.fields(utils.AutoEvalColumn)
139
  if c.displayed_by_default and not c.hidden and not c.never_hidden
140
  ],
141
  label="Select columns to show",
 
150
  #with gr.Box(elem_id="box-filter"):
151
  filter_columns_type = gr.CheckboxGroup(
152
  label="Model types",
153
+ choices=[t.to_str() for t in utils.ModelType],
154
+ value=[t.to_str() for t in utils.ModelType],
155
  interactive=True,
156
  elem_id="filter-columns-type",
157
  )
158
  filter_columns_precision = gr.CheckboxGroup(
159
  label="Precision",
160
+ choices=[i.value.name for i in utils.Precision],
161
+ value=[i.value.name for i in utils.Precision],
162
  interactive=True,
163
  elem_id="filter-columns-precision",
164
  )
165
  filter_columns_size = gr.CheckboxGroup(
166
  label="Model sizes (in billions of parameters)",
167
+ choices=list(utils.NUMERIC_INTERVALS.keys()),
168
+ value=list(utils.NUMERIC_INTERVALS.keys()),
169
  interactive=True,
170
  elem_id="filter-columns-size",
171
  )
172
 
173
  leaderboard_table = gr.components.Dataframe(
174
  value=leaderboard_df[
175
+ [c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
176
  + shown_columns.value
177
+ + [utils.AutoEvalColumn.dummy.name]
178
  ],
179
+ headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
180
+ datatype=utils.TYPES,
181
  elem_id="leaderboard-table",
182
  interactive=False,
183
  visible=True,
 
186
 
187
  # Dummy leaderboard for handling the case when the user uses backspace key
188
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
189
+ value=original_df[utils.COLS],
190
+ headers=utils.COLS,
191
+ datatype=utils.TYPES,
192
  visible=False,
193
  )
194
  search_bar.submit(
 
221
  )
222
 
223
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
224
+ gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
225
 
226
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
227
  with gr.Column():
228
  with gr.Row():
229
+ gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
230
 
231
  with gr.Column():
232
  with gr.Accordion(
 
236
  with gr.Row():
237
  finished_eval_table = gr.components.Dataframe(
238
  value=finished_eval_queue_df,
239
+ headers=utils.EVAL_COLS,
240
+ datatype=utils.EVAL_TYPES,
241
  row_count=5,
242
  )
243
  with gr.Accordion(
 
247
  with gr.Row():
248
  running_eval_table = gr.components.Dataframe(
249
  value=running_eval_queue_df,
250
+ headers=utils.EVAL_COLS,
251
+ datatype=utils.EVAL_TYPES,
252
  row_count=5,
253
  )
254
 
 
259
  with gr.Row():
260
  pending_eval_table = gr.components.Dataframe(
261
  value=pending_eval_queue_df,
262
+ headers=utils.EVAL_COLS,
263
+ datatype=utils.EVAL_TYPES,
264
  row_count=5,
265
  )
266
  with gr.Row():
 
271
  model_name_textbox = gr.Textbox(label="Model name")
272
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
273
  model_type = gr.Dropdown(
274
+ choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
275
  label="Model type",
276
  multiselect=False,
277
  value=None,
 
280
 
281
  with gr.Column():
282
  precision = gr.Dropdown(
283
+ choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
284
  label="Precision",
285
  multiselect=False,
286
  value="float16",
287
  interactive=True,
288
  )
289
  weight_type = gr.Dropdown(
290
+ choices=[i.value.name for i in utils.WeightType],
291
  label="Weights type",
292
  multiselect=False,
293
  value="Original",
 
298
  submit_button = gr.Button("Submit Eval")
299
  submission_result = gr.Markdown()
300
  submit_button.click(
301
+ submit.add_new_eval,
302
  [
303
  model_name_textbox,
304
  base_model_name_textbox,
 
313
  with gr.Row():
314
  with gr.Accordion("📙 Citation", open=False):
315
  citation_button = gr.Textbox(
316
+ value=about.CITATION_BUTTON_TEXT,
317
+ label=about.CITATION_BUTTON_LABEL,
318
  lines=20,
319
  elem_id="citation-button",
320
  show_copy_button=True,
main_backend.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pprint
3
+
4
+ from huggingface_hub import snapshot_download
5
+
6
+ import src.backend.run_eval_suite as run_eval_suite
7
+ import src.backend.manage_requests as manage_requests
8
+ import src.backend.sort_queue as sort_queue
9
+ import src.envs as envs
10
+
11
+ logging.basicConfig(level=logging.ERROR)
12
+ pp = pprint.PrettyPrinter(width=80)
13
+
14
+ PENDING_STATUS = "PENDING"
15
+ RUNNING_STATUS = "RUNNING"
16
+ FINISHED_STATUS = "FINISHED"
17
+ FAILED_STATUS = "FAILED"
18
+
19
+ snapshot_download(repo_id=envs.RESULTS_REPO, revision="main",
20
+ local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
21
+ snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
22
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
23
+
24
+
25
+ def run_auto_eval():
26
+ current_pending_status = [PENDING_STATUS]
27
+
28
+ manage_requests.check_completed_evals(
29
+ api=envs.API,
30
+ checked_status=RUNNING_STATUS,
31
+ completed_status=FINISHED_STATUS,
32
+ failed_status=FAILED_STATUS,
33
+ hf_repo=envs.QUEUE_REPO,
34
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
35
+ hf_repo_results=envs.RESULTS_REPO,
36
+ local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
37
+ )
38
+
39
+ eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
40
+ hf_repo=envs.QUEUE_REPO,
41
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
42
+ eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
43
+
44
+ print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
45
+
46
+ if len(eval_requests) == 0:
47
+ print("No eval requests found. Exiting.")
48
+ return
49
+
50
+ eval_request = eval_requests[0]
51
+ pp.pprint(eval_request)
52
+
53
+ manage_requests.set_eval_request(
54
+ api=envs.API,
55
+ eval_request=eval_request,
56
+ new_status=RUNNING_STATUS,
57
+ hf_repo=envs.QUEUE_REPO,
58
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
59
+ )
60
+
61
+ run_eval_suite.run_evaluation(
62
+ eval_request=eval_request,
63
+ local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
64
+ results_repo=envs.RESULTS_REPO,
65
+ batch_size=1,
66
+ device=envs.DEVICE,
67
+ no_cache=True,
68
+ )
69
+
70
+
71
+ if __name__ == "__main__":
72
+ run_auto_eval()
requirements.txt CHANGED
@@ -12,4 +12,5 @@ python-dateutil==2.8.2
12
  requests==2.28.2
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
- tokenizers>=0.15.0
 
 
12
  requests==2.28.2
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
+ tokenizers>=0.15.0
16
+ sentence-transformers==2.2.2
scripts/create_request_file.py CHANGED
@@ -7,10 +7,9 @@ from datetime import datetime, timezone
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
10
- from util import QUEUE_REPO, EVAL_REQUESTS_PATH
11
 
12
- # EVAL_REQUESTS_PATH = "eval-queue"
13
- # QUEUE_REPO = "open-llm-leaderboard/requests"
14
 
15
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
16
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
@@ -25,7 +24,8 @@ def get_model_size(model_info, precision: str):
25
  try:
26
  size_match = re.search(size_pattern, model_info.modelId.lower())
27
  model_size = size_match.group(0)
28
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
 
29
  except AttributeError:
30
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
31
 
@@ -37,13 +37,15 @@ def get_model_size(model_info, precision: str):
37
  def main():
38
  api = HfApi()
39
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
40
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
 
41
 
42
  model_name = click.prompt("Enter model name")
43
  revision = click.prompt("Enter revision", default="main")
44
  precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
45
  model_type = click.prompt("Enter model type", type=click.Choice(model_types))
46
- weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
 
47
  base_model = click.prompt("Enter base model", default="")
48
  status = click.prompt("Enter status", default="FINISHED")
49
 
 
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
 
10
 
11
+ from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH
12
+
13
 
14
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
 
24
  try:
25
  size_match = re.search(size_pattern, model_info.modelId.lower())
26
  model_size = size_match.group(0)
27
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b"
28
+ else float(model_size[:-1]) / 1e3, 3)
29
  except AttributeError:
30
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
31
 
 
37
  def main():
38
  api = HfApi()
39
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
40
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH,
41
+ repo_type="dataset")
42
 
43
  model_name = click.prompt("Enter model name")
44
  revision = click.prompt("Enter revision", default="main")
45
  precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
46
  model_type = click.prompt("Enter model type", type=click.Choice(model_types))
47
+ weight_type = click.prompt("Enter weight type", default="Original",
48
+ type=click.Choice(weight_types))
49
  base_model = click.prompt("Enter base model", default="")
50
  status = click.prompt("Enter status", default="FINISHED")
51
 
src/backend/evaluate_model.py CHANGED
@@ -1,37 +1,95 @@
 
1
  import pandas as pd
2
 
3
- from leaderboard.src.backend.model_operations import SummaryGenerator, EvaluationModel
4
- from envs import HEM_PATH, SOURCE_PATH
5
- from leaderboard.src.backend.util import load_dataframe, format_results
 
 
 
 
 
6
 
7
  class Evaluator:
8
- def __init__(self, model, revision, precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  self.model = model
10
  self.revision = revision
11
  self.precision = precision
12
- self.num_fewshot = num_fewshot
13
  self.batch_size = batch_size
14
  self.device = device
15
  self.no_cache = no_cache
16
  self.limit = limit
17
  self.write_out = write_out
18
  self.output_base_path = output_base_path
19
- self.summary_generator = SummaryGenerator(model, revision)
20
- self.eval_model = EvaluationModel(HEM_PATH)
21
-
 
 
 
 
22
  def evaluate(self):
23
- df = load_dataframe(SOURCE_PATH)
24
- generated_summaries_df = self.summary_generator.generate_summaries(df)
25
-
26
- avg_summary_len = self.summary_generator.avg_length
27
- answer_rate = self.summary_generator.answer_rate
28
-
29
- hallucination_scores = self.eval_model.evaluate_hallucination(generated_summaries_df)
30
-
31
- accuracy = self.eval_model.compute_accuracy
32
- hallucination_rate = self.eval_model.hallucination_rate
33
-
34
- results = format_results(hallucination_scores, self.model, self.revision, self.precision, accuracy, hallucination_rate, answer_rate, avg_summary_len)
35
-
36
- return results
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
  import pandas as pd
3
 
4
+ import src.envs as envs
5
+
6
+ from src.backend.model_operations import SummaryGenerator, EvaluationModel
7
+ import src.backend.util as util
8
+
9
+ logging.basicConfig(level=logging.INFO,
10
+ format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
 
13
  class Evaluator:
14
+ """A class to evaluate summaries generated by a language model.
15
+
16
+ Attributes:
17
+ model (str): The name or path of the model.
18
+ revision (str): The model revision.
19
+ precision (str): The precision setting of the model.
20
+ num_fewshot (int): Number of few-shot examples to use.
21
+ batch_size (int): Batch size for processing.
22
+ device (str): The device to run the model on.
23
+ no_cache (bool): Flag to disable caching.
24
+ limit (int): Limit on the number of items to process.
25
+ write_out (bool): Whether to write results to a file.
26
+ output_base_path (str): Base path for output files.
27
+ summary_generator (SummaryGenerator): Instance for generating summaries.
28
+ eval_model (EvaluationModel): Instance for evaluating summaries.
29
+ """
30
+ def __init__(self, model, revision, precision, batch_size,
31
+ device, no_cache, limit, write_out=True,
32
+ output_base_path='logs'):
33
+ """Initializes the Evaluator with the given model and settings.
34
+
35
+ Args:
36
+ model (str): The name or path of the model.
37
+ revision (str): The model revision.
38
+ precision (str): The precision setting of the model.
39
+ num_fewshot (int): Number of few-shot examples to use.
40
+ batch_size (int): Batch size for processing.
41
+ device (str): The device to run the model on.
42
+ no_cache (bool): Flag to disable caching.
43
+ limit (int): Limit on the number of items to process.
44
+ write_out (bool): Whether to write results to a file.
45
+ output_base_path (str): Base path for output files.
46
+ """
47
  self.model = model
48
  self.revision = revision
49
  self.precision = precision
 
50
  self.batch_size = batch_size
51
  self.device = device
52
  self.no_cache = no_cache
53
  self.limit = limit
54
  self.write_out = write_out
55
  self.output_base_path = output_base_path
56
+ try:
57
+ self.summary_generator = SummaryGenerator(model, revision)
58
+ self.eval_model = EvaluationModel(envs.HEM_PATH)
59
+ except Exception as e:
60
+ logging.error(f"Error initializing Evaluator: {e}")
61
+ raise
62
+
63
  def evaluate(self):
64
+ """
65
+ Performs the evaluation process by generating summaries
66
+ and computing metrics.
67
+
68
+ Returns:
69
+ dict: A dictionary containing evaluation results.
70
+ """
71
+ try:
72
+ df = pd.read_csv(envs.SOURCE_PATH)
73
+ generated_summaries_df = self.summary_generator.generate_summaries(df)
74
+
75
+ avg_summary_len = self.summary_generator.avg_length
76
+ answer_rate = self.summary_generator.answer_rate
77
+ error_rate = self.summary_generator.error_rate
78
+
79
+ hallucination_scores = self.eval_model.evaluate_hallucination(
80
+ generated_summaries_df)
81
+ accuracy = self.eval_model.compute_accuracy()
82
+ hallucination_rate = self.eval_model.hallucination_rate
83
+
84
+ results = util.format_results(model_name=self.model, revision=self.revision,
85
+ precision=self.precision, accuracy=accuracy,
86
+ hallucination_rate=hallucination_rate, answer_rate=answer_rate,
87
+ avg_summary_len=avg_summary_len, error_rate=error_rate)
88
+
89
+ return results
90
+ except FileNotFoundError:
91
+ logging.error(f"File not found: {envs.SOURCE_PATH}")
92
+ raise
93
+ except Exception as e:
94
+ logging.error(f"Error during evaluation: {e}")
95
+ raise
src/backend/manage_requests.py CHANGED
@@ -1,10 +1,10 @@
 
1
  import glob
2
  import json
3
  from dataclasses import dataclass
4
  from typing import Optional
5
 
6
  from huggingface_hub import HfApi, snapshot_download
7
- from src.envs import TOKEN
8
 
9
  @dataclass
10
  class EvalRequest:
@@ -22,42 +22,34 @@ class EvalRequest:
22
  likes: Optional[int] = 0
23
  params: Optional[int] = None
24
  license: Optional[str] = ""
25
-
26
  def get_model_args(self):
27
  model_args = f"pretrained={self.model},revision={self.revision}"
28
 
29
  if self.precision in ["float16", "bfloat16"]:
30
  model_args += f",dtype={self.precision}"
31
- # Quantized models need some added config, the install of bits and bytes, etc
32
- #elif self.precision == "8bit":
33
- # model_args += ",load_in_8bit=True"
34
- #elif self.precision == "4bit":
35
- # model_args += ",load_in_4bit=True"
36
- #elif self.precision == "GPTQ":
37
- # A GPTQ model does not need dtype to be specified,
38
- # it will be inferred from the config
39
- pass
40
  else:
41
- raise Exception(f"Unknown precision {self.precision}.")
42
-
43
  return model_args
44
 
45
 
46
- def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
47
- """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
 
48
  json_filepath = eval_request.json_filepath
49
 
50
  with open(json_filepath) as fp:
51
  data = json.load(fp)
52
 
53
- data["status"] = set_to_status
54
 
55
  with open(json_filepath, "w") as f:
56
  f.write(json.dumps(data))
57
 
58
  api.upload_file(
59
  path_or_fileobj=json_filepath,
60
- path_in_repo=json_filepath.replace(local_dir, ""),
61
  repo_id=hf_repo,
62
  repo_type="dataset",
63
  )
@@ -69,9 +61,10 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
69
  likes.
70
 
71
  Returns:
72
- `list[EvalRequest]`: a list of model info dicts.
73
  """
74
- snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
 
75
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
76
 
77
  eval_requests = []
@@ -97,7 +90,8 @@ def check_completed_evals(
97
  local_dir_results: str,
98
  ):
99
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
100
- snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
 
101
 
102
  running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
103
 
@@ -107,10 +101,10 @@ def check_completed_evals(
107
  print(f"Checking {model}")
108
 
109
  output_path = model
110
- output_file = f"{local_dir_results}/{output_path}/results*.json"
111
- output_file_exists = len(glob.glob(output_file)) > 0
112
 
113
- if output_file_exists:
114
  print(
115
  f"EXISTS output file exists for {model} setting it to {completed_status}"
116
  )
 
1
+ import os
2
  import glob
3
  import json
4
  from dataclasses import dataclass
5
  from typing import Optional
6
 
7
  from huggingface_hub import HfApi, snapshot_download
 
8
 
9
  @dataclass
10
  class EvalRequest:
 
22
  likes: Optional[int] = 0
23
  params: Optional[int] = None
24
  license: Optional[str] = ""
25
+
26
  def get_model_args(self):
27
  model_args = f"pretrained={self.model},revision={self.revision}"
28
 
29
  if self.precision in ["float16", "bfloat16"]:
30
  model_args += f",dtype={self.precision}"
 
 
 
 
 
 
 
 
 
31
  else:
32
+ raise ValueError(f"Unknown precision {self.precision}.")
33
+
34
  return model_args
35
 
36
 
37
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
38
+ hf_repo: str, local_dir: str):
39
+ """Updates a given eval request with its new status on the hub (running, completed, failed,)"""
40
  json_filepath = eval_request.json_filepath
41
 
42
  with open(json_filepath) as fp:
43
  data = json.load(fp)
44
 
45
+ data["status"] = new_status
46
 
47
  with open(json_filepath, "w") as f:
48
  f.write(json.dumps(data))
49
 
50
  api.upload_file(
51
  path_or_fileobj=json_filepath,
52
+ path_in_repo=os.path.relpath(json_filepath, start=local_dir),
53
  repo_id=hf_repo,
54
  repo_type="dataset",
55
  )
 
61
  likes.
62
 
63
  Returns:
64
+ list[EvalRequest]: a list of model info dicts.
65
  """
66
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
67
+ repo_type="dataset", max_workers=60)
68
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
69
 
70
  eval_requests = []
 
90
  local_dir_results: str,
91
  ):
92
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
93
+ snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results,
94
+ repo_type="dataset", max_workers=60)
95
 
96
  running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
97
 
 
101
  print(f"Checking {model}")
102
 
103
  output_path = model
104
+ output_files = f"{local_dir_results}/{output_path}/results*.json"
105
+ output_files_exists = len(glob.glob(output_files)) > 0
106
 
107
+ if output_files_exists:
108
  print(
109
  f"EXISTS output file exists for {model} setting it to {completed_status}"
110
  )
src/backend/model_operations.py CHANGED
@@ -1,96 +1,224 @@
 
 
1
  import numpy as np
2
  import pandas as pd
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from sentence_transformers import CrossEncoder
5
 
6
- from leaderboard.src.backend.util import generate_prompt
 
 
 
 
 
 
 
 
7
 
8
  def load_evaluation_model(model_path):
 
 
 
 
 
 
 
 
9
  model = CrossEncoder(model_path)
10
- model.save_pretrained('.checkpoints/{model_path}')
11
  return model
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class SummaryGenerator:
 
 
 
 
 
 
 
 
 
 
 
14
  def __init__(self, model_id, revision):
15
- self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
16
- self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
 
 
 
 
 
 
 
 
 
 
 
17
  self.summaries_df = pd.DataFrame()
18
  self.revision = revision
19
  self.avg_length = None
20
  self.answer_rate = None
 
21
 
22
  def generate_summaries(self, df):
 
 
 
 
 
 
 
 
23
  source, summary, dataset = [], [], []
24
 
 
25
  for index, row in df.iterrows():
26
  _source = row['text']
27
  _dataset = row['dataset']
28
 
29
- prompt = generate_prompt(_source)
30
- inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024, revision=self.revision)
 
31
  try:
32
- outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False, temperature=0.0, revision=self.revision)
33
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True, revision=self.revision)
 
 
34
  except Exception as e:
35
  print(f"Error at index {index}: {e}")
36
  response = ""
37
-
 
38
  summary.append(response)
39
  source.append(_source)
40
  dataset.append(_dataset)
41
 
42
- self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"])
 
43
  self._compute_avg_length()
44
  self._compute_answer_rate()
 
45
 
46
  return self.summaries_df
47
 
48
  def _compute_avg_length(self):
 
 
 
49
  total_words = 0
50
  count = 0
51
 
52
  for summary in self.summaries_df['summary']:
53
  if summary != "":
54
- words = summary.split()
 
55
  total_words += len(words)
56
  count += 1
57
 
58
  self.avg_length = 0 if count == 0 else total_words / count
59
 
60
  def _compute_answer_rate(self):
61
- non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary != "")
 
 
 
62
  total_rows = len(self.summaries_df)
63
 
64
  self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
65
 
 
 
 
 
 
 
 
 
 
66
  class EvaluationModel:
 
 
 
 
 
 
 
 
 
67
  def __init__(self, model_path):
 
 
 
 
 
 
68
  self.model = load_evaluation_model(model_path)
69
  self.scores = []
70
  self.accuracy = None
71
  self.hallucination_rate = None
72
 
73
  def evaluate_hallucination(self, summaries_df):
74
- # Convert to NumPy arrays for efficient processing
75
- source_docs = np.array(summaries_df['source'])
76
- generated_summaries = np.array(summaries_df['summary'])
77
 
78
- scores = self.model.predict(source_docs, generated_summaries)
79
- self.scores = scores
80
- return self.scores
81
 
82
- def compute_accuracy(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  if not self.scores:
84
- raise ValueError("Scores not calculated. Call evaluate_hallucination() first.")
 
 
85
 
86
  # Use threshold of 0.5 to compute accuracy
87
- num_above_threshold = sum(score >= 0.5 for score in self.scores)
88
  num_total = len(self.scores)
89
 
90
- if num_total == 0:
91
  raise ValueError("No scores available to compute accuracy.")
92
 
93
  self.accuracy = (num_above_threshold / num_total) * 100
94
  self.hallucination_rate = 100 - self.accuracy
95
 
96
- return self.accuracy
 
 
1
+ import logging
2
+
3
  import numpy as np
4
  import pandas as pd
5
+ import spacy
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
  from sentence_transformers import CrossEncoder
8
 
9
+ import src.backend.util as util
10
+
11
+ # Set up basic configuration for logging
12
+ logging.basicConfig(level=logging.INFO,
13
+ format='%(asctime)s - %(levelname)s - %(message)s')
14
+
15
+ # Load spacy model for word tokenization
16
+ nlp = spacy.load("en_core_web_sm")
17
+
18
 
19
  def load_evaluation_model(model_path):
20
+ """Load the evaluation model from the given path
21
+
22
+ Args:
23
+ model_path (str): Path to the evaluation model
24
+
25
+ Returns:
26
+ CrossEncoder: The evaluation model
27
+ """
28
  model = CrossEncoder(model_path)
 
29
  return model
30
 
31
+
32
+ class ModelLoadingException(Exception):
33
+ """Exception raised for errors in loading a model.
34
+
35
+ Attributes:
36
+ model_id (str): The model identifier.
37
+ revision (str): The model revision.
38
+ """
39
+
40
+ def __init__(self, model_id, revision, messages="Error initializing model"):
41
+ self.model_id = model_id
42
+ self.revision = revision
43
+ super().__init__(f"{messages} id={model_id} revision={revision}")
44
+
45
  class SummaryGenerator:
46
+ """A class to generate summaries using a causal language model.
47
+
48
+ Attributes:
49
+ tokenizer (AutoTokenizer): Tokenizer for the model.
50
+ model (AutoModelForCausalLM): The causal language model.
51
+ summaries_df (DataFrame): DataFrame to store generated summaries.
52
+ revision (str): Model revision.
53
+ avg_length (float): Average length of summaries.
54
+ answer_rate (float): Rate of non-empty summaries.
55
+ """
56
+
57
  def __init__(self, model_id, revision):
58
+ """
59
+ Initializes the SummaryGenerator with a model.
60
+
61
+ Args:
62
+ model_id (str): Identifier for the model.
63
+ revision (str): Revision of the model.
64
+ """
65
+ try:
66
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
67
+ self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
68
+ except Exception as e:
69
+ logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}")
70
+ raise ModelLoadingException(model_id, revision) from e
71
  self.summaries_df = pd.DataFrame()
72
  self.revision = revision
73
  self.avg_length = None
74
  self.answer_rate = None
75
+ self.error_rate = None
76
 
77
  def generate_summaries(self, df):
78
+ """Generate summaries for a given DataFrame of source docs.
79
+
80
+ Args:
81
+ df (DataFrame): DataFrame containing source docs.
82
+
83
+ Returns:
84
+ summaries_df (DataFrame): Generated summaries by the model.
85
+ """
86
  source, summary, dataset = [], [], []
87
 
88
+ error_count = 0
89
  for index, row in df.iterrows():
90
  _source = row['text']
91
  _dataset = row['dataset']
92
 
93
+ prompt = util.generate_prompt(_source)
94
+ inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
95
+ revision=self.revision)
96
  try:
97
+ outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
98
+ temperature=0.0, revision=self.revision)
99
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True,
100
+ revision=self.revision)
101
  except Exception as e:
102
  print(f"Error at index {index}: {e}")
103
  response = ""
104
+ error_count += 1
105
+
106
  summary.append(response)
107
  source.append(_source)
108
  dataset.append(_dataset)
109
 
110
+ self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
111
+ columns=["source", "summary", "dataset"])
112
  self._compute_avg_length()
113
  self._compute_answer_rate()
114
+ self._compute_error_rate(error_count)
115
 
116
  return self.summaries_df
117
 
118
  def _compute_avg_length(self):
119
+ """
120
+ Compute the average length of non-empty summaries using SpaCy.
121
+ """
122
  total_words = 0
123
  count = 0
124
 
125
  for summary in self.summaries_df['summary']:
126
  if summary != "":
127
+ doc = nlp(summary)
128
+ words = [token.text for token in doc if token.is_alpha]
129
  total_words += len(words)
130
  count += 1
131
 
132
  self.avg_length = 0 if count == 0 else total_words / count
133
 
134
  def _compute_answer_rate(self):
135
+ """
136
+ Compute the rate of non-empty summaries.
137
+ """
138
+ non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary)
139
  total_rows = len(self.summaries_df)
140
 
141
  self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
142
 
143
+ def _compute_error_rate(self, count):
144
+ """
145
+ Compute the error rate of summaries.
146
+ """
147
+ total_rows = len(self.summaries_df)
148
+
149
+ self.error_rate = 0 if total_rows == 0 else count / total_rows
150
+
151
+
152
  class EvaluationModel:
153
+ """A class to evaluate generated summaries.
154
+
155
+ Attributes:
156
+ model (CrossEncoder): The evaluation model.
157
+ scores (list): List of evaluation scores.
158
+ accuracy (float): Accuracy of the summaries.
159
+ hallucination_rate (float): Rate of hallucination in summaries.
160
+ """
161
+
162
  def __init__(self, model_path):
163
+ """
164
+ Initializes the EvaluationModel with a CrossEncoder model.
165
+
166
+ Args:
167
+ model_path (str): Path to the CrossEncoder model.
168
+ """
169
  self.model = load_evaluation_model(model_path)
170
  self.scores = []
171
  self.accuracy = None
172
  self.hallucination_rate = None
173
 
174
  def evaluate_hallucination(self, summaries_df):
175
+ """
176
+ Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute
177
+ of the instance with the computed scores.
178
 
179
+ Args:
180
+ summaries_df (DataFrame): DataFrame containing source docs and summaries.
 
181
 
182
+ Returns:
183
+ list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
184
+ """
185
+ source_docs = np.array(summaries_df['source'])
186
+ generated_summaries = np.array(summaries_df['summary'])
187
+ try:
188
+ scores = self.model.predict(source_docs, generated_summaries)
189
+ self.scores = scores
190
+ return self.scores
191
+ except Exception as e:
192
+ logging.error(f"Error evaluating hallucination: {e}")
193
+ raise
194
+
195
+ def compute_accuracy(self, threshold=0.5):
196
+ """
197
+ Compute the accuracy of the evaluated summaries based on the previously calculated scores.
198
+ This method relies on the 'scores' attribute being populated, typically via the
199
+ 'evaluate_hallucination' method.
200
+
201
+ Returns:
202
+ float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate'
203
+ attributes of the instance.
204
+
205
+ Raises:
206
+ ValueError: If scores have not been calculated prior to calling this method.
207
+ """
208
  if not self.scores:
209
+ error_msg = "Scores not calculated. Call evaluate_hallucination() first."
210
+ logging.error(error_msg)
211
+ raise ValueError(error_msg)
212
 
213
  # Use threshold of 0.5 to compute accuracy
214
+ num_above_threshold = sum(score >= threshold for score in self.scores)
215
  num_total = len(self.scores)
216
 
217
+ if not num_total:
218
  raise ValueError("No scores available to compute accuracy.")
219
 
220
  self.accuracy = (num_above_threshold / num_total) * 100
221
  self.hallucination_rate = 100 - self.accuracy
222
 
223
+ return self.accuracy
224
+
src/backend/run_eval_suite.py CHANGED
@@ -3,43 +3,56 @@ import os
3
  import logging
4
  from datetime import datetime
5
 
6
- # from lm_eval import tasks, evaluator, utils
7
- from evaluate_model import Evaluator
8
-
9
- from src.envs import RESULTS_REPO, API
10
  from src.backend.manage_requests import EvalRequest
 
11
 
12
- from util import load_dataframe, format_results
13
-
 
14
  logging.getLogger("openai").setLevel(logging.WARNING)
15
 
16
- def run_evaluation(eval_request: EvalRequest, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
17
- if limit:
18
- print(
19
- "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
20
- )
21
 
22
- # task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # print(f"Selected Tasks: {task_names}")
25
- evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs')
26
- results = evaluator.evaluate()
27
-
28
- # results["config"]["model_dtype"] = eval_request.precision
29
- # results["config"]["model_name"] = eval_request.model
30
- # results["config"]["model_sha"] = eval_request.revision
 
31
 
32
  dumped = json.dumps(results, indent=2)
33
- print(dumped)
34
 
35
- output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
 
36
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
37
  with open(output_path, "w") as f:
38
  f.write(dumped)
39
 
40
- print(evaluator.make_table(results))
41
-
42
- API.upload_file(
43
  path_or_fileobj=output_path,
44
  path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
45
  repo_id=results_repo,
 
3
  import logging
4
  from datetime import datetime
5
 
6
+ import src.envs as envs
 
 
 
7
  from src.backend.manage_requests import EvalRequest
8
+ from src.backend.evaluate_model import Evaluator
9
 
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s')
13
  logging.getLogger("openai").setLevel(logging.WARNING)
14
 
 
 
 
 
 
15
 
16
+ def run_evaluation(eval_request: EvalRequest, batch_size, device,
17
+ local_dir: str, results_repo: str, no_cache=True, limit=None):
18
+ """
19
+ Run the evaluation for a given model and upload the results.
20
+
21
+ Args:
22
+ eval_request (EvalRequest): The evaluation request object containing model details.
23
+ num_fewshot (int): Number of few-shot examples.
24
+ batch_size (int): Batch size for processing.
25
+ device (str): The device to run the evaluation on.
26
+ local_dir (str): Local directory path for saving results.
27
+ results_repo (str): Repository ID where results will be uploaded.
28
+ no_cache (bool): Whether to disable caching.
29
+ limit (int, optional): Limit on the number of items to process. Use with caution.
30
+
31
+ Returns:
32
+ dict: A dictionary containing evaluation results.
33
+ """
34
+ if limit:
35
+ logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
36
 
37
+ try:
38
+ evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
39
+ batch_size, device, no_cache, limit, write_out=True,
40
+ output_base_path='logs')
41
+ results = evaluator.evaluate()
42
+ except Exception as e:
43
+ logging.error(f"Error during evaluation: {e}")
44
+ raise
45
 
46
  dumped = json.dumps(results, indent=2)
47
+ logging.info(dumped)
48
 
49
+ output_path = os.path.join(local_dir, *eval_request.model.split("/"),
50
+ f"results_{datetime.now()}.json")
51
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
52
  with open(output_path, "w") as f:
53
  f.write(dumped)
54
 
55
+ envs.API.upload_file(
 
 
56
  path_or_fileobj=output_path,
57
  path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
58
  repo_id=results_repo,
src/backend/sort_queue.py CHANGED
@@ -1,4 +1,3 @@
1
- import re
2
  from dataclasses import dataclass
3
 
4
  from huggingface_hub import HfApi
@@ -25,4 +24,4 @@ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
25
  return sorted(eval_requests, key=lambda x: x.params, reverse=False)
26
 
27
  def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
- return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
 
 
1
  from dataclasses import dataclass
2
 
3
  from huggingface_hub import HfApi
 
24
  return sorted(eval_requests, key=lambda x: x.params, reverse=False)
25
 
26
  def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
27
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/backend/util.py CHANGED
@@ -1,18 +1,41 @@
1
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def load_dataframe(data_path):
4
- df = pd.read_csv(data_path)
5
- return df
6
 
7
- def generate_prompt(source_passage):
8
- return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
9
- You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described.'
10
- Passage:
11
- {source_passage}
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  """
13
-
14
- def format_results(hallucination_scores, model_name, revision, precision, accuracy, hallucination_rate, answer_rate, avg_summary_len):
15
- # Define the structure of the results (JSON)
16
  results = {
17
  "config": {
18
  "model_dtype": precision, # Precision with which you ran the evaluation
@@ -20,13 +43,22 @@ def format_results(hallucination_scores, model_name, revision, precision, accura
20
  "model_sha": revision # Hash of the model
21
  },
22
  "results": {
23
- "hallucination_eval": {
24
- "HEM Scores": hallucination_scores,
25
- "Accuracy": accuracy,
26
- "Hallucination Rate": hallucination_rate,
27
- "Answer Rate": answer_rate,
28
- "Average Summary Length": avg_summary_len,
 
 
 
 
 
 
 
 
29
  }
30
  }
31
  }
32
- return results
 
 
1
+ def generate_prompt(source_passage: str) -> str:
2
+ """
3
+ Generates a prompt for a chatbot to summarize a given passage.
4
+
5
+ Args:
6
+ source_passage (str): The passage to be summarized.
7
+
8
+ Returns:
9
+ str: A formatted prompt string for the chatbot.
10
+ """
11
+ if not source_passage:
12
+ raise ValueError("Source passage is empty.")
13
+
14
+ return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
15
+ You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described:'
16
+ Passage:\n {source_passage}
17
+ """
18
 
 
 
 
19
 
20
+ def format_results(model_name: str, revision: str, precision: str, accuracy: float,
21
+ hallucination_rate: float, answer_rate: float, avg_summary_len: float,
22
+ error_rate: float) -> dict:
23
+ """
24
+ Formats the evaluation results into a structured dictionary.
25
+
26
+ Args:
27
+ model_name (str): The name of the evaluated model.
28
+ revision (str): The revision hash of the model.
29
+ precision (str): The precision with which the evaluation was run.
30
+ accuracy (float): The accuracy score from the evaluation.
31
+ hallucination_rate (float): The hallucination rate from the evaluation.
32
+ answer_rate (float): The answer rate from the evaluation.
33
+ avg_summary_len (float): The average summary length from the evaluation.
34
+ error_rate (float): The rate at which errors occurred during summary generation.
35
+
36
+ Returns:
37
+ dict: A dictionary containing the structured evaluation results.
38
  """
 
 
 
39
  results = {
40
  "config": {
41
  "model_dtype": precision, # Precision with which you ran the evaluation
 
43
  "model_sha": revision # Hash of the model
44
  },
45
  "results": {
46
+ "accuracy": {
47
+ "accuracy": accuracy
48
+ },
49
+ "hallucination_rate": {
50
+ "hallucination_rate": hallucination_rate
51
+ },
52
+ "answer_rate": {
53
+ "answer_rate": answer_rate
54
+ },
55
+ "average_summary_length": {
56
+ "average_summary_length": avg_summary_len
57
+ },
58
+ "error_rate": {
59
+ "error_rate": error_rate
60
  }
61
  }
62
  }
63
+
64
+ return results
src/display/about.py CHANGED
@@ -1,20 +1,23 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
- # @dataclass
5
- # class Task:
6
- # benchmark: str
7
- # metric: str
8
- # col_name: str
9
-
10
-
11
- # # Init: to update with your specific keys
12
- # class Tasks(Enum):
13
- # # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- # task0 = Task("task_name1", "metric_name", "First task")
15
- # task1 = Task("task_name2", "metric_name", "Second task")
16
-
17
-
 
 
 
18
  # Your leaderboard name
19
  TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
20
 
@@ -24,7 +27,7 @@ This Leaderboard evaluates how much easy LLM hallucinates in factual summarizati
24
  """
25
 
26
  # Which evaluations are you running? how can people reproduce what you have?
27
- LLM_BENCHMARKS_TEXT = f"""
28
  ## How it works
29
 
30
  ## Reproducibility
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ class Tasks(Enum):
12
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
+ accuracy = Task("accuracy", "accuracy", "Accuracy")
14
+ hallucination_rate = Task("hallucination_rate",
15
+ "hallucination_rate", "Hallucination Rate")
16
+ answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
+ average_summary_length = Task("average_summary_length",
18
+ "average_summary_length", "Average Summary Length")
19
+ error_rate = Task("error_rate", "error_rate", "Error Rate")
20
+
21
  # Your leaderboard name
22
  TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
23
 
 
27
  """
28
 
29
  # Which evaluations are you running? how can people reproduce what you have?
30
+ LLM_BENCHMARKS_TEXT = """
31
  ## How it works
32
 
33
  ## Reproducibility
src/display/css_html_js.py CHANGED
@@ -33,7 +33,7 @@ custom_css = """
33
  background: none;
34
  border: none;
35
  }
36
-
37
  #search-bar {
38
  padding: 0px;
39
  }
 
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- # from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -24,16 +24,27 @@ class ColumnContent:
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
27
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- # Accuracy
30
- auto_eval_column_dict.append(["accuracy", ColumnContent, ColumnContent("Accuracy ⬆️", "number", True)])
31
- # Hallucination Rate
32
- auto_eval_column_dict.append(["hallucination_rate", ColumnContent, ColumnContent("Hallucination Rate ⬇️", "number", True)])
33
- # Answer Rate
34
- auto_eval_column_dict.append(["answer_rate", ColumnContent, ColumnContent("Answer Rate ⬆️", "number", True)])
35
- # Average Summary Length
36
- auto_eval_column_dict.append(["average_summary_length", ColumnContent, ColumnContent("Average Summary Length", "number", True)])
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # Model information
39
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
@@ -126,7 +137,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
126
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
127
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
128
 
129
- BENCHMARK_COLS = ["Accuracy", "Hallucination Rate", "Answer Rate", "Average Summary Length"]
130
 
131
  NUMERIC_INTERVALS = {
132
  "?": pd.Interval(-1, 0, closed="right"),
 
3
 
4
  import pandas as pd
5
 
6
+ from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
27
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
28
+ ColumnContent("T", "str", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["model", ColumnContent,
30
+ ColumnContent("Model", "markdown", True, never_hidden=True)])
31
+ for task in Tasks:
32
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
+ # # Accuracy
34
+ # auto_eval_column_dict.append(["accuracy", ColumnContent,
35
+ # ColumnContent("Accuracy", "number", True)])
36
+ # # Hallucination Rate
37
+ # auto_eval_column_dict.append(["hallucination_rate", ColumnContent,
38
+ # ColumnContent("Hallucination Rate", "number", True)])
39
+ # # Answer Rate
40
+ # auto_eval_column_dict.append(["answer_rate", ColumnContent,
41
+ # ColumnContent("Answer Rate", "number", True)])
42
+ # # Average Summary Length
43
+ # auto_eval_column_dict.append(["average_summary_length", ColumnContent,
44
+ # ColumnContent("Average Summary Length", "number", True)])
45
+ # # Error Rate
46
+ # auto_eval_column_dict.append(["error_rate", ColumnContent,
47
+ # ColumnContent("Error Rate", "number", True)])
48
 
49
  # Model information
50
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
137
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
138
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
139
 
140
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
141
 
142
  NUMERIC_INTERVALS = {
143
  "?": pd.Interval(-1, 0, closed="right"),
src/envs.py CHANGED
@@ -2,7 +2,7 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # replace this with our token
6
  TOKEN = os.environ.get("HF_TOKEN", None)
7
 
8
  OWNER = "vectara"
@@ -15,8 +15,11 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
15
  # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
17
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
18
 
 
19
  API = HfApi(token=TOKEN)
20
 
21
- SOURCE_PATH = "/datasets/leaderboard_summaries.csv"
22
- HEM_PATH = 'vectara/hallucination_evaluation_model'
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+ # replace this with our token
6
  TOKEN = os.environ.get("HF_TOKEN", None)
7
 
8
  OWNER = "vectara"
 
15
  # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
17
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
18
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
19
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
20
 
21
+ DEVICE = "cpu"
22
  API = HfApi(token=TOKEN)
23
 
24
+ SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
25
+ HEM_PATH = 'vectara/hallucination_evaluation_model'
src/leaderboard/read_evals.py CHANGED
@@ -1,33 +1,32 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
7
- import dateutil
8
  import numpy as np
 
9
 
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- eval_name: str # org_model_precision (uid)
18
- full_model: str # org/model (path on hub)
19
- org: str
20
  model: str
21
- revision: str # commit hash, "" if main
22
  results: dict
23
- precision: Precision = Precision.Unknown
24
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
25
- weight_type: WeightType = WeightType.Original # Original or Adapter
26
- architecture: str = "Unknown"
27
  license: str = "?"
28
  likes: int = 0
29
  num_params: int = 0
30
- date: str = "" # submission date of request file
31
  still_on_hub: bool = False
32
 
33
  @classmethod
@@ -39,42 +38,38 @@ class EvalResult:
39
  config = data.get("config")
40
 
41
  # Precision
42
- precision = Precision.from_str(config.get("model_dtype"))
43
 
44
  # Get model and org
45
- org_and_model = config.get("model_name", config.get("model_args", None))
46
- org_and_model = org_and_model.split("/", 1)
47
 
48
- if len(org_and_model) == 1:
49
- org = None
50
- model = org_and_model[0]
51
- result_key = f"{model}_{precision.value.name}"
52
- else:
53
- org = org_and_model[0]
54
- model = org_and_model[1]
55
  result_key = f"{org}_{model}_{precision.value.name}"
56
- full_model = "/".join(org_and_model)
 
57
 
58
- still_on_hub, _, model_config = is_model_on_hub(
59
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
60
- )
61
- architecture = "?"
62
- if model_config is not None:
63
- architectures = getattr(model_config, "architectures", None)
64
- if architectures:
65
- architecture = ";".join(architectures)
66
 
67
  # Extract results available in this file (some results are split in several files)
68
- hallucination_eval = data["results"].get("hallucination_eval", {})
69
-
70
- # Extract metrics from hallucination eval
71
- results = {
72
- "HEM Scores": hallucination_eval.get("HEM Scores", None),
73
- "Accuracy": hallucination_eval.get("Accuracy", None),
74
- "Hallucination Rate": hallucination_eval.get("Hallucination Rate", None),
75
- "Answer Rate": hallucination_eval.get("Answer Rate", None),
76
- "Average Summary Length": hallucination_eval.get("Average Summary Length", None),
77
- }
 
78
 
79
  return self(
80
  eval_name=result_key,
@@ -82,7 +77,7 @@ class EvalResult:
82
  org=org,
83
  model=model,
84
  results=results,
85
- precision=precision,
86
  revision= config.get("model_sha", ""),
87
  still_on_hub=still_on_hub,
88
  architecture=architecture
@@ -90,47 +85,44 @@ class EvalResult:
90
 
91
  def update_with_request_file(self, requests_path):
92
  """Finds the relevant request file for the current model and updates info with it"""
93
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
94
 
95
  try:
96
  with open(request_file, "r") as f:
97
  request = json.load(f)
98
- self.model_type = ModelType.from_str(request.get("model_type", ""))
99
- self.weight_type = WeightType[request.get("weight_type", "Original")]
100
  self.license = request.get("license", "?")
101
  self.likes = request.get("likes", 0)
102
  self.num_params = request.get("params", 0)
103
  self.date = request.get("submitted_time", "")
104
- except Exception:
105
  print(f"Could not find request file for {self.org}/{self.model}")
 
 
106
 
107
  def to_dict(self):
108
  """Converts the Eval Result to a dict compatible with our dataframe display"""
109
- # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
110
- accuracy = self.results.get("Accuracy", None)
111
-
112
  data_dict = {
113
  "eval_name": self.eval_name, # not a column, just a save name,
114
- AutoEvalColumn.precision.name: self.precision.value.name,
115
- AutoEvalColumn.model_type.name: self.model_type.value.name,
116
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
117
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
118
- AutoEvalColumn.architecture.name: self.architecture,
119
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
120
- AutoEvalColumn.dummy.name: self.full_model,
121
- AutoEvalColumn.revision.name: self.revision,
122
- # AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.accuracy.name: accuracy,
124
- AutoEvalColumn.license.name: self.license,
125
- AutoEvalColumn.likes.name: self.likes,
126
- AutoEvalColumn.params.name: self.num_params,
127
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
128
  }
129
- # for task in Tasks:
130
- # data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
- data_dict["Hallucination Rate"] = self.results.get("Hallucination Rate", None)
132
- data_dict["Answer Rate"] = self.results.get("Answer Rate", None)
133
- data_dict["Average Summary Length"] = self.results.get("Average Summary Length", None)
134
 
135
  return data_dict
136
 
@@ -163,7 +155,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
163
 
164
  for root, _, files in os.walk(results_path):
165
  # We should only have json files in model results
166
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
167
  continue
168
 
169
  # Sort the files by date
@@ -172,8 +164,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
172
  except dateutil.parser._parser.ParserError:
173
  files = [files[-1]]
174
 
175
- for file in files:
176
- model_result_filepaths.append(os.path.join(root, file))
177
 
178
  eval_results = {}
179
  for model_result_filepath in model_result_filepaths:
@@ -184,7 +175,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
184
  # Store results of same eval together
185
  eval_name = eval_result.eval_name
186
  if eval_name in eval_results.keys():
187
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
188
  else:
189
  eval_results[eval_name] = eval_result
190
 
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
 
6
  import numpy as np
7
+ import dateutil
8
 
9
+ import src.display.formatting as formatting
10
+ import src.display.utils as utils
11
+ import src.submission.check_validity as check_validity
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
+ eval_name: str # org_model_precision (uid)
17
+ full_model: str # org/model (path on hub)
18
+ org: str
19
  model: str
20
+ revision: str # commit hash, "" if main
21
  results: dict
22
+ precision: utils.Precision = utils.Precision.Unknown
23
+ model_type: utils.ModelType = utils.ModelType.Unknown # Pretrained, fine tuned, ...
24
+ weight_type: utils.WeightType = utils.WeightType.Original # Original or Adapter
25
+ architecture: str = "Unknown"
26
  license: str = "?"
27
  likes: int = 0
28
  num_params: int = 0
29
+ date: str = "" # submission date of request file
30
  still_on_hub: bool = False
31
 
32
  @classmethod
 
38
  config = data.get("config")
39
 
40
  # Precision
41
+ precision = utils.Precision.from_str(config.get("model_dtype"))
42
 
43
  # Get model and org
44
+ full_model = config.get("model_name", config.get("model_args", None))
45
+ org, model = full_model.split("/", 1) if "/" in full_model else (None, full_model)
46
 
47
+ if org:
 
 
 
 
 
 
48
  result_key = f"{org}_{model}_{precision.value.name}"
49
+ else:
50
+ result_key = f"{model}_{precision.value.name}"
51
 
52
+ still_on_hub, _, model_config = check_validity.is_model_on_hub(
53
+ full_model, config.get("model_sha", "main"), trust_remote_code=True,
54
+ test_tokenizer=False)
55
+
56
+ if model_config:
57
+ architecture = ";".join(getattr(model_config, "architectures", ["?"]))
58
+ else:
59
+ architecture = "?"
60
 
61
  # Extract results available in this file (some results are split in several files)
62
+ results = {}
63
+ for task in utils.Tasks:
64
+ task = task.value
65
+
66
+ # We average all scores of a given metric (not all metrics are present in all files)
67
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
68
+ if accs.size == 0 or any([acc is None for acc in accs]):
69
+ continue
70
+
71
+ mean_acc = np.mean(accs) * 100.0
72
+ results[task.benchmark] = mean_acc
73
 
74
  return self(
75
  eval_name=result_key,
 
77
  org=org,
78
  model=model,
79
  results=results,
80
+ precision=precision,
81
  revision= config.get("model_sha", ""),
82
  still_on_hub=still_on_hub,
83
  architecture=architecture
 
85
 
86
  def update_with_request_file(self, requests_path):
87
  """Finds the relevant request file for the current model and updates info with it"""
88
+ request_file = get_request_file_for_model(requests_path, self.full_model,
89
+ self.precision.value.name)
90
 
91
  try:
92
  with open(request_file, "r") as f:
93
  request = json.load(f)
94
+ self.model_type = utils.ModelType.from_str(request.get("model_type", ""))
95
+ self.weight_type = utils.WeightType[request.get("weight_type", "Original")]
96
  self.license = request.get("license", "?")
97
  self.likes = request.get("likes", 0)
98
  self.num_params = request.get("params", 0)
99
  self.date = request.get("submitted_time", "")
100
+ except FileNotFoundError:
101
  print(f"Could not find request file for {self.org}/{self.model}")
102
+ except json.JSONDecodeError:
103
+ print(f"Error decoding JSON in request file for {self.org}/{self.model}")
104
 
105
  def to_dict(self):
106
  """Converts the Eval Result to a dict compatible with our dataframe display"""
107
+
 
 
108
  data_dict = {
109
  "eval_name": self.eval_name, # not a column, just a save name,
110
+ utils.AutoEvalColumn.precision.name: self.precision.value.name,
111
+ utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
112
+ utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
113
+ utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
114
+ utils.AutoEvalColumn.architecture.name: self.architecture,
115
+ utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
116
+ utils.AutoEvalColumn.dummy.name: self.full_model,
117
+ utils.AutoEvalColumn.revision.name: self.revision,
118
+ utils.AutoEvalColumn.license.name: self.license,
119
+ utils.AutoEvalColumn.likes.name: self.likes,
120
+ utils.AutoEvalColumn.params.name: self.num_params,
121
+ utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
 
122
  }
123
+
124
+ for task in utils.Tasks:
125
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
126
 
127
  return data_dict
128
 
 
155
 
156
  for root, _, files in os.walk(results_path):
157
  # We should only have json files in model results
158
+ if not files or any([not f.endswith(".json") for f in files]):
159
  continue
160
 
161
  # Sort the files by date
 
164
  except dateutil.parser._parser.ParserError:
165
  files = [files[-1]]
166
 
167
+ model_result_filepaths.extend([os.path.join(root, file) for file in files])
 
168
 
169
  eval_results = {}
170
  for model_result_filepath in model_result_filepaths:
 
175
  # Store results of same eval together
176
  eval_name = eval_result.eval_name
177
  if eval_name in eval_results.keys():
178
+ eval_results[eval_name].results.update({k: v for k, v in
179
+ eval_result.results.items() if v is not None})
180
  else:
181
  eval_results[eval_name] = eval_result
182
 
src/populate.py CHANGED
@@ -3,21 +3,21 @@ import os
3
 
4
  import pandas as pd
5
 
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- raw_data = get_raw_eval_results(results_path, requests_path)
13
  all_data_json = [v.to_dict() for v in raw_data]
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
- df = df.sort_values(by=[AutoEvalColumn.accuracy.name], ascending=False)
17
  df = df[cols].round(decimals=2)
18
 
19
  # filter out if any of the benchmarks have not been produced
20
- df = df[has_no_nan_values(df, benchmark_cols)]
21
  return raw_data, df
22
 
23
 
@@ -31,8 +31,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
31
  with open(file_path) as fp:
32
  data = json.load(fp)
33
 
34
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
35
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
36
 
37
  all_evals.append(data)
38
  elif ".md" not in entry:
@@ -43,8 +43,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
43
  with open(file_path) as fp:
44
  data = json.load(fp)
45
 
46
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
47
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
48
  all_evals.append(data)
49
 
50
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
3
 
4
  import pandas as pd
5
 
6
+ import src.display.formatting as formatting
7
+ import src.display.utils as utils
8
+ import src.leaderboard.read_evals as read_evals
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
13
  all_data_json = [v.to_dict() for v in raw_data]
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
+ df = df.sort_values(by=[utils.AutoEvalColumn.accuracy.name], ascending=False)
17
  df = df[cols].round(decimals=2)
18
 
19
  # filter out if any of the benchmarks have not been produced
20
+ df = df[formatting.has_no_nan_values(df, benchmark_cols)]
21
  return raw_data, df
22
 
23
 
 
31
  with open(file_path) as fp:
32
  data = json.load(fp)
33
 
34
+ data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
35
+ data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
36
 
37
  all_evals.append(data)
38
  elif ".md" not in entry:
 
43
  with open(file_path) as fp:
44
  data = json.load(fp)
45
 
46
+ data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
47
+ data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
48
  all_evals.append(data)
49
 
50
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
src/submission/check_validity.py CHANGED
@@ -1,8 +1,6 @@
1
  import json
2
  import os
3
- import re
4
  from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
@@ -37,11 +35,11 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
37
  try:
38
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
39
  if test_tokenizer:
40
- tokenizer_config = get_tokenizer_config(model_name)
41
  if tokenizer_config is not None:
42
  tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
43
  else:
44
- tokenizer_class_candidate = config.tokenizer_class
45
 
46
 
47
  tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
 
1
  import json
2
  import os
 
3
  from collections import defaultdict
 
4
 
5
  import huggingface_hub
6
  from huggingface_hub import ModelCard
 
35
  try:
36
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
37
  if test_tokenizer:
38
+ tokenizer_config = get_tokenizer_config(model_name)
39
  if tokenizer_config is not None:
40
  tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
41
  else:
42
+ tokenizer_class_candidate = config.tokenizer_class
43
 
44
 
45
  tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
src/submission/submit.py CHANGED
@@ -2,14 +2,10 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
@@ -25,7 +21,7 @@ def add_new_eval(
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
  user_name = ""
31
  model_path = model
@@ -37,7 +33,7 @@ def add_new_eval(
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
  if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
 
42
  # Does the model actually exist?
43
  if revision == "":
@@ -45,32 +41,32 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
  if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
 
52
  if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
54
  if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
 
57
  # Is the model info correctly filled?
58
  try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
  except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
 
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
 
65
  # Were the model card and license filled?
66
  try:
67
  license = model_info.cardData["license"]
68
  except Exception:
69
- return styled_error("Please select a license for your model")
70
 
71
- modelcard_OK, error_msg = check_model_card(model)
72
  if not modelcard_OK:
73
- return styled_error(error_msg)
74
 
75
  # Seems good, creating the eval
76
  print("Adding new eval")
@@ -91,11 +87,11 @@ def add_new_eval(
91
 
92
  # Check for duplicate submission
93
  if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
94
- return styled_warning("This model has been already submitted.")
95
 
96
  print("Creating eval file")
97
 
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
  out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
@@ -103,10 +99,10 @@ def add_new_eval(
103
  f.write(json.dumps(eval_entry))
104
 
105
  print("Uploading eval file")
106
- API.upload_file(
107
  path_or_fileobj=out_path,
108
  path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
  repo_type="dataset",
111
  commit_message=f"Add {model} to eval queue",
112
  )
@@ -114,6 +110,6 @@ def add_new_eval(
114
  # Remove the local file
115
  os.remove(out_path)
116
 
117
- return styled_message(
118
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
 
2
  import os
3
  from datetime import datetime, timezone
4
 
5
+ import src.display.formatting as formatting
6
+ import src.envs as envs
7
+ import src.submission.check_validity as check_validity
8
+
 
 
 
 
9
 
10
  REQUESTED_MODELS = None
11
  USERS_TO_SUBMISSION_DATES = None
 
21
  global REQUESTED_MODELS
22
  global USERS_TO_SUBMISSION_DATES
23
  if not REQUESTED_MODELS:
24
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = check_validity.already_submitted_models(envs.EVAL_REQUESTS_PATH)
25
 
26
  user_name = ""
27
  model_path = model
 
33
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
34
 
35
  if model_type is None or model_type == "":
36
+ return formatting.styled_error("Please select a model type.")
37
 
38
  # Does the model actually exist?
39
  if revision == "":
 
41
 
42
  # Is the model on the hub?
43
  if weight_type in ["Delta", "Adapter"]:
44
+ base_model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=base_model, revision=revision, token=envs.TOKEN, test_tokenizer=True)
45
  if not base_model_on_hub:
46
+ return formatting.styled_error(f'Base model "{base_model}" {error}')
47
 
48
  if not weight_type == "Adapter":
49
+ model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
50
  if not model_on_hub:
51
+ return formatting.styled_error(f'Model "{model}" {error}')
52
 
53
  # Is the model info correctly filled?
54
  try:
55
+ model_info = envs.API.model_info(repo_id=model, revision=revision)
56
  except Exception:
57
+ return formatting.styled_error("Could not get your model information. Please fill it up properly.")
58
 
59
+ model_size = check_validity.get_model_size(model_info=model_info, precision=precision)
60
 
61
  # Were the model card and license filled?
62
  try:
63
  license = model_info.cardData["license"]
64
  except Exception:
65
+ return formatting.styled_error("Please select a license for your model")
66
 
67
+ modelcard_OK, error_msg = check_validity.check_model_card(model)
68
  if not modelcard_OK:
69
+ return formatting.styled_error(error_msg)
70
 
71
  # Seems good, creating the eval
72
  print("Adding new eval")
 
87
 
88
  # Check for duplicate submission
89
  if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
90
+ return formatting.styled_warning("This model has been already submitted.")
91
 
92
  print("Creating eval file")
93
 
94
+ OUT_DIR = f"{envs.EVAL_REQUESTS_PATH}/{user_name}"
95
  os.makedirs(OUT_DIR, exist_ok=True)
96
  out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
97
 
 
99
  f.write(json.dumps(eval_entry))
100
 
101
  print("Uploading eval file")
102
+ envs.API.upload_file(
103
  path_or_fileobj=out_path,
104
  path_in_repo=out_path.split("eval-queue/")[1],
105
+ repo_id=envs.QUEUE_REPO,
106
  repo_type="dataset",
107
  commit_message=f"Add {model} to eval queue",
108
  )
 
110
  # Remove the local file
111
  os.remove(out_path)
112
 
113
+ return formatting.styled_message(
114
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
115
  )
tests/test_evaluate_model.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ import pandas as pd
5
+
6
+ import src.backend.evaluate_model as evaluate_model
7
+ import src.envs as envs
8
+
9
+
10
+ class TestEvaluator(unittest.TestCase):
11
+
12
+ def setUp(self):
13
+ self.model_name = 'test_model'
14
+ self.revision = 'test_revision'
15
+ self.precision = 'test_precision'
16
+ self.batch_size = 10
17
+ self.device = 'test_device'
18
+ self.no_cache = False
19
+ self.limit = 10
20
+
21
+ @patch('src.backend.evaluate_model.SummaryGenerator')
22
+ @patch('src.backend.evaluate_model.EvaluationModel')
23
+ def test_evaluator_initialization(self, mock_eval_model, mock_summary_generator):
24
+ evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
25
+ self.precision, self.batch_size,
26
+ self.device, self.no_cache, self.limit)
27
+
28
+ mock_summary_generator.assert_called_once_with(self.model_name, self.revision)
29
+ mock_eval_model.assert_called_once_with(envs.HEM_PATH)
30
+ self.assertEqual(evaluator.model, self.model_name)
31
+
32
+ @patch('src.backend.evaluate_model.EvaluationModel')
33
+ @patch('src.backend.evaluate_model.SummaryGenerator')
34
+ def test_evaluator_initialization_error(self, mock_summary_generator, mock_eval_model):
35
+ mock_eval_model.side_effect = Exception('test_exception')
36
+ with self.assertRaises(Exception):
37
+ evaluate_model.Evaluator(self.model_name, self.revision,
38
+ self.precision, self.batch_size,
39
+ self.device, self.no_cache, self.limit)
40
+
41
+ @patch('src.backend.evaluate_model.SummaryGenerator')
42
+ @patch('src.backend.evaluate_model.EvaluationModel')
43
+ @patch('src.backend.evaluate_model.pd.read_csv')
44
+ @patch('src.backend.util.format_results')
45
+ def test_evaluate_method(self, mock_format_results, mock_read_csv, mock_eval_model,
46
+ mock_summary_generator):
47
+ evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
48
+ self.precision, self.batch_size,
49
+ self.device, self.no_cache, self.limit)
50
+
51
+ # Mock setup
52
+ mock_format_results.return_value = {'test': 'result'}
53
+ mock_read_csv.return_value = pd.DataFrame({'column1': ['data1', 'data2']})
54
+ mock_summary_generator.return_value.generate_summaries.return_value = pd.DataFrame({'column1': ['summary1', 'summary2']})
55
+ mock_summary_generator.return_value.avg_length = 100
56
+ mock_summary_generator.return_value.answer_rate = 1.0
57
+ mock_summary_generator.return_value.error_rate = 0.0
58
+ mock_eval_model.return_value.compute_accuracy.return_value = 1.0
59
+ mock_eval_model.return_value.hallucination_rate = 0.0
60
+ mock_eval_model.return_value.evaluate_hallucination.return_value = [0.5]
61
+
62
+ # Method call and assertions
63
+ results = evaluator.evaluate()
64
+ mock_format_results.assert_called_once_with(model_name=self.model_name,
65
+ revision=self.revision,
66
+ precision=self.precision,
67
+ accuracy=1.0, hallucination_rate=0.0,
68
+ answer_rate=1.0, avg_summary_len=100,
69
+ error_rate=0.0)
70
+ mock_read_csv.assert_called_once_with(envs.SOURCE_PATH)
71
+
72
+ @patch('src.backend.evaluate_model.SummaryGenerator')
73
+ @patch('src.backend.evaluate_model.EvaluationModel')
74
+ @patch('src.backend.evaluate_model.pd.read_csv')
75
+ def test_evaluate_with_file_not_found(self, mock_read_csv, mock_eval_model,
76
+ mock_summary_generator):
77
+ mock_read_csv.side_effect = FileNotFoundError('test_exception')
78
+ evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
79
+ self.precision, self.batch_size,
80
+ self.device, self.no_cache, self.limit)
81
+
82
+ with self.assertRaises(FileNotFoundError):
83
+ evaluator.evaluate()
84
+
85
+
86
+ if __name__ == '__main__':
87
+ unittest.main()
tests/test_evaluator.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ import pandas as pd
5
+
6
+ import src.backend.model_operations as model_operations
7
+
8
+
9
+ class TestEvaluator(unittest.TestCase):
10
+
11
+ def setUp(self):
12
+ self.model_path = "test_model"
13
+
14
+ @patch("src.backend.model_operations.load_evaluation_model")
15
+ def test_init(self, mock_load_evaluation_model):
16
+ model_operations.EvaluationModel(self.model_path)
17
+ mock_load_evaluation_model.assert_called_once_with(self.model_path)
18
+
19
+ @patch("src.backend.model_operations.load_evaluation_model")
20
+ def test_evaluate_hallucination(self, mock_load_evaluation_model):
21
+ model = model_operations.EvaluationModel(self.model_path)
22
+ df = pd.DataFrame({'source': ['source1', 'source2'], 'summary': ['summary1', 'summary2']})
23
+
24
+ mock_load_evaluation_model.return_value.predict.return_value = [0.8, 0.2]
25
+
26
+ scores = model.evaluate_hallucination(df)
27
+ self.assertEqual(scores, [0.8, 0.2])
28
+
29
+ @patch("src.backend.model_operations.load_evaluation_model")
30
+ def test_evaluate_hallucination_exception(self, mock_load_evaluation_model):
31
+ model = model_operations.EvaluationModel(self.model_path)
32
+ df = pd.DataFrame({'source': ['source1', 'source2'], 'summary': ['summary1', 'summary2']})
33
+
34
+ mock_load_evaluation_model.return_value.predict.side_effect = Exception("Test exception")
35
+
36
+ with self.assertRaises(Exception):
37
+ scores = model.evaluate_hallucination(df)
38
+
39
+ @patch("src.backend.model_operations.load_evaluation_model")
40
+ def test_compute_accuracy(self, mock_load_evaluation_model):
41
+ model = model_operations.EvaluationModel(self.model_path)
42
+ model.scores = [0.8, 0.2]
43
+
44
+ accuracy = model.compute_accuracy()
45
+ expected_accuracy = 50.0
46
+ self.assertEqual(accuracy, expected_accuracy)
47
+
48
+
49
+ class TestLoadEvaluationModel(unittest.TestCase):
50
+
51
+ @patch("src.backend.model_operations.CrossEncoder")
52
+ def test_load_evaluation_model(self, mock_cross_encoder):
53
+ model_path = 'test_model_path'
54
+ model_operations.load_evaluation_model(model_path)
55
+ mock_cross_encoder.assert_called_once_with(model_path)
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main()
tests/test_main_backend.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ import main_backend
5
+ import src.backend.manage_requests as manage_requests
6
+
7
+
8
+ class TestMainBackend(unittest.TestCase):
9
+
10
+ @patch('src.backend.manage_requests.check_completed_evals')
11
+ @patch('src.backend.manage_requests.get_eval_requests')
12
+ @patch('src.backend.sort_queue.sort_models_by_priority')
13
+ @patch('src.backend.manage_requests.set_eval_request')
14
+ @patch('src.backend.run_eval_suite.run_evaluation')
15
+ def test_run_auto_eval_with_pending_requests(self, mock_run_evaluation, mock_set_eval_request,
16
+ mock_sort_models_by_priority, mock_get_eval_requests,
17
+ mock_check_completed_evals):
18
+ mock_sort_models_by_priority.return_value = [manage_requests.EvalRequest(
19
+ model="test_model",
20
+ private=True,
21
+ status="PENDING",
22
+ json_filepath="test_filepath",
23
+ weight_type="test_weight_type",
24
+ precision="test_precision",
25
+ base_model="test_base_model",
26
+ revision="test_revision",
27
+ )]
28
+
29
+ main_backend.run_auto_eval()
30
+
31
+ # Assertions
32
+ mock_check_completed_evals.assert_called()
33
+ mock_get_eval_requests.assert_called()
34
+ mock_sort_models_by_priority.assert_called()
35
+ mock_set_eval_request.assert_called()
36
+ mock_run_evaluation.assert_called()
37
+
38
+ @patch('builtins.print')
39
+ @patch('src.backend.manage_requests.check_completed_evals')
40
+ @patch('src.backend.manage_requests.get_eval_requests')
41
+ def test_run_auto_eval_with_no_pending_requests(self, mock_get_eval_requests,
42
+ mock_check_completed_evals, mock_print):
43
+ mock_get_eval_requests.return_value = []
44
+
45
+ main_backend.run_auto_eval()
46
+
47
+ # Assertions
48
+ mock_check_completed_evals.assert_called()
49
+ mock_get_eval_requests.assert_called()
50
+ mock_print.assert_any_call("No eval requests found. Exiting.")
51
+
52
+
53
+ if __name__ == "__main__":
54
+ unittest.main()
tests/test_summary_generator.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ import pandas as pd
5
+
6
+ import src.backend.evaluate_model as evaluate_model
7
+
8
+
9
+ class TestSummaryGenerator(unittest.TestCase):
10
+
11
+ def setUp(self):
12
+ self.model_id = "test_model"
13
+ self.revision = "test_revision"
14
+
15
+ @patch("src.backend.model_operations.AutoTokenizer")
16
+ @patch("src.backend.model_operations.AutoModelForCausalLM")
17
+ def test_init(self, mock_model, mock_tokenizer):
18
+ evaluate_model.SummaryGenerator(self.model_id, self.revision)
19
+ mock_tokenizer.from_pretrained.assert_called_once_with(self.model_id,
20
+ self.revision)
21
+ mock_model.from_pretrained.assert_called_once_with(self.model_id,
22
+ self.revision)
23
+
24
+ @patch("src.backend.model_operations.nlp")
25
+ @patch("src.backend.model_operations.AutoTokenizer")
26
+ @patch("src.backend.model_operations.AutoModelForCausalLM")
27
+ def test_generate_summaries(self, mock_model, mock_tokenizer, mock_nlp):
28
+ df = pd.DataFrame({'text': ['text1', 'text2'],
29
+ 'dataset': ['dataset1', 'dataset2']})
30
+
31
+ generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
32
+ generator.generate_summaries(df)
33
+
34
+ self.assertEqual(len(generator.summaries_df), len(df))
35
+
36
+ @patch("src.backend.model_operations.AutoTokenizer")
37
+ @patch("src.backend.model_operations.AutoModelForCausalLM")
38
+ def test_compute_avg_length(self, mock_model, mock_tokenizer):
39
+ generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
40
+ test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
41
+ 'dataset': ['dataset']})
42
+ generator.summaries_df = test_df
43
+ generator._compute_avg_length()
44
+ self.assertEqual(generator.avg_length, 4)
45
+
46
+ @patch("src.backend.model_operations.AutoTokenizer")
47
+ @patch("src.backend.model_operations.AutoModelForCausalLM")
48
+ def test_compute_answer_rate(self, mock_model, mock_tokenizer):
49
+ generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
50
+ test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
51
+ 'dataset': ['dataset']})
52
+ generator.summaries_df = test_df
53
+ generator._compute_answer_rate()
54
+ self.assertEqual(generator.answer_rate, 1)
55
+
56
+ @patch("src.backend.model_operations.AutoTokenizer")
57
+ @patch("src.backend.model_operations.AutoModelForCausalLM")
58
+ def test_error_rate(self, mock_model, mock_tokenizer):
59
+ generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
60
+ test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
61
+ 'dataset': ['dataset']})
62
+ generator.summaries_df = test_df
63
+ generator._compute_error_rate(0)
64
+ self.assertEqual(generator.error_rate, 0)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ unittest.main()