Madhavan Iyengar commited on
Commit
ab76bab
1 Parent(s): 465d7de

add submission capability

Browse files
Files changed (2) hide show
  1. app.py +38 -43
  2. src/submission/evaluate.py +111 -0
app.py CHANGED
@@ -1,8 +1,11 @@
1
  import subprocess
2
  import gradio as gr
 
 
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -26,13 +29,41 @@ from src.display.utils import (
26
  WeightType,
27
  Precision
28
  )
29
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def handle_new_eval_submission(model_name, model_zip, model_link):
34
- # This is a placeholder for the actual submission logic
35
- return "We are not accepting submissions at this time, please check back soon!"
36
 
37
  def restart_space():
38
  API.restart_space(repo_id=REPO_ID)
@@ -241,49 +272,13 @@ with demo:
241
  with gr.Column():
242
  with gr.Row():
243
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
244
-
245
- # with gr.Column():
246
- # with gr.Accordion(
247
- # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
248
- # open=False,
249
- # ):
250
- # with gr.Row():
251
- # finished_eval_table = gr.components.Dataframe(
252
- # value=finished_eval_queue_df,
253
- # headers=EVAL_COLS,
254
- # datatype=EVAL_TYPES,
255
- # row_count=5,
256
- # )
257
- # with gr.Accordion(
258
- # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
259
- # open=False,
260
- # ):
261
- # with gr.Row():
262
- # running_eval_table = gr.components.Dataframe(
263
- # value=running_eval_queue_df,
264
- # headers=EVAL_COLS,
265
- # datatype=EVAL_TYPES,
266
- # row_count=5,
267
- # )
268
-
269
- # with gr.Accordion(
270
- # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
271
- # open=False,
272
- # ):
273
- # with gr.Row():
274
- # pending_eval_table = gr.components.Dataframe(
275
- # value=pending_eval_queue_df,
276
- # headers=EVAL_COLS,
277
- # datatype=EVAL_TYPES,
278
- # row_count=5,
279
- # )
280
  with gr.Row():
281
  gr.Markdown("# 📋 Submit your results here!", elem_classes="markdown-text")
282
 
283
  with gr.Row():
284
  model_name_textbox = gr.Textbox(label="Model name")
285
  model_zip_file = gr.File(label="Upload model prediction result ZIP file")
286
- model_link_textbox = gr.Textbox(label="Link to model page")
287
  with gr.Row():
288
  gr.Column()
289
  with gr.Column(scale=2):
@@ -292,7 +287,7 @@ with demo:
292
 
293
  submit_button.click(
294
  handle_new_eval_submission,
295
- [model_name_textbox, model_zip_file, model_link_textbox],
296
  submission_result
297
  )
298
  gr.Column()
 
1
  import subprocess
2
  import gradio as gr
3
+ import zipfile
4
+ import os
5
+ import shutil
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
+ from huggingface_hub import snapshot_download, Repository, HfFolder
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
 
29
  WeightType,
30
  Precision
31
  )
32
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND
33
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
34
  from src.submission.submit import add_new_eval
35
+ from src.submission.evaluate import calculate_metrics
36
+
37
+ def handle_new_eval_submission(model_name, model_zip, model_link=None):
38
+ extraction_path = EVAL_RESULTS_PATH_BACKEND
39
+
40
+ if not os.path.exists(extraction_path):
41
+ os.makedirs(extraction_path)
42
+
43
+ # define path for the zip file to be extracted to
44
+ extraction_path = os.path.join(extraction_path, model_name)
45
+
46
+ if model_zip is not None:
47
+ with zipfile.ZipFile(model_zip, 'r') as zip_ref:
48
+ zip_ref.extractall(extraction_path)
49
+ print("File unzipped successfully to:", extraction_path)
50
+
51
+ # Evaluate the model's performance
52
+ calculate_metrics(extraction_path, model_name)
53
+
54
+ # upload to results repo
55
+ API.upload_file(
56
+ path_or_fileobj=os.path.join(os.getcwd(), EVAL_RESULTS_PATH, '3d-pope', model_name, 'results.json'),
57
+ path_in_repo=os.path.join('3d-pope', model_name, 'results.json'),
58
+ repo_id=RESULTS_REPO,
59
+ repo_type="dataset",
60
+ )
61
+
62
+ restart_space()
63
+
64
+ return "Submission received and results are being processed. Please check the leaderboard for updates."
65
+
66
 
 
 
 
67
 
68
  def restart_space():
69
  API.restart_space(repo_id=REPO_ID)
 
272
  with gr.Column():
273
  with gr.Row():
274
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  with gr.Row():
276
  gr.Markdown("# 📋 Submit your results here!", elem_classes="markdown-text")
277
 
278
  with gr.Row():
279
  model_name_textbox = gr.Textbox(label="Model name")
280
  model_zip_file = gr.File(label="Upload model prediction result ZIP file")
281
+ # model_link_textbox = gr.Textbox(label="Link to model page")
282
  with gr.Row():
283
  gr.Column()
284
  with gr.Column(scale=2):
 
287
 
288
  submit_button.click(
289
  handle_new_eval_submission,
290
+ [model_name_textbox, model_zip_file],
291
  submission_result
292
  )
293
  gr.Column()
src/submission/evaluate.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ import os
5
+ import json
6
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
7
+ import re
8
+
9
+ from src.envs import EVAL_RESULTS_PATH
10
+
11
+ def parse_first_word(answer):
12
+ # Extract the first word and check if it's 'yes' or 'no'
13
+ first_word = re.split(r'[\s,\.]', answer.lower())[0]
14
+ if first_word.startswith('yes'):
15
+ return 'yes'
16
+ elif first_word.startswith('no'):
17
+ return 'no'
18
+ else:
19
+ return None
20
+
21
+ def compute_metrics(true_labels, predicted_labels):
22
+ # Filtering out invalid answers
23
+ valid_indices = [i for i, label in enumerate(predicted_labels) if label in ['yes', 'no']]
24
+ filtered_true_labels = [true_labels[i] for i in valid_indices]
25
+ filtered_predicted_labels = [predicted_labels[i] for i in valid_indices]
26
+
27
+ # Calculating metrics
28
+ accuracy = accuracy_score(filtered_true_labels, filtered_predicted_labels)
29
+ precision, recall, f1_score, _ = precision_recall_fscore_support(
30
+ filtered_true_labels, filtered_predicted_labels, average='binary', pos_label='yes')
31
+
32
+ yes_ratio = filtered_predicted_labels.count('yes') / len(filtered_predicted_labels) if filtered_predicted_labels else 0
33
+
34
+ return {
35
+ "Accuracy": accuracy,
36
+ "Precision": precision,
37
+ "Recall": recall,
38
+ "F1 Score": f1_score,
39
+ "Yes Ratio": yes_ratio
40
+ }
41
+
42
+ def aggregate_metrics(directory_path):
43
+ metrics_data = {"random": {"true": [], "pred": [], "invalid": []},
44
+ "popular": {"true": [], "pred": [], "invalid": []},
45
+ "adversarial": {"true": [], "pred": [], "invalid": []}}
46
+
47
+ # Process each file in the directory
48
+ for filename in os.listdir(directory_path):
49
+ if filename.endswith(".json"):
50
+ file_path = os.path.join(directory_path, filename)
51
+ with open(file_path, 'r') as f:
52
+ data = json.load(f)
53
+
54
+ question_type = filename.split('_')[0]
55
+ if question_type in metrics_data:
56
+ for entry in data[next(iter(data))]:
57
+ first_word = parse_first_word(entry['predicted_answer'])
58
+ if first_word is None:
59
+ metrics_data[question_type]["invalid"].append(entry['predicted_answer'])
60
+ metrics_data[question_type]["true"].append(entry['ground_truth_answer'].lower())
61
+ metrics_data[question_type]["pred"].append(first_word if first_word else entry['predicted_answer'].lower())
62
+
63
+ results = {}
64
+ for q_type, data in metrics_data.items():
65
+ result = compute_metrics(data["true"], data["pred"])
66
+ result["Non-Binary Responses Count"] = len(data["invalid"])
67
+ result["Non-Binary Responses"] = data["invalid"]
68
+ results[q_type] = result
69
+
70
+ return results
71
+
72
+ def transform_format(data, model_name):
73
+ # Define the new format's base structure
74
+ transformed_data = {
75
+ "config": {
76
+ "model_name": model_name
77
+ },
78
+ "results": {}
79
+ }
80
+
81
+ # Mapping of old keys to new keys
82
+ key_mapping = {
83
+ "Accuracy": "accuracy",
84
+ "Precision": "precision",
85
+ "Recall": "recall",
86
+ "F1 Score": "f1_score",
87
+ "Yes Ratio": "yes_percentage"
88
+ }
89
+
90
+ # Iterate over each item in the original data
91
+ for model_type, metrics in data.items():
92
+ for old_key, new_suffix in key_mapping.items():
93
+ # Format the new key according to the required format 2 style
94
+ new_key = f"{model_type}_{new_suffix}"
95
+ # Assign the corresponding value to the new key in the results dictionary
96
+ transformed_data["results"][new_key] = {
97
+ new_key: round(metrics[old_key], 4) if isinstance(metrics[old_key], float) else metrics[old_key]
98
+ }
99
+
100
+ return transformed_data
101
+
102
+ def calculate_metrics(json_output_directory, model_name):
103
+ final_metrics = aggregate_metrics(json_output_directory)
104
+ transformed_metrics = transform_format(final_metrics, model_name)
105
+ # write to a file
106
+ results_path = os.path.join(EVAL_RESULTS_PATH, '3d-pope', model_name)
107
+ if not os.path.exists(results_path):
108
+ os.makedirs(results_path)
109
+ with open(os.path.join(results_path, 'results.json'), 'w') as f:
110
+ json.dump(transformed_metrics, f, indent=4)
111
+ print(json.dumps(final_metrics, indent=4))