Spaces:

sled-umich
/

3D-POPE-leaderboard

Running

App Files Files Community

Madhavan Iyengar commited on Jun 15

Commit

ab76bab

•

1 Parent(s): 465d7de

add submission capability

Browse files

Files changed (2) hide show

app.py +38 -43
src/submission/evaluate.py +111 -0

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import subprocess
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -26,13 +29,41 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
-def handle_new_eval_submission(model_name, model_zip, model_link):
-    # This is a placeholder for the actual submission logic
-    return "We are not accepting submissions at this time, please check back soon!"
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
@@ -241,49 +272,13 @@ with demo:
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                # with gr.Column():
-                #     with gr.Accordion(
-                #         f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                #         open=False,
-                #     ):
-                #         with gr.Row():
-                #             finished_eval_table = gr.components.Dataframe(
-                #                 value=finished_eval_queue_df,
-                #                 headers=EVAL_COLS,
-                #                 datatype=EVAL_TYPES,
-                #                 row_count=5,
-                #             )
-                #     with gr.Accordion(
-                #         f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                #         open=False,
-                #     ):
-                #         with gr.Row():
-                #             running_eval_table = gr.components.Dataframe(
-                #                 value=running_eval_queue_df,
-                #                 headers=EVAL_COLS,
-                #                 datatype=EVAL_TYPES,
-                #                 row_count=5,
-                #             )
-                #     with gr.Accordion(
-                #         f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                #         open=False,
-                #     ):
-                #         with gr.Row():
-                #             pending_eval_table = gr.components.Dataframe(
-                #                 value=pending_eval_queue_df,
-                #                 headers=EVAL_COLS,
-                #                 datatype=EVAL_TYPES,
-                #                 row_count=5,
-                #             )
             with gr.Row():
                 gr.Markdown("# 📋 Submit your results here!", elem_classes="markdown-text")
             with gr.Row():
                     model_name_textbox = gr.Textbox(label="Model name")
                     model_zip_file = gr.File(label="Upload model prediction result ZIP file")
-                    model_link_textbox = gr.Textbox(label="Link to model page")
             with gr.Row():
                 gr.Column()
                 with gr.Column(scale=2):
@@ -292,7 +287,7 @@ with demo:
                     submit_button.click(
                         handle_new_eval_submission,
-                        [model_name_textbox, model_zip_file, model_link_textbox],
                         submission_result
                     )
                 gr.Column()

 import subprocess
 import gradio as gr
+import zipfile
+import os
+import shutil
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download, Repository, HfFolder
 from src.about import (
     CITATION_BUTTON_LABEL,
     WeightType,
     Precision
 )
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+from src.submission.evaluate import calculate_metrics
+def handle_new_eval_submission(model_name, model_zip, model_link=None):
+    extraction_path = EVAL_RESULTS_PATH_BACKEND
+    if not os.path.exists(extraction_path):
+        os.makedirs(extraction_path)
+    # define path for the zip file to be extracted to
+    extraction_path = os.path.join(extraction_path, model_name)
+    if model_zip is not None:
+        with zipfile.ZipFile(model_zip, 'r') as zip_ref:
+            zip_ref.extractall(extraction_path)
+    print("File unzipped successfully to:", extraction_path)
+    # Evaluate the model's performance
+    calculate_metrics(extraction_path, model_name)
+    # upload to results repo
+    API.upload_file(
+        path_or_fileobj=os.path.join(os.getcwd(), EVAL_RESULTS_PATH, '3d-pope', model_name, 'results.json'),
+        path_in_repo=os.path.join('3d-pope', model_name, 'results.json'),
+        repo_id=RESULTS_REPO,
+        repo_type="dataset",
+        )
+    restart_space()
+    return "Submission received and results are being processed. Please check the leaderboard for updates."
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
             with gr.Row():
                 gr.Markdown("# 📋 Submit your results here!", elem_classes="markdown-text")
             with gr.Row():
                     model_name_textbox = gr.Textbox(label="Model name")
                     model_zip_file = gr.File(label="Upload model prediction result ZIP file")
+                    # model_link_textbox = gr.Textbox(label="Link to model page")
             with gr.Row():
                 gr.Column()
                 with gr.Column(scale=2):
                     submit_button.click(
                         handle_new_eval_submission,
+                        [model_name_textbox, model_zip_file],
                         submission_result
                     )
                 gr.Column()

src/submission/evaluate.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+import os
+import json
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+import re
+from src.envs import EVAL_RESULTS_PATH
+def parse_first_word(answer):
+    # Extract the first word and check if it's 'yes' or 'no'
+    first_word = re.split(r'[\s,\.]', answer.lower())[0]
+    if first_word.startswith('yes'):
+        return 'yes'
+    elif first_word.startswith('no'):
+        return 'no'
+    else:
+        return None
+def compute_metrics(true_labels, predicted_labels):
+    # Filtering out invalid answers
+    valid_indices = [i for i, label in enumerate(predicted_labels) if label in ['yes', 'no']]
+    filtered_true_labels = [true_labels[i] for i in valid_indices]
+    filtered_predicted_labels = [predicted_labels[i] for i in valid_indices]
+    # Calculating metrics
+    accuracy = accuracy_score(filtered_true_labels, filtered_predicted_labels)
+    precision, recall, f1_score, _ = precision_recall_fscore_support(
+        filtered_true_labels, filtered_predicted_labels, average='binary', pos_label='yes')
+    yes_ratio = filtered_predicted_labels.count('yes') / len(filtered_predicted_labels) if filtered_predicted_labels else 0
+    return {
+        "Accuracy": accuracy,
+        "Precision": precision,
+        "Recall": recall,
+        "F1 Score": f1_score,
+        "Yes Ratio": yes_ratio
+    }
+def aggregate_metrics(directory_path):
+    metrics_data = {"random": {"true": [], "pred": [], "invalid": []},
+                    "popular": {"true": [], "pred": [], "invalid": []},
+                    "adversarial": {"true": [], "pred": [], "invalid": []}}
+    # Process each file in the directory
+    for filename in os.listdir(directory_path):
+        if filename.endswith(".json"):
+            file_path = os.path.join(directory_path, filename)
+            with open(file_path, 'r') as f:
+                data = json.load(f)
+            question_type = filename.split('_')[0]
+            if question_type in metrics_data:
+                for entry in data[next(iter(data))]:
+                    first_word = parse_first_word(entry['predicted_answer'])
+                    if first_word is None:
+                        metrics_data[question_type]["invalid"].append(entry['predicted_answer'])
+                    metrics_data[question_type]["true"].append(entry['ground_truth_answer'].lower())
+                    metrics_data[question_type]["pred"].append(first_word if first_word else entry['predicted_answer'].lower())
+    results = {}
+    for q_type, data in metrics_data.items():
+        result = compute_metrics(data["true"], data["pred"])
+        result["Non-Binary Responses Count"] = len(data["invalid"])
+        result["Non-Binary Responses"] = data["invalid"]
+        results[q_type] = result
+    return results
+def transform_format(data, model_name):
+    # Define the new format's base structure
+    transformed_data = {
+        "config": {
+            "model_name": model_name
+        },
+        "results": {}
+    }
+    # Mapping of old keys to new keys
+    key_mapping = {
+        "Accuracy": "accuracy",
+        "Precision": "precision",
+        "Recall": "recall",
+        "F1 Score": "f1_score",
+        "Yes Ratio": "yes_percentage"
+    }
+    # Iterate over each item in the original data
+    for model_type, metrics in data.items():
+        for old_key, new_suffix in key_mapping.items():
+            # Format the new key according to the required format 2 style
+            new_key = f"{model_type}_{new_suffix}"
+            # Assign the corresponding value to the new key in the results dictionary
+            transformed_data["results"][new_key] = {
+                new_key: round(metrics[old_key], 4) if isinstance(metrics[old_key], float) else metrics[old_key]
+            }
+    return transformed_data
+def calculate_metrics(json_output_directory, model_name):
+    final_metrics = aggregate_metrics(json_output_directory)
+    transformed_metrics = transform_format(final_metrics, model_name)
+    # write to a file
+    results_path = os.path.join(EVAL_RESULTS_PATH, '3d-pope', model_name)
+    if not os.path.exists(results_path):
+        os.makedirs(results_path)
+    with open(os.path.join(results_path, 'results.json'), 'w') as f:
+        json.dump(transformed_metrics, f, indent=4)
+    print(json.dumps(final_metrics, indent=4))