Spaces:

arshy
/

leaderboard-gradio

Runtime error

App Files Files Community

arshy commited on Apr 8

Commit

5348813

•

1 Parent(s): 1bea577

updates

Browse files

Files changed (2) hide show

app.py +12 -5
tabs/run_benchmark.py +4 -4

app.py CHANGED Viewed

@@ -17,14 +17,14 @@ from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
-def run_benchmark_gradio(tool_name, model_name, openai_api_key, anthropic_api_key):
     """Run the benchmark using inputs."""
     if tool_name is None:
         return "Please enter the name of your tool."
     if openai_api_key is None and anthropic_api_key is None:
         return "Please enter either OpenAI or Anthropic API key."
-    result = run_benchmark_main(tool_name, model_name, openai_api_key, anthropic_api_key)
     if result == 'completed':
         # get the results file in the results directory
         fns = glob('results/*.csv')
@@ -106,8 +106,8 @@ with demo:
                         "claude-prediction-offline",
                         "claude-prediction-online",
                         'prediction-request-rag',
-                        # "prediction-with-research-conservative",
-                        # "prediction-with-research-bold",
                         "prediction-request-reasoning-claude",
                         "prediction-request-rag-claude",
                         "prediction-url-cot-claude",
@@ -122,6 +122,13 @@ with demo:
             with gr.Row():
                 openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
                 anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
             with gr.Row():
                 run_button = gr.Button("Run Benchmark")
             with gr.Row():
@@ -132,7 +139,7 @@ with demo:
                     summary = gr.Dataframe()
             run_button.click(run_benchmark_gradio,
-                            inputs=[tool_name, model_name, openai_api_key, anthropic_api_key],
                             outputs=[result, summary])
 demo.queue(default_concurrency_limit=40).launch()

 demo = gr.Blocks()
+def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key):
     """Run the benchmark using inputs."""
     if tool_name is None:
         return "Please enter the name of your tool."
     if openai_api_key is None and anthropic_api_key is None:
         return "Please enter either OpenAI or Anthropic API key."
+    result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key)
     if result == 'completed':
         # get the results file in the results directory
         fns = glob('results/*.csv')
                         "claude-prediction-offline",
                         "claude-prediction-online",
                         'prediction-request-rag',
+                        "prediction-with-research-conservative",
+                        "prediction-with-research-bold",
                         "prediction-request-reasoning-claude",
                         "prediction-request-rag-claude",
                         "prediction-url-cot-claude",
             with gr.Row():
                 openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
                 anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
+            with gr.Row():
+                num_questions = gr.Slider(
+                                    minimum=1,
+                                    maximum=340,
+                                    value=10,
+                                    label="Number of questions to run the benchmark on",
+                                )
             with gr.Row():
                 run_button = gr.Button("Run Benchmark")
             with gr.Row():
                     summary = gr.Dataframe()
             run_button.click(run_benchmark_gradio,
+                            inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key],
                             outputs=[result, summary])
 demo.queue(default_concurrency_limit=40).launch()

tabs/run_benchmark.py CHANGED Viewed

@@ -2,16 +2,16 @@ import os
 from benchmark.run_benchmark import run_benchmark
-def run_benchmark_main(tool_name, model_name, openai_api_key, anthropic_api_key):
     """Run the benchmark using the provided function and API key."""
     # Empyt the results directory
     os.system("rm -rf results/*")
-    print(f"Running benchmark with the following parameters: {tool_name}, {model_name}, {openai_api_key}, {anthropic_api_key}")
     # Set the benchmark parameters
     kwargs = {}
-    kwargs["num_questions"] = 2
     kwargs["tools"] = [tool_name]
     if model_name:
         kwargs["model"] = model_name

 from benchmark.run_benchmark import run_benchmark
+def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key):
     """Run the benchmark using the provided function and API key."""
     # Empyt the results directory
     os.system("rm -rf results/*")
     # Set the benchmark parameters
     kwargs = {}
+    if not num_questions:
+        num_questions = 10
+    kwargs["num_questions"] = num_questions
     kwargs["tools"] = [tool_name]
     if model_name:
         kwargs["model"] = model_name