Terry Zhuo commited on
Commit
777def4
1 Parent(s): 6e2a72a

use docker and add exectuion

Browse files
Files changed (4) hide show
  1. Dockerfile +5 -0
  2. README.md +1 -2
  3. app.py +65 -4
  4. src/execute.py +194 -0
Dockerfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ FROM bigcodebench/bigcodebench-gradio:latest
2
+ COPY . /app
3
+ EXPOSE 7860
4
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
5
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -3,8 +3,7 @@ title: BigCodeBench Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  disable_embedding: true
10
  pinned: false
 
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
+ sdk: docker
 
7
  app_file: app.py
8
  disable_embedding: true
9
  pinned: false
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import logging
3
  import time
4
- import schedule
5
  import datetime
6
  import gradio as gr
7
  from threading import Thread
@@ -49,6 +48,7 @@ from src.envs import (
49
  HF_HOME,
50
  )
51
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
52
  from src.tools.plots import plot_elo_mle, plot_solve_rate
53
  # from src.voting.vote_system import VoteManager, run_scheduler
54
 
@@ -482,6 +482,7 @@ with main_block as demo:
482
  ], instruct_map)
483
  with gr.TabItem("📝 About", id=3):
484
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
 
485
  with gr.TabItem("🔎 Data Viewer", id="viewer"):
486
  search_input = gr.Textbox(label="Search by keyword")
487
  count_output = gr.Number(label="Number of filtered items")
@@ -508,7 +509,67 @@ with main_block as demo:
508
  inputs=[search_input, index_slider, show_test],
509
  outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
510
  )
511
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  with gr.TabItem("🚀 Request", id=4):
513
  gr.Markdown(SUBMISSION_TEXT_3)
514
 
@@ -583,5 +644,5 @@ def update_leaderboard(payload: WebhookPayload) -> None:
583
  webhooks_server.launch()
584
 
585
  scheduler = BackgroundScheduler()
586
- scheduler.add_job(restart_space, "interval", hours=5) # restarted every 3h as backup in case automatic updates are not working
587
- scheduler.start()
 
1
  import os
2
  import logging
3
  import time
 
4
  import datetime
5
  import gradio as gr
6
  from threading import Thread
 
48
  HF_HOME,
49
  )
50
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
51
+ from src.execute import generate_command, is_running, default_command, stream_logs, find_result_file
52
  from src.tools.plots import plot_elo_mle, plot_solve_rate
53
  # from src.voting.vote_system import VoteManager, run_scheduler
54
 
 
482
  ], instruct_map)
483
  with gr.TabItem("📝 About", id=3):
484
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
+
486
  with gr.TabItem("🔎 Data Viewer", id="viewer"):
487
  search_input = gr.Textbox(label="Search by keyword")
488
  count_output = gr.Number(label="Number of filtered items")
 
509
  inputs=[search_input, index_slider, show_test],
510
  outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
511
  )
512
+
513
+ with gr.TabItem("🛠️ Code Execution (Beta)", id=5):
514
+ gr.Markdown("## Upload your sanitized JSONL file to evaluate")
515
+
516
+ with gr.Row():
517
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
518
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
519
+ subset = gr.Dropdown(choices=["hard (gt pass rate: 100%)", "full (gt pass rate: 99.6%)"], label="Subset", value="hard")
520
+
521
+ with gr.Row():
522
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
523
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
524
+ max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
525
+
526
+ with gr.Row():
527
+ max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
528
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
529
+ check_gt_only = gr.Checkbox(label="Check GT Only", value=False, visible=False)
530
+ no_gt = gr.Checkbox(label="No GT", value=False, visible=False)
531
+
532
+ command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
533
+ with gr.Row():
534
+ submit_btn = gr.Button("Run Evaluation")
535
+ download_btn = gr.DownloadButton(label="Download Result")
536
+ log_output = gr.Textbox(label="Execution Logs", lines=20)
537
+
538
+ input_components = [
539
+ jsonl_file, split, subset, parallel,
540
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
541
+ check_gt_only, no_gt
542
+ ]
543
+
544
+ for component in input_components:
545
+ component.change(generate_command, inputs=input_components, outputs=command_output)
546
+
547
+
548
+ def start_evaluation(command, jsonl_file, subset, split):
549
+ extra = subset + "_" if subset != "full" else ""
550
+ if jsonl_file is not None:
551
+ result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
552
+ else:
553
+ result_path = None
554
+
555
+ for log in stream_logs(command, jsonl_file):
556
+ if jsonl_file is not None:
557
+ yield log, gr.update(value=result_path, label=result_path), gr.update()
558
+ else:
559
+ yield log, gr.update(), gr.update()
560
+ result_file = find_result_file()
561
+ if result_file:
562
+ return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
563
+ # gr.Button(visible=False)#,
564
+ # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
565
+ else:
566
+ return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
567
+ # gr.Button("Run Evaluation", visible=True),
568
+ # gr.DownloadButton(visible=False))
569
+ submit_btn.click(start_evaluation,
570
+ inputs=[command_output, jsonl_file, subset, split],
571
+ outputs=[log_output, download_btn])
572
+
573
  with gr.TabItem("🚀 Request", id=4):
574
  gr.Markdown(SUBMISSION_TEXT_3)
575
 
 
644
  webhooks_server.launch()
645
 
646
  scheduler = BackgroundScheduler()
647
+ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
648
+ scheduler.start()
src/execute.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import sys
4
+ import os
5
+ import threading
6
+ import time
7
+ import uuid
8
+ import glob
9
+ import shutil
10
+ from pathlib import Path
11
+
12
+ default_command = "bigcodebench.evaluate"
13
+ is_running = False
14
+
15
+ def generate_command(
16
+ jsonl_file, split, subset, parallel,
17
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
18
+ check_gt_only, no_gt
19
+ ):
20
+ command = [default_command]
21
+
22
+ if jsonl_file is not None:
23
+ # Copy the uploaded file to the current directory
24
+ local_filename = os.path.basename(jsonl_file.name)
25
+ shutil.copy(jsonl_file.name, local_filename)
26
+ command.extend(["--samples", local_filename])
27
+
28
+ command.extend(["--split", split, "--subset", subset])
29
+
30
+ if parallel is not None and parallel != 0:
31
+ command.extend(["--parallel", str(int(parallel))])
32
+
33
+ command.extend([
34
+ "--min-time-limit", str(min_time_limit),
35
+ "--max-as-limit", str(int(max_as_limit)),
36
+ "--max-data-limit", str(int(max_data_limit)),
37
+ "--max-stack-limit", str(int(max_stack_limit))
38
+ ])
39
+
40
+ if check_gt_only:
41
+ command.append("--check-gt-only")
42
+
43
+ if no_gt:
44
+ command.append("--no-gt")
45
+
46
+ return " ".join(command)
47
+
48
+
49
+ def cleanup_previous_files(jsonl_file):
50
+ if jsonl_file is not None:
51
+ file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
52
+ else:
53
+ file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
54
+ for file in glob.glob("*"):
55
+ try:
56
+ if file not in file_list:
57
+ os.remove(file)
58
+ except Exception as e:
59
+ print(f"Error during cleanup of {file}: {e}")
60
+
61
+ def find_result_file():
62
+ json_files = glob.glob("*.json")
63
+ if json_files:
64
+ return max(json_files, key=os.path.getmtime)
65
+ return None
66
+
67
+ def run_bigcodebench(command):
68
+ global is_running
69
+ if is_running:
70
+ yield "A command is already running. Please wait for it to finish.\n"
71
+ return
72
+ is_running = True
73
+
74
+ try:
75
+ yield f"Executing command: {command}\n"
76
+
77
+ process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
78
+
79
+ def kill_process():
80
+ if process.poll() is None: # If the process is still running
81
+ process.terminate()
82
+ is_running = False
83
+ yield "Process terminated after 12 minutes timeout.\n"
84
+
85
+ # Start a timer to kill the process after 12 minutes
86
+ timer = threading.Timer(720, kill_process)
87
+ timer.start()
88
+
89
+ for line in process.stdout:
90
+ yield line
91
+
92
+ # process.wait()
93
+
94
+ timer.cancel()
95
+
96
+ if process.returncode != 0:
97
+ yield f"Error: Command exited with status {process.returncode}\n"
98
+
99
+ yield "Evaluation completed.\n"
100
+
101
+ result_file = find_result_file()
102
+ if result_file:
103
+ yield f"Result file found: {result_file}\n"
104
+ else:
105
+ yield "No result file found.\n"
106
+ finally:
107
+ is_running = False
108
+
109
+ def stream_logs(command, jsonl_file=None):
110
+ global is_running
111
+
112
+ if is_running:
113
+ yield "A command is already running. Please wait for it to finish.\n"
114
+ return
115
+
116
+ cleanup_previous_files(jsonl_file)
117
+ yield "Cleaned up previous files.\n"
118
+
119
+ log_content = []
120
+ for log_line in run_bigcodebench(command):
121
+ log_content.append(log_line)
122
+ yield "".join(log_content)
123
+
124
+ # with gr.Blocks() as demo:
125
+ # gr.Markdown("# BigCodeBench Evaluator")
126
+
127
+ # with gr.Row():
128
+ # jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
129
+ # split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
130
+ # subset = gr.Dropdown(choices=["hard", "full"], label="Subset", value="hard")
131
+
132
+ # with gr.Row():
133
+ # parallel = gr.Number(label="Parallel (optional)", precision=0)
134
+ # min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
135
+ # max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
136
+
137
+ # with gr.Row():
138
+ # max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
139
+ # max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
140
+ # check_gt_only = gr.Checkbox(label="Check GT Only")
141
+ # no_gt = gr.Checkbox(label="No GT")
142
+
143
+ # command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
144
+ # with gr.Row():
145
+ # submit_btn = gr.Button("Run Evaluation")
146
+ # download_btn = gr.DownloadButton(label="Download Result")
147
+ # log_output = gr.Textbox(label="Execution Logs", lines=20)
148
+
149
+ # input_components = [
150
+ # jsonl_file, split, subset, parallel,
151
+ # min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
152
+ # check_gt_only, no_gt
153
+ # ]
154
+
155
+ # for component in input_components:
156
+ # component.change(generate_command, inputs=input_components, outputs=command_output)
157
+
158
+
159
+ # def start_evaluation(command, jsonl_file, subset, split):
160
+ # extra = subset + "_" if subset != "full" else ""
161
+ # if jsonl_file is not None:
162
+ # result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
163
+ # else:
164
+ # result_path = None
165
+
166
+ # for log in stream_logs(command, jsonl_file):
167
+ # if jsonl_file is not None:
168
+ # yield log, gr.update(value=result_path, label=result_path), gr.update()
169
+ # else:
170
+ # yield log, gr.update(), gr.update()
171
+ # result_file = find_result_file()
172
+ # if result_file:
173
+ # return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
174
+ # # gr.Button(visible=False)#,
175
+ # # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
176
+ # else:
177
+ # return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
178
+ # # gr.Button("Run Evaluation", visible=True),
179
+ # # gr.DownloadButton(visible=False))
180
+ # submit_btn.click(start_evaluation,
181
+ # inputs=[command_output, jsonl_file, subset, split],
182
+ # outputs=[log_output, download_btn])
183
+
184
+ # REPO_ID = "bigcode/bigcodebench-evaluator"
185
+ # HF_TOKEN = os.environ.get("HF_TOKEN", None)
186
+ # API = HfApi(token=HF_TOKEN)
187
+
188
+ # def restart_space():
189
+ # API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
190
+
191
+ # demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
192
+ # scheduler = BackgroundScheduler()
193
+ # scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
194
+ # scheduler.start()