TableGPT2-7B / evaluation /table_related_benchmarks /run_tableqa_execution_eval.py
qianxiao1111's picture
upgrade: add benchmarks eval
2a26d3b
raw
history blame
8.49 kB
import json
import re
import pandas as pd
from tqdm import tqdm
from utils import (
sample_from_two_lists,
get_dfs_info,
get_tool,
filter_code,
read_jsonl,
filter_cot,
timeout,
TimeoutException,
execute_with_timeout,
load_json,
save_json,
)
from table_qa_execution_eval.sft_prompt import (
prompt_with_format_list,
prompt_with_instruction_list,
)
from inference import (
generate_outputs,
load_model,
load_tokenizer_and_template,
get_infer_kwargs,
)
import os
import argparse
import shutil
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from joblib import Parallel, delayed
CODE_PREFIX = """import matplotlib.pyplot as plt
from mplfonts import use_font
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# Fixing Chinese font issues
use_font("Noto Serif CJK SC")
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False\n"""
def format_inputs(test_datas: list[dict],args) -> list[list[dict]]:
"""Format inputs to the required messages"""
# 把需要推理的数据拼成 message 形式
format_message_datas = []
for idx, test_dt in enumerate(test_datas):
if args.slim:
messages = test_dt["message"]
else:
instruction = test_dt["instruction"]
table_info = test_dt["table_info"]
df_info_simple_str = test_dt["df_info_simple_str"]
instruction = instruction.replace(table_info, df_info_simple_str)
messages = [{"role": "user", "content": instruction}]
format_message_datas.append(messages)
return format_message_datas
def eval_outputs_parallel(
llm_output: str,
test_data: str,
args,
) -> dict:
df_paths = test_data["table_paths"]
df_names = test_data["df_names"]
query = test_data["query"]
table_paths = test_data["table_paths"]
df = [pd.read_csv(path, low_memory=False) for path in df_paths]
if args.slim:
# tool = get_tool(df, df_names)
tool = get_tool(df)
instruction = test_data["message"]
else:
tool = get_tool(df, df_names)
instruction = test_data["instruction"]
table_info = test_data["table_info"]
df_info_simple_str = test_data["df_info_simple_str"]
instruction = instruction.replace(table_info, df_info_simple_str)
code, _ = filter_code(llm_output)
# cot = filter_cot(llm_output)
eval_result_sample = {}
# 运行超时代码,认为都是异常代码, 在tool.run()过程中,可能会print出额外的内容,不影响执行
try:
# 如果生成的代码为空(解析不到代码), 也认为是llm没有理解observe内容或instruct, 输出为Code Error
if not code:
observe = "Code Error: output empty code.."
elif 'df.explode("Candidate")' in code:
raise ValueError(f"df.explode error")
else:
with timeout(15): # 设置超时时间为15秒
pure_code = CODE_PREFIX + code
# print("pure code:", pure_code)
observe = tool.run(pure_code) # 需要监控超时的代码块
# observe = execute_with_timeout(pure_code, 15, tool)
if isinstance(observe, pd.DataFrame):
observe = observe.head().to_markdown(index=False)
else:
observe = str(observe)
except TimeoutException as e:
observe = f"Timeout Error: code running time exceed 15s.."
except SystemExit as e:
observe = f"SystemExit Error: {str(e)}"
except Exception as e:
observe = f"Unexpected Error: {str(e)}"
eval_result_sample["code"] = code
eval_result_sample["llm_output"] = llm_output
eval_result_sample["observe"] = observe
eval_result_sample["flag"] = execution_eval(observe)
eval_result_sample["query"] = query
eval_result_sample["table_paths"] = table_paths
eval_result_sample["instruction"] = instruction
return eval_result_sample
def execution_eval(observe: str) -> bool:
"""
Test whether the code generated by eval_llm can be executed.
:param output: output code of llm generation
:return: True or False
"""
# 只要执行结果中不出现error 或者 exception, 就认为代码可执行
pattern = re.compile(r"error|exception", re.IGNORECASE)
try:
res = not pattern.search(observe)
except:
res = True
return res
def main(args):
eval_dataset_path = args.eval_dataset_path
eval_results_save_path = args.eval_results_save_path
model_path = args.model_path
max_model_len = args.max_model_len
template = args.template
gpus_num = args.gpus_num
model_kwargs = get_infer_kwargs(args)
print("Load model...")
llm_model = load_model(model_path, max_model_len, gpus_num)
tokenizer = load_tokenizer_and_template(model_path, template)
eval_dataset_path = args.eval_dataset_path
test_datas = load_json(eval_dataset_path)
format_message_datas = format_inputs(test_datas,args)
print("Generating eval answers now..")
model_outputs = generate_outputs(
format_message_datas, llm_model, tokenizer, model_kwargs
)
# with open("model_output.json","w")as f:
# json.dump(model_outputs,f,ensure_ascii=False)
print("Generating answers finished..")
eval_answers = Parallel(n_jobs=48)(
delayed(eval_outputs_parallel)(model_outputs[i]["output_text"], test_datas[i],args)
for i in range(len(test_datas))
)
# calculate execute rate
execute_passed = 0
total_len = len(eval_answers)
for eval_answer in eval_answers:
execute_passed += int(eval_answer["flag"])
print(f"Sample length: {total_len}. ")
print(
f"Execute Passed: {execute_passed}." f"\tExecute pass-rate is:",
round(execute_passed / total_len, 3),
)
# save eval result
with open(eval_results_save_path, "w", encoding="utf-8") as f:
json.dump(eval_answers, f, ensure_ascii=False)
if __name__ == "__main__":
# 确定images目录是否存在和写权限
output_dir = Path(__file__).parent / "images"
if os.path.exists(output_dir):
if not os.access(output_dir, os.W_OK):
shutil.rmtree(output_dir)
os.makedirs(output_dir)
os.chmod(output_dir, 0o777)
print("not write permission, makedir:", output_dir)
else:
print(f"{output_dir} exists!")
else:
os.makedirs(output_dir)
os.chmod(output_dir, 0o777)
print("makedir:", output_dir)
parser = argparse.ArgumentParser(description="eval tableqa python code")
parser.add_argument(
"--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
)
parser.add_argument(
"--temperature", type=float, default=0.01, help="Temperature setting"
)
parser.add_argument(
"--template",
type=str,
choices=[None, "llama3", "baichuan", "chatglm"],
default=None,
help="The template must be specified if not present in the config file",
)
parser.add_argument(
"--model_path", type=str, required=True, help="Path to the model"
)
parser.add_argument(
"--model_type",
choices=["base_model", "chat_model"],
default="chat_model",
help="Base model or Chat model",
)
parser.add_argument(
"--slim",
action="store_true",
help="slim data format",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=1024,
help="Maximum number of output tokens",
)
parser.add_argument("--max_model_len", type=int, default=8192, help="Cutoff length")
parser.add_argument(
"--eval_dataset_path",
type=str,
default="table_related_benchmarks/evalset/table_qa_execuate_test/test_datas_zuizong_filter.json",
help="Test Set Path",
)
parser.add_argument(
"--eval_results_save_path",
type=str,
default="output/result_table_qa.json",
help="Max iteration for llm to run each code correction task",
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
main(args)
"""
python run_eval.py --model_path /data0/pretrained-models/Qwen2-7B-Instruct
"""