TableGPT2-7B / evaluation /table_related_benchmarks /inference_encoder.py

upgrade: add benchmarks eval

2a26d3b 11 days ago

5.82 kB

	import pandas as pd
	from vllm import LLM
	from vllm.sampling_params import SamplingParams
	import copy


	def extract_contrastive_table(df: pd.DataFrame):
	# Convert DataFrame to the desired format
	return {
	"columns": [
	{
	"name": col,
	"dtype": str(df[col].dtype),
	"contains_nan": df[col].isnull().any(),
	"is_unique":df[col].nunique() == len(df[col]),
	"values": df[col].tolist(), # slice?
	}
	for col in df.columns
	]
	}

	import contextlib
	import gc
	import torch
	from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
	from vllm.utils import is_cpu


	def cleanup():
	destroy_model_parallel()
	destroy_distributed_environment()
	with contextlib.suppress(AssertionError):
	torch.distributed.destroy_process_group()
	gc.collect()
	if not is_cpu():
	torch.cuda.empty_cache()

	def inference_with_encoder(args, format_msg_datas):
	print("Load model...")
	model = LLM(
	model=args.model_path,
	max_model_len=args.max_model_len,
	gpu_memory_utilization=0.8,
	max_num_seqs=20,
	limit_mm_per_prompt={"table": 10},
	# dtype="half",
	dtype="bfloat16",
	)

	sparams = SamplingParams(temperature=args.temperature, max_tokens=args.max_new_tokens)
	# 单个推理查看prompt
	# ----------------------
	# print("==================")
	# print(test_datas)
	# res = model.chat(messages=format_msg_datas, sampling_params=sparams)
	# print(res)
	# print("------------------PROMPT Start----------------")
	# print(res[0].prompt)
	# print("------------------PROMPT END-----------------")


	# print("++++++++++++++++++++++++Response Start++++++++++++++++++++++++")
	# print(res[0].outputs[0].text)
	# print("++++++++++++++++++++++++Response End++++++++++++++++++++++++")
	# print("Generating answers finished..")
	# exit()
	# ----------------------
	# 单个运行
	# ----------------------
	# model_outputs_text = []
	# for fmd in format_msg_datas:
	# print(fmd)
	# res = model.chat(messages=fmd, sampling_params=sparams)
	# rt_output = res[0].outputs[0].text
	# model_outputs_text.append(rt_output)
	# ----------------------

	# 批量运行
	# ----------------------
	model_outputs = model.chat(messages=format_msg_datas, sampling_params=sparams)
	model_outputs_text = [mot.outputs[0].text for mot in model_outputs]
	# ----------------------

	del model
	cleanup()
	return model_outputs_text

	def truncate(value, max_length=80):
	new_value = ""
	if not isinstance(value, str) or len(value) <= max_length:
	new_value = value
	else:
	new_value = value[:max_length] + "..."
	return new_value

	def format_encoder_tables(df_names, table_paths):
	tables = []
	tables_info = []
	for idx, table_path in enumerate(table_paths):
	df_name = df_names[idx]
	df = pd.read_csv(table_path, encoding="utf-8", nrows=500)
	df.columns = df.columns.str.strip()
	df = df.dropna(how="all").dropna(axis=1, how="all")
	# 限制超过列时截断
	max_columns = 50 # 可以根据你的需求设置这个数量
	if len(df.columns) > max_columns:
	df = df.iloc[:, :max_columns]

	df_extra_info = extract_contrastive_table(df)
	tables_info.append(copy.deepcopy(f"Details about the '{df_name}' other info as follows:\n<TABLE_CONTENT>\n"))
	tables.append(copy.deepcopy(df_extra_info))

	tables_list = []
	for tb in tables:
	tables_list.append({
	"type": "table",
	"table": tb,
	})

	return tables_list, tables_info

	def build_encoder_table_part_content(df_names, table_paths):
	content_msg = []
	for idx, table_path in enumerate(table_paths):

	content_msg.append(
	{
	"type": "text",
	"text": f"/*\nDetails about the '{df_names[idx]}' other info as follows:\n",
	}
	)
	# 读取df并处理
	df = pd.read_csv(table_path, encoding="utf-8", nrows=500)
	df.columns = df.columns.str.strip()
	df = df.dropna(how="all").dropna(axis=1, how="all")
	# 限制超过列时截断
	max_columns = 50 # 可以根据你的需求设置这个数量
	if len(df.columns) > max_columns:
	df = df.iloc[:, :max_columns]

	content_msg.append(
	{
	"type": "table",
	"table": extract_contrastive_table(
	copy.deepcopy(df)
	)
	}
	)
	content_msg.append(
	{
	"type": "text",
	"text": "*/",
	}
	)

	return content_msg

	def read_df_head(table_path, head_num=3, format_type="string"):
	df = pd.read_csv(table_path, encoding="utf-8", nrows=500)
	df.columns = df.columns.str.strip()
	df = df.dropna(how="all").dropna(axis=1, how="all")
	# 限制超过列时截断
	max_columns = 50 # 可以根据你的需求设置这个数量
	if len(df.columns) > max_columns:
	df = df.iloc[:, :max_columns]

	df_head = copy.deepcopy(df.head(head_num))
	df_truncated_head = df_head.apply(lambda x: x.map(lambda y: truncate(y, 80)))
	if format_type == "string":
	df_truncated_head_str = df_truncated_head.to_string()
	elif format_type == "md":
	df_truncated_head_str = df_truncated_head.to_markdown(index=False)
	else:
	df_truncated_head_str = df_truncated_head.to_string()
	return df_truncated_head_str, df

	# build_message # def build_single_messages(test_dt)
	# format_inputs # def format_inputs(test_datas)