Spaces:

phyloforfun
/

VoucherVision

Running

VoucherVision / vouchervision /save_dataset.py

July 18 update

c5e57d6 4 months ago

1.29 kB

	from datasets import load_dataset

	# Load the dataset
	dataset = load_dataset("phyloforfun/HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05")

	# Define the directory where you want to save the files
	save_dir = "D:/Dropbox/VoucherVision/datasets/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05"

	# Save each split as a JSONL file in the specified directory
	for split, split_dataset in dataset.items():
	split_dataset.to_json(f"{save_dir}/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-{split}.jsonl")


	'''import json # convert to google

	# Load the JSONL file
	input_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train.jsonl'
	output_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train-converted.jsonl'

	# Define the conversion function
	def convert_record(record):
	return {
	"input_text": record.get('instruction', '') + ' ' + record.get('input', ''),
	"target_text": record.get('output', '')
	}

	# Convert and save the new JSONL file
	with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
	for line in infile:
	record = json.loads(line)
	converted_record = convert_record(record)
	outfile.write(json.dumps(converted_record) + '\n')

	output_file_path'''