Spaces:
Running
Running
from datasets import load_dataset | |
# Load the dataset | |
dataset = load_dataset("phyloforfun/HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05") | |
# Define the directory where you want to save the files | |
save_dir = "D:/Dropbox/VoucherVision/datasets/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05" | |
# Save each split as a JSONL file in the specified directory | |
for split, split_dataset in dataset.items(): | |
split_dataset.to_json(f"{save_dir}/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-{split}.jsonl") | |
'''import json # convert to google | |
# Load the JSONL file | |
input_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train.jsonl' | |
output_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train-converted.jsonl' | |
# Define the conversion function | |
def convert_record(record): | |
return { | |
"input_text": record.get('instruction', '') + ' ' + record.get('input', ''), | |
"target_text": record.get('output', '') | |
} | |
# Convert and save the new JSONL file | |
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile: | |
for line in infile: | |
record = json.loads(line) | |
converted_record = convert_record(record) | |
outfile.write(json.dumps(converted_record) + '\n') | |
output_file_path''' | |