import gc from datasets import load_dataset from litdata import optimize, TokensLoader from litgpt.tokenizer import Tokenizer from functools import partial def batch_iterator(name=None): if name in (None, 'Replete-AI/Everything_Instruct_Multilingual'): dataset = load_dataset('Replete-AI/Everything_Instruct_Multilingual', split='train') for row in dataset: text = [] if row['instruction']: text.append( '<|im_start|>system\n' f"{row['instruction']}<|im_end|>" ) if row['input']: text.append( '<|im_start|>user\n' f"{row['input']}<|im_end|>" ) if row['output']: text.append( '<|im_start|>assistant\n' f"{row['output']}<|im_end|>" ) text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'HuggingFaceH4/ultrachat_200k'): dataset = load_dataset('HuggingFaceH4/ultrachat_200k', split='train_sft') for row in dataset: text = [ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>" for n in row['messages'] ] text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'HuggingFaceH4/no_robots'): dataset = load_dataset('HuggingFaceH4/no_robots', split='train') for row in dataset: text = [ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>" for n in row['messages'] ] text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'datatab/ultrachat_200k_serbian'): dataset = load_dataset('datatab/ultrachat_200k_serbian', split='train') for row in dataset: text = [ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>" for n in row['messages_srb'] ] text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'datatab/ultrafeedback_binarized_serbian'): dataset = load_dataset('datatab/ultrafeedback_binarized_serbian', split='train_sft') for row in dataset: text = [ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>" for n in row['chosen'] ] text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'datatab/alpaca-cleaned-serbian-full'): dataset = load_dataset('datatab/alpaca-cleaned-serbian-full', split='train') for row in dataset: text = [] if row['instruction']: text.append( '<|im_start|>system\n' f"{row['instruction']}<|im_end|>" ) if row['input']: text.append( '<|im_start|>user\n' f"{row['input']}<|im_end|>" ) if row['output']: text.append( '<|im_start|>assistant\n' f"{row['output']}<|im_end|>" ) text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'datatab/orca_math_world_problem_200k_serbian'): dataset = load_dataset('datatab/orca_math_world_problem_200k_serbian', split='train') for row in dataset: text = [] text.append( '<|im_start|>user\n' f"{row['question_translated_srb']}<|im_end|>" ) text.append( '<|im_start|>assistant\n' f"{row['answer_translated_srb']}<|im_end|>" ) text = '\n'.join(text) + '\n' yield text del dataset gc.collect() if name in (None, 'datatab/open-orca-slim-serbian'): dataset = load_dataset('datatab/open-orca-slim-serbian', split='train') role_map = {'system': 'system', 'human': 'user', 'gpt': 'assistant'} for row in dataset['conversations']: text = [ f"<|im_start|>{role_map[n['from']]}\n{n['value']}<|im_end|>" for n in row if n ] text = '\n'.join(text) + '\n' yield text del dataset gc.collect() def tokenize_fn(dataset_name, tokenizer=None): for text in batch_iterator(dataset_name): text_ids = tokenizer.encode(text, bos=False, eos=True) yield text_ids datasets_names = [ 'Replete-AI/Everything_Instruct_Multilingual', 'HuggingFaceH4/ultrachat_200k', 'HuggingFaceH4/no_robots', 'datatab/ultrachat_200k_serbian', 'datatab/ultrafeedback_binarized_serbian', 'datatab/alpaca-cleaned-serbian-full', 'datatab/orca_math_world_problem_200k_serbian', 'datatab/open-orca-slim-serbian', ] outputs = optimize( fn=partial(tokenize_fn, tokenizer=Tokenizer('..')), inputs=datasets_names, output_dir='../data/', # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk. chunk_size=((32768 + 1) * 500), num_workers=16, )