tangled-llama-q-128k-base-v0.1 / scripts /prepare_contrain_dataset.py
mtasic85's picture
pretrain model
4f52be0
raw
history blame
No virus
4.32 kB
from typing import Optional
from functools import partial
from datasets import load_dataset
from litdata import optimize, TokensLoader
from litgpt.tokenizer import Tokenizer
def batch_iterator(path: str,
name: Optional[str]=None,
data_dir: Optional[str]=None,
data_files: Optional[str]=None,
revision: Optional[str]=None,
split: str='train',
format: Optional[str]=None):
assert format is not None
dataset = load_dataset(path=path,
name=name,
data_dir=data_dir,
data_files=data_files,
revision=revision,
split=split,
trust_remote_code=True)
for row in dataset:
text = format.format(**row)
yield text
def tokenize_fn(datasets_config, tokenizer=None):
for text in batch_iterator(**datasets_config):
text_ids = tokenizer.encode(text, bos=False, eos=True)
yield text_ids
roles_map = {
'system': 'system',
'user': 'user',
'human': 'user',
'assistant': 'assistant',
'gpt': 'assistant',
'AI': 'assistant',
}
datasets_configs = [
# cognition
# https://huggingface.co/datasets/Tongjilibo/self_cognition
# instruct
{'path': 'arcee-ai/The-Tome', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 4.58 GB, 1,752,473
{'path': 'teknium/OpenHermes-2.5', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 1.94 GB, 1,001,551
# tool/function calling
{'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 65.7 MB, 11,578
# math
{'path': 'ai2-adapt-dev/openmath-2-math', 'field': 'messages'}, # 6.07 GB, 11,402,286
# agent
{'path': 'arcee-ai/agent-data', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 1.51 GB, 485,874
# conversation
{'path': 'AtlasUnified/atlas-converse', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 3.26 MB + 4.82 MB + 5.31 MB, <10k
{'path': 'PJMixers/hieunguyenminh_roleplay-deduped-ShareGPT', 'field': 'conversations'}, # 3.24 MB, 1,054
{'path': 'TokenBender/roleplay_alpaca', 'transform': lambda r: [{'role': 'user', 'content': r['instruction']}, {'role': 'assistant', 'content': r['output']}]}, # 10.2 MB, 30,530
# code
# https://huggingface.co/datasets/bleugreen/typescript-instruct
# https://huggingface.co/datasets/NuclearAi/Nuke-Python-Verse
# reflection
{'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 4.17 MB, 1,000
{'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 12.4 MB, 3,000
{'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [r['system'][0], {'role': 'user', 'content': r['input']}, {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']}]}, # 30.6 MB, 25,391
{'path': 'gretelai/synthetic-gsm8k-reflection-405b', 'split': 'train+test', 'transform': lambda r: [{'role': 'user', 'content': r['question']}, {'role': 'assistant', 'content': r['answer_with_tags']}]}, # 26.8 MB, 23,164
# reasoning
{'path': 'KingNish/reasoning-base-20k', 'field': 'conversations'}, # 307 MB, 19,944 - both pretrain and contrain
]
outputs = optimize(
fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
inputs=datasets_configs,
output_dir='../contrain-data/',
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
chunk_size=(2049 * 8012),
num_workers=32,
# compression='zstd',
)