tangled-llama-t-128k-base-v0.1 / scripts /prepare_contrain_dataset.py

cognition

96b0f63 6 days ago

No virus

1.38 kB

	"""
	# cognition
	# https://huggingface.co/datasets/Tongjilibo/self_cognition

	# instruction
	https://huggingface.co/datasets/arcee-ai/The-Tome
	https://huggingface.co/datasets/teknium/OpenHermes-2.5

	# tool/function calling
	https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1

	# math
	https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math

	# agent
	https://huggingface.co/datasets/arcee-ai/agent-data

	# role-play

	# reflection

	# reasoning
	https://huggingface.co/datasets/KingNish/reasoning-base-20k
	https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
	https://huggingface.co/datasets/thesven/gsm8k-reasoning
	"""

	"""
	# sft
	https://huggingface.co/datasets/HuggingFaceH4/no_robots
	https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
	https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft
	https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
	https://huggingface.co/datasets/arcee-ai/EvolKit-20k
	https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K
	https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
	https://huggingface.co/datasets/ai2-adapt-dev/olmoe-commercial

	# dpo
	https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
	https://huggingface.co/datasets/kyujinpy/orca_math_dpo
	https://huggingface.co/datasets/argilla/OpenHermesPreferences
	"""