tangledgroup
/

tangled-llama-t-128k-base-v0.1

@@ -1,38 +1,40 @@
 """
 # https://huggingface.co/datasets/Tongjilibo/self_cognition
 https://huggingface.co/datasets/HuggingFaceH4/no_robots
 https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
 https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
-https://huggingface.co/datasets/Locutusque/function-calling-chatml
-https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
 https://huggingface.co/datasets/teknium/OpenHermes-2.5
-https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
-https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
-https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft?row=0
 https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
-https://huggingface.co/datasets/Undi95/andrijdavid_roleplay-conversation-sharegpt
-https://huggingface.co/datasets/roleplay4fun/CoupleRP
 https://huggingface.co/datasets/arcee-ai/EvolKit-20k
-https://huggingface.co/datasets/arcee-ai/The-Tome
 https://huggingface.co/datasets/arcee-ai/agent-data
-https://huggingface.co/datasets/arcee-ai/reasoning-sharegpt
-https://huggingface.co/datasets/arcee-ai/infini-instruct-top-500k
-https://huggingface.co/datasets/arcee-ai/BAAI-Infinity-Instruct-System
-# https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
 https://huggingface.co/datasets/KingNish/reasoning-base-20k
 https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
-https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
 https://huggingface.co/datasets/thesven/gsm8k-reasoning
-https://huggingface.co/datasets/codeparrot/self-instruct-starcoder
-https://huggingface.co/datasets/gair-prox/RedPajama-pro
-https://huggingface.co/datasets/codecomplete/base_dataset
-https://huggingface.co/datasets/SivilTaram/starcoder2-documentation
 """

 """
 # https://huggingface.co/datasets/Tongjilibo/self_cognition
+https://huggingface.co/datasets/arcee-ai/The-Tome
+# https://huggingface.co/datasets/Locutusque/function-calling-chatml
+# https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
+# https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
+# https://huggingface.co/datasets/arcee-ai/reasoning-sharegpt
+# https://huggingface.co/datasets/arcee-ai/infini-instruct-top-500k
+# https://huggingface.co/datasets/arcee-ai/BAAI-Infinity-Instruct-System
+# https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
 https://huggingface.co/datasets/HuggingFaceH4/no_robots
 https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
+https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft
 https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
 https://huggingface.co/datasets/teknium/OpenHermes-2.5
 https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
+https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
 https://huggingface.co/datasets/arcee-ai/EvolKit-20k
+https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K
+https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
 https://huggingface.co/datasets/arcee-ai/agent-data
+https://huggingface.co/datasets/ai2-adapt-dev/olmoe-commercial
+https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
 https://huggingface.co/datasets/KingNish/reasoning-base-20k
 https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
 https://huggingface.co/datasets/thesven/gsm8k-reasoning
+"""
+# Non-conversation
+"""
+# https://huggingface.co/datasets/gair-prox/RedPajama-pro
+# https://huggingface.co/datasets/codecomplete/base_dataset
+# https://huggingface.co/datasets/SivilTaram/starcoder2-documentation
 """

scripts/pretrain-model.yaml CHANGED Viewed

@@ -1,3 +1,5 @@
 # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
 # ``model_config``. (type: Optional[str], default: null)
 model_name: "Llama-3.2-1B"

+# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
 # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
 # ``model_config``. (type: Optional[str], default: null)
 model_name: "Llama-3.2-1B"