contrain datasest
Browse files
scripts/prepare_contrain_dataset.py
CHANGED
@@ -1,38 +1,40 @@
|
|
1 |
"""
|
2 |
# https://huggingface.co/datasets/Tongjilibo/self_cognition
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
https://huggingface.co/datasets/HuggingFaceH4/no_robots
|
5 |
https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
|
|
|
6 |
https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
|
7 |
-
https://huggingface.co/datasets/Locutusque/function-calling-chatml
|
8 |
-
https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
|
9 |
-
|
10 |
https://huggingface.co/datasets/teknium/OpenHermes-2.5
|
11 |
-
|
12 |
-
https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
|
13 |
-
|
14 |
-
https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
|
15 |
-
https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft?row=0
|
16 |
https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
|
17 |
-
https://huggingface.co/datasets/
|
18 |
-
|
19 |
https://huggingface.co/datasets/arcee-ai/EvolKit-20k
|
20 |
-
https://huggingface.co/datasets/
|
|
|
21 |
https://huggingface.co/datasets/arcee-ai/agent-data
|
22 |
-
https://huggingface.co/datasets/
|
23 |
-
|
24 |
-
https://huggingface.co/datasets/
|
25 |
-
# https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
|
26 |
|
27 |
https://huggingface.co/datasets/KingNish/reasoning-base-20k
|
28 |
https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
|
29 |
-
https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
|
30 |
https://huggingface.co/datasets/thesven/gsm8k-reasoning
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
https://huggingface.co/datasets/gair-prox/RedPajama-pro
|
35 |
|
36 |
-
|
37 |
-
https://huggingface.co/datasets/
|
|
|
|
|
38 |
"""
|
|
|
1 |
"""
|
2 |
# https://huggingface.co/datasets/Tongjilibo/self_cognition
|
3 |
|
4 |
+
https://huggingface.co/datasets/arcee-ai/The-Tome
|
5 |
+
# https://huggingface.co/datasets/Locutusque/function-calling-chatml
|
6 |
+
# https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
|
7 |
+
# https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
|
8 |
+
# https://huggingface.co/datasets/arcee-ai/reasoning-sharegpt
|
9 |
+
# https://huggingface.co/datasets/arcee-ai/infini-instruct-top-500k
|
10 |
+
# https://huggingface.co/datasets/arcee-ai/BAAI-Infinity-Instruct-System
|
11 |
+
# https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
|
12 |
+
|
13 |
https://huggingface.co/datasets/HuggingFaceH4/no_robots
|
14 |
https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
|
15 |
+
https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft
|
16 |
https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
|
|
|
|
|
|
|
17 |
https://huggingface.co/datasets/teknium/OpenHermes-2.5
|
|
|
|
|
|
|
|
|
|
|
18 |
https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
|
19 |
+
https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
|
20 |
+
|
21 |
https://huggingface.co/datasets/arcee-ai/EvolKit-20k
|
22 |
+
https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K
|
23 |
+
https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
|
24 |
https://huggingface.co/datasets/arcee-ai/agent-data
|
25 |
+
https://huggingface.co/datasets/ai2-adapt-dev/olmoe-commercial
|
26 |
+
|
27 |
+
https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
|
|
|
28 |
|
29 |
https://huggingface.co/datasets/KingNish/reasoning-base-20k
|
30 |
https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
|
|
|
31 |
https://huggingface.co/datasets/thesven/gsm8k-reasoning
|
32 |
+
"""
|
33 |
|
34 |
+
# Non-conversation
|
|
|
|
|
35 |
|
36 |
+
"""
|
37 |
+
# https://huggingface.co/datasets/gair-prox/RedPajama-pro
|
38 |
+
# https://huggingface.co/datasets/codecomplete/base_dataset
|
39 |
+
# https://huggingface.co/datasets/SivilTaram/starcoder2-documentation
|
40 |
"""
|
scripts/pretrain-model.yaml
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
|
2 |
# ``model_config``. (type: Optional[str], default: null)
|
3 |
model_name: "Llama-3.2-1B"
|
|
|
1 |
+
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
|
2 |
+
|
3 |
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
|
4 |
# ``model_config``. (type: Optional[str], default: null)
|
5 |
model_name: "Llama-3.2-1B"
|