mtasic85 commited on
Commit
2bfc9d7
1 Parent(s): 473911e

contrain datasest

Browse files
scripts/prepare_contrain_dataset.py CHANGED
@@ -1,38 +1,40 @@
1
  """
2
  # https://huggingface.co/datasets/Tongjilibo/self_cognition
3
 
 
 
 
 
 
 
 
 
 
4
  https://huggingface.co/datasets/HuggingFaceH4/no_robots
5
  https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
 
6
  https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
7
- https://huggingface.co/datasets/Locutusque/function-calling-chatml
8
- https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
9
-
10
  https://huggingface.co/datasets/teknium/OpenHermes-2.5
11
-
12
- https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
13
-
14
- https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
15
- https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft?row=0
16
  https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
17
- https://huggingface.co/datasets/Undi95/andrijdavid_roleplay-conversation-sharegpt
18
- https://huggingface.co/datasets/roleplay4fun/CoupleRP
19
  https://huggingface.co/datasets/arcee-ai/EvolKit-20k
20
- https://huggingface.co/datasets/arcee-ai/The-Tome
 
21
  https://huggingface.co/datasets/arcee-ai/agent-data
22
- https://huggingface.co/datasets/arcee-ai/reasoning-sharegpt
23
- https://huggingface.co/datasets/arcee-ai/infini-instruct-top-500k
24
- https://huggingface.co/datasets/arcee-ai/BAAI-Infinity-Instruct-System
25
- # https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
26
 
27
  https://huggingface.co/datasets/KingNish/reasoning-base-20k
28
  https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
29
- https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
30
  https://huggingface.co/datasets/thesven/gsm8k-reasoning
 
31
 
32
- https://huggingface.co/datasets/codeparrot/self-instruct-starcoder
33
-
34
- https://huggingface.co/datasets/gair-prox/RedPajama-pro
35
 
36
- https://huggingface.co/datasets/codecomplete/base_dataset
37
- https://huggingface.co/datasets/SivilTaram/starcoder2-documentation
 
 
38
  """
 
1
  """
2
  # https://huggingface.co/datasets/Tongjilibo/self_cognition
3
 
4
+ https://huggingface.co/datasets/arcee-ai/The-Tome
5
+ # https://huggingface.co/datasets/Locutusque/function-calling-chatml
6
+ # https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
7
+ # https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
8
+ # https://huggingface.co/datasets/arcee-ai/reasoning-sharegpt
9
+ # https://huggingface.co/datasets/arcee-ai/infini-instruct-top-500k
10
+ # https://huggingface.co/datasets/arcee-ai/BAAI-Infinity-Instruct-System
11
+ # https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
12
+
13
  https://huggingface.co/datasets/HuggingFaceH4/no_robots
14
  https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
15
+ https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft
16
  https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
 
 
 
17
  https://huggingface.co/datasets/teknium/OpenHermes-2.5
 
 
 
 
 
18
  https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
19
+ https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
20
+
21
  https://huggingface.co/datasets/arcee-ai/EvolKit-20k
22
+ https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K
23
+ https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
24
  https://huggingface.co/datasets/arcee-ai/agent-data
25
+ https://huggingface.co/datasets/ai2-adapt-dev/olmoe-commercial
26
+
27
+ https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
 
28
 
29
  https://huggingface.co/datasets/KingNish/reasoning-base-20k
30
  https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
 
31
  https://huggingface.co/datasets/thesven/gsm8k-reasoning
32
+ """
33
 
34
+ # Non-conversation
 
 
35
 
36
+ """
37
+ # https://huggingface.co/datasets/gair-prox/RedPajama-pro
38
+ # https://huggingface.co/datasets/codecomplete/base_dataset
39
+ # https://huggingface.co/datasets/SivilTaram/starcoder2-documentation
40
  """
scripts/pretrain-model.yaml CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
  # ``model_config``. (type: Optional[str], default: null)
3
  model_name: "Llama-3.2-1B"
 
1
+ # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
2
+
3
  # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
4
  # ``model_config``. (type: Optional[str], default: null)
5
  model_name: "Llama-3.2-1B"