mtasic85 commited on
Commit
951a420
1 Parent(s): 98473da

pretrain model, extend from 5 to 8 epochs

Browse files
Files changed (2) hide show
  1. scripts/TRAIN.md +13 -3
  2. scripts/pretrain-model.yaml +7 -4
scripts/TRAIN.md CHANGED
@@ -46,9 +46,19 @@ save_file(state_dict, 'out/converted_model/model.safetensors')
46
  ## Evaluate
47
 
48
  ```bash
49
- litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-0/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
50
 
51
- litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-1/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
52
 
53
- litgpt evaluate --tasks 'mmlu_pro,ifeval,mgsm_direct,mathqa,gpqa' --out_dir 'evaluate-2/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
 
 
 
 
 
 
 
 
 
 
54
  ```
 
46
  ## Evaluate
47
 
48
  ```bash
49
+ litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
50
 
51
+ litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
52
 
53
+ litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
54
+
55
+ litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
56
+
57
+ litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,siqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
58
+
59
+ litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
60
+
61
+ litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
62
+
63
+ litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
64
  ```
scripts/pretrain-model.yaml CHANGED
@@ -57,7 +57,7 @@ data:
57
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
58
  train:
59
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
60
- save_interval: 500
61
 
62
  # Number of iterations between logging calls (type: int, default: 1)
63
  log_interval: 1
@@ -77,7 +77,8 @@ train:
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
- max_tokens: 8159107755 # 796399 * 2049 * 5
 
81
 
82
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
83
  max_steps:
@@ -120,10 +121,12 @@ optimizer:
120
 
121
  init_args:
122
  # (type: float, default: 0.001)
123
- lr: 1e-3
 
124
 
125
  # (type: float, default: 0.01)
126
- weight_decay: 0.01
 
127
 
128
  # (type: tuple, default: (0.9,0.999))
129
  betas:
 
57
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
58
  train:
59
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
60
+ save_interval: 200
61
 
62
  # Number of iterations between logging calls (type: int, default: 1)
63
  log_interval: 1
 
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
+ # max_tokens: 8159107755 # 796399 * 2049 * 5
81
+ max_tokens: 13054572408 # 796399 * 2049 * 8
82
 
83
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
84
  max_steps:
 
121
 
122
  init_args:
123
  # (type: float, default: 0.001)
124
+ # lr: 1e-3
125
+ lr: 1e-4
126
 
127
  # (type: float, default: 0.01)
128
+ # weight_decay: 0.01
129
+ weight_decay: 0.1
130
 
131
  # (type: tuple, default: (0.9,0.999))
132
  betas: