File size: 4,836 Bytes
992b12b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46f1dc6
992b12b
 
 
 
 
e7120a7
 
992b12b
 
 
 
 
 
 
 
 
 
 
 
1186a4d
992b12b
 
 
 
 
b157bef
 
992b12b
 
cfcc825
54746ee
992b12b
 
 
 
 
 
 
 
 
 
24ca5e3
992b12b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ca5e3
992b12b
24ca5e3
992b12b
 
 
68ac89b
992b12b
 
fd45849
992b12b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: "tiny-llama-1.1b"

# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
  padded_vocab_size: 32768
  vocab_size: 32768
  block_size: 32768
  n_layer: 10
  n_head: 12
  head_size: null
  n_embd: 312
  n_query_groups: 4
  rotary_percentage: 1.0
  parallel_residual: false
  bias: false
  norm_class_name: "RMSNorm"
  norm_eps: 1e-05
  mlp_class_name: "LLaMAMLP"
  intermediate_size: 1092
  rope_base: 500000

# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: "../out/contrain/"

# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
# precision: bf16-mixed
precision: bf16-true

# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir: "../out/pretrain/pretrained_checkpoint_converted"

# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
resume: false
# resume: "auto"

# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data:
  class_path: LitData

  init_args:
    data_path: "../data/"
    num_workers: 16

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
  save_interval: 100

  # Number of iterations between logging calls (type: int, default: 1)
  log_interval: 1

  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
  # global_batch_size: 512
  global_batch_size: 128

  # Number of samples per data-parallel rank (type: int, default: 4)
  micro_batch_size: 1
  # micro_batch_size: 16
  # micro_batch_size: 14

  # Number of iterations with learning rate warmup active (type: int, default: 2000)
  lr_warmup_steps: 2000

  # Number of epochs to train on (type: Optional[int], default: null)
  epochs:

  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
  # max_tokens: 3000000000000
  max_tokens: 4252334823 # 129767 * 32769 * 1

  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
  max_steps:

  # Limits the length of samples. Off by default (type: Optional[int], default: null)
  max_seq_length: 32768

  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
  tie_embeddings:

  #   (type: Optional[float], default: 1.0)
  max_norm: 1.0

  #   (type: float, default: 4e-05)
  min_lr: 4.0e-05

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
  interval: 100

  # Number of tokens to generate (type: Optional[int], default: null)
  max_new_tokens:

  # Number of iterations (type: int, default: 100)
  max_iters: 100

  # Whether to evaluate on the validation set at the beginning of the training
  initial_validation: false

  # Whether to evaluate on the validation set at the end the training
  final_validation: true

# Optimizer-related arguments
optimizer:
  # class_path: torch.optim.AdamW
  class_path: grokadamw.GrokAdamW
  # class_path: bitsandbytes.optim.AdamW8bit
  # class_path: bitsandbytes.optim.PagedAdamW8bit
  
  init_args:
    #   (type: float, default: 0.001)
    lr: 1.0e-4
    
    #   (type: float, default: 0.01)
    weight_decay: 0.01
    
    #   (type: tuple, default: (0.9,0.999))
    betas:
      - 0.9
      - 0.95

# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto

# How many nodes to use. (type: int, default: 1)
num_nodes: 1

# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: "../"

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: "wandb"

# The random seed to use for reproducibility. (type: int, default: 42)
seed: 42