model_name: tiny-llama-1.1b | |
model_config: | |
name: '' | |
hf_config: {} | |
scale_embeddings: false | |
block_size: 32768 | |
vocab_size: 32768 | |
padding_multiple: 512 | |
padded_vocab_size: 32768 | |
n_layer: 10 | |
n_head: 12 | |
n_embd: 312 | |
rotary_percentage: 1.0 | |
parallel_residual: false | |
bias: false | |
lm_head_bias: false | |
n_query_groups: 4 | |
shared_attention_norm: false | |
norm_class_name: RMSNorm | |
post_attention_norm: false | |
post_mlp_norm: false | |
norm_eps: 1.0e-05 | |
mlp_class_name: LLaMAMLP | |
gelu_approximate: none | |
intermediate_size: 1092 | |
rope_condense_ratio: 1 | |
rope_base: 500000 | |
n_expert: 0 | |
n_expert_per_token: 0 | |
out_dir: ../out/pretrain | |
precision: bf16-true | |
resume: auto | |
data: | |
class_path: litgpt.data.LitData | |
init_args: | |
data_path: ../data/ | |
seed: 42 | |
num_workers: 16 | |
train: | |
save_interval: 1000 | |
log_interval: 1 | |
global_batch_size: 512 | |
micro_batch_size: 16 | |
lr_warmup_steps: 2000 | |
max_tokens: 9782206713 | |
max_seq_length: 2048 | |
max_norm: 1.0 | |
min_lr: 4.0e-05 | |
eval: | |
interval: 1000 | |
max_iters: 100 | |
initial_validation: false | |
final_validation: false | |
optimizer: | |
class_path: grokadamw.GrokAdamW | |
init_args: | |
lr: 5.0e-05 | |
weight_decay: 0.1 | |
betas: | |
- 0.9 | |
- 0.95 | |
devices: auto | |
num_nodes: 1 | |
tokenizer_dir: .. | |
logger_name: wandb | |
seed: 42 | |