Marko Tasic commited on
Commit
72dc319
1 Parent(s): 60e17da

pretrain final

Browse files
out/pretrain/final/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": [
9
+ 2,
10
+ 5,
11
+ 6
12
+ ],
13
+ "head_dim": 64,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 256,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 1024,
18
+ "max_position_embeddings": 131072,
19
+ "mlp_bias": false,
20
+ "model_type": "llama",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 8,
24
+ "pretraining_tp": 1,
25
+ "rms_norm_eps": 1e-05,
26
+ "rope_scaling": {
27
+ "factor": 32.0,
28
+ "high_freq_factor": 4.0,
29
+ "low_freq_factor": 1.0,
30
+ "original_max_position_embeddings": 8192,
31
+ "rope_type": "llama3"
32
+ },
33
+ "rope_theta": 500000.0,
34
+ "tie_word_embeddings": true,
35
+ "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.45.0.dev0",
37
+ "use_cache": true,
38
+ "vocab_size": 262144
39
+ }
out/pretrain/final/hyperparameters.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: Llama-3.1-8B
2
+ model_config:
3
+ name: ''
4
+ hf_config: {}
5
+ scale_embeddings: false
6
+ block_size: 8192
7
+ vocab_size: 32768
8
+ padding_multiple: 512
9
+ padded_vocab_size: 32768
10
+ n_layer: 32
11
+ n_head: 16
12
+ head_size: 64
13
+ n_embd: 768
14
+ rotary_percentage: 1.0
15
+ parallel_residual: false
16
+ bias: false
17
+ lm_head_bias: false
18
+ n_query_groups: 4
19
+ shared_attention_norm: false
20
+ norm_class_name: RMSNorm
21
+ post_attention_norm: false
22
+ post_mlp_norm: false
23
+ norm_eps: 1.0e-05
24
+ mlp_class_name: LLaMAMLP
25
+ gelu_approximate: none
26
+ intermediate_size: 2048
27
+ rope_condense_ratio: 1
28
+ rope_base: 5000000
29
+ rope_adjustments:
30
+ factor: 32.0
31
+ low_freq_factor: 1.0
32
+ high_freq_factor: 4.0
33
+ original_max_seq_len: 8192
34
+ n_expert: 0
35
+ n_expert_per_token: 0
36
+ out_dir: ../out/pretrain
37
+ precision: bf16-true
38
+ resume: auto
39
+ data:
40
+ class_path: litgpt.data.LitData
41
+ init_args:
42
+ data_path: ../pretrain-data/
43
+ seed: 42
44
+ num_workers: 32
45
+ train:
46
+ save_interval: 500
47
+ log_interval: 1
48
+ global_batch_size: 512
49
+ micro_batch_size: 32
50
+ lr_warmup_steps: 0
51
+ max_tokens: 14401885184
52
+ max_seq_length: 512
53
+ tie_embeddings: true
54
+ max_norm: 1.0
55
+ min_lr: 4.0e-05
56
+ eval:
57
+ interval: 100
58
+ max_iters: 100
59
+ initial_validation: false
60
+ final_validation: true
61
+ evaluate_example: first
62
+ optimizer:
63
+ class_path: grokadamw.GrokAdamW
64
+ init_args:
65
+ lr: 0.001
66
+ weight_decay: 0.01
67
+ betas:
68
+ - 0.9
69
+ - 0.999
70
+ devices: auto
71
+ num_nodes: 1
72
+ tokenizer_dir: ..
73
+ logger_name: wandb
74
+ seed: 23
out/pretrain/final/lit_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4e5f236da8f656c0f010bcaae21d52575ca739d6e8127fba66c6df6575fb5cd
3
+ size 1913328738
out/pretrain/final/model_config.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ attention_logit_softcapping: null
2
+ attention_scores_scalar: null
3
+ bias: false
4
+ block_size: 8192
5
+ final_logit_softcapping: null
6
+ gelu_approximate: none
7
+ head_size: 64
8
+ hf_config: {}
9
+ intermediate_size: 2048
10
+ lm_head_bias: false
11
+ mlp_class_name: LLaMAMLP
12
+ n_embd: 768
13
+ n_expert: 0
14
+ n_expert_per_token: 0
15
+ n_head: 16
16
+ n_layer: 32
17
+ n_query_groups: 4
18
+ name: ''
19
+ norm_class_name: RMSNorm
20
+ norm_eps: 1.0e-05
21
+ padded_vocab_size: 32768
22
+ padding_multiple: 512
23
+ parallel_residual: false
24
+ post_attention_norm: false
25
+ post_mlp_norm: false
26
+ rope_adjustments:
27
+ factor: 32.0
28
+ high_freq_factor: 4.0
29
+ low_freq_factor: 1.0
30
+ original_max_seq_len: 8192
31
+ rope_base: 5000000
32
+ rope_condense_ratio: 1
33
+ rotary_percentage: 1.0
34
+ scale_embeddings: false
35
+ shared_attention_norm: false
36
+ sliding_window_layer_placing: null
37
+ sliding_window_size: null
38
+ vocab_size: 32768
out/pretrain/final/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de12352815a18902973a43e535c747ddad999c72675e43002972892f1903cf75
3
+ size 2113290
out/pretrain/final/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff