Marko Tasic commited on
Commit
5bd56de
1 Parent(s): b157bef

cont pretrained on 1 epoch

Browse files
out/contrain/final/hyperparameters.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: tiny-llama-1.1b
2
+ model_config:
3
+ name: ''
4
+ hf_config: {}
5
+ scale_embeddings: false
6
+ block_size: 32768
7
+ vocab_size: 32768
8
+ padding_multiple: 512
9
+ padded_vocab_size: 32768
10
+ n_layer: 10
11
+ n_head: 12
12
+ n_embd: 312
13
+ rotary_percentage: 1.0
14
+ parallel_residual: false
15
+ bias: false
16
+ lm_head_bias: false
17
+ n_query_groups: 4
18
+ shared_attention_norm: false
19
+ norm_class_name: RMSNorm
20
+ post_attention_norm: false
21
+ post_mlp_norm: false
22
+ norm_eps: 1.0e-05
23
+ mlp_class_name: LLaMAMLP
24
+ gelu_approximate: none
25
+ intermediate_size: 1092
26
+ rope_condense_ratio: 1
27
+ rope_base: 500000
28
+ n_expert: 0
29
+ n_expert_per_token: 0
30
+ out_dir: ../out/contrain
31
+ precision: bf16-true
32
+ initial_checkpoint_dir: ../out/pretrain/pretrained_checkpoint_converted
33
+ resume: false
34
+ data:
35
+ class_path: litgpt.data.LitData
36
+ init_args:
37
+ data_path: ../data/
38
+ seed: 42
39
+ num_workers: 16
40
+ train:
41
+ save_interval: 100
42
+ log_interval: 1
43
+ global_batch_size: 128
44
+ micro_batch_size: 1
45
+ lr_warmup_steps: 2000
46
+ max_tokens: 4252334823
47
+ max_seq_length: 32768
48
+ max_norm: 1.0
49
+ min_lr: 4.0e-05
50
+ eval:
51
+ interval: 100
52
+ max_iters: 100
53
+ initial_validation: false
54
+ final_validation: true
55
+ optimizer:
56
+ class_path: grokadamw.GrokAdamW
57
+ init_args:
58
+ lr: 0.0001
59
+ weight_decay: 0.01
60
+ betas:
61
+ - 0.9
62
+ - 0.95
63
+ devices: auto
64
+ num_nodes: 1
65
+ tokenizer_dir: ..
66
+ logger_name: wandb
67
+ seed: 42
out/contrain/final/lit_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f52a5c030a64c62676bc7268733a7cd38b749a090c3f03b5630bc7f3e99a831e
3
+ size 266271850
out/contrain/final/model_config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ attention_logit_softcapping: null
2
+ attention_scores_scalar: null
3
+ bias: false
4
+ block_size: 32768
5
+ final_logit_softcapping: null
6
+ gelu_approximate: none
7
+ head_size: 26
8
+ hf_config: {}
9
+ intermediate_size: 1092
10
+ lm_head_bias: false
11
+ mlp_class_name: LLaMAMLP
12
+ n_embd: 312
13
+ n_expert: 0
14
+ n_expert_per_token: 0
15
+ n_head: 12
16
+ n_layer: 10
17
+ n_query_groups: 4
18
+ name: ''
19
+ norm_class_name: RMSNorm
20
+ norm_eps: 1.0e-05
21
+ padded_vocab_size: 32768
22
+ padding_multiple: 512
23
+ parallel_residual: false
24
+ post_attention_norm: false
25
+ post_mlp_norm: false
26
+ rope_base: 500000
27
+ rope_condense_ratio: 1
28
+ rotary_percentage: 1.0
29
+ scale_embeddings: false
30
+ shared_attention_norm: false
31
+ sliding_window_layer_placing: null
32
+ sliding_window_size: null
33
+ vocab_size: 32768
out/contrain/final/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b496a30dc268bcb8adfd551f693e68e9eadd06b81cab385c088a61e7663649c
3
+ size 1368561
out/contrain/final/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6333d68c3280be6081b795cc160fd5872707562021f9889b2e2bd3ae508fa62
3
+ size 23043