mtasic85 commited on
Commit
76d7e32
1 Parent(s): 4af8cee

model initial

Browse files
out/pretrain/initial/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e73e086a5ed14149cee99a1aa3e2563ec7ab536c1653ff332999afa3520694
3
+ size 546
out/pretrain/initial/hyperparameters.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: tiny-llama-1.1b
2
+ model_config:
3
+ name: ''
4
+ hf_config: {}
5
+ scale_embeddings: false
6
+ block_size: 32768
7
+ vocab_size: 32768
8
+ padding_multiple: 512
9
+ padded_vocab_size: 32768
10
+ n_layer: 10
11
+ n_head: 12
12
+ n_embd: 312
13
+ rotary_percentage: 1.0
14
+ parallel_residual: false
15
+ bias: false
16
+ lm_head_bias: false
17
+ n_query_groups: 4
18
+ shared_attention_norm: false
19
+ norm_class_name: RMSNorm
20
+ post_attention_norm: false
21
+ post_mlp_norm: false
22
+ norm_eps: 1.0e-05
23
+ mlp_class_name: LLaMAMLP
24
+ gelu_approximate: none
25
+ intermediate_size: 1092
26
+ rope_condense_ratio: 1
27
+ rope_base: 500000
28
+ n_expert: 0
29
+ n_expert_per_token: 0
30
+ out_dir: ../out/pretrain
31
+ precision: bf16-true
32
+ resume: auto
33
+ data:
34
+ class_path: litgpt.data.LitData
35
+ init_args:
36
+ data_path: ../data/
37
+ seed: 42
38
+ num_workers: 16
39
+ train:
40
+ save_interval: 1000
41
+ log_interval: 1
42
+ global_batch_size: 512
43
+ micro_batch_size: 16
44
+ lr_warmup_steps: 2000
45
+ max_tokens: 9782206713
46
+ max_seq_length: 2048
47
+ max_norm: 1.0
48
+ min_lr: 4.0e-05
49
+ eval:
50
+ interval: 1000
51
+ max_iters: 100
52
+ initial_validation: false
53
+ final_validation: false
54
+ optimizer:
55
+ class_path: grokadamw.GrokAdamW
56
+ init_args:
57
+ lr: 5.0e-05
58
+ weight_decay: 0.1
59
+ betas:
60
+ - 0.9
61
+ - 0.95
62
+ devices: auto
63
+ num_nodes: 1
64
+ tokenizer_dir: ..
65
+ logger_name: wandb
66
+ seed: 42
out/pretrain/initial/lit_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19358c65b29f92cd191e18609d9eb8ad27872d96a90c6acfed76b541648d5aa8
3
+ size 266269738
out/pretrain/initial/model_config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ attention_logit_softcapping: null
2
+ attention_scores_scalar: null
3
+ bias: false
4
+ block_size: 32768
5
+ final_logit_softcapping: null
6
+ gelu_approximate: none
7
+ head_size: 26
8
+ hf_config: {}
9
+ intermediate_size: 1092
10
+ lm_head_bias: false
11
+ mlp_class_name: LLaMAMLP
12
+ n_embd: 312
13
+ n_expert: 0
14
+ n_expert_per_token: 0
15
+ n_head: 12
16
+ n_layer: 10
17
+ n_query_groups: 4
18
+ name: ''
19
+ norm_class_name: RMSNorm
20
+ norm_eps: 1.0e-05
21
+ padded_vocab_size: 32768
22
+ padding_multiple: 512
23
+ parallel_residual: false
24
+ post_attention_norm: false
25
+ post_mlp_norm: false
26
+ rope_base: 500000
27
+ rope_condense_ratio: 1
28
+ rotary_percentage: 1.0
29
+ scale_embeddings: false
30
+ shared_attention_norm: false
31
+ sliding_window_layer_placing: null
32
+ sliding_window_size: null
33
+ vocab_size: 32768
out/pretrain/initial/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b496a30dc268bcb8adfd551f693e68e9eadd06b81cab385c088a61e7663649c
3
+ size 1368561
out/pretrain/initial/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6333d68c3280be6081b795cc160fd5872707562021f9889b2e2bd3ae508fa62
3
+ size 23043