tangledgroup
/

tangled-llama-f-128k-v0.1

Text Generation

Inference Endpoints

Model card Files Files and versions Community

mtasic85 commited on 6 days ago

Commit

60e17da

•

1 Parent(s): 8fa271a

pretrain model

Files changed (1) hide show

scripts/pretrain-model.yaml +2 -2

scripts/pretrain-model.yaml CHANGED Viewed

@@ -11,7 +11,7 @@ model_config:
   vocab_size: 32768
   block_size: 8192
   n_layer: 32
-  n_head: 32
   head_size: 64
   n_embd: 768
   n_query_groups: 4
@@ -69,7 +69,7 @@ train:
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
-  micro_batch_size: 29
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 0

   vocab_size: 32768
   block_size: 8192
   n_layer: 32
+  n_head: 16
   head_size: 64
   n_embd: 768
   n_query_groups: 4
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 32
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 0