tangledgroup
/

tangled-llama-33m-32k-instruct-v0.1

Text Generation

Inference Endpoints

Model card Files Files and versions Community

mtasic85 commited on 14 days ago

Commit

24ca5e3

•

1 Parent(s): 049ca3f

model

Files changed (1) hide show

scripts/model.yaml +3 -3

scripts/model.yaml CHANGED Viewed

@@ -73,7 +73,7 @@ train:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
   # max_tokens: 3000000000000
-  max_tokens: 12757004469 # 129767 * 32769 * 3
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:
@@ -110,9 +110,9 @@ eval:
 # Optimizer-related arguments
 optimizer:
   # class_path: torch.optim.AdamW
-  # class_path: grokadamw.GrokAdamW
   # class_path: bitsandbytes.optim.AdamW8bit
-  class_path: bitsandbytes.optim.PagedAdamW8bit
   init_args:
     #   (type: float, default: 0.001)

   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
   # max_tokens: 3000000000000
+  max_tokens: 4252334823 # 129767 * 32769 * 1
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:
 # Optimizer-related arguments
 optimizer:
   # class_path: torch.optim.AdamW
+  class_path: grokadamw.GrokAdamW
   # class_path: bitsandbytes.optim.AdamW8bit
+  # class_path: bitsandbytes.optim.PagedAdamW8bit
   init_args:
     #   (type: float, default: 0.001)