base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer load_in_8bit: false load_in_4bit: false strict: false pretraining_dataset: - path: augmxnt/shisa-pretrain-en-ja-v1 type: completion total_supervised_tokens: true pretrain_multipack_attn: false dataset_processes: 32 val_set_size: 0.0 output_dir: ./out pretrain_multipack_buffer_size: 100000 max_steps: 4702818 sequence_len: 2048 sample_packing: true pad_to_sequence_len: true eval_sample_packing: false adapter: lora_model_dir: lora_r: lora_alpha: lora_dropout: lora_target_linear: lora_fan_in_fan_out: wandb_project: tiny-llama wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 64 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_apex_fused lr_scheduler: cosine learning_rate: 5e-5 adam_beta1: 0.9 adam_beta2: 0.95 train_on_inputs: false group_by_length: false bf16: true fp16: false tf32: false gradient_checkpointing: false early_stopping_patience: resume_from_checkpoint: auto_resume_from_checkpoints: logging_steps: 1 xformers_attention: flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_qkv: false flash_attn_fuse_mlp: true save_total_limit: 15 warmup_steps: 100 evals_per_epoch: eval_table_size: save_steps: 250 saves_per_epoch: debug: deepspeed: deepspeed_configs/zero1.json weight_decay: 0.1 fsdp: fsdp_config: special_tokens: