badam_mode: layer badam_switch_interval: 7 badam_switch_mode: ascending badam_update_ratio: 0.05 bf16: true cutoff_len: 10240 dataset: api_eval_dataset dataset_dir: data ddp_timeout: 180000000 do_train: true finetuning_type: full flash_attn: fa2 gradient_accumulation_steps: 4 include_num_input_tokens_seen: true learning_rate: 3.0e-06 logging_steps: 5 lr_scheduler_type: cosine max_grad_norm: 1.0 max_samples: 10000000 model_name_or_path: microsoft/Phi-3.5-mini-instruct num_train_epochs: 32.0 optim: adamw_torch output_dir: saves/Custom/full/SpecAI packing: false per_device_train_batch_size: 4 plot_loss: true preprocessing_num_workers: 16 report_to: none save_steps: 2000 shift_attn: true stage: sft template: phi use_badam: true warmup_steps: 0