# Config for multi-device full finetuning in full_finetune_distributed.py # using a Llama3 8B Instruct model # # This config assumes that you've run the following command before launching # this run: # tune download meta-llama/Meta-Llama-3-8B-Instruct --output-dir /tmp/Meta-Llama-3-8B-Instruct --hf-token # # To launch on 4 devices, run the following command from root: # tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full # # You can add specific overrides through the command line. For example # to override the checkpointer directory while launching training # you can run: # tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir= # # This config works best when the model is being fine-tuned on 2+ GPUs. # Single device full finetuning requires more memory optimizations. It's # best to use 8B_full_single_device.yaml for those cases # Tokenizer tokenizer: _component_: torchtune.models.llama3.llama3_s_tokenizer path: ../model_zoo/tokenizer.model max_seq_len: 1024 # Dataset dataset: _component_: torchtune.datasets.chat_dataset source: homebrewltd/instruction-speech-whispervq-v2 conversation_style: openai max_seq_len: 1024 split: train train_on_input: True seed: 42 shuffle: True # Model Arguments model: _component_: torchtune.models.llama3_1.llama3_1_s_8b # path: model_zoo/Llama3.1_s_8b_init checkpointer: _component_: torchtune.utils.FullModelHFCheckpointerSaveSteps checkpoint_dir: ../model_zoo/llama3.1-s-base-2024-08-17 checkpoint_files: [ pytorch_model.bin, ] recipe_checkpoint: null output_dir: ../model_zoo/llama3-s-instruct2 model_type: LLAMA3 resume_from_checkpoint: False save_every_n_steps: 1000 max_checkpoints: 3 # Fine-tuning arguments batch_size: 8 epochs: 5 max_steps_per_epoch: null gradient_accumulation_steps: 2 compile: False # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW #change this to use adam_mini: torchtune.modules.optimizer.Adam_mini weight_decay: 0.005 lr: 1e-4 fused: True lr_scheduler: _component_: torchtune.modules.get_cosine_schedule_with_warmup num_warmup_steps: 80 loss: _component_: torch.nn.CrossEntropyLoss fsdp: cpu_offload: False # Training env device: cuda dtype: bf16 # Memory management enable_activation_checkpointing: True memory_efficient_fsdp_wrap: True ac_mode: 'selective' # Logging metric_logger: _component_: torchtune.utils.metric_logging.DiskLogger log_dir: ${output_dir} output_dir: ../model_zoo/Llama3-instruct2-log/ log_every_n_steps: 1 log_peak_memory_stats: False