Upload Llama3.1 with Whisper Tokenizer at step 5000

Browse files

Files changed (3) hide show

8B_full.yaml +90 -0
config.json +1 -39
log_1723708612.txt +0 -0

8B_full.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama3 8B Instruct model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B-Instruct --output-dir /tmp/Meta-Llama-3-8B-Instruct --hf-token <HF_TOKEN>
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 8B_full_single_device.yaml for those cases
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_s_tokenizer
+  path: ../model_zoo/tokenizer.model
+  max_seq_len: 1024
+# Dataset
+dataset:
+  _component_: torchtune.datasets.chat_dataset
+  source: homebrewltd/instruction-speech-whispervq-v2
+  conversation_style: openai
+  max_seq_len: 1024
+  split: train
+  train_on_input: True
+seed: 42
+shuffle: True
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_1.llama3_1_s_8b
+  # path: model_zoo/Llama3.1_s_8b_init
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointerSaveSteps
+  checkpoint_dir: ../model_zoo/llama3.1-s-base-2024-08-17
+  checkpoint_files: [
+    pytorch_model.bin,
+  ]
+  recipe_checkpoint: null
+  output_dir: ../model_zoo/llama3-s-instruct2
+  model_type: LLAMA3
+resume_from_checkpoint: False
+save_every_n_steps: 1000
+max_checkpoints: 3
+# Fine-tuning arguments
+batch_size: 8
+epochs: 5
+max_steps_per_epoch: null
+gradient_accumulation_steps: 2
+compile: False
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW #change this to use adam_mini: torchtune.modules.optimizer.Adam_mini
+  weight_decay: 0.005
+  lr: 1e-4
+  fused: True
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 80
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+fsdp:
+  cpu_offload: False
+# Training env
+device: cuda
+dtype: bf16
+# Memory management
+enable_activation_checkpointing: True
+memory_efficient_fsdp_wrap: True
+ac_mode: 'selective'
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: ../model_zoo/Llama3-instruct2-log/
+log_every_n_steps: 1
+log_peak_memory_stats: False

config.json CHANGED Viewed

@@ -1,39 +1 @@
-{
-  "_name_or_path": "llama3.1-s-base-2024-08-17/",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 128000,
-  "eos_token_id": [
-    128001,
-    128008,
-    128009
-  ],
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 131072,
-  "mlp_bias": false,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 8,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": {
-    "factor": 8.0,
-    "high_freq_factor": 4.0,
-    "low_freq_factor": 1.0,
-    "original_max_position_embeddings": 8192,
-    "rope_type": "llama3"
-  },
-  "rope_theta": 500000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.0",
-  "use_cache": true,
-  "vocab_size": 128771
-}


1	+ {"_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", "architectures": ["LlamaForCausalLM"], "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 128000, "eos_token_id": [128001, 128008, 128009], "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 14336, "max_position_embeddings": 131072, "mlp_bias": false, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3"}, "rope_theta": 500000.0, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.43.1", "use_cache": true, "vocab_size": 128771}

log_1723708612.txt ADDED Viewed

The diff for this file is too large to render. See raw diff