supervised finetuning error
I am new to working with Hugging Face models and LLMs in general so any help will be appreciated.
I am trying to run a supervised fine-tuning experiment with phi2 on my custom dataset. I have collected data samples of the form {"instruction": ... , "input":...,"output":...}.
I am getting this error during the training process and I am unable to understand where it is coming from. The model starts training and every time after running on 2-3 input sequences it crashes with this error.
File "/huggingface/modules/transformers_modules/phi-2/modeling_phi.py", line 158, in _apply_rotary_emb_qkv
q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
~~~^~~
RuntimeError: The size of tensor a (328) must match the size of tensor b (319) at non-singleton dimension 1
I am attaching my code for supervised fine-tuning:
import glob
import re
from transformers import AutoTokenizer, AutoModelForCausalLM,TrainingArguments,Trainer,BitsAndBytesConfig
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
from datasets import Dataset,load_dataset
import json
import peft
from trl import SFTTrainer
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from peft import LoraConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype='float16',
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained("phi-2", quantization_config = bnb_config, trust_remote_code=True, load_in_8bit = True, torch_dtype=torch.float16, revision="refs/pr/1")
model.config.use_cache = False
print(model)
peft_config = LoraConfig(
r=32,
lora_alpha=64,
lora_dropout=0.05,
bias='none',
task_type='CAUSAL_LM',
# target_modules=["out_proj", "Wqkv"]
target_modules = ["Wqkv"] #,"fc1","fc2"]
)
model = peft.get_peft_model(model, peft_config)
model = accelerator.prepare_model(model)
if torch.cuda.device_count() > 1: # If more than 1 GPU
model.is_parallelizable = True
model.model_parallel = True
model.print_trainable_parameters()
tokenizer = AutoTokenizer.from_pretrained("phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
custom_dataset = load_dataset("json",data_files="sft_dataset.json",split='train')
def formatting_prompts_func(examples):
output_text = []
for i in range(len(examples["instruction"])):
instruction = examples["instruction"][i]
input_text = examples["input"][i]
response = examples["output"][i]
if len(input_text) >= 2:
text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input_text}
### Response:
{response}
'''
else:
text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
{response}
'''
output_text.append(text)
return output_text
training_args = TrainingArguments(
output_dir="results",
num_train_epochs=3,
per_device_train_batch_size=1,
# per_device_eval_batch_size=2,
warmup_steps=500,
weight_decay=0.01,
logging_dir="logs",
logging_steps=1,
remove_unused_columns=True,
gradient_accumulation_steps=4,
# gradient_checkpointing=True,
bf16=False,
fp16 = True,
lr_scheduler_type="cosine",
optim = "paged_adamw_8bit",
max_grad_norm=0.3,
learning_rate=2.5e-5,
)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=custom_dataset,
packing=False,
max_seq_length=2048,
# eval_dataset=custom_dataset,
# peft_config=peft_config,
formatting_func=formatting_prompts_func,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("fine_tuned_model")
Another query: Is it possible to run this experiment on 2 8GB GPUs? I have been trying to setup another code based on another notebook on the internet but one of the GPUs keeps going out of memory.