fireballoon/baichuan-vicuna-7b · A question of setstorage in Baichuan-7B and Baichuan-vicuna-7B

I have load Baichuan-7B and Baichuan-vicuna-7B in Fastchat code, and the load model changed to this:

tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, use_fast=False,add_bos_token=False, model_max_length=4096,padding_side="right",trust_remote_code=True)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
torch_dtype=torch.float16,
trust_remote_code=True,)

But the error named setStorage appeared:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/ec2-user/FastChat/fastchat/train/train_mem.py:13 in │
│ │
│ 10 from fastchat.train.train import train │
│ 11 │
│ 12 if name == "main": │
│ ❱ 13 │ train() │
│ 14 │
│ │
│ /home/ec2-user/FastChat/fastchat/train/train.py:282 in train │
│ │
│ 279 │ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): │
│ 280 │ │ trainer.train(resume_from_checkpoint=True) │
│ 281 │ else: │
│ ❱ 282 │ │ trainer.train() │
│ 283 │ trainer.save_state() │
│ 284 │ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) │
│ 285 │
│ │
│ /home/ec2-user/anaconda3/envs/liwei2/lib/python3.10/site-packages/transformers/trainer.py:1664 │
│ in train │
│ │
│ 1661 │ │ inner_training_loop = find_executable_batch_size( │
│ 1662 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1663 │ │ ) │
│ ❱ 1664 │ │ return inner_training_loop( │
│ 1665 │ │ │ args=args, │
│ 1666 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1667 │ │ │ trial=trial, │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/transformers/trainer.py:1938 │
│ in _inner_training_loop │
│ │
│ 1935 │ │ │ │ ): │
│ 1936 │ │ │ │ │ # Avoid unnecessary DDP synchronization since there will be no backw │
│ 1937 │ │ │ │ │ with model.no_sync(): │
│ ❱ 1938 │ │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1939 │ │ │ │ else: │
│ 1940 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1941 │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/transformers/trainer.py:2753 │
│ in training_step │
│ │
│ 2750 │ │ │ # loss gets scaled under gradient_accumulation_steps in deepspeed │
│ 2751 │ │ │ loss = self.deepspeed.backward(loss) │
│ 2752 │ │ else: │
│ ❱ 2753 │ │ │ loss.backward() │
│ 2754 │ │ │
│ 2755 │ │ return loss.detach() │
│ 2756 │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/_tensor.py:487 in │
│ backward │
│ │
│ 484 │ │ │ │ create_graph=create_graph, │
│ 485 │ │ │ │ inputs=inputs, │
│ 486 │ │ │ ) │
│ ❱ 487 │ │ torch.autograd.backward( │
│ 488 │ │ │ self, gradient, retain_graph, create_graph, inputs=inputs │
│ 489 │ │ ) │
│ 490 │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/autograd/init.py:200 │
│ in backward │
│ │
│ 197 │ # The reason we repeat same the comment below is that │
│ 198 │ # some Python versions print out the first line of a multi-line function │
│ 199 │ # calls in the traceback and some print out the last line │
│ ❱ 200 │ Variable.execution_engine.run_backward( # Calls into the C++ engine to run the bac │
│ 201 │ │ tensors, grad_tensors, retain_graph, create_graph, inputs, │
│ 202 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to ru │
│ 203 │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/autograd/function.py:274 │
│ in apply │
│ │
│ 271 │ │ │ │ │ │ │ "Function is not allowed. You should only implement one " │
│ 272 │ │ │ │ │ │ │ "of them.") │
│ 273 │ │ user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn │
│ ❱ 274 │ │ return user_fn(self, *args) │
│ 275 │ │
│ 276 │ def apply_jvp(self, *args): │
│ 277 │ │ # _forward_cls is defined by derived class │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/utils/checkpoint.py:141 │
│ in backward │
│ │
│ 138 │ │ │ with torch.enable_grad(), \ │
│ 139 │ │ │ │ torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs), \ │
│ 140 │ │ │ │ torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): │
│ ❱ 141 │ │ │ │ outputs = ctx.run_function(*detached_inputs) │
│ 142 │ │ │
│ 143 │ │ if isinstance(outputs, torch.Tensor): │
│ 144 │ │ │ outputs = (outputs,) │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/transformers/models/llama/mode │
│ ling_llama.py:566 in custom_forward │
│ │
│ 563 │ │ │ │ def create_custom_forward(module): │
│ 564 │ │ │ │ │ def custom_forward(*inputs): │
│ 565 │ │ │ │ │ │ # None for past_key_value │
│ ❱ 566 │ │ │ │ │ │ return module(*inputs, output_attentions, None) │
│ 567 │ │ │ │ │ │
│ 568 │ │ │ │ │ return custom_forward │
│ 569 │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/nn/modules/module.py:150 │
│ 1 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/transformers/models/llama/mode │
│ ling_llama.py:293 in forward │
│ │
│ 290 │ │ hidden_states = self.input_layernorm(hidden_states) │
│ 291 │ │ │
│ 292 │ │ # Self Attention │
│ ❱ 293 │ │ hidden_states, self_attn_weights, present_key_value = self.self_attn( │
│ 294 │ │ │ hidden_states=hidden_states, │
│ 295 │ │ │ attention_mask=attention_mask, │
│ 296 │ │ │ position_ids=position_ids, │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/nn/modules/module.py:150 │
│ 1 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/ec2-user/FastChat/fastchat/train/llama_flash_attn_monkey_patch.py:32 in forward │
│ │
│ 29 │ bsz, q_len, _ = hidden_states.size() │
│ 30 │ │
│ 31 │ query_states = ( │
│ ❱ 32 │ │ self.q_proj(hidden_states) │
│ 33 │ │ .view(bsz, q_len, self.num_heads, self.head_dim) │
│ 34 │ │ .transpose(1, 2) │
│ 35 │ ) │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/nn/modules/module.py:150 │
│ 1 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/ec2-user/anaconda3/envs/lib/python3.10/site-packages/torch/nn/modules/linear.py:114 │
│ in forward │
│ │
│ 111 │ │ │ init.uniform(self.bias, -bound, bound) │
│ 112 │ │
│ 113 │ def forward(self, input: Tensor) -> Tensor: │
│ ❱ 114 │ │ return F.linear(input, self.weight, self.bias) │
│ 115 │ │
│ 116 │ def extra_repr(self) -> str: │
│ 117 │ │ return 'in_features={}, out_features={}, bias={}'.format( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: setStorage: sizes [4096, 4096], strides [1, 4096], storage offset 6333644800, and itemsize 2 requiring
a storage size of 12700844032 are out of bounds for storage of size 0

And the error in Baichuan-7B is

RuntimeError: setStorage: sizes [4096, 12288], strides [1, 4096], storage offset 6333644800, and itemsize 2 requiring
a storage size of 12700844032 are out of bounds for storage of size 0

Have you ever meet this problem?