Cannot run inference: got multiple values for keyword argument 'return_dict'

#25
by Turlan - opened

I tried running the inference on the model (nvidia/NVLM-D-72B) as shown in the model card but failed with both text+image and pure text due to an error - full traceback on the bottom. Running QWEN_2_VL (Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8) did not trigger any errors. Do you maybe know how I may fix this?

HARDWARE: 3x A100, RAM 100GB+
torch_dtype=torch.bfloat16
device_map=split_model()
transformers==4.46.1
torch==2.5.1
Cuda tools release 12.2, V12.2.128

TypeError Traceback (most recent call last)
Cell In[17], line 33
30 pixel_values = load_image('/scratch/$USER/test_image.png', max_num=6).to(
31 torch.bfloat16)
32 question = '\nPlease describe the image shortly.'
---> 33 response = model.chat(tokenizer, pixel_values, question, generation_config)
34 print(f'User: {question}\nAssistant: {response}')
(...)
57
58 # response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

File /scratch/$USER/apptainer_env/hf_cache/modules/transformers_modules/nvidia/NVLM-D-72B/5a57d927ac0ab6b0a96ebc90f5ee7901ddca790d/modeling_nvlm_d.py:255, in NVLM_D_Model.chat(self, tokenizer, pixel_values, question, generation_config, history, return_history, num_patches_list, IMG_START_TOKEN, IMG_END_TOKEN, IMG_CONTEXT_TOKEN, verbose, visual_features)
253 attention_mask = model_inputs['attention_mask'].cuda()
254 generation_config['eos_token_id'] = eos_token_id
--> 255 generation_output = self.generate(
256 pixel_values=pixel_values,
257 visual_features=visual_features,
258 input_ids=input_ids,
259 attention_mask=attention_mask,
260 **generation_config
261 )
262 response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
263 response = response.split(template.sep)[0].strip()

File /scratch/$USER/apptainer_env/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File /scratch/$USER/apptainer_env/hf_cache/modules/transformers_modules/nvidia/NVLM-D-72B/5a57d927ac0ab6b0a96ebc90f5ee7901ddca790d/modeling_nvlm_d.py:424, in NVLM_D_Model.generate(self, pixel_values, input_ids, attention_mask, visual_features, generation_config, output_hidden_states, return_dict, **generate_kwargs)
421 else:
422 input_embeds = self.language_model.get_input_embeddings()(input_ids)
--> 424 outputs = self.language_model.generate(
425 inputs_embeds=input_embeds,
426 attention_mask=attention_mask,
427 generation_config=generation_config,
428 output_hidden_states=output_hidden_states,
429 return_dict=return_dict,
430 use_cache=True,
431 **generate_kwargs,
432 )
434 return outputs

File /scratch/$USER/apptainer_env/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File /scratch/$USER/apptainer_env/venv/lib/python3.10/site-packages/transformers/generation/utils.py:2215, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2207 input_ids, model_kwargs = self._expand_inputs_for_generation(
2208 input_ids=input_ids,
2209 expand_size=generation_config.num_return_sequences,
2210 is_encoder_decoder=self.config.is_encoder_decoder,
2211 **model_kwargs,
2212 )
2214 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2215 result = self._sample(
2216 input_ids,
2217 logits_processor=prepared_logits_processor,
2218 stopping_criteria=prepared_stopping_criteria,
2219 generation_config=generation_config,
2220 synced_gpus=synced_gpus,
2221 streamer=streamer,
2222 **model_kwargs,
2223 )
2225 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2226 # 11. prepare beam search scorer
2227 beam_scorer = BeamSearchScorer(
2228 batch_size=batch_size,
2229 num_beams=generation_config.num_beams,
(...)
2234 max_length=generation_config.max_length,
2235 )

File /scratch/$USER/apptainer_env/venv/lib/python3.10/site-packages/transformers/generation/utils.py:3206, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3203 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3205 # forward pass to get next token
-> 3206 outputs = self(**model_inputs, return_dict=True)
3208 # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
3209 model_kwargs = self._update_model_kwargs_for_generation(
3210 outputs,
3211 model_kwargs,
3212 is_encoder_decoder=self.config.is_encoder_decoder,
3213 )

TypeError: Qwen2ForCausalLM(
(model): Qwen2Model(
(embed_tokens): Embedding(152064, 8192)
(layers): ModuleList(
(0-79): 80 x Qwen2DecoderLayer(
(self_attn): Qwen2Attention(
(q_proj): Linear(in_features=8192, out_features=8192, bias=True)
(k_proj): Linear(in_features=8192, out_features=1024, bias=True)
(v_proj): Linear(in_features=8192, out_features=1024, bias=True)
(o_proj): Linear(in_features=8192, out_features=8192, bias=False)
(rotary_emb): Qwen2RotaryEmbedding()
)
(mlp): Qwen2MLP(
(gate_proj): Linear(in_features=8192, out_features=29568, bias=False)
(up_proj): Linear(in_features=8192, out_features=29568, bias=False)
(down_proj): Linear(in_features=29568, out_features=8192, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen2RMSNorm((8192,), eps=1e-06)
(post_attention_layernorm): Qwen2RMSNorm((8192,), eps=1e-06)
)
)
(norm): Qwen2RMSNorm((8192,), eps=1e-06)
(rotary_emb): Qwen2RotaryEmbedding()
)
(lm_head): Linear(in_features=8192, out_features=152064, bias=False)
) got multiple values for keyword argument 'return_dict'

I have temporarily solved by editing this file:

nano +3206 /opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py

I have changed this line:

outputs = self(**model_inputs, return_dict=True)

To this:

outputs = self(**model_inputs)

Sign up or log in to comment