Error running the example code
#1
by
HAvietisov
- opened
Tried to run this using sample code from the description:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
model_name_or_path = "TheBloke/CodeLlama-34B-GPTQ"
use_triton = False
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
use_safetensors=True,
trust_remote_code=False,
device="cuda:0",
use_triton=use_triton,
quantize_config=None)
"""
# To download from a specific branch, use the revision parameter, as in this example:
# Note that `revision` requires AutoGPTQ 0.3.1 or later!
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
revision="gptq-4bit-32g-actorder_True",
use_safetensors=True,
trust_remote_code=False,
device="cuda:0",
quantize_config=None)
"""
prompt = "Tell me about AI"
prompt_template=f'''Info on prompt template will be added shortly.
'''
print("\n\n*** Generate:")
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))
# Inference can also be done using transformers' pipeline
# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)
print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.15
)
print(pipe(prompt_template)[0]['generated_text'])
Got this error :
Cell In[1], line 36
33 print("\n\n*** Generate:")
35 input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
---> 36 output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
37 print(tokenizer.decode(output[0]))
39 # Inference can also be done using transformers' pipeline
40
41 # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
File /usr/local/lib/python3.10/dist-packages/auto_gptq/modeling/_base.py:443, in BaseGPTQForCausalLM.generate(self, **kwargs)
441 """shortcut for model.generate"""
442 with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
--> 443 return self.model.generate(**kwargs)
File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:1596, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1579 return self.assisted_decoding(
1580 input_ids,
...
---> 54 query_states, key_states, value_states = torch.split(qkv_states, self.hidden_size, dim=2)
56 query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
57 key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
ValueError: not enough values to unpack (expected 3, got 2)
Package versions :
Auto-gptq : Version: 0.4.1+cu117
torch version : Version: 2.0.1
bitsandbytes : Version: 0.41.1
transformers : Version: 4.32.0
Ah, please add inject_fused_attention=False
to the from_quantized()
call. 34B has the same new GQA feature is the same as Llama 70B. I'll update that in the README for future
Yep, that works. thanks!