Example code not working
#1
by
marksverdhei
- opened
The example code you've provided crashes. First of all, it contains a syntax error which is easy to fix:
model_basename=model_basename lacks a comma at the end.
However, when fixing it, I run into another issue:
TypeError Traceback (most recent call last)
Cell In[2], line 7
5 use_triton = False
6 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
----> 7 model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
8 model_basename=model_basename,
9 use_safetensors=True,
10 trust_remote_code=True,
11 device="cuda:0",
12 use_triton=use_triton,
13 quantize_config=None)
14 """
15 To download from a specific branch, use the revision parameter, as in this example:
16 model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
(...)
22 quantize_config=None)
23 """
24 prompt = "Fortell meg om AI"
File ~/anaconda3/lib/python3.10/site-packages/auto_gptq/modeling/auto.py:108, in AutoGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
102 # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
103 keywords = {
104 key: kwargs[key]
105 for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
106 if key in kwargs
107 }
--> 108 return quant_func(
109 model_name_or_path=model_name_or_path,
110 device_map=device_map,
111 max_memory=max_memory,
112 device=device,
113 low_cpu_mem_usage=low_cpu_mem_usage,
114 use_triton=use_triton,
115 inject_fused_attention=inject_fused_attention,
116 inject_fused_mlp=inject_fused_mlp,
117 use_cuda_fp16=use_cuda_fp16,
118 quantize_config=quantize_config,
119 model_basename=model_basename,
120 use_safetensors=use_safetensors,
121 trust_remote_code=trust_remote_code,
122 warmup_triton=warmup_triton,
123 trainable=trainable,
124 disable_exllama=disable_exllama,
125 **keywords
126 )
File ~/anaconda3/lib/python3.10/site-packages/auto_gptq/modeling/_base.py:757, in BaseGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, torch_dtype, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
754 raise TypeError(f"{config.model_type} isn't supported yet.")
756 if quantize_config is None:
--> 757 quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path, **cached_file_kwargs, **kwargs)
759 if model_basename is None:
760 if quantize_config.model_file_base_name:
File ~/anaconda3/lib/python3.10/site-packages/auto_gptq/modeling/_base.py:93, in BaseQuantizeConfig.from_pretrained(cls, save_dir, **kwargs)
76 else: # Remote
77 resolved_config_file = cached_file(
78 save_dir,
79 quantize_config_filename,
(...)
90 _commit_hash=commit_hash,
91 )
---> 93 with open(resolved_config_file, "r", encoding="utf-8") as f:
94 return cls(**json.load(f))
TypeError: expected str, bytes or os.PathLike object, not NoneType
For reference, this is the code in the model card i'm referring to.
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
model_name_or_path = "RuterNorway/Llama-2-13b-chat-norwegian-GPTQ"
model_basename = "gptq_model-4bit-128g"
use_triton = False
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
model_basename=model_basename
use_safetensors=True,
trust_remote_code=True,
device="cuda:0",
use_triton=use_triton,
quantize_config=None)
"""
To download from a specific branch, use the revision parameter, as in this example:
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
revision="gptq-4bit-32g-actorder_True",
model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
device="cuda:0",
quantize_config=None)
"""
prompt = "Fortell meg om AI"
prompt_template=f'''### Human: {prompt}
### Assistant:
'''
print("\n\n*** Generate:")
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))
# Inference can also be done using transformers' pipeline
# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)
print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.15
)
print(pipe(prompt_template)[0]['generated_text'])
Hello,
Thank you for bringing the issue to our attention. We've updated the quantize_config.json file as well as the example code, so the problem should now be resolved.
If you continue to experience any issues, please don't hesitate to let us know.
Additionally, you can have a look at the implementation of ExLLaMA by visiting the Colab link in the model card.
marksverdhei
changed discussion status to
closed