the coder from the model card has errors when executing on google colab
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator
#Load the model
###################################################
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)
#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="torchao_int4")
prepare_for_inference(model, backend="bitblas") #takes a while to init...
WARNING:bitblas.utils.target_detector:TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>
, where is one of the available targets can be found in the output of tools/get_available_targets.py
.
Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
Warning: failed to import the BitBlas backend. Check if BitBlas is correctly installed if you want to use the bitblas backend (https://github.com/microsoft/BitBLAS).
Fetchingβ7βfiles:β100%
β7/7β[08:45<00:00,β207.15s/it]
.gitattributes:β100%
β1.60k/1.60kβ[00:00<00:00,β23.6kB/s]
README.md:β100%
β4.68k/4.68kβ[00:00<00:00,β54.7kB/s]
config.json:β100%
β915/915β[00:00<00:00,β20.5kB/s]
special_tokens_map.json:β100%
β296/296β[00:00<00:00,β8.46kB/s]
tokenizer_config.json:β100%
β55.3k/55.3kβ[00:00<00:00,β2.10MB/s]
tokenizer.json:β100%
β9.09M/9.09Mβ[00:00<00:00,β36.7MB/s]
qmodel.pt:β100%
β6.03G/6.03Gβ[08:44<00:00,β16.2MB/s]
ValueError Traceback (most recent call last)
in <cell line: 15>()
13 compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17
5 frames
/usr/local/lib/python3.10/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
180
181 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 182 raise ValueError(
183 "rope_scaling
must be a dictionary with two fields, type
and factor
, " f"got {self.rope_scaling}"
184 )
ValueError: rope_scaling
must be a dictionary with two fields, type
and factor
, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
What gpu are you using ? You need at least an 3090 with cuda 12.1.
Regarding the rope_scaling
error, you need to use the latest transformers
version. Also make sure you use the latest Pytorch version.