mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq · the coder from the model card has errors when executing on google colab

import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version

compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="torchao_int4") 
prepare_for_inference(model, backend="bitblas") #takes a while to init...

WARNING:bitblas.utils.target_detector:TVM target not found. Please set the TVM target environment variable using `export TVM_TARGET=<target>`, where is one of the available targets can be found in the output of `tools/get_available_targets.py`.
Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
Warning: failed to import the BitBlas backend. Check if BitBlas is correctly installed if you want to use the bitblas backend (https://github.com/microsoft/BitBLAS).
Fetching 7 files: 100%
7/7 [08:45<00:00, 207.15s/it]
.gitattributes: 100%
1.60k/1.60k [00:00<00:00, 23.6kB/s]
README.md: 100%
4.68k/4.68k [00:00<00:00, 54.7kB/s]
config.json: 100%
915/915 [00:00<00:00, 20.5kB/s]
special_tokens_map.json: 100%
296/296 [00:00<00:00, 8.46kB/s]
tokenizer_config.json: 100%
55.3k/55.3k [00:00<00:00, 2.10MB/s]
tokenizer.json: 100%
9.09M/9.09M [00:00<00:00, 36.7MB/s]
qmodel.pt: 100%
6.03G/6.03G [08:44<00:00, 16.2MB/s]

ValueError Traceback (most recent call last)
in <cell line: 15>()
13 compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17

5 frames
/usr/local/lib/python3.10/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
180
181 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 182 raise ValueError(
183 "rope_scaling must be a dictionary with two fields, type and factor, " f"got {self.rope_scaling}"
184 )

ValueError: rope_scaling must be a dictionary with two fields, type and factor, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}