The model is extremelly slow in 4bit, is my code for loading ok?
#7
by
zokica
- opened
I load it on RTX 3090 in 4 bits, and I get 1 tokens per second on a GPU which is slower than a CPU. For example llama 7b is around 30 tokens/second on teh same GPU. And the speed should be around 10 tokens/second.
I show only load code
if 1==1:
import time
timea = time.time()
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-30b')
base_model = "mosaicml/mpt-30b-chat"
config = transformers.AutoConfig.from_pretrained(base_model, trust_remote_code=True)
config.max_seq_len = 16384 # (input + output) tokens can now be up to 16384
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
#print("device_map",device_map)
model = transformers.AutoModelForCausalLM.from_pretrained(
base_model,
config=config,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
model.eval()
print("load time",-timea + time.time())