import torch from datasets import load_dataset torch.cuda.is_available() print("executed successfully") dataset_name = "timdettmers/openassistant-guanaco" dataset = load_dataset(dataset_name, split="train") from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # quantizition configuration bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) # download model model_name = "TinyPixel/Llama-2-7B-bf16-sharded" model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, trust_remote_code=True ) model.config.use_cache = False tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token text = "What is a large language model?" device = "cuda:0" inputs = tokenizer(text, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_new_tokens=50) print(tokenizer.decode(outputs[0], skip_special_tokens=True))