import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch # Load the base model and LoRA adapter model_name = "unsloth/llama-3-8b-bnb-4bit" # Replace with your base model adapter_model_name = "DanielWong76/lora_model1" # Replace with your LoRA adapter # Load the base model and tokenizer base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") model = PeftModel.from_pretrained(base_model, adapter_model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Define the Alpaca-style prompt alpaca_prompt = """### Instruction: {instruction} ### Input: {input} ### Output: {output} """ # Define the function to generate text def generate_response(instruction, input_text): # Format the prompt with the instruction and input text prompt = alpaca_prompt.format( instruction=instruction, input=input_text, output="" # Leave output blank for generation ) # Tokenize the prompt and move it to the GPU inputs = tokenizer([prompt], return_tensors="pt").to("cuda") # Generate the response from the model outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) # Decode the output into human-readable text generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] output_start = generated_text.find("### Output:") if output_start != -1: generated_text = generated_text[output_start + len("### Output:"):].strip() return generated_text # Gradio Interface with two inputs: Instruction and Input Text iface = gr.Interface( fn=generate_response, inputs=[ gr.Textbox(lines=2, placeholder="Enter the instruction here..."), gr.Textbox(lines=5, placeholder="Enter the input text here...") ], outputs="text", title="Alpaca-Style Instruction-Input-Output Model" ) # Launch the Gradio app iface.launch()