import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load the base model and LoRA adapter
model_name = "unsloth/llama-3-8b-bnb-4bit"  # Replace with your base model
adapter_model_name = "DanielWong76/lora_model1"  # Replace with your LoRA adapter

# Load the base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model = PeftModel.from_pretrained(base_model, adapter_model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the Alpaca-style prompt
alpaca_prompt = """### Instruction:
{instruction}

### Input:
{input}

### Output:
{output}
"""

# Define the function to generate text
def generate_response(instruction, input_text):
    # Format the prompt with the instruction and input text
    prompt = alpaca_prompt.format(
        instruction=instruction,
        input=input_text,
        output=""  # Leave output blank for generation
    )
    
    # Tokenize the prompt and move it to the GPU
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    # Generate the response from the model
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    
    # Decode the output into human-readable text
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    
    output_start = generated_text.find("### Output:")
    if output_start != -1:
        generated_text = generated_text[output_start + len("### Output:"):].strip()

    return generated_text

# Gradio Interface with two inputs: Instruction and Input Text
iface = gr.Interface(
    fn=generate_response, 
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter the instruction here..."), 
        gr.Textbox(lines=5, placeholder="Enter the input text here...")
    ], 
    outputs="text", 
    title="Alpaca-Style Instruction-Input-Output Model"
)

# Launch the Gradio app
iface.launch()