import gradio as gr
from ctransformers import AutoModelForCausalLM

def generate_text(model_url, model_file, prompt, tokens):

    if model_file is None:
        model = AutoModelForCausalLM.from_pretrained(model_url,
                                                     gpu_layers=50, model_type="llama")
        
    if model_url != "flamingo_q4.bin":
        model = AutoModelForCausalLM.from_pretrained(model_url, model_file = model_file,
                                                     gpu_layers=50, model_type="llama")
    else:
        model = AutoModelForCausalLM.from_pretrained(model_url,
                                                     gpu_layers=50, model_type="llama")
        
    prompt = "A conversation between a helpful bot and a human. ### Human: " + prompt + "### Assistant: "
    
    generated_text = model(prompt=prompt, max_new_tokens=tokens, stream=True)

    output = ""
    for word in generated_text:
        output += word
    return output

iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.inputs.Textbox(lines=1, label="Model URL"),
        gr.inputs.Textbox(lines=1, label="Model File"),
        "text",
        gr.inputs.Slider(1, 2048, default=50, step=1, label="Max New Tokens")
    ],
    outputs="text",
    title="GGML Chat", 
    description="Enter a Hugging Face repository URL for a LLaMA-based GGML model and a prompt. If you want to use a pre-loaded model, simply enter 'flamingo_q4.bin' in the Model URL field and leave the Model File field blank. If the GGML model page you're using has multiple .bin files, specify the one you want to use in the Model File field. After the model has been loaded, it will be cached for the rest of the session (i.e., you won't need to download it again)."
)
iface.queue()
iface.launch()