import gradio as gr from ctransformers import AutoModelForCausalLM def generate_text(model_url, model_file, prompt, tokens): if model_file is None: model = AutoModelForCausalLM.from_pretrained(model_url, gpu_layers=50, model_type="llama") if model_url != "flamingo_q4.bin": model = AutoModelForCausalLM.from_pretrained(model_url, model_file = model_file, gpu_layers=50, model_type="llama") else: model = AutoModelForCausalLM.from_pretrained(model_url, gpu_layers=50, model_type="llama") prompt = "A conversation between a helpful bot and a human. ### Human: " + prompt + "### Assistant: " generated_text = model(prompt=prompt, max_new_tokens=tokens, stream=True) output = "" for word in generated_text: output += word return output iface = gr.Interface( fn=generate_text, inputs=[ gr.inputs.Textbox(lines=1, label="Model URL"), gr.inputs.Textbox(lines=1, label="Model File"), "text", gr.inputs.Slider(1, 2048, default=50, step=1, label="Max New Tokens") ], outputs="text", title="GGML Chat", description="Enter a Hugging Face repository URL for a LLaMA-based GGML model and a prompt. If you want to use a pre-loaded model, simply enter 'flamingo_q4.bin' in the Model URL field and leave the Model File field blank. If the GGML model page you're using has multiple .bin files, specify the one you want to use in the Model File field. After the model has been loaded, it will be cached for the rest of the session (i.e., you won't need to download it again)." ) iface.queue() iface.launch()