fblgit's picture
Update app.py
7678793
raw
history blame contribute delete
No virus
1.64 kB
import os
import gradio as gr
import copy
import time
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
llm = Llama(
model_path=hf_hub_download(
repo_id=os.environ.get("REPO_ID", "TheBloke/una-cybertron-7B-v2-GGUF"),
filename=os.environ.get("MODEL_FILE", "una-cybertron-7b-v2-bf16.Q5_K_S.gguf"),
),
n_ctx=2048,
n_gpu_layers=0,
)
system_prompt = 'You are a helpful assistant.'
def make_prompt(history, msg):
prompt = f"""<|im_start|>system
{system_prompt}</s>"""
for m in history:
prompt += f"\n<|user|>\n{m[0]}</s>"
if m[1]:
prompt += f"\n<|assistant|>\n{m[1]}"
prompt += f"\n<|user|>\n{msg}</s>"
prompt += "\n<|assistant|>\n"
return prompt
def generate_text(message, history):
temp = ""
input_prompt = make_prompt(history, message)
print(input_prompt)
output = llm(
input_prompt,
temperature=0.8,
top_p=0.95,
top_k=40,
repeat_penalty=1.1,
max_tokens=2048,
stop=[
"<|prompter|>",
"<|endoftext|>",
"<|endoftext|> \n",
"ASSISTANT:",
"USER:",
"SYSTEM:",
],
stream=True,
)
for out in output:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
history.append([message, temp])
demo = gr.ChatInterface(
generate_text,
concurrency_limit=5,
title="Cybertron 7B v2 CPU",
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear"
)
demo.queue(max_size=10)
demo.launch()