lemur-7B / app.py
tianyang's picture
change height
15ade78
raw
history blame
7.01 kB
import torch
import gradio as gr
import logging
from utils.inference import load_tokenizer_and_model, decode, \
get_prompt_with_history, is_stop_word_or_prefix
from utils.gradio import reset_textbox, cancel_outputing, transfer_input, \
delete_last_conversation, reset_state, convert_to_markdown
# set variables
BASE_MODEL = "llama-7B"
LORA_MODEL = "lemur-7B"
print("Loading model...")
import time
start = time.time()
tokenizer, model, device = load_tokenizer_and_model(
base_model=BASE_MODEL,
adapter_model=LORA_MODEL,
load_8bit=True,
)
print("Model loaded in {} seconds.".format(time.time() - start))
def predict(
text,
chatbot,
history,
top_p,
temperature,
max_length_tokens,
max_context_length_tokens,
):
if text == "":
yield chatbot, history, "Empty context."
return
inputs = get_prompt_with_history(
text, history, tokenizer, max_length=max_context_length_tokens
)
if inputs is None:
yield chatbot, history, "Input too long."
return
else:
prompt, inputs = inputs
input_ids = inputs["input_ids"][:, -max_context_length_tokens:].to(device)
torch.cuda.empty_cache()
with torch.no_grad():
for x in decode(
input_ids,
model,
tokenizer,
stop_words=["[Human]", "[AI]"],
max_length=max_length_tokens,
temperature=temperature,
top_p=top_p,
):
if is_stop_word_or_prefix(x, ["[Human]", "[AI]"]) is False:
if "[Human]" in x:
x = x[: x.index("[Human]")].strip()
if "[AI]" in x:
x = x[: x.index("[AI]")].strip()
x = x.strip(" ")
a, b = [[y[0], convert_to_markdown(y[1])] for y in history] + [
[text, convert_to_markdown(x)]
], history + [[text, x]]
yield a, b, "Generating..."
torch.cuda.empty_cache()
print(prompt)
print(x)
print("=" * 80)
try:
yield a, b, "Generate: Success"
except:
pass
def retry(
text,
chatbot,
history,
top_p,
temperature,
max_length_tokens,
max_context_length_tokens,
):
logging.info("Retry...")
if len(history) == 0:
yield chatbot, history, "Empty context."
return
chatbot.pop()
inputs = history.pop()[0]
for x in predict(
inputs,
chatbot,
history,
top_p,
temperature,
max_length_tokens,
max_context_length_tokens,
):
yield x
with gr.Blocks(
theme=gr.themes.Soft(),
css=".disclaimer {font-variant-caps: all-small-caps;}"
) as demo:
history = gr.State([])
user_question = gr.State("")
with gr.Row():
gr.HTML("<h1>Lemur 🦥</h1>")
status_display = gr.Markdown("Success", elem_id="status_display")
with gr.Row(scale=1).style(equal_height=True):
with gr.Column(scale=5):
with gr.Row(scale=1):
chatbot = gr.Chatbot(elem_id="chuanhu_chatbot").style(height=666)
with gr.Row(scale=1):
with gr.Column(scale=12):
user_input = gr.Textbox(
show_label=False, placeholder="Enter text"
).style(container=False)
with gr.Column(min_width=70, scale=1):
submitBtn = gr.Button("📤 Send")
with gr.Column(min_width=70, scale=1):
cancelBtn = gr.Button("⏸️ Stop")
with gr.Row(scale=1):
emptyBtn = gr.Button(
"🧹 New Conversation",
)
retryBtn = gr.Button("🔄 Regenerate")
delLastBtn = gr.Button("🗑️ Remove Last Turn")
with gr.Column():
with gr.Column(min_width=50, scale=1):
with gr.Tab(label="Parameter Setting"):
gr.Markdown("# Parameters")
top_p = gr.Slider(
minimum=-0,
maximum=1.0,
value=0.95,
step=0.05,
interactive=True,
label="Top-p",
)
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=1,
step=0.1,
interactive=True,
label="Temperature",
)
max_length_tokens = gr.Slider(
minimum=0,
maximum=512,
value=512,
step=8,
interactive=True,
label="Max Generation Tokens",
)
max_context_length_tokens = gr.Slider(
minimum=0,
maximum=4096,
value=2048,
step=128,
interactive=True,
label="Max History Tokens",
)
predict_args = dict(
fn=predict,
inputs=[
user_question,
chatbot,
history,
top_p,
temperature,
max_length_tokens,
max_context_length_tokens,
],
outputs=[chatbot, history, status_display],
show_progress=True,
)
retry_args = dict(
fn=retry,
inputs=[
user_input,
chatbot,
history,
top_p,
temperature,
max_length_tokens,
max_context_length_tokens,
],
outputs=[chatbot, history, status_display],
show_progress=True,
)
reset_args = dict(fn=reset_textbox, inputs=[], outputs=[user_input, status_display])
# Chatbot
transfer_input_args = dict(
fn=transfer_input,
inputs=[user_input],
outputs=[user_question, user_input, submitBtn, cancelBtn],
show_progress=True,
)
submit_event = user_input.submit(**transfer_input_args).then(**predict_args)
submit_click_event = submitBtn.click(**transfer_input_args).then(**predict_args)
emptyBtn.click(
reset_state,
outputs=[chatbot, history, status_display],
show_progress=True,
)
emptyBtn.click(**reset_args)
retry_click_event = retryBtn.click(**retry_args)
cancelBtn.click(
fn=cancel_outputing,
inputs=[],
outputs=[status_display],
cancels=[submit_event, submit_click_event]
)
delLastBtn.click(
delete_last_conversation,
[chatbot, history],
[chatbot, history, status_display],
show_progress=True,
)
demo.title = "Lemur"
demo.queue(max_size=128, concurrency_count=2)
demo.launch()