Spaces:

tianyang
/

lemur-7B

Runtime error

App Files Files Community

tianyang commited on Jun 14, 2023

Commit

51e2020

•

1 Parent(s): 058af24

upload?

Browse files

Files changed (11) hide show

app.py +250 -0
lemur-7B/config.json +23 -0
lemur-7B/generation_config.json +7 -0
lemur-7B/pytorch_model.bin +3 -0
lemur-7B/special_tokens_map.json +24 -0
lemur-7B/tokenizer.json +0 -0
lemur-7B/tokenizer.model +3 -0
lemur-7B/tokenizer_config.json +31 -0
utils/gradio.py +71 -0
utils/inference.py +107 -0
variables.py +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+import re
+import torch
+import gradio as gr
+import logging
+from utils.inference import load_tokenizer_and_model, decode, \
+    get_prompt_with_history, is_stop_word_or_prefix
+from utils.gradio import reset_textbox, cancel_outputing, transfer_input, \
+    delete_last_conversation, reset_state, convert_to_markdown
+# set variables
+model = "lemur-7B"
+print("Loading model...")
+import time
+start = time.time()
+tokenizer, model, device = load_tokenizer_and_model(model, load_8bit=True)
+print("Model loaded in {} seconds.".format(time.time() - start))
+def predict(
+    text,
+    chatbot,
+    history,
+    top_p,
+    temperature,
+    max_length_tokens,
+    max_context_length_tokens,
+):
+    if text == "":
+        yield chatbot, history, "Empty context."
+        return
+    inputs = get_prompt_with_history(
+        text, history, tokenizer, max_length=max_context_length_tokens
+    )
+    if inputs is None:
+        yield chatbot, history, "Input too long."
+        return
+    else:
+        prompt, inputs = inputs
+    input_ids = inputs["input_ids"][:, -max_context_length_tokens:].to(device)
+    torch.cuda.empty_cache()
+    with torch.no_grad():
+        for x in decode(
+            input_ids,
+            model,
+            tokenizer,
+            stop_words=["[Human]", "[AI]"],
+            max_length=max_length_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            if is_stop_word_or_prefix(x, ["[Human]", "[AI]"]) is False:
+                if "[Human]" in x:
+                    x = x[: x.index("[Human]")].strip()
+                if "[AI]" in x:
+                    x = x[: x.index("[AI]")].strip()
+                x = x.strip(" ")
+                a, b = [[y[0], convert_to_markdown(y[1])] for y in history] + [
+                    [text, convert_to_markdown(x)]
+                ], history + [[text, x]]
+                yield a, b, "Generating..."
+    torch.cuda.empty_cache()
+    print(prompt)
+    print(x)
+    print("=" * 80)
+    try:
+        yield a, b, "Generate: Success"
+    except:
+        pass
+def retry(
+    text,
+    chatbot,
+    history,
+    top_p,
+    temperature,
+    max_length_tokens,
+    max_context_length_tokens,
+):
+    logging.info("Retry...")
+    if len(history) == 0:
+        yield chatbot, history, "Empty context."
+        return
+    chatbot.pop()
+    inputs = history.pop()[0]
+    for x in predict(
+        inputs,
+        chatbot,
+        history,
+        top_p,
+        temperature,
+        max_length_tokens,
+        max_context_length_tokens,
+    ):
+        yield x
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    css=".disclaimer {font-variant-caps: all-small-caps;}"
+    ) as demo:
+    history = gr.State([])
+    user_question = gr.State("")
+    with gr.Row():
+        gr.HTML("<h1>Lemur 🦥</h1>")
+        status_display = gr.Markdown("Success", elem_id="status_display")
+    with gr.Row(scale=1).style(equal_height=True):
+        with gr.Column(scale=5):
+            with gr.Row(scale=1):
+                chatbot = gr.Chatbot(elem_id="chuanhu_chatbot").style(height=800)
+            with gr.Row(scale=1):
+                with gr.Column(scale=12):
+                    user_input = gr.Textbox(
+                        show_label=False, placeholder="Enter text"
+                    ).style(container=False)
+                with gr.Column(min_width=70, scale=1):
+                    submitBtn = gr.Button("📤 Send")
+                with gr.Column(min_width=70, scale=1):
+                    cancelBtn = gr.Button("⏸️ Stop")
+            with gr.Row(scale=1):
+                emptyBtn = gr.Button(
+                    "🧹 New Conversation",
+                )
+                retryBtn = gr.Button("🔄 Regenerate")
+                delLastBtn = gr.Button("🗑️ Remove Last Turn")
+        with gr.Column():
+            with gr.Column(min_width=50, scale=1):
+                with gr.Tab(label="Parameter Setting"):
+                    gr.Markdown("# Parameters")
+                    top_p = gr.Slider(
+                        minimum=-0,
+                        maximum=1.0,
+                        value=0.95,
+                        step=0.05,
+                        interactive=True,
+                        label="Top-p",
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=2.0,
+                        value=1,
+                        step=0.1,
+                        interactive=True,
+                        label="Temperature",
+                    )
+                    max_length_tokens = gr.Slider(
+                        minimum=0,
+                        maximum=512,
+                        value=512,
+                        step=8,
+                        interactive=True,
+                        label="Max Generation Tokens",
+                    )
+                    max_context_length_tokens = gr.Slider(
+                        minimum=0,
+                        maximum=4096,
+                        value=2048,
+                        step=128,
+                        interactive=True,
+                        label="Max History Tokens",
+                    )
+    predict_args = dict(
+        fn=predict,
+        inputs=[
+            user_question,
+            chatbot,
+            history,
+            top_p,
+            temperature,
+            max_length_tokens,
+            max_context_length_tokens,
+        ],
+        outputs=[chatbot, history, status_display],
+        show_progress=True,
+    )
+    retry_args = dict(
+        fn=retry,
+        inputs=[
+            user_input,
+            chatbot,
+            history,
+            top_p,
+            temperature,
+            max_length_tokens,
+            max_context_length_tokens,
+        ],
+        outputs=[chatbot, history, status_display],
+        show_progress=True,
+    )
+    reset_args = dict(fn=reset_textbox, inputs=[], outputs=[user_input, status_display])
+    # Chatbot
+    transfer_input_args = dict(
+        fn=transfer_input,
+        inputs=[user_input],
+        outputs=[user_question, user_input, submitBtn, cancelBtn],
+        show_progress=True,
+    )
+    submit_event = user_input.submit(**transfer_input_args).then(**predict_args)
+    submit_click_event = submitBtn.click(**transfer_input_args).then(**predict_args)
+    emptyBtn.click(
+        reset_state,
+        outputs=[chatbot, history, status_display],
+        show_progress=True,
+    )
+    emptyBtn.click(**reset_args)
+    retry_click_event = retryBtn.click(**retry_args)
+    cancelBtn.click(
+        fn=cancel_outputing,
+        inputs=[],
+        outputs=[status_display],
+        cancels=[submit_event, submit_click_event]
+    )
+    delLastBtn.click(
+        delete_last_conversation,
+        [chatbot, history],
+        [chatbot, history, status_display],
+        show_progress=True,
+    )
+demo.title = "Lemur"
+demo.queue(max_size=128, concurrency_count=2)
+demo.launch()

lemur-7B/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "llama-7B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}

lemur-7B/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.1"
+}

lemur-7B/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8800f80fe257ad94942049beaa2dc86703571a8696bcaf0f03f57c021a2ec6ec
+size 524332500

lemur-7B/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

lemur-7B/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lemur-7B/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

lemur-7B/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

utils/gradio.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+from utils.inference import shared_state
+import re
+def convert_to_markdown(text):
+    text = text.replace("$", "&#36;")
+    def replace_leading_tabs_and_spaces(line):
+        new_line = []
+        for char in line:
+            if char == "\t":
+                new_line.append("&#9;")
+            elif char == " ":
+                new_line.append("&nbsp;")
+            else:
+                break
+        return "".join(new_line) + line[len(new_line) :]
+    markdown_text = ""
+    lines = text.split("\n")
+    in_code_block = False
+    for line in lines:
+        if in_code_block is False and line.startswith("```"):
+            in_code_block = True
+            markdown_text += "```\n"
+        elif in_code_block is True and line.startswith("```"):
+            in_code_block = False
+            markdown_text += "```\n"
+        elif in_code_block:
+            markdown_text += f"{line}\n"
+        else:
+            line = replace_leading_tabs_and_spaces(line)
+            line = re.sub(r"^(#)", r"\\\1", line)
+            markdown_text += f"{line}  \n"
+    return markdown_text
+def reset_textbox():
+    return gr.update(value=""), ""
+def cancel_outputing():
+    shared_state.interrupt()
+    textbox = reset_textbox()
+    return "Stop Done"
+def reset_state():
+    return [], [], "Reset Done"
+def transfer_input(inputs):
+    textbox = reset_textbox()
+    return (
+        inputs,
+        gr.update(value=""),
+        gr.Button.update(visible=True),
+        gr.Button.update(visible=True)
+    )
+def delete_last_conversation(chatbot, history):
+    if len(chatbot) > 0:
+        chatbot.pop()
+    if len(history) > 0:
+        history.pop()
+    return (
+        chatbot,
+        history,
+        "Delete Done",
+    )

utils/inference.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+from typing import Iterator
+from variables import SYSTEM, HUMAN, AI
+def load_tokenizer_and_model(base_model, load_8bit=True):
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(base_model)
+    model = AutoModelForCausalLM.from_pretrained(base_model, load_8bit=load_8bit)
+    return tokenizer, model, device
+class State:
+    interrupted = False
+    def interrupt(self):
+        self.interrupted = True
+    def recover(self):
+        self.interrupted = False
+shared_state = State()
+def decode(
+    input_ids: torch.Tensor,
+    model: PeftModel,
+    tokenizer: AutoTokenizer,
+    stop_words: list,
+    max_length: int,
+    temperature: float = 1.0,
+    top_p: float = 1.0,
+) -> Iterator[str]:
+    generated_tokens = []
+    past_key_values = None
+    for _ in range(max_length):
+        with torch.no_grad():
+            if past_key_values is None:
+                outputs = model(input_ids)
+            else:
+                outputs = model(input_ids[:, -1:], past_key_values=past_key_values)
+            logits = outputs.logits[:, -1, :]
+            past_key_values = outputs.past_key_values
+        # apply temperature
+        logits /= temperature
+        probs = torch.softmax(logits, dim=-1)
+        # apply top_p
+        probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+        probs_sum = torch.cumsum(probs_sort, dim=-1)
+        mask = probs_sum - probs_sort > top_p
+        probs_sort[mask] = 0.0
+        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+        next_token = torch.multinomial(probs_sort, num_samples=1)
+        next_token = torch.gather(probs_idx, -1, next_token)
+        input_ids = torch.cat((input_ids, next_token), dim=-1)
+        generated_tokens.append(next_token[0].item())
+        text = tokenizer.decode(generated_tokens)
+        yield text
+        if any([x in text for x in stop_words]):
+            return
+def get_prompt_with_history(text, history, tokenizer, max_length=2048):
+    prompt = SYSTEM
+    history = [f"\n{HUMAN} {x[0]}\n{AI} {x[1]}" for x in history]
+    history.append(f"\n{HUMAN} {text}\n{AI}")
+    history_text = ""
+    flag = False
+    for x in history[::-1]:
+        if (
+            tokenizer(prompt + history_text + x, return_tensors="pt")["input_ids"].size(
+                -1
+            )
+            <= max_length
+        ):
+            history_text = x + history_text
+            flag = True
+        else:
+            break
+    if flag:
+        return prompt + history_text, tokenizer(
+            prompt + history_text, return_tensors="pt"
+        )
+    else:
+        return None
+def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
+    for stop_word in stop_words:
+        if s.endswith(stop_word):
+            return True
+        for i in range(1, len(stop_word)):
+            if s.endswith(stop_word[:i]):
+                return True
+    return False

variables.py ADDED Viewed

	@@ -0,0 +1,5 @@

+SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+HUMAN = "[Human]:"
+AI = "[AI]:"
+NAME = "Lemur"
+ORGANIZATION = "UC San Diego (UCSD)"