glm-chat

Running on Zero

File size: 3,595 Bytes

51a7d9e
 
 
 
22f5f54
51a7d9e
edb9e8a
51a7d9e
 
 
063316d
99a7a45
 
51a7d9e
99a7a45
51a7d9e
d875b4e
51a7d9e
 
 
 
 
 
 
 
 
 
99a7a45
063316d
22f5f54
 
 
 
 
99a7a45
51a7d9e
f663115
51a7d9e
063316d
fd6304d
 
51a7d9e
 
 
 
 
fd6304d
99a7a45
 
22f5f54
030c23d
639e063
edb9e8a
030c23d
 
f663115
 
51a7d9e
22f5f54
063316d
51a7d9e
030c23d
0961bc7
f663115
030c23d
 
b4d1f01
 
 
 
8ea3132
99a7a45
51a7d9e
 
 
 
 
 
 
 
 
 
 
 
 
 
063316d
51a7d9e
 
 
 
 
063316d
51a7d9e
063316d
030c23d
51a7d9e
 
 
 
 
 
 
 
 
 
 
99a7a45
51a7d9e

import torch
from PIL import Image
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread


HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_LIST = "THUDM/LongWriter-glm4-9b"
#MODELS = os.environ.get("MODELS")
#MODEL_NAME = MODELS.split("/")[-1]

TITLE = "<h1><center>GLM SPACE</center></h1>"

PLACEHOLDER = f'<h3><center>Feel Free To Test GLM<br>Select Model in Parameters</center></h3>'

CSS = """
.duplicate-button {
  margin: auto !important;
  color: white !important;
  background: black !important;
  border-radius: 100vh !important;
}
"""

model_chat = AutoModelForCausalLM.from_pretrained(
        "THUDM/LongWriter-glm4-9b",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        ).to(0).eval()

tokenizer_chat = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat",trust_remote_code=True)


@spaces.GPU
def stream_chat(message: str, history: list, temperature: float, max_length: int):
    print(f'message is - {message}')
    print(f'history is - {history}')
    conversation = []
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")

        
    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        max_length=max_length,
        streamer=streamer,
        do_sample=True,
        top_k=1,
        temperature=temperature,
        repetition_penalty=1.2,
        num_beams=1,
    )
    gen_kwargs = {**input_ids, **generate_kwargs}

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=gen_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            yield buffer
 
chatbot = gr.Chatbot(height=600, placeholder = PLACEHOLDER)

with gr.Blocks(css=CSS) as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.5,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=32768,
                step=1,
                value=4096,
                label="Max Length",
                render=False,
            ),
        ],
        examples=[
            ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
            ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
            ["Tell me a random fun fact about the Roman Empire."],
            ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
        ],
        cache_examples=False,
    )
    

if __name__ == "__main__":
    demo.launch()