MagpieLM-4B

Running on Zero

App Files Files Community

MagpieLM-4B / app.py

yuchenlin

Update app.py

fd8b009 verified 2 days ago

raw

history blame contribute delete

No virus

2.77 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import spaces
	from threading import Thread
	from typing import Iterator

	# Load model and tokenizer
	model_name = "Magpie-Align/MagpieLM-4B-Chat-v0.1"

	device = "cuda" # the device to load the model onto
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype="auto"
	)
	model.to(device)

	MAX_INPUT_TOKEN_LENGTH = 4096 # You may need to adjust this value

	@spaces.GPU
	def respond(
	message: str,
	chat_history: list[tuple[str, str]],
	system_prompt: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	conversation = []
	if system_prompt:
	conversation.append({"role": "system", "content": system_prompt})
	for user, assistant in chat_history:
	conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
	conversation.append({"role": "user", "content": message})

	input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)

	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are Magpie, a helpful AI assistant.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.1, label="Repetation Penalty"),
	],
	)


	if __name__ == "__main__":
	demo.queue()
	demo.launch(share=True)