import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from typing import List, Literal, Sequence, TypedDict Role = Literal["system", "user", "assistant"] class Message(TypedDict): role: Role content: str Dialog = Sequence[Message] class ChatFormat: def encode_header(self, message: Message) -> str: return f"{message['role']}\n\n" def encode_message(self, message: Message) -> str: header = self.encode_header(message) return f"{header}{message['content'].strip()}" def encode_dialog_prompt(self, dialog: Dialog) -> str: dialog_str = "" for message in dialog: dialog_str += self.encode_message(message) dialog_str += self.encode_header({"role": "assistant", "content": ""}) return dialog_str class MedS_Llama3: def __init__(self, model_path: str): # 加载模型到CPU self.model = AutoModelForCausalLM.from_pretrained( model_path, device_map='cpu', # 指定加载到CPU torch_dtype=torch.float32 # 使用标准的float32精度 ) self.model.config.pad_token_id = self.model.config.eos_token_id = 128009 self.tokenizer = AutoTokenizer.from_pretrained( model_path, model_max_length=2048, padding_side="right" ) self.tokenizer.pad_token = self.tokenizer.eos_token self.model.eval() self.prompt_engine = ChatFormat() print('Model and tokenizer loaded on CPU!') def __build_inputs_for_llama3(self, query: str, instruction: str) -> str: input_ss = [ {"role": 'system', "content": instruction}, {"role": 'user', "content": query} ] return self.prompt_engine.encode_dialog_prompt(input_ss) def chat(self, query: str, instruction: str, max_output_tokens: int) -> str: formatted_query = f"Input:\n{query}\nOutput:\n" input_sentence = self.__build_inputs_for_llama3(formatted_query, instruction) input_tokens = self.tokenizer( input_sentence, return_tensors="pt", padding=True, truncation=True ) output = self.model.generate( **input_tokens, max_new_tokens=max_output_tokens, eos_token_id=128009 ) generated_text = self.tokenizer.decode( output[0][input_tokens['input_ids'].shape[1]:], skip_special_tokens=True ) return generated_text.strip() # 实例化模型 model_path = "Henrychur/MMedS-Llama-3-8B" # 确保这里是模型的正确路径 chat_model = MedS_Llama3(model_path) # 定义 Gradio 接口中使用的响应函数 def respond(message, system_message, max_output_tokens): # 每次对话结束后清空历史,只使用当前输入和系统指令 response = chat_model.chat(query=message, instruction=system_message, max_output_tokens=max_output_tokens) yield response # 设置 Gradio 聊天界面 demo = gr.Interface( fn=respond, inputs=[ gr.Textbox(value="What is the treatment for diabetes?", label="Your Input"), gr.Textbox(value="If you are a doctor, please perform clinical consulting with the patient.", label="System message"), gr.Slider(minimum=1, maximum=1024, value=512, step=1, label="Max Output Tokens") ], outputs="text" ) if __name__ == "__main__": demo.launch()