Spaces:

Dify-AI
/

Baichuan2-13B-Chat

Paused

File size: 2,363 Bytes

7f9f96d
 
 
 
e4f695b
7f9f96d
 
 
 
 
e4f695b
7f9f96d
e4f695b
 
7f9f96d
e4f695b
 
 
 
 
 
 
 
 
7f9f96d
 
e4f695b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f9f96d
e4f695b
 
 
 
 
 
 
 
7f9f96d
e4f695b
 
 
 
 
 
 
 
 
 
7f9f96d
e4f695b
 
7f9f96d
e4f695b
 
 
 
 
7f9f96d
e4f695b
 
b878b56
 
 
 
3ccf1a4
 
 
 
b878b56
e4f695b
b878b56

from threading import Thread
from typing import Iterator

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

model_id = 'baichuan-inc/Baichuan2-13B-Chat'

if torch.cuda.is_available():
  model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # device_map='auto',
    torch_dtype=torch.float16,
    trust_remote_code=True
  )
  model = model.quantize(4).cuda()
  model.generation_config = GenerationConfig.from_pretrained(model_id)
else:
  model = None
tokenizer = AutoTokenizer.from_pretrained(
  model_id,
  use_fast=False,
  trust_remote_code=True
)

def get_prompt(
  message: str,
  chat_history: list[tuple[str, str]],
  system_prompt: str
) -> str:
  texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
  # The first user input is _not_ stripped
  do_strip = False
  for user_input, response in chat_history:
    user_input = user_input.strip() if do_strip else user_input
    do_strip = True
    texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
  message = message.strip() if do_strip else message
  texts.append(f'{message} [/INST]')
  return ''.join(texts)

def get_input_token_length(
  message: str,
  chat_history: list[tuple[str, str]],
  system_prompt: str
) -> int:
  prompt = get_prompt(message, chat_history, system_prompt)
  input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
  return input_ids.shape[-1]

def run(
  message: str,
  chat_history: list[tuple[str, str]],
  system_prompt: str,
  max_new_tokens: int = 1024,
  temperature: float = 1.0,
  top_p: float = 0.95,
  top_k: int = 5
) -> Iterator[str]:
  print(chat_history)

  history = []
  result=""

  for i in chat_history:
    history.append({"role": "user", "content": i[0]})
    history.append({"role": "assistant", "content": i[1]})
  
  print(history)

  history.append({"role": "user", "content": message})
  
  for response in model.chat(
    tokenizer,
    history,
    # stream=True,
    # max_new_tokens=max_new_tokens,
    # temperature=temperature,
    # top_p=top_p,
    # top_k=top_k,
  ):
    print(response)
    result = result + response
    yield result
    # if "content" in response["choices"][0]["delta"]:
    #   result = result + response["choices"][0]["delta"]["content"]
    #   yield result