HODACHI commited on
Commit
789187b
1 Parent(s): e023e8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -17
app.py CHANGED
@@ -3,14 +3,23 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStream
3
  import torch
4
  from threading import Thread
5
 
6
- MODEL_ID = "HODACHI/EZO-Common-9B-gemma-2-it"
7
  DTYPE = torch.bfloat16
8
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
10
  model = AutoModelForCausalLM.from_pretrained(
11
  MODEL_ID,
12
- device_map="cuda",
13
- torch_dtype=DTYPE,
 
 
 
 
 
 
 
 
14
  )
15
 
16
  def respond(
@@ -21,32 +30,42 @@ def respond(
21
  top_p,
22
  ):
23
  chat = []
 
24
  for user, assistant in history:
25
  chat.append({"role": "user", "content": user})
26
  chat.append({"role": "assistant", "content": assistant})
27
  chat.append({"role": "user", "content": message})
28
 
29
- prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
30
- inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
31
 
32
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
33
 
34
- generation_kwargs = dict(
35
- input_ids=inputs,
36
- max_new_tokens=max_tokens,
37
- temperature=temperature,
38
- top_p=top_p,
39
- do_sample=True,
40
- streamer=streamer,
41
- )
42
 
43
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
44
  thread.start()
45
 
46
- response = ""
47
- for new_text in streamer:
48
- response += new_text
49
- yield response
 
 
 
 
 
 
 
 
 
50
 
51
  demo = gr.ChatInterface(
52
  respond,
 
3
  import torch
4
  from threading import Thread
5
 
6
+ MODEL_ID = "HODACHI/Llama-3.1-8B-EZO-1.1-it"
7
  DTYPE = torch.bfloat16
8
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+
11
  model = AutoModelForCausalLM.from_pretrained(
12
  MODEL_ID,
13
+ torch_dtype=torch.bfloat16, # bfloat16形式で計算を行い、精度と速度のバランスを取る
14
+ device_map="auto", # 利用可能なデバイスに自動的にモデルを配置
15
+ low_cpu_mem_usage=True, # CPU消費メモリを抑える
16
+ )
17
+
18
+ pipeline = transformers.pipeline(
19
+ "text-generation", # タスクを指定(ここではテキスト生成)
20
+ model=model, # 使用するモデル
21
+ tokenizer=tokenizer, # 使用するトークナイザー
22
+ device_map="auto", # デバイスの自動割り当て
23
  )
24
 
25
  def respond(
 
30
  top_p,
31
  ):
32
  chat = []
33
+ chat.append({"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、原則日本語で回答してください。"})
34
  for user, assistant in history:
35
  chat.append({"role": "user", "content": user})
36
  chat.append({"role": "assistant", "content": assistant})
37
  chat.append({"role": "user", "content": message})
38
 
39
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
40
+ #inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
41
 
42
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
43
 
44
+ #generation_kwargs = dict(
45
+ # input_ids=inputs,
46
+ # max_new_tokens=max_tokens,
47
+ # temperature=temperature,
48
+ # top_p=top_p,
49
+ # do_sample=True,
50
+ # streamer=streamer,
51
+ #)
52
 
53
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
54
  thread.start()
55
 
56
+ #response = ""
57
+ #for new_text in streamer:
58
+ # response += new_text
59
+ # yield response
60
+ outputs = pipeline(
61
+ prompt,
62
+ max_new_tokens=40, # 生成する最大トークン数
63
+ do_sample=True, # サンプリングを有効にして多様な出力を得る
64
+ temperature=0.7, # 生成の多様性を調整(高いほど多様、低いほど決定的)
65
+ top_p=0.95, # 累積確率に基づくサンプリングの閾値
66
+ )
67
+
68
+ response = outputs[0]["generated_text"]
69
 
70
  demo = gr.ChatInterface(
71
  respond,