dahara1 commited on
Commit
19d18f7
1 Parent(s): c023a77

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +105 -0
README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - npu
4
+ - amd
5
+ - llama3
6
+ ---
7
+
8
+ This is a model that has been AWQ quantized and converted to run on the NPU installed in the Ryzen AI PC (for example, Ryzen 9 7940HS Processor) (for Windows environment)
9
+
10
+ For information on setting up Ryzen AI for LLMs, see [Running LLM on AMD NPU Hardware](https://www.hackster.io/gharada2013/running-llm-on-amd-npu-hardware-19322f).
11
+
12
+ The following script assumes that conda activate XXXX.\setup.bat has been completed in cmd
13
+
14
+ ### setup
15
+ ```
16
+
17
+ ```
18
+
19
+ ### Sample Script
20
+
21
+ ```
22
+ import torch
23
+ import time
24
+ import os
25
+ import psutil
26
+ import transformers
27
+ from transformers import AutoTokenizer, set_seed
28
+ import qlinear
29
+ import logging
30
+
31
+ set_seed(123)
32
+ transformers.logging.set_verbosity_error()
33
+ logging.disable(logging.CRITICAL)
34
+
35
+ messages = [
36
+ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
37
+ ]
38
+
39
+ message_list = [
40
+ "Who are you? ",
41
+ # Japanese
42
+ "あなたの乗っている船の名前は何ですか?英語ではなく全て日本語だけを使って返事をしてください",
43
+ # Chainese
44
+ "你经历过的最危险的冒险是什么?请用中文回答所有问题,不要用英文。",
45
+ # French
46
+ "À quelle vitesse va votre bateau ? Veuillez répondre uniquement en français et non en anglais.",
47
+ # Korean
48
+ "당신은 그 배의 어디를 좋아합니까? 영어를 사용하지 않고 모두 한국어로 대답하십시오.",
49
+ # German
50
+ "Wie würde Ihr Schiffsname auf Deutsch lauten? Bitte antwortet alle auf Deutsch statt auf Englisch.",
51
+ # Taiwanese
52
+ "您發現過的最令人驚奇的寶藏是什麼?請僅使用台語和繁體中文回答,不要使用英文。",
53
+ ]
54
+
55
+
56
+ if __name__ == "__main__":
57
+ p = psutil.Process()
58
+ p.cpu_affinity([0, 1, 2, 3])
59
+ torch.set_num_threads(4)
60
+
61
+ tokenizer = AutoTokenizer.from_pretrained("dahara1/llama3-8b-amd-npu")
62
+ ckpt = "pytorch_llama3_8b_w_bit_4_awq_lm_amd.pt"
63
+ terminators = [
64
+ tokenizer.eos_token_id,
65
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
66
+ ]
67
+ model = torch.load(ckpt)
68
+ model.eval()
69
+ model = model.to(torch.bfloat16)
70
+
71
+ for n, m in model.named_modules():
72
+ if isinstance(m, qlinear.QLinearPerGrp):
73
+ print(f"Preparing weights of layer : {n}")
74
+ m.device = "aie"
75
+ m.quantize_weights()
76
+
77
+ print("system: " + messages[0]['content'])
78
+
79
+ for i in range(len(message_list)):
80
+ messages.append({"role": "user", "content": message_list[i]})
81
+ print("user: " + message_list[i])
82
+
83
+ input = tokenizer.apply_chat_template(
84
+ messages,
85
+ add_generation_prompt=True,
86
+ return_tensors="pt",
87
+ return_dict=True
88
+ )
89
+
90
+ outputs = model.generate(input['input_ids'],
91
+ max_new_tokens=600,
92
+ eos_token_id=terminators,
93
+ attention_mask=input['attention_mask'],
94
+ do_sample=True,
95
+ temperature=0.6,
96
+ top_p=0.9)
97
+
98
+ response = outputs[0][input['input_ids'].shape[-1]:]
99
+ response_message = tokenizer.decode(response, skip_special_tokens=True)
100
+ print("assistant: " + response_message)
101
+ messages.append({"role": "system", "content": response_message})
102
+
103
+ ```
104
+
105
+