bandhit commited on
Commit
7f4bdfa
1 Parent(s): 6a9bdc0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +70 -1
README.md CHANGED
@@ -6,4 +6,73 @@ library_name: transformers
6
  pipeline_tag: text-generation
7
  ---
8
 
9
- A 4-bits quantization of [scb10x/typhoon-7b](https://huggingface.co/scb10x/typhoon-7b) with only less than 8 GB VRAM is required.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  pipeline_tag: text-generation
7
  ---
8
 
9
+ A 4-bits quantization of [scb10x/typhoon-7b](https://huggingface.co/scb10x/typhoon-7b) with only less than 8 GB VRAM is required.
10
+
11
+ ```python
12
+ # init parameters
13
+ model_name: str = 'scb10x/typhoon-7b'
14
+ quantization_mode: str = 'q4-bnb_cuda' # possible values = {'q4-bnb_cuda', 'q8-bnb_cuda', 'q4-torch_ptdq', 'q8-torch_ptdq'}
15
+
16
+ # load tokenizer
17
+ from transformers import AutoTokenizer
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ tokenizer.pad_token_id = tokenizer.eos_token_id
21
+ print(tokenizer) # LlamaTokenizerFast
22
+
23
+ # load model
24
+ import torch
25
+ from transformers import AutoModelForCausalLM
26
+
27
+ if quantization_mode == 'q4-bnb_cuda': # ampere architecture with 8gb vram + cpu with 20gb is recommended
28
+ print('4-bits bitsandbytes quantization with cuda')
29
+ model = AutoModelForCausalLM.from_pretrained(
30
+ model_name,
31
+ load_in_4bit = True,
32
+ device_map = 'auto',
33
+ torch_dtype = torch.bfloat16)
34
+ elif quantization_mode == 'q8-bnb_cuda': # ampere architecture with 12gb vram + cpu with 20gb is recommended
35
+ print('8-bits bitsandbytes quantization with cuda')
36
+ model = AutoModelForCausalLM.from_pretrained(
37
+ model_name,
38
+ load_in_8bit = True,
39
+ device_map = 'auto',
40
+ torch_dtype = torch.bfloat16)
41
+ elif quantization_mode == 'q4-torch_ptdq': # cpu with 64gb++ ram is recommended
42
+ print('4-bits x2 post training dynamic quantization')
43
+ base_model = AutoModelForCausalLM.from_pretrained(
44
+ model_name,
45
+ torch_dtype = torch.float32)
46
+ model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint4x2)
47
+ elif quantization_mode == 'q8-torch_ptdq': # cpu with 64gb++ ram is recommended
48
+ print('8-bits post training dynamic quantization')
49
+ base_model = AutoModelForCausalLM.from_pretrained(
50
+ model_name,
51
+ torch_dtype = torch.float32)
52
+ model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint8)
53
+ else:
54
+ print('default model')
55
+ model = AutoModelForCausalLM.from_pretrained(model_name)
56
+ print(model) # MistralForCausalLM
57
+
58
+ # text generator
59
+ from transformers import GenerationConfig, TextGenerationPipeline
60
+
61
+ config = GenerationConfig.from_pretrained(model_name)
62
+ config.num_return_sequences: int = 1
63
+ config.do_sample: bool = True
64
+ config.max_new_tokens: int = 128
65
+ config.temperature: float = 0.7
66
+ config.top_p: float = 0.95
67
+ config.repetition_penalty: float = 1.3
68
+ generator = TextGenerationPipeline(
69
+ model = model,
70
+ tokenizer = tokenizer,
71
+ return_full_text = True,
72
+ generation_config = config)
73
+
74
+ # sample
75
+ sample: str = 'ความหมายของชีวิตคืออะไร?\n'
76
+ output = generator(sample, pad_token_id = tokenizer.eos_token_id)
77
+ print(output[0]['generated_text'])
78
+ ```