Lin-K76 commited on
Commit
eba4504
1 Parent(s): 85dc941

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -14
README.md CHANGED
@@ -42,27 +42,24 @@ Only the weights and activations of the linear operators within transformers blo
42
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
43
 
44
  ```python
45
- from vllm import LLM, SamplingParams
46
  from transformers import AutoTokenizer
 
47
 
48
- model_id = "neuralmagic/DeepSeek-Coder-V2-Instruct-FP8"
49
-
50
- sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)
51
-
52
- tokenizer = AutoTokenizer.from_pretrained(model_id)
53
 
54
- messages = [
55
- {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
56
- {"role": "user", "content": "Who are you?"},
57
  ]
58
 
59
- prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
-
61
- llm = LLM(model=model_id, trust_remote_code=True, max_model_len=4096, tensor_parallel_size=4)
62
 
63
- outputs = llm.generate(prompts, sampling_params)
64
 
65
- generated_text = outputs[0].outputs[0].text
66
  print(generated_text)
67
  ```
68
 
 
42
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
43
 
44
  ```python
 
45
  from transformers import AutoTokenizer
46
+ from vllm import LLM, SamplingParams
47
 
48
+ max_model_len, tp_size = 4096, 4
49
+ model_name = "neuralmagic/DeepSeek-Coder-V2-Instruct-FP8"
50
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
52
+ sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
53
 
54
+ messages_list = [
55
+ [{"role": "user", "content": "Who are you? Please respond in pirate speak!"}],
 
56
  ]
57
 
58
+ prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True) for messages in messages_list]
 
 
59
 
60
+ outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
61
 
62
+ generated_text = [output.outputs[0].text for output in outputs]
63
  print(generated_text)
64
  ```
65