djstrong commited on
Commit
7599592
1 Parent(s): 936cb04

queue max_size

Browse files
Files changed (1) hide show
  1. app.py +17 -15
app.py CHANGED
@@ -58,6 +58,21 @@ def send_discord(i,o):
58
  else:
59
  print(f"Not sent with {result.status_code}, response:\n{result.json()}")
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  @spaces.GPU()
62
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
63
  print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
@@ -141,20 +156,7 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
141
  run_as_future=True
142
  )
143
 
144
- # Load model
145
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
146
- quantization_config = BitsAndBytesConfig(
147
- load_in_4bit=True,
148
- bnb_4bit_compute_dtype=torch.bfloat16
149
- )
150
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
151
- tokenizer.pad_token = tokenizer.eos_token
152
- model = AutoModelForCausalLM.from_pretrained(
153
- MODEL_ID,
154
- device_map="auto",
155
- quantization_config=quantization_config,
156
- attn_implementation="flash_attention_2",
157
- )
158
 
159
  on_load="""
160
  async()=>{
@@ -203,7 +205,7 @@ with gr.Blocks() as demo:
203
  js=on_load,
204
  )
205
 
206
- demo.queue().launch()
207
 
208
  # chatbot = gr.Chatbot(label="Chatbot", likeable=True)
209
  # chatbot.like(vote, None, None)
 
58
  else:
59
  print(f"Not sent with {result.status_code}, response:\n{result.json()}")
60
 
61
+ # Load model
62
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
63
+ quantization_config = BitsAndBytesConfig(
64
+ load_in_4bit=True,
65
+ bnb_4bit_compute_dtype=torch.bfloat16
66
+ )
67
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
68
+ tokenizer.pad_token = tokenizer.eos_token
69
+ model = AutoModelForCausalLM.from_pretrained(
70
+ MODEL_ID,
71
+ device_map="auto",
72
+ quantization_config=quantization_config,
73
+ attn_implementation="flash_attention_2",
74
+ )
75
+
76
  @spaces.GPU()
77
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
78
  print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
 
156
  run_as_future=True
157
  )
158
 
159
+
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  on_load="""
162
  async()=>{
 
205
  js=on_load,
206
  )
207
 
208
+ demo.queue(max_size=20).launch()
209
 
210
  # chatbot = gr.Chatbot(label="Chatbot", likeable=True)
211
  # chatbot.like(vote, None, None)