Spaces:
Running
on
Zero
Running
on
Zero
queue max_size
Browse files
app.py
CHANGED
@@ -58,6 +58,21 @@ def send_discord(i,o):
|
|
58 |
else:
|
59 |
print(f"Not sent with {result.status_code}, response:\n{result.json()}")
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
@spaces.GPU()
|
62 |
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
|
63 |
print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
|
@@ -141,20 +156,7 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
|
|
141 |
run_as_future=True
|
142 |
)
|
143 |
|
144 |
-
|
145 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
146 |
-
quantization_config = BitsAndBytesConfig(
|
147 |
-
load_in_4bit=True,
|
148 |
-
bnb_4bit_compute_dtype=torch.bfloat16
|
149 |
-
)
|
150 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
151 |
-
tokenizer.pad_token = tokenizer.eos_token
|
152 |
-
model = AutoModelForCausalLM.from_pretrained(
|
153 |
-
MODEL_ID,
|
154 |
-
device_map="auto",
|
155 |
-
quantization_config=quantization_config,
|
156 |
-
attn_implementation="flash_attention_2",
|
157 |
-
)
|
158 |
|
159 |
on_load="""
|
160 |
async()=>{
|
@@ -203,7 +205,7 @@ with gr.Blocks() as demo:
|
|
203 |
js=on_load,
|
204 |
)
|
205 |
|
206 |
-
demo.queue().launch()
|
207 |
|
208 |
# chatbot = gr.Chatbot(label="Chatbot", likeable=True)
|
209 |
# chatbot.like(vote, None, None)
|
|
|
58 |
else:
|
59 |
print(f"Not sent with {result.status_code}, response:\n{result.json()}")
|
60 |
|
61 |
+
# Load model
|
62 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
63 |
+
quantization_config = BitsAndBytesConfig(
|
64 |
+
load_in_4bit=True,
|
65 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
66 |
+
)
|
67 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
68 |
+
tokenizer.pad_token = tokenizer.eos_token
|
69 |
+
model = AutoModelForCausalLM.from_pretrained(
|
70 |
+
MODEL_ID,
|
71 |
+
device_map="auto",
|
72 |
+
quantization_config=quantization_config,
|
73 |
+
attn_implementation="flash_attention_2",
|
74 |
+
)
|
75 |
+
|
76 |
@spaces.GPU()
|
77 |
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
|
78 |
print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
|
|
|
156 |
run_as_future=True
|
157 |
)
|
158 |
|
159 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
on_load="""
|
162 |
async()=>{
|
|
|
205 |
js=on_load,
|
206 |
)
|
207 |
|
208 |
+
demo.queue(max_size=20).launch()
|
209 |
|
210 |
# chatbot = gr.Chatbot(label="Chatbot", likeable=True)
|
211 |
# chatbot.like(vote, None, None)
|