Bielik-7B-Instruct-v0.1

Running on Zero

djstrong commited on Apr 16

Commit

7599592

•

1 Parent(s): 936cb04

queue max_size

Files changed (1) hide show

app.py CHANGED Viewed

@@ -58,6 +58,21 @@ def send_discord(i,o):
     else:
         print(f"Not sent with {result.status_code}, response:\n{result.json()}")
 @spaces.GPU()
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
     print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
@@ -141,20 +156,7 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
         run_as_future=True
     )
-# Load model
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-tokenizer.pad_token = tokenizer.eos_token
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    quantization_config=quantization_config,
-    attn_implementation="flash_attention_2",
-)
 on_load="""
 async()=>{
@@ -203,7 +205,7 @@ with gr.Blocks() as demo:
         js=on_load,
     )
-demo.queue().launch()
 # chatbot = gr.Chatbot(label="Chatbot", likeable=True)
 # chatbot.like(vote, None, None)

     else:
         print(f"Not sent with {result.status_code}, response:\n{result.json()}")
+# Load model
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    quantization_config=quantization_config,
+    attn_implementation="flash_attention_2",
+)
 @spaces.GPU()
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
     print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
         run_as_future=True
     )
 on_load="""
 async()=>{
         js=on_load,
     )
+demo.queue(max_size=20).launch()
 # chatbot = gr.Chatbot(label="Chatbot", likeable=True)
 # chatbot.like(vote, None, None)