Bielik-7B-Instruct-v0.1

Running on Zero

App Files Files Community

djstrong commited on Apr 18

Commit

358ae35

•

1 Parent(s): 626f638

optimize

Browse files

Files changed (1) hide show

app.py +33 -28

app.py CHANGED Viewed

@@ -74,9 +74,37 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 @spaces.GPU()
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
     repetition_penalty=float(repetition_penalty)
-    print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
     # Format history with a given chat template
     if CHAT_TEMPLATE == "ChatML":
         stop_tokens = ["<|endoftext|>", "<|im_end|>"]
@@ -103,33 +131,10 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
         raise Exception("Incorrect chat template, select 'ChatML' or 'Mistral Instruct'")
     print(instruction)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
-    input_ids, attention_mask = enc.input_ids, enc.attention_mask
-    if input_ids.shape[1] > CONTEXT_LENGTH:
-        input_ids = input_ids[:, -CONTEXT_LENGTH:]
-    generate_kwargs = dict(
-        {"input_ids": input_ids.to(device), "attention_mask": attention_mask.to(device)},
-        streamer=streamer,
-        do_sample=True if temperature else False,
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-        top_p=top_p
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for new_token in streamer:
-        outputs.append(new_token)
-        if new_token in stop_tokens:
-            break
-        yield "".join(outputs)
-    send_discord(instruction, "".join(outputs))
     hfapi = HfApi()
@@ -145,7 +150,7 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
         'repetition_penalty':repetition_penalty,
         'top_p':top_p,
         'instruction':instruction,
-        'output':"".join(outputs),
         'precision': 'auto '+str(model.dtype),
     }
     hfapi.upload_file(

 )
 @spaces.GPU()
+def generate(instruction, stop_tokens, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
+    input_ids, attention_mask = enc.input_ids, enc.attention_mask
+    if input_ids.shape[1] > CONTEXT_LENGTH:
+        input_ids = input_ids[:, -CONTEXT_LENGTH:]
+    generate_kwargs = dict(
+        {"input_ids": input_ids.to(device), "attention_mask": attention_mask.to(device)},
+        streamer=streamer,
+        do_sample=True if temperature else False,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        top_p=top_p
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for new_token in streamer:
+        outputs.append(new_token)
+        if new_token in stop_tokens:
+            break
+        yield "".join(outputs)
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
     repetition_penalty=float(repetition_penalty)
+    print('LLL', [message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p])
     # Format history with a given chat template
     if CHAT_TEMPLATE == "ChatML":
         stop_tokens = ["<|endoftext|>", "<|im_end|>"]
         raise Exception("Incorrect chat template, select 'ChatML' or 'Mistral Instruct'")
     print(instruction)
+    for output_text in generate(instruction, stop_tokens, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
+        yield output_text
+    send_discord(instruction, output_text)
     hfapi = HfApi()
         'repetition_penalty':repetition_penalty,
         'top_p':top_p,
         'instruction':instruction,
+        'output':output_text,
         'precision': 'auto '+str(model.dtype),
     }
     hfapi.upload_file(