Spaces:

sayakpaul
/

diffusers-docs-qa-chatbot

Runtime error

App Files Files Community

sayakpaul HF staff commited on Mar 9

Commit

417eb3c

•

1 Parent(s): b50a68d

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -21

app.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 import json
 import re
 from sentence_transformers import SentenceTransformer, CrossEncoder
-from huggingface_hub import hf_hub_download
-from openai import OpenAI
 import hnswlib
 import numpy as np
 from typing import Iterator
@@ -12,8 +10,11 @@ import gradio as gr
 import pandas as pd
 import torch
 from transformers import AutoTokenizer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
@@ -30,15 +31,11 @@ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
 biencoder = SentenceTransformer("intfloat/e5-large-v2", device=torch_device)
 cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", max_length=512, device=torch_device)
-model_id = "HuggingFaceH4/zephyr-7b-beta"
-client = OpenAI(
-    base_url=f"https://api-inference.huggingface.co/models/{model_id}/v1",
-    api_key=os.environ["HUGGINGFACE_TOKEN"],
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 def create_qa_prompt(query, relevant_chunks):
@@ -104,17 +101,17 @@ def get_completion(
     if system_prompt is not None:
         messages.append({"role": "system", "content": system_prompt})
     messages.append({"role": "user", "content": prompt})
-    response = client.chat.completions.create(
         model=model,
         messages=messages,
-        # temperature=temperature,  # this is the degree of randomness of the model's output
         max_tokens=max_new_tokens,  # this is the number of new tokens being generated
-        # top_p=top_p,
-        # top_k=top_k,
         stream=stream,
-        # debug=debug,
     )
-    return response.choices[0].message.content if not stream else response
 # load the index for the Diffusers docs
@@ -190,6 +187,8 @@ DESCRIPTION = """
 # 🧨 Diffusers Docs QA Chatbot 🤗
 """
 LICENSE = """
 <p/>
@@ -198,10 +197,6 @@ As a derivate work of [Llama-2-70b-chat](https://huggingface.co/meta-llama/Llama
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-70b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-70b-chat/blob/main/USE_POLICY.md).
 """
-if not torch.cuda.is_available():
-    DESCRIPTION += "This application is almost exactly copied from [smangrul/PEFT-Docs-QA-Chatbot](https://huggingface.co/spaces/smangrul/PEFT-Docs-QA-Chatbot).\n Related code: [pacman100/DHS-LLM-Workshop](https://github.com/pacman100/DHS-LLM-Workshop/blob/main/6_Module/)."
 def clear_and_save_textbox(message: str) -> tuple[str, str]:
     return "", message
@@ -262,7 +257,7 @@ def generate(
     output = ""
     for idx, response in enumerate(generator):
-        token = response.choices[0].delta.content or ""
         output += token
         if idx == 0:
             history.append((message, output))
@@ -293,7 +288,7 @@ def check_input_token_length(message: str, chat_history: list[tuple[str, str]],
         )
-search_index = load_hnsw_index(SEARCH_INDEX) # create_hnsw_index(EMBEDDINGS_FILE)
 data_df = pd.read_parquet(DOCUMENT_DATASET).reset_index()
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
@@ -467,4 +462,4 @@ with gr.Blocks(css="style.css") as demo:
         api_name=False,
     )
-demo.queue(max_size=20).launch(debug=True, share=False)

 import json
 import re
 from sentence_transformers import SentenceTransformer, CrossEncoder
 import hnswlib
 import numpy as np
 from typing import Iterator
 import pandas as pd
 import torch
+from easyllm.clients import huggingface
 from transformers import AutoTokenizer
+huggingface.prompt_builder = "llama2"
+huggingface.api_key = os.environ["HUGGINGFACE_TOKEN"]
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
 print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
+model_id = "meta-llama/Llama-2-70b-chat-hf"
 biencoder = SentenceTransformer("intfloat/e5-large-v2", device=torch_device)
 cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", max_length=512, device=torch_device)
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
 def create_qa_prompt(query, relevant_chunks):
     if system_prompt is not None:
         messages.append({"role": "system", "content": system_prompt})
     messages.append({"role": "user", "content": prompt})
+    response = huggingface.ChatCompletion.create(
         model=model,
         messages=messages,
+        temperature=temperature,  # this is the degree of randomness of the model's output
         max_tokens=max_new_tokens,  # this is the number of new tokens being generated
+        top_p=top_p,
+        top_k=top_k,
         stream=stream,
+        debug=debug,
     )
+    return response["choices"][0]["message"]["content"] if not stream else response
 # load the index for the Diffusers docs
 # 🧨 Diffusers Docs QA Chatbot 🤗
 """
+DESCRIPTION += "This application is almost exactly copied from [smangrul/Diffusers-Docs-QA-Chatbot](https://huggingface.co/spaces/smangrul/Diffusers-Docs-QA-Chatbot).\n Related code: [pacman100/DHS-LLM-Workshop](https://github.com/pacman100/DHS-LLM-Workshop/blob/main/6_Module/)."
 LICENSE = """
 <p/>
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-70b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-70b-chat/blob/main/USE_POLICY.md).
 """
 def clear_and_save_textbox(message: str) -> tuple[str, str]:
     return "", message
     output = ""
     for idx, response in enumerate(generator):
+        token = response["choices"][0]["delta"].get("content", "") or ""
         output += token
         if idx == 0:
             history.append((message, output))
         )
+search_index = load_hnsw_index(SEARCH_INDEX) # create_hnsw_index(EMBEDDINGS_FILE)
 data_df = pd.read_parquet(DOCUMENT_DATASET).reset_index()
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
         api_name=False,
     )
+demo.queue(max_size=20).launch(debug=True, share=False)