Spaces:
Runtime error
Runtime error
Commit
•
be68597
0
Parent(s):
Duplicate from juancopi81/chat-gpt-y
Browse filesCo-authored-by: Juan Carlos Piñeros <[email protected]>
- .gitattributes +34 -0
- README.md +14 -0
- app.py +188 -0
- requirements.txt +4 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Chat Gpt Y
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.16.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: openrail
|
11 |
+
duplicated_from: juancopi81/chat-gpt-y
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from ast import literal_eval
|
3 |
+
from datasets import load_dataset
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
import openai
|
8 |
+
import tiktoken
|
9 |
+
from transformers import GPT2TokenizerFast
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
# get API key from top-right dropdown on OpenAI website
|
13 |
+
openai.api_key = os.getenv("OPEN_AI_API_KEY")
|
14 |
+
|
15 |
+
EMBEDDING_MODEL = "text-embedding-ada-002"
|
16 |
+
COMPLETIONS_MODEL = "text-davinci-003"
|
17 |
+
MAX_SECTION_LEN = 2000
|
18 |
+
COMPLETIONS_API_PARAMS = {
|
19 |
+
# We use temperature of 0.0 because it gives the most predictable, factual answer.
|
20 |
+
"temperature": 0.0,
|
21 |
+
"max_tokens": 500,
|
22 |
+
"model": COMPLETIONS_MODEL,
|
23 |
+
}
|
24 |
+
|
25 |
+
hf_ds = "juancopi81/yannic_ada_embeddings"
|
26 |
+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
27 |
+
|
28 |
+
HEADER = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "This is not covered in my videos." Try imitating the style of the provided context. \n\nContext:\n"""
|
29 |
+
RESPONSE_SOURCES = " For more information, check out my following videos: "
|
30 |
+
|
31 |
+
# query separator to help the model distinguish between separate pieces of text.
|
32 |
+
SEPARATOR = "\n* "
|
33 |
+
ENCODING = "cl100k_base" # encoding for text-embedding-ada-002
|
34 |
+
|
35 |
+
encoding = tiktoken.get_encoding(ENCODING)
|
36 |
+
separator_len = len(encoding.encode(SEPARATOR))
|
37 |
+
|
38 |
+
f"Context separator contains {separator_len} tokens"
|
39 |
+
|
40 |
+
# UTILS
|
41 |
+
def count_tokens(text: str) -> int:
|
42 |
+
"""count the number of tokens in a string"""
|
43 |
+
return len(tokenizer.encode(text))
|
44 |
+
|
45 |
+
def load_embeddings(hf_ds: str) -> dict:
|
46 |
+
"""
|
47 |
+
Read the document embeddings and their keys from a HuggingFace dataset.
|
48 |
+
|
49 |
+
hf_ds is the name of the HF dataset with exactly these named columns:
|
50 |
+
"TITLE", "URL", "TRANSCRIPTION", "transcription_length", "text", "ada_embedding"
|
51 |
+
"""
|
52 |
+
hf_ds = load_dataset(hf_ds, split="train")
|
53 |
+
hf_ds.set_format("pandas")
|
54 |
+
df = hf_ds[:]
|
55 |
+
df.ada_embedding = df.ada_embedding.apply(literal_eval)
|
56 |
+
df["idx"] = df.index
|
57 |
+
return {
|
58 |
+
(r.idx, r.TITLE, r.URL): r.ada_embedding for idx, r in df.iterrows()
|
59 |
+
}
|
60 |
+
|
61 |
+
def create_dataframe(hf_ds: str):
|
62 |
+
hf_ds = load_dataset(hf_ds, split="train")
|
63 |
+
hf_ds.set_format("pandas")
|
64 |
+
df = hf_ds[:]
|
65 |
+
df["num_tokens"] = df["text"].map(count_tokens)
|
66 |
+
df["idx"] = df.index
|
67 |
+
df = df.set_index(["idx", "TITLE", "URL"])
|
68 |
+
return df
|
69 |
+
|
70 |
+
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list:
|
71 |
+
result = openai.Embedding.create(
|
72 |
+
model=model,
|
73 |
+
input=text
|
74 |
+
)
|
75 |
+
return result["data"][0]["embedding"]
|
76 |
+
|
77 |
+
def vector_similarity(x: list, y: list) -> float:
|
78 |
+
"""
|
79 |
+
Returns the similarity between two vectors.
|
80 |
+
|
81 |
+
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
|
82 |
+
"""
|
83 |
+
return np.dot(np.array(x), np.array(y))
|
84 |
+
|
85 |
+
def order_document_sections_by_query_similarity(query: str, contexts: dict) -> list:
|
86 |
+
"""
|
87 |
+
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
|
88 |
+
to find the most relevant sections.
|
89 |
+
|
90 |
+
Return the list of document sections, sorted by relevance in descending order.
|
91 |
+
"""
|
92 |
+
query_embedding = get_embedding(query)
|
93 |
+
|
94 |
+
document_similarities = sorted([
|
95 |
+
(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
|
96 |
+
], reverse=True)
|
97 |
+
|
98 |
+
return document_similarities
|
99 |
+
|
100 |
+
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> tuple:
|
101 |
+
"""
|
102 |
+
Fetch relevant
|
103 |
+
"""
|
104 |
+
most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
|
105 |
+
|
106 |
+
chosen_sections = []
|
107 |
+
chosen_sections_len = 0
|
108 |
+
chosen_sections_indexes = []
|
109 |
+
|
110 |
+
for _, section_index in most_relevant_document_sections:
|
111 |
+
# Add contexts until we run out of space.
|
112 |
+
document_section = df.loc[section_index]
|
113 |
+
|
114 |
+
chosen_sections_len += document_section.num_tokens + separator_len
|
115 |
+
if chosen_sections_len > MAX_SECTION_LEN:
|
116 |
+
break
|
117 |
+
|
118 |
+
chosen_sections.append(SEPARATOR + document_section.text.replace("\n", " "))
|
119 |
+
chosen_sections_indexes.append(str(section_index))
|
120 |
+
|
121 |
+
# Useful diagnostic information
|
122 |
+
print(f"Selected {len(chosen_sections)} document sections:")
|
123 |
+
print("\n".join(chosen_sections_indexes))
|
124 |
+
|
125 |
+
header = HEADER
|
126 |
+
|
127 |
+
return (header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:",
|
128 |
+
chosen_sections_indexes)
|
129 |
+
|
130 |
+
def answer_query_with_context(
|
131 |
+
query: str,
|
132 |
+
df: pd.DataFrame,
|
133 |
+
document_embeddings: dict,
|
134 |
+
show_prompt: bool = False
|
135 |
+
) -> str:
|
136 |
+
prompt, sources = construct_prompt(
|
137 |
+
query,
|
138 |
+
document_embeddings,
|
139 |
+
df
|
140 |
+
)
|
141 |
+
|
142 |
+
if show_prompt:
|
143 |
+
print(prompt)
|
144 |
+
|
145 |
+
response = openai.Completion.create(
|
146 |
+
prompt=prompt,
|
147 |
+
**COMPLETIONS_API_PARAMS
|
148 |
+
)
|
149 |
+
gpt_answer = response["choices"][0]["text"].strip(" \n")
|
150 |
+
|
151 |
+
if gpt_answer != "This is not covered in my videos.":
|
152 |
+
res_sources = RESPONSE_SOURCES
|
153 |
+
for source in sources[:2]:
|
154 |
+
src_lst = eval(source)
|
155 |
+
title = "".join(src_lst[1])
|
156 |
+
url = "".join(src_lst[2])
|
157 |
+
if url not in res_sources:
|
158 |
+
final_src = title + " " + url
|
159 |
+
res_sources += " " + final_src
|
160 |
+
else:
|
161 |
+
res_sources = ""
|
162 |
+
|
163 |
+
final_answer = gpt_answer + res_sources
|
164 |
+
|
165 |
+
return final_answer
|
166 |
+
|
167 |
+
df = create_dataframe(hf_ds)
|
168 |
+
document_embeddings = load_embeddings(hf_ds)
|
169 |
+
|
170 |
+
def predict(question, history):
|
171 |
+
history = history or []
|
172 |
+
response = answer_query_with_context(question, df, document_embeddings)
|
173 |
+
history.append((question, response))
|
174 |
+
return history, history
|
175 |
+
|
176 |
+
block = gr.Blocks()
|
177 |
+
|
178 |
+
with block:
|
179 |
+
gr.Markdown("""<h1><center>Chat with Yannic</center></h1>
|
180 |
+
<p>Each question is independent. You should not base your new questions on the previous conversation</p>
|
181 |
+
""")
|
182 |
+
chatbot = gr.Chatbot()
|
183 |
+
question = gr.Textbox(placeholder="Enter your question")
|
184 |
+
state = gr.State()
|
185 |
+
submit = gr.Button("SEND")
|
186 |
+
submit.click(predict, inputs=[question, state], outputs=[chatbot, state])
|
187 |
+
|
188 |
+
block.launch(debug = True)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
openai
|
3 |
+
transformers
|
4 |
+
tiktoken
|