abcde1234www juancopi81 commited on
Commit
be68597
0 Parent(s):

Duplicate from juancopi81/chat-gpt-y

Browse files

Co-authored-by: Juan Carlos Piñeros <[email protected]>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +188 -0
  4. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chat Gpt Y
3
+ emoji: 🚀
4
+ colorFrom: pink
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.16.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ duplicated_from: juancopi81/chat-gpt-y
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from ast import literal_eval
3
+ from datasets import load_dataset
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ import openai
8
+ import tiktoken
9
+ from transformers import GPT2TokenizerFast
10
+ import gradio as gr
11
+
12
+ # get API key from top-right dropdown on OpenAI website
13
+ openai.api_key = os.getenv("OPEN_AI_API_KEY")
14
+
15
+ EMBEDDING_MODEL = "text-embedding-ada-002"
16
+ COMPLETIONS_MODEL = "text-davinci-003"
17
+ MAX_SECTION_LEN = 2000
18
+ COMPLETIONS_API_PARAMS = {
19
+ # We use temperature of 0.0 because it gives the most predictable, factual answer.
20
+ "temperature": 0.0,
21
+ "max_tokens": 500,
22
+ "model": COMPLETIONS_MODEL,
23
+ }
24
+
25
+ hf_ds = "juancopi81/yannic_ada_embeddings"
26
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
27
+
28
+ HEADER = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "This is not covered in my videos." Try imitating the style of the provided context. \n\nContext:\n"""
29
+ RESPONSE_SOURCES = " For more information, check out my following videos: "
30
+
31
+ # query separator to help the model distinguish between separate pieces of text.
32
+ SEPARATOR = "\n* "
33
+ ENCODING = "cl100k_base" # encoding for text-embedding-ada-002
34
+
35
+ encoding = tiktoken.get_encoding(ENCODING)
36
+ separator_len = len(encoding.encode(SEPARATOR))
37
+
38
+ f"Context separator contains {separator_len} tokens"
39
+
40
+ # UTILS
41
+ def count_tokens(text: str) -> int:
42
+ """count the number of tokens in a string"""
43
+ return len(tokenizer.encode(text))
44
+
45
+ def load_embeddings(hf_ds: str) -> dict:
46
+ """
47
+ Read the document embeddings and their keys from a HuggingFace dataset.
48
+
49
+ hf_ds is the name of the HF dataset with exactly these named columns:
50
+ "TITLE", "URL", "TRANSCRIPTION", "transcription_length", "text", "ada_embedding"
51
+ """
52
+ hf_ds = load_dataset(hf_ds, split="train")
53
+ hf_ds.set_format("pandas")
54
+ df = hf_ds[:]
55
+ df.ada_embedding = df.ada_embedding.apply(literal_eval)
56
+ df["idx"] = df.index
57
+ return {
58
+ (r.idx, r.TITLE, r.URL): r.ada_embedding for idx, r in df.iterrows()
59
+ }
60
+
61
+ def create_dataframe(hf_ds: str):
62
+ hf_ds = load_dataset(hf_ds, split="train")
63
+ hf_ds.set_format("pandas")
64
+ df = hf_ds[:]
65
+ df["num_tokens"] = df["text"].map(count_tokens)
66
+ df["idx"] = df.index
67
+ df = df.set_index(["idx", "TITLE", "URL"])
68
+ return df
69
+
70
+ def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list:
71
+ result = openai.Embedding.create(
72
+ model=model,
73
+ input=text
74
+ )
75
+ return result["data"][0]["embedding"]
76
+
77
+ def vector_similarity(x: list, y: list) -> float:
78
+ """
79
+ Returns the similarity between two vectors.
80
+
81
+ Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
82
+ """
83
+ return np.dot(np.array(x), np.array(y))
84
+
85
+ def order_document_sections_by_query_similarity(query: str, contexts: dict) -> list:
86
+ """
87
+ Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
88
+ to find the most relevant sections.
89
+
90
+ Return the list of document sections, sorted by relevance in descending order.
91
+ """
92
+ query_embedding = get_embedding(query)
93
+
94
+ document_similarities = sorted([
95
+ (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
96
+ ], reverse=True)
97
+
98
+ return document_similarities
99
+
100
+ def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> tuple:
101
+ """
102
+ Fetch relevant
103
+ """
104
+ most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
105
+
106
+ chosen_sections = []
107
+ chosen_sections_len = 0
108
+ chosen_sections_indexes = []
109
+
110
+ for _, section_index in most_relevant_document_sections:
111
+ # Add contexts until we run out of space.
112
+ document_section = df.loc[section_index]
113
+
114
+ chosen_sections_len += document_section.num_tokens + separator_len
115
+ if chosen_sections_len > MAX_SECTION_LEN:
116
+ break
117
+
118
+ chosen_sections.append(SEPARATOR + document_section.text.replace("\n", " "))
119
+ chosen_sections_indexes.append(str(section_index))
120
+
121
+ # Useful diagnostic information
122
+ print(f"Selected {len(chosen_sections)} document sections:")
123
+ print("\n".join(chosen_sections_indexes))
124
+
125
+ header = HEADER
126
+
127
+ return (header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:",
128
+ chosen_sections_indexes)
129
+
130
+ def answer_query_with_context(
131
+ query: str,
132
+ df: pd.DataFrame,
133
+ document_embeddings: dict,
134
+ show_prompt: bool = False
135
+ ) -> str:
136
+ prompt, sources = construct_prompt(
137
+ query,
138
+ document_embeddings,
139
+ df
140
+ )
141
+
142
+ if show_prompt:
143
+ print(prompt)
144
+
145
+ response = openai.Completion.create(
146
+ prompt=prompt,
147
+ **COMPLETIONS_API_PARAMS
148
+ )
149
+ gpt_answer = response["choices"][0]["text"].strip(" \n")
150
+
151
+ if gpt_answer != "This is not covered in my videos.":
152
+ res_sources = RESPONSE_SOURCES
153
+ for source in sources[:2]:
154
+ src_lst = eval(source)
155
+ title = "".join(src_lst[1])
156
+ url = "".join(src_lst[2])
157
+ if url not in res_sources:
158
+ final_src = title + " " + url
159
+ res_sources += " " + final_src
160
+ else:
161
+ res_sources = ""
162
+
163
+ final_answer = gpt_answer + res_sources
164
+
165
+ return final_answer
166
+
167
+ df = create_dataframe(hf_ds)
168
+ document_embeddings = load_embeddings(hf_ds)
169
+
170
+ def predict(question, history):
171
+ history = history or []
172
+ response = answer_query_with_context(question, df, document_embeddings)
173
+ history.append((question, response))
174
+ return history, history
175
+
176
+ block = gr.Blocks()
177
+
178
+ with block:
179
+ gr.Markdown("""<h1><center>Chat with Yannic</center></h1>
180
+ <p>Each question is independent. You should not base your new questions on the previous conversation</p>
181
+ """)
182
+ chatbot = gr.Chatbot()
183
+ question = gr.Textbox(placeholder="Enter your question")
184
+ state = gr.State()
185
+ submit = gr.Button("SEND")
186
+ submit.click(predict, inputs=[question, state], outputs=[chatbot, state])
187
+
188
+ block.launch(debug = True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ datasets
2
+ openai
3
+ transformers
4
+ tiktoken