jeevan commited on
Commit
bc453aa
0 Parent(s):
Files changed (9) hide show
  1. .chainlit/config.toml +84 -0
  2. .gitignore +8 -0
  3. Dockerfile +11 -0
  4. app.py +271 -0
  5. chainlit.md +3 -0
  6. embedding_model.py +58 -0
  7. pdfloader.py +27 -0
  8. pre-processing.ipynb +595 -0
  9. requirements.txt +15 -0
.chainlit/config.toml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+ # List of environment variables to be provided by each user to use the app.
6
+ user_env = []
7
+
8
+ # Duration (in seconds) during which the session is saved when the connection is lost
9
+ session_timeout = 3600
10
+
11
+ # Enable third parties caching (e.g LangChain cache)
12
+ cache = false
13
+
14
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
+ # follow_symlink = false
16
+
17
+ [features]
18
+ # Show the prompt playground
19
+ prompt_playground = true
20
+
21
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
22
+ unsafe_allow_html = false
23
+
24
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
25
+ latex = false
26
+
27
+ # Authorize users to upload files with messages
28
+ multi_modal = true
29
+
30
+ # Allows user to use speech to text
31
+ [features.speech_to_text]
32
+ enabled = false
33
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
34
+ # language = "en-US"
35
+
36
+ [UI]
37
+ # Name of the app and chatbot.
38
+ name = "Chatbot"
39
+
40
+ # Show the readme while the conversation is empty.
41
+ show_readme_as_default = true
42
+
43
+ # Description of the app and chatbot. This is used for HTML tags.
44
+ # description = ""
45
+
46
+ # Large size content are by default collapsed for a cleaner ui
47
+ default_collapse_content = true
48
+
49
+ # The default value for the expand messages settings.
50
+ default_expand_messages = false
51
+
52
+ # Hide the chain of thought details from the user in the UI.
53
+ hide_cot = false
54
+
55
+ # Link to your github repo. This will add a github button in the UI's header.
56
+ # github = ""
57
+
58
+ # Specify a CSS file that can be used to customize the user interface.
59
+ # The CSS file can be served from the public directory or via an external link.
60
+ # custom_css = "/public/test.css"
61
+
62
+ # Override default MUI light theme. (Check theme.ts)
63
+ [UI.theme.light]
64
+ #background = "#FAFAFA"
65
+ #paper = "#FFFFFF"
66
+
67
+ [UI.theme.light.primary]
68
+ #main = "#F80061"
69
+ #dark = "#980039"
70
+ #light = "#FFE7EB"
71
+
72
+ # Override default MUI dark theme. (Check theme.ts)
73
+ [UI.theme.dark]
74
+ #background = "#FAFAFA"
75
+ #paper = "#FFFFFF"
76
+
77
+ [UI.theme.dark.primary]
78
+ #main = "#F80061"
79
+ #dark = "#980039"
80
+ #light = "#FFE7EB"
81
+
82
+
83
+ [meta]
84
+ generated_by = "0.7.700"
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /venv
2
+ __pycache__/*
3
+ .env
4
+ download-hf-model.ipynb
5
+ temp
6
+ start_qdrant_services.sh
7
+ requirements copy.txt
8
+ Dockerfile copy
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import uuid
4
+ import chainlit as cl
5
+ from chainlit.types import AskFileResponse
6
+ from langchain.memory import ConversationBufferMemory
7
+ from langchain_core.chat_history import BaseChatMessageHistory
8
+ from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
9
+ from langchain.prompts import MessagesPlaceholder
10
+ from langchain.prompts import ChatPromptTemplate
11
+ from langchain_community.chat_message_histories import ChatMessageHistory
12
+ from langchain.chains.history_aware_retriever import create_history_aware_retriever
13
+ from langchain.chains.retrieval import create_retrieval_chain
14
+ from langchain.chains.combine_documents import create_stuff_documents_chain
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain_experimental.text_splitter import SemanticChunker
17
+ from langchain_qdrant import QdrantVectorStore
18
+ from langchain_core.documents import Document
19
+ from qdrant_client import QdrantClient
20
+ from qdrant_client.http.models import Distance, VectorParams
21
+ from langchain_openai import ChatOpenAI
22
+ from embedding_model import get_embeddings_openai_text_3_large,get_embeddings_snowflake_arctic_embed_l
23
+ from pdfloader import PDFLoaderWrapper
24
+ from langchain_core.runnables.history import RunnableWithMessageHistory
25
+ from chainlit.input_widget import Select, Switch, Slider
26
+ from dotenv import load_dotenv
27
+ from langchain_huggingface import HuggingFaceEmbeddings
28
+
29
+ load_dotenv()
30
+
31
+ BOR_FILE_PATH = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
32
+ NIST_FILE_PATH = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
33
+ SMALL_DOC = "https://arxiv.org/pdf/1908.10084" # 11 pages Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
34
+ documents_to_preload = [
35
+ BOR_FILE_PATH,
36
+ NIST_FILE_PATH
37
+ # SMALL_DOC
38
+ ]
39
+ collection_name = "ai-safety"
40
+
41
+ welcome_message = """
42
+ Welcome to the chatbot to clarify all your AI Safety related queries.:
43
+ Now preloading below documents:
44
+ 1. Blueprint for an AI Bill of Rights
45
+ 2. NIST AI Standards
46
+ Please wait for a moment to load the documents.
47
+ """
48
+ chat_model_name = "gpt-4o"
49
+ embedding_model_name = "Snowflake/snowflake-arctic-embed-l"
50
+ chat_model = ChatOpenAI(model=chat_model_name, temperature=0)
51
+
52
+ async def connect_to_qdrant():
53
+ embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
54
+ qdrant_url = os.environ["QDRANT_URL"]
55
+ qdrant_api_key = os.environ["QDRANT_API_KEY"]
56
+ collection_name = os.environ["COLLECTION_NAME"]
57
+ qdrant_client = QdrantClient(url=qdrant_url,api_key=qdrant_api_key)
58
+ vector_store = QdrantVectorStore(
59
+ client=qdrant_client,
60
+ collection_name=collection_name,
61
+ embedding=embedding_model,
62
+ )
63
+ return vector_store.as_retriever()
64
+
65
+ def initialize_vectorstore(
66
+ collection_name: str,
67
+ embedding_model,
68
+ dimension,
69
+ distance_metric: Distance = Distance.COSINE,
70
+ ):
71
+ client = QdrantClient(":memory:")
72
+ client.create_collection(
73
+ collection_name=collection_name,
74
+ vectors_config=VectorParams(size=dimension, distance=distance_metric),
75
+ )
76
+
77
+ vector_store = QdrantVectorStore(
78
+ client=client,
79
+ collection_name=collection_name,
80
+ embedding=embedding_model,
81
+ )
82
+ return vector_store
83
+
84
+ def get_text_splitter(strategy, embedding_model):
85
+ if strategy == "semantic":
86
+ return SemanticChunker(
87
+ embedding_model,
88
+ breakpoint_threshold_type="percentile",
89
+ breakpoint_threshold_amount=90,
90
+ )
91
+
92
+ def process_file(file: AskFileResponse, text_splitter):
93
+ if file.type == "text/plain":
94
+ Loader = TextLoader
95
+ elif file.type == "application/pdf":
96
+ Loader = PyMuPDFLoader
97
+
98
+ loader = Loader(file.path)
99
+ documents = loader.load()
100
+ title = documents[0].metadata.get("title")
101
+ docs = text_splitter.split_documents(documents)
102
+ for i, doc in enumerate(docs):
103
+ doc.metadata["source"] = f"source_{i}"
104
+ doc.metadata["title"] = title
105
+ return docs
106
+
107
+ def populate_vectorstore(vector_store, docs: List[Document]):
108
+ vector_store.add_documents(docs)
109
+ return vector_store
110
+
111
+ def create_history_aware_retriever_self(chat_model, retriever):
112
+ contextualize_q_system_prompt = (
113
+ "Given a chat history and the latest user question which might reference context in the chat history, "
114
+ "formulate a standalone question which can be understood without the chat history. Do NOT answer the question, "
115
+ "just reformulate it if needed and otherwise return it as is."
116
+ )
117
+ contextualize_q_prompt = ChatPromptTemplate.from_messages(
118
+ [
119
+ ("system", contextualize_q_system_prompt),
120
+ MessagesPlaceholder("chat_history"),
121
+ ("human", "{input}"),
122
+ ]
123
+ )
124
+ return create_history_aware_retriever(chat_model, retriever, contextualize_q_prompt)
125
+
126
+ def create_qa_chain(chat_model):
127
+ qa_system_prompt = (
128
+ "You are an helpful assistant named 'Shield' and your task is to answer any questions related to AI Safety for the given context."
129
+ "Use the following pieces of retrieved context to answer the question."
130
+ # "If any questions asked outside AI Safety context, just say that you are a specialist in AI Safety and can't answer that."
131
+ # f"When introducing you, just say that you are an AI assistant powered by embedding model {embedding_model_name} and chat model {chat_model_name} and your knowledge is limited to 'Blueprint for an AI Bill of Rights' and 'NIST AI Standards' documents."
132
+ "If you don't know the answer, just say that you don't know.\n\n"
133
+ "{context}"
134
+ )
135
+ qa_prompt = ChatPromptTemplate.from_messages(
136
+ [
137
+ ("system", qa_system_prompt),
138
+ MessagesPlaceholder("chat_history"),
139
+ ("human", "{input}"),
140
+ ]
141
+ )
142
+ return create_stuff_documents_chain(chat_model, qa_prompt)
143
+
144
+
145
+ def create_rag_chain(chat_model, retriever):
146
+ history_aware_retriever = create_history_aware_retriever_self(chat_model, retriever)
147
+ question_answer_chain = create_qa_chain(chat_model)
148
+ return create_retrieval_chain(history_aware_retriever, question_answer_chain)
149
+
150
+
151
+ def create_session_id():
152
+ session_id = str(uuid.uuid4())
153
+ return session_id
154
+
155
+
156
+ @cl.on_chat_start
157
+ async def start():
158
+ # cl.user_session.set("memory", conversation_buffer_memory)
159
+ msg = cl.Message(content=welcome_message)
160
+ await msg.send()
161
+
162
+ # Create a session id
163
+ session_id = create_session_id()
164
+ cl.user_session.set("session_id", session_id)
165
+
166
+ # Preserve chat history
167
+ conversation_buffer_memory = ConversationBufferMemory(
168
+ memory_key="chat_history",
169
+ output_key="answer",
170
+ chat_memory=ChatMessageHistory(),
171
+ return_messages=True,
172
+ )
173
+
174
+ # todo: if logged in user is admin then allow them to upload new pdfs.
175
+
176
+ # # Embedding model
177
+ # # embedding_model, dimension = get_embeddings_openai_text_3_large()
178
+ # embedding_model, dimension = get_embeddings_snowflake_arctic_embed_l()
179
+ # msg.content = "Embedding model loaded"
180
+ # await msg.update()
181
+ # cl.user_session.set("embedding_model", embedding_model)
182
+ # cl.user_session.set("dimension", dimension)
183
+
184
+ # # Pdf loader
185
+ # pdf_loader = PDFLoaderWrapper(
186
+ # documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF
187
+ # )
188
+ # msg.content = "Embedding model loaded"
189
+ # await msg.update()
190
+ # cl.user_session.set("pdf_loader", pdf_loader)
191
+ # documents = await pdf_loader.aload()
192
+
193
+ # text_splitter = get_text_splitter("semantic", embedding_model)
194
+
195
+ # chunked_docs = text_splitter.split_documents(documents)
196
+
197
+ # vector_store = initialize_vectorstore(
198
+ # collection_name, embedding_model, dimension=dimension
199
+ # )
200
+
201
+ # vector_store = populate_vectorstore(vector_store, chunked_docs)
202
+
203
+ retriever = await connect_to_qdrant()
204
+
205
+ rag_chain = create_rag_chain(chat_model, retriever)
206
+
207
+ store = {}
208
+
209
+ def get_session_history(session_id: str) -> BaseChatMessageHistory:
210
+ if session_id not in store:
211
+ store[session_id] = ChatMessageHistory()
212
+ return store[session_id]
213
+
214
+ conversational_rag_chain = RunnableWithMessageHistory(
215
+ rag_chain,
216
+ get_session_history,
217
+ input_messages_key="input",
218
+ history_messages_key="chat_history",
219
+ output_messages_key="answer",
220
+ )
221
+
222
+ # Let the user know that the system is ready
223
+ msg.content = msg.content + "\nReady to answer your questions!"
224
+ await msg.update()
225
+
226
+ cl.user_session.set("conversational_rag_chain", conversational_rag_chain)
227
+
228
+
229
+ @cl.on_message
230
+ async def main(message: cl.Message):
231
+ session_id = cl.user_session.get("session_id")
232
+ conversational_rag_chain = cl.user_session.get("conversational_rag_chain")
233
+
234
+ response = await conversational_rag_chain.ainvoke(
235
+ {"input": message.content},
236
+ config={"configurable": {"session_id": session_id},
237
+ "callbacks":[cl.AsyncLangchainCallbackHandler()]},
238
+ )
239
+ answer = response["answer"]
240
+
241
+ source_documents = response["context"]
242
+ text_elements = []
243
+ unique_pages = set()
244
+
245
+ if source_documents:
246
+
247
+ for source_idx, source_doc in enumerate(source_documents):
248
+ source_name = f"source_{source_idx+1}"
249
+ page_number = source_doc.metadata['page']
250
+ #page_number = source_doc.metadata.get('page', "NA") # NA or any default value
251
+ page = f"Page {page_number}"
252
+ text_element_content = source_doc.page_content
253
+ text_element_content = text_element_content if text_element_content != "" else "No Content"
254
+ #text_elements.append(cl.Text(content=text_element_content, name=source_name))
255
+ if page not in unique_pages:
256
+ unique_pages.add(page)
257
+ text_elements.append(cl.Text(content=text_element_content, name=page))
258
+ #text_elements.append(cl.Text(content=text_element_content, name=page))
259
+ source_names = [text_el.name for text_el in text_elements]
260
+
261
+ if source_names:
262
+ answer += f"\n\n Sources:{', '.join(source_names)}"
263
+ else:
264
+ answer += "\n\n No sources found"
265
+
266
+ await cl.Message(content=answer, elements=text_elements).send()
267
+
268
+ if __name__ == "__main__":
269
+ from chainlit.cli import run_chainlit
270
+
271
+ run_chainlit(__file__)
chainlit.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Welcome to Chat with Your Text File
2
+
3
+ With this application, you can chat with an uploaded text file that is smaller than 2MB!
embedding_model.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tiktoken
3
+ import os
4
+ from langchain_openai import OpenAIEmbeddings
5
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
6
+ import torch
7
+ from transformers import AutoModel, AutoTokenizer
8
+ from transformers import AutoModel, AutoTokenizer
9
+ from langchain_huggingface import HuggingFaceEmbeddings
10
+
11
+ # def get_embeddings_model_bge_base_en_v1_5():
12
+ # model_name = "BAAI/bge-base-en-v1.5"
13
+ # model_kwargs = {'device': 'cpu'}
14
+ # encode_kwargs = {'normalize_embeddings': False}
15
+ # embedding_model = HuggingFaceBgeEmbeddings(
16
+ # model_name=model_name,
17
+ # model_kwargs=model_kwargs,
18
+ # encode_kwargs=encode_kwargs
19
+ # )
20
+ # return embedding_model
21
+
22
+ # def get_embeddings_model_bge_en_icl():
23
+ # model_name = "BAAI/bge-en-icl"
24
+ # model_kwargs = {'device': 'cpu'}
25
+ # encode_kwargs = {'normalize_embeddings': False}
26
+ # embedding_model = HuggingFaceBgeEmbeddings(
27
+ # model_name=model_name,
28
+ # model_kwargs=model_kwargs,
29
+ # encode_kwargs=encode_kwargs
30
+ # )
31
+ # return embedding_model , 4096
32
+
33
+ # def get_embeddings_model_bge_large_en():
34
+ # model_name = "BAAI/bge-large-en"
35
+ # model_kwargs = {'device': 'cpu'}
36
+ # encode_kwargs = {'normalize_embeddings': False}
37
+ # embedding_model = HuggingFaceBgeEmbeddings(
38
+ # model_name=model_name,
39
+ # model_kwargs=model_kwargs,
40
+ # encode_kwargs=encode_kwargs
41
+ # )
42
+ # return embedding_model
43
+
44
+ def get_embeddings_openai_text_3_large():
45
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
46
+ dimension = 3072
47
+ return embedding_model,dimension
48
+
49
+ # def get_embeddings_snowflake_arctic_embed_l():
50
+ # current_dir = os.path.dirname(os.path.realpath(__file__))
51
+ # model_name = "Snowflake/snowflake-arctic-embed-l"
52
+ # tokenizer = AutoTokenizer.from_pretrained(f"{current_dir}/cache/tokenizer/{model_name}")
53
+ # model = AutoModel.from_pretrained(f"{current_dir}/cache/model/{model_name}")
54
+ # return model,1024
55
+
56
+ def get_embeddings_snowflake_arctic_embed_l():
57
+ embedding_model = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
58
+ return embedding_model,1024
pdfloader.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import List
3
+ from langchain_community.document_loaders import PyMuPDFLoader
4
+ from langchain_core.documents import Document
5
+ import asyncio
6
+
7
+ class PDFLoaderWrapper():
8
+ class LoaderType(str, Enum):
9
+ PYMUPDF = "pymupdf"
10
+
11
+ def __init__(self, file_path: str | List[str] , loader_type: LoaderType = LoaderType.PYMUPDF):
12
+ self.file_path = file_path if isinstance(file_path, list) else [file_path]
13
+ self.loader_type = loader_type
14
+
15
+ async def aload(self) -> List[Document]:
16
+ all_docs = []
17
+ for file_path in self.file_path:
18
+ if self.loader_type == self.LoaderType.PYMUPDF:
19
+ try:
20
+ loader = PyMuPDFLoader(file_path)
21
+ docs = await loader.aload()
22
+ all_docs.extend(docs)
23
+ except Exception as e:
24
+ print(f"Error loading file {file_path}: {e}")
25
+ continue
26
+ return all_docs
27
+
pre-processing.ipynb ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "% pip install numpy==1.26.4 \\\n",
10
+ "openai==1.44.1 \\\n",
11
+ "qdrant-client==1.11.2 \\\n",
12
+ "langchain==0.3.0 \\\n",
13
+ "langchain-text-splitters==0.3.0 \\\n",
14
+ "langchain-community==0.3.0 \\\n",
15
+ "langchain_experimental \\\n",
16
+ "langchain_qdrant \\\n",
17
+ "langchain_openai \\\n",
18
+ "pypdf==4.3.1 \\\n",
19
+ "PyMuPDF==1.24.10 \\\n",
20
+ "pymupdf4llm \\\n",
21
+ "sentence_transformers \\\n",
22
+ "langchain_huggingface "
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 1,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "BOR_FILE_PATH = \"https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf\"\n",
32
+ "NIST_FILE_PATH = \"https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf\"\n",
33
+ "SMALL_DOC = \"https://arxiv.org/pdf/1908.10084\" \n",
34
+ "documents_to_preload = [\n",
35
+ " BOR_FILE_PATH,\n",
36
+ " NIST_FILE_PATH\n",
37
+ " # SMALL_DOC\n",
38
+ "]\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stderr",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "/Users/jeevan/Documents/Learnings/ai-engineering-bootcamp/AIE4/AIE4/mid-term/ai-safety-chatapp/venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
51
+ " from tqdm.autonotebook import tqdm, trange\n"
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "# Embedding model - snowflake-arctic-embed-l\n",
57
+ "from langchain_huggingface import HuggingFaceEmbeddings\n",
58
+ "\n",
59
+ "model_name = \"Snowflake/snowflake-arctic-embed-l\"\n",
60
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_name)"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 3,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "from pdfloader import PDFLoaderWrapper\n",
70
+ "from langchain_experimental.text_splitter import SemanticChunker\n",
71
+ "\n",
72
+ "\n",
73
+ "pdf_loader = PDFLoaderWrapper(\n",
74
+ " documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF\n",
75
+ ")\n",
76
+ "documents = await pdf_loader.aload()\n",
77
+ "\n",
78
+ "text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type=\"percentile\",breakpoint_threshold_amount=90)\n",
79
+ "\n",
80
+ "chunked_docs = text_splitter.split_documents(documents)\n"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 4,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "import os\n",
90
+ "import getpass\n",
91
+ "\n",
92
+ "os.environ[\"QDRANT_API_KEY\"] = getpass.getpass(\"Enter Your Qdrant API Key: \")"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 5,
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "data": {
102
+ "text/plain": [
103
+ "['eddeba090cf64372b937fdeeb4a66a05',\n",
104
+ " '04d716b884124244876b0dd6bba15b4e',\n",
105
+ " 'db68b0d183214d95a3b8be26f9a3072f',\n",
106
+ " 'cb21583a20c748aa898821c475825aa1',\n",
107
+ " '503b0d1da1354b3dba9903d889fa1dcf',\n",
108
+ " 'f5db16617a4b4ed69cf46c7739ce1705',\n",
109
+ " '1e1532cacc434b988de2039a9b07bd95',\n",
110
+ " 'eb62a186469e4d6a860ed9f2c32264cf',\n",
111
+ " 'e621542bdc944c35adad13321669a782',\n",
112
+ " '0a8ad7cbf78b488bbcb19bc046f991ea',\n",
113
+ " 'e24af2031ccc4b86afc5c5b868ce0875',\n",
114
+ " '6eed4c2596e14f9b8fbcad5a16682bf7',\n",
115
+ " '0adb7d6c0ed641fdb1aa7c38b44f205e',\n",
116
+ " '69559e35c6284d1a9711aa0d67cf3663',\n",
117
+ " '1ce9c69941e446aa83325edd2a43e6af',\n",
118
+ " '67472b47adf3441dbc315be9369b5bac',\n",
119
+ " '0470ecb0cb3c48e49d2aeabf8c7b8764',\n",
120
+ " '603575c0eff4430f914b1ff491c6cff0',\n",
121
+ " 'cbff97eea69343c4b2073c3264337c82',\n",
122
+ " '9f0a9425534041e5a117f546d5aa9e0f',\n",
123
+ " '0f551ca5bb74476d8bed3183315cb687',\n",
124
+ " 'd79db53763084978bffeb3d2a9ba888e',\n",
125
+ " '4adcbf0531504ec3a7672b6ac5e88695',\n",
126
+ " '5fa8f04f748d4fcc8bdb0e803ca38053',\n",
127
+ " '855328fce2db47ec9a128d8eab41b3b8',\n",
128
+ " '9be9afb0e477463db19d507179e18436',\n",
129
+ " '9356ffe511ae4f659f079895695a67f3',\n",
130
+ " '26b1283ce9064aab9e6399db9371e542',\n",
131
+ " 'f9e92a690fcf4a58a978abd173f39503',\n",
132
+ " 'd0fd1190717046d1a02d7c21f15e982c',\n",
133
+ " '48ddfed6388e48c2bbc260d77d2ea7b2',\n",
134
+ " '7391be7bceba44d7974ff05604422ff6',\n",
135
+ " 'a1645bb4440f4a2eb1ff3e5a818e1857',\n",
136
+ " '9265bbde2d9549c5aefc2da778943bb1',\n",
137
+ " 'c0e85a23783e4eaab735bf4906c5c4ee',\n",
138
+ " '2a8e0ade33134a16b696a6f274815682',\n",
139
+ " '1c4df2e13e314e419286bff9441848ca',\n",
140
+ " '74cd8670d0e748c1883497732ec03814',\n",
141
+ " '7cfad9000dd7406f86e12c740d5ef6eb',\n",
142
+ " '0ee580a675574c42ab6f2a191279c7e1',\n",
143
+ " 'f24963b680ae448eb3a5a98a629207c8',\n",
144
+ " '2d85b61d77244dfa8d15d665ff354bb4',\n",
145
+ " 'f38706bb1fe34e6db224c8fde5bf29b4',\n",
146
+ " 'e0a651dab4f14ab8bb27f886acb6f81d',\n",
147
+ " '8e41d96e0c40420fa79ce5b9460eea90',\n",
148
+ " '0f56a8ddaf7a41828b98e0ad531b571c',\n",
149
+ " '03d42d339edc4dc9bd0081b619f55ecc',\n",
150
+ " 'ba42fded48a249c694bf9a9b2e94c146',\n",
151
+ " 'c36879e922d745ce824c3c6c5391c671',\n",
152
+ " 'cf782c0ed9d3430a884e8ecf11fc4628',\n",
153
+ " '85334a4e52e04aa8b364131dd90857f9',\n",
154
+ " '796507d959704177af9fda3d8b5429a4',\n",
155
+ " 'd614bad2456842e7b8f3e92de44d68ec',\n",
156
+ " '6042cdd6a45a42228e8fa2bd909f3b53',\n",
157
+ " '6a0fbab037cd4d079d7abdc0694392b5',\n",
158
+ " '9bb6e193fe1f47a5a8efb433ca026586',\n",
159
+ " 'af5664a283ca4262a22cb706ba0866ee',\n",
160
+ " '868ea113f0b2470eb85bc7fbf7355ac7',\n",
161
+ " '13d0c504b19d4794b7f4e0b0d0ecb41b',\n",
162
+ " 'd24f3f8f3967458689d5315992b51f44',\n",
163
+ " '363d7278858142e1a9a8580dff2469b1',\n",
164
+ " 'c9f1bb8520344d5fb6cf7db77986c326',\n",
165
+ " '8a738ce3bafd4cdf94a2984e20016146',\n",
166
+ " '904f9cb924be4e9aa1d8d01e7c49e5c2',\n",
167
+ " '8dbdb46493964a15af5a7550442e076d',\n",
168
+ " '957d5363e2904646999b45fca59c10df',\n",
169
+ " '2f519f70fee240eaaf5b9cf19dc9c058',\n",
170
+ " 'afc3204b02c64c7e9a7bd10d4000acac',\n",
171
+ " 'df1d427abe9348a0baf4561b4fab104b',\n",
172
+ " 'ac96b17eb3f343c983120a9f4c5577d9',\n",
173
+ " '47f1f02b36d441aa9db960fa593323b0',\n",
174
+ " 'a604fa5fb1344139a6149836d6bc20dd',\n",
175
+ " '574857debc5c42b49b341a04f41c0d47',\n",
176
+ " 'ca59bb9be5e64470b226b36b78f7a11c',\n",
177
+ " 'f590db78eceb4c22b0c224b77ffa03b5',\n",
178
+ " 'bcbcbe3628214fd89b0f5f0c008700b8',\n",
179
+ " '58cf91d633794df09036a3246d44d3f4',\n",
180
+ " '58788414f5314fb3b9fbbe9d3b82ce40',\n",
181
+ " '3eb765a9b00a458b9c352f9ecd66069c',\n",
182
+ " 'c8ae3298d8b64ac3ab9f45dc6e0b1712',\n",
183
+ " '4d4b558e40994a919700b19d6ce3fb78',\n",
184
+ " '2dec1ed15f52459fada667c171b6cd15',\n",
185
+ " '70b7b33f7b3f4a89b22bdc587e51408f',\n",
186
+ " '5301362517fb475b916fa4dee82c0ece',\n",
187
+ " 'ac25b4c2668d44bea14c8299ddd17c92',\n",
188
+ " 'f0509d5035c14fa2aa6f91181134b0f3',\n",
189
+ " 'd53f9f0dc38149ffa3ea8b0aa6bbf849',\n",
190
+ " 'a2962de10d784220a72a380e8d409786',\n",
191
+ " 'b25e3cd5b3b148aba2627d88b93728d8',\n",
192
+ " 'a344f02041bc45e984452fe693bd26a6',\n",
193
+ " '53ec4ac26d6342f498fee283a5235d3f',\n",
194
+ " '323f6f0dd916451c9231730257d9b166',\n",
195
+ " '62f04a85af0c4b8aacf2bde02d0bbed3',\n",
196
+ " 'aa840da6722e458caeac99cc503ddd10',\n",
197
+ " '7d4fe147f0984ffc8d2f59ce86d5312a',\n",
198
+ " '2fee83e32198453986a73a7e3311bb14',\n",
199
+ " '9757507874714b57b7ff6c6a51aef718',\n",
200
+ " 'c7f42a04c66c4ab28c2c7c18ed33700a',\n",
201
+ " '49bd88e28a244910b888e79af4039282',\n",
202
+ " '420985e4c15c4c18aad851e95bdbebd8',\n",
203
+ " 'aaf353cf3b3b4e2c9709895271f55649',\n",
204
+ " '7a52229e16944616a1108caa86fc815d',\n",
205
+ " '629098069e2c4d09b845065c653a206d',\n",
206
+ " 'b37f1984a07e480b957057932b5e5f4d',\n",
207
+ " '7dfce2615e0344eb8147a10c57ef790e',\n",
208
+ " '07a1d0a768384b4aa77f986faa02a111',\n",
209
+ " 'eeeea24632f6475a9248613e6f6b7273',\n",
210
+ " '5ffbfc5623db434d807438c8a5c2a3db',\n",
211
+ " '83bf4a99c26f4374b39eb5685b6029a8',\n",
212
+ " 'd2176d5ac1934206ac13a25e3d3b4af9',\n",
213
+ " '8929fa8903a748748f51bb3e55e3aac2',\n",
214
+ " '21cedcf7d73a4a3cbb10319e64ebf145',\n",
215
+ " 'c1ba5a05221542838764f923be43d9e2',\n",
216
+ " '4d07a54bf62d4a929bec86ba278fa935',\n",
217
+ " '0cfbdccc2fb24637bd061e456996aad1',\n",
218
+ " '403ed966e49f44deb05101c4fa6ae485',\n",
219
+ " '073d20ce62514518b0a7ec495b317fef',\n",
220
+ " 'cbde5f3792b04bef8632c91906fc7146',\n",
221
+ " 'cb5bc23be3cf4ae389c346526c0323ae',\n",
222
+ " '7ca053a641394108b41dd8a91b1347df',\n",
223
+ " '027ad2187b8d4a80b84e95c59809d111',\n",
224
+ " 'bc49807d6f8c4f87a923366f83586ed8',\n",
225
+ " '41312c394807424ab191bf3bbe40416f',\n",
226
+ " '8390108ed60e465da899e0a2a688efaf',\n",
227
+ " '72353287c709410fafbb8096dfdf72c8',\n",
228
+ " 'b2902f5c90894246966ed217e6fb9574',\n",
229
+ " '10b79371e2364b0a9773fcacdc3daf05',\n",
230
+ " '3625bc3a69564ea39a28ae9bf70c4614',\n",
231
+ " '9177954079094aeaba7884bd0dffcc06',\n",
232
+ " '394fd830247141248a7454aea3aa489d',\n",
233
+ " '96877971f66644e6b8ab5767d4119a71',\n",
234
+ " 'a7e88bf685c14ba48c820f106b221881',\n",
235
+ " '216ec8e4e3fe439998fafe51e826db5f',\n",
236
+ " '1b66efa55a114c2985cc5236fb0b0742',\n",
237
+ " 'aa991a0cb36d4a75967c5e591bc08b97',\n",
238
+ " '6a6ce074204a48bf95032ae16c7db23c',\n",
239
+ " 'ff8e1f9be7804049af8e7c41974e4c5a',\n",
240
+ " '31a71b7f81674adf8c93d65a2d2eec63',\n",
241
+ " '9cc71e212bb14b909c79649596170c2a',\n",
242
+ " '9b042e1551d747bf9070b0b4ba8c0212',\n",
243
+ " '65f8e2ed927d41658069977d684be32e',\n",
244
+ " '6ed2ac368f3b4c04b2898fb6be21e98b',\n",
245
+ " 'f0924724eb644c1880e208dc4fab03e1',\n",
246
+ " '721f66c77001440b9d952fc8f6f4c56f',\n",
247
+ " '4843d3717aaf47d4b34e1f3a3cba498d',\n",
248
+ " '6465a722293d4645afeaaa3bd132d63e',\n",
249
+ " 'd26331770ecb486c9a5335da6bb0740d',\n",
250
+ " '8c7894b2ea4c421ba2937fdb4e34be86',\n",
251
+ " 'b20695156c6d42f4828e70d10b6f0277',\n",
252
+ " '6d64009de2774beab0f7c18424bb7327',\n",
253
+ " '2765c0b9b2a24917ab36b119ca8eef84',\n",
254
+ " 'bcf0337addee484882291f8cf9de408b',\n",
255
+ " '92b1f5a04ef340f486dd31b4eceb4a7c',\n",
256
+ " '512bfff959224382a066921c89bb735b',\n",
257
+ " 'e0558260a83c465eacff77ed9cb7308a',\n",
258
+ " '4074348ca89546bda70812b37b92fe14',\n",
259
+ " '21633e3b024e45ac998d58410d16e51d',\n",
260
+ " '0797be77777644129b6e907d0fb897bc',\n",
261
+ " 'd5d49a75b7a846e1b020bc3e49d62da7',\n",
262
+ " '10941d6aa34c45bbb0168578a486db2c',\n",
263
+ " '87b58399cb2a45c781ecf4d8e2f4e307',\n",
264
+ " 'e9d77e36c77b4de1bec4ae7951e38511',\n",
265
+ " '317c0b2c1d7d4652be7d51a1512e7f3f',\n",
266
+ " 'a2c77fe449e242c29ee83a43f28f8c58',\n",
267
+ " '6f8d84d606e94f66a9bcbd82941e5133',\n",
268
+ " '38aefb805fdf40f287e505154560f7b4',\n",
269
+ " '8e720f4f5fde454d94d941cc8c027d8e',\n",
270
+ " '33dda74031614e2dbae31283d2674e65',\n",
271
+ " '68b4b01b9bb2441cad5182ac4d0a8c01',\n",
272
+ " '97bcb73c830b4429b446e3ab718de1b1',\n",
273
+ " 'c510dce650ec469e99b4653b80284d64',\n",
274
+ " '9ed79818fd9d406790961038e968d87c',\n",
275
+ " 'b7fc935f6b4f4922ba2c9695f1c6f253',\n",
276
+ " '9c5c6635c12a48e4bf50a5823aa13bec',\n",
277
+ " 'ea3882dac1c345d4a5f24589b64d273b',\n",
278
+ " 'd77d8d6861104592aa4849b6a6e21ded',\n",
279
+ " '260626f4c965432bb342768026f132e5',\n",
280
+ " '56e3ecd4eb64413a9a943bffe1ad86af',\n",
281
+ " '6638ad84fef94984a6082915855935c3',\n",
282
+ " '37eb01f2393e4402acc1547a954d2b7d',\n",
283
+ " 'aeffe25827314271b00284704a245dc1',\n",
284
+ " '922a3f98218c4dc7b3a1a574e5bfab41',\n",
285
+ " 'adf1623b0f2e490b8f5f62cc15e3fd03',\n",
286
+ " 'aaebb687fb1945009cbe473dae7b48e6',\n",
287
+ " '5be6cd8b0f224c8d94e92b32b35f2d46',\n",
288
+ " '4f9ce579447a4bd4a76d6288cf2652e2',\n",
289
+ " 'c0456f7c254346399654aa02ba1796d2',\n",
290
+ " '32cafe4487d04f498fd721b105d2a1ca',\n",
291
+ " 'b321bdeb2d9e4cb7a017aed9cedee7d6',\n",
292
+ " 'b8c3466d506842e1af7ac94deee57b81',\n",
293
+ " '93a3dba99695448f8123c13bb8b779c8',\n",
294
+ " '1a22fc8c63164c12bc2e6dcc1e558f4c',\n",
295
+ " 'bde4793c8ed242dcbec5da1cf329912b',\n",
296
+ " '1c45a001e9894b4ab2e8ba13db0c47c6',\n",
297
+ " '0971bf69ee1043c69d3126c9e056d7a8',\n",
298
+ " '2c8864207467424e98a32256e13d78e9',\n",
299
+ " 'a5712b28bc7244b69b2f1f4bd203383c',\n",
300
+ " 'd14300d16e404c8087980ea322d7c923',\n",
301
+ " '96af51f4384a472c8ed57f5fd85d6d0f',\n",
302
+ " 'dc43cafd35384e169a90ff6a4dd7e603',\n",
303
+ " 'ee49f93bc1924ee895f2ed1cd9dd5807',\n",
304
+ " '883e971384d74702894bd84742e9e01d',\n",
305
+ " '4cb32ece2cc1453c8b283966d77a573f',\n",
306
+ " '87da85de0eb24c0291409b33f20f4fe0',\n",
307
+ " '71d0f05cdd8e49c1b12fa6c4290e19b9',\n",
308
+ " '5787fd6f5f934396bdba27c0982c69fd',\n",
309
+ " 'ac3c132b8fa646b998bce3f8e816bfae',\n",
310
+ " '9e17210401cb4d0998520db1313023c7',\n",
311
+ " 'd24cbb397d16417290039b255903c915',\n",
312
+ " '3ba47099629d48249da0ff0633c2547b',\n",
313
+ " 'f07dce036bf24a7e88af1a021fc97d42',\n",
314
+ " '165045264b9b4c2881f675d51e66a2a6',\n",
315
+ " 'feb749b3910f416597e18b6573f5919f',\n",
316
+ " '022e7607e8ba428b9213f763e032a895',\n",
317
+ " '5ada3caafac145ee946015d373d46cb2',\n",
318
+ " '9895042a53e547e1a4df189e719783c8',\n",
319
+ " '73c1c19ba70146e8a7703b0bdf3d29c8',\n",
320
+ " '9f5714b0d63547a3ac6b62c0a8795af3',\n",
321
+ " '754785ee7b6d44f5a474940eade68323',\n",
322
+ " '93aa78a59c044242a3c774d66eb0ce38',\n",
323
+ " '19fbf46edf5b42ea98ce35f4becb22c9',\n",
324
+ " '88f6394d368c4a4dbaa91f2ca3f06e83',\n",
325
+ " '2e8e6595709a4e77a0656833e14de87a',\n",
326
+ " '83a8ea55e3374628a8d2817f949f4a33',\n",
327
+ " 'bca22c47143c46dcbc350e393f43a260',\n",
328
+ " '32dfd090a865443cafc87324f0f583bb',\n",
329
+ " '069a75a888bd43ca9b425f76eba236ba',\n",
330
+ " 'e4240b6768a24309960f6671510e2b97',\n",
331
+ " '29068f3ec43b4aabb78f49b93bbc0704',\n",
332
+ " '25f21b35cbdf49729887854567e8f093',\n",
333
+ " 'bb01bcd9b6f9464fb19296279cd228ce',\n",
334
+ " '6c7c0873830241c5acfc7a5ff68c3e7e',\n",
335
+ " 'fa6aab7a0587489f8218465112ca1027',\n",
336
+ " '0a3d9574f789460c8ac54e69aa5a896d',\n",
337
+ " '9cdf3f1556964e0fb95dce551b9d80d0',\n",
338
+ " 'e02b0fea19f24f40b9e21aa065aec5e2',\n",
339
+ " '02b5b6915447495a9b40d28631e20621',\n",
340
+ " 'fd7e17383d434aff8e3c1578381a2be5',\n",
341
+ " 'f54d6b530dc44c81861a3421ddf85477',\n",
342
+ " '31016f3fe8f44f2a95ec885eea02a1a0',\n",
343
+ " '89a6bb3ef75045a8a13d0e1717536a7b',\n",
344
+ " '10ad95ca700f4d8095230aa7ccacdf45',\n",
345
+ " 'fa02f1851c694e98b9fc8b40f0789fca',\n",
346
+ " 'e307d59d2bda4bb498810bd088d58c26',\n",
347
+ " '09531751f0ad46c990d830f784d49920',\n",
348
+ " '463a5cc86f014d9b9c934019f8a6a3b0',\n",
349
+ " 'c6da3d870a664ae5a58d5b0f24f86afa',\n",
350
+ " '0347b993d5104dbe9c58cd87ec0e0920',\n",
351
+ " 'baf4197276f34479bf254bbb95457a78',\n",
352
+ " '4dc1052154984cdfbab206cb21389316',\n",
353
+ " '9201ed1c46b842fbb57c14abf6dc681c',\n",
354
+ " '67d60aae5043431f9296735656c1c578',\n",
355
+ " '96b88d3220c1438584ce83d20e7126d5',\n",
356
+ " 'cb24d605797f42b1956bdf34121734b8',\n",
357
+ " '6d180b3524eb43f5b1629e0edbeed005',\n",
358
+ " '422ab68c4bb44d9485a7e891de8d8bb2',\n",
359
+ " '2efea4ce3a1f489490e0eb277e2f9b51',\n",
360
+ " 'f39aeee6512644f78ae848bd0b17925c',\n",
361
+ " '922ba55579cc40a9ae6de8ff4d8f2fcb',\n",
362
+ " 'c94dce24f53b4fc597b0a9844cb9f2d1',\n",
363
+ " '92e5324c55f54c74bfeeae5a5ee472d0',\n",
364
+ " 'da29c33c41d54889926938c763559184',\n",
365
+ " '24175ca064f94ab18ffd755537fbeb08',\n",
366
+ " '737a9a163ab04803ab4811b083e84a7f',\n",
367
+ " 'c48c7210f7e04f42ba749352a90a2e14',\n",
368
+ " '33d1da5d12e84f039af98e53805463db',\n",
369
+ " '168384cba6a94e7ab9801b98eb25a83e',\n",
370
+ " '216c155d1910410e9ff3e2981305b0a1',\n",
371
+ " '9d2baad2bf734ec395e388832f375428',\n",
372
+ " '8f427298554343db9afef5f0ca04508f',\n",
373
+ " '91d16d31d0c84742ac7b410d634a77b4',\n",
374
+ " '49eead9aa61346fc8c20e6b320968f9c',\n",
375
+ " '2726ba58fdca4fab8b3d504088f5bb16',\n",
376
+ " '7cbfe53e55d94c9eab33049257b5b996',\n",
377
+ " '1829f7c9c7f64741a02e66cf07d7e4ca',\n",
378
+ " 'dc51c1d3980a41be85154cb81492f0d8',\n",
379
+ " '30babae9382a4204b65dfcea45b94111',\n",
380
+ " 'b3cf2c85a1aa43369f0d8bd3d1148214',\n",
381
+ " '8e66f9797e9542df8401cbeba12bf576',\n",
382
+ " 'e158952bb2a444b3b0137cc9ed09b2b3',\n",
383
+ " 'cc424f6c708d46c3b6ff0ccc532256a8',\n",
384
+ " '9f15d95cc0484a8b84f52c4e6aa6a3d1',\n",
385
+ " 'dd5123a7824b41fcbf9553691f4f8ce4',\n",
386
+ " '0c568b98747f4deda37a886d8176eaef',\n",
387
+ " '71b8843b17914ae2926ab5f7cc6c2afc',\n",
388
+ " '7123c69c56fc4ca79e510fe962fda11f',\n",
389
+ " '0662e249e7e846f7a34cdfb6b0ab97a4',\n",
390
+ " '37614a4fa3bc4b6892ed640136432595',\n",
391
+ " 'ba977631881b4e7887541f304cbdbf84',\n",
392
+ " 'a9e9604bbe8546db8379df385c6775bb',\n",
393
+ " '29cbaedfdc4842be8af5ef11fb0ae880',\n",
394
+ " 'fb96faf7d811465197ecf0bebb5ef7af',\n",
395
+ " 'a386736ae3e34fd8810f61554daff3a8',\n",
396
+ " '3322d87785f74f7bb03fff6184a88959',\n",
397
+ " '8d56b24bcce647ef92bcea0b9e730535',\n",
398
+ " '9df83b49b7b347e4880663e59fd15a4c',\n",
399
+ " 'e6beb25ea09243edab2d0b917d4ed658',\n",
400
+ " 'b788547e806548e4a6c0be68c37783c8',\n",
401
+ " 'd0cc604bb0aa47158eda39e6ea990c9a',\n",
402
+ " 'fa8ea860f8604a78a544a21ff65a9fa6',\n",
403
+ " '73efff02cfa3420dbc5d8412282093e4',\n",
404
+ " '8a891d866e8f41839dfa04dcfab7704d',\n",
405
+ " 'af9020b82fcb4f7dac6466a661c26915',\n",
406
+ " '10ea6f99054b4844bb42258530fb0058',\n",
407
+ " 'c4a6866e6ce84c468caf7cdb6e6d535b',\n",
408
+ " 'b0c1794f0f7f4f7c8fd21ab6598eccac',\n",
409
+ " 'ac530d4f83ee4c418e52245ab06630c2',\n",
410
+ " '860d9613fdfb4b969cb5dec21b29be06',\n",
411
+ " '220bd37a37ca4f259844deb952c32245',\n",
412
+ " '2eaa29f0eb34476fa3f67669e3da5b62',\n",
413
+ " '550db6a68b394795926cb1faaa1e0a52',\n",
414
+ " 'e86011e86a524ee8bf17f674fbb687e3',\n",
415
+ " '7d7b1c4d811d42d4aa80f5582c64f9ac',\n",
416
+ " '5a4c8245a6394f7db6562a9324683a03',\n",
417
+ " '96aa6bdf11964b2cb2531bba32f34249',\n",
418
+ " '168db84dd98248a88d5de07aa13496ae',\n",
419
+ " 'd67d59079f4a4c10b80d571cb438de23',\n",
420
+ " 'd7158abe270d496d9985420c9b03a077',\n",
421
+ " '2fde1e21c14a4f90a113b395a3712743',\n",
422
+ " '2510f9f4fe8b4bb78bb880469169476a',\n",
423
+ " 'a4a1b06704044d4eadf8b2e6142e79c9',\n",
424
+ " '0a466939b2d54d5293770a7b1bfecda1',\n",
425
+ " '79aee364cb994e36b1adde5b8e5e3086',\n",
426
+ " '5592d4e1f1ec4307956ff1a56c7a9a95',\n",
427
+ " '3deee26006c8406da04f85d82ef7c52b',\n",
428
+ " '4f3bb845b65f4ff9b6dc44bc0e6ff645',\n",
429
+ " '63b8bbe1a5fc461dad75fa7aed4e1382',\n",
430
+ " 'bd833bdc626741e1bd3af21e90b598e0',\n",
431
+ " 'dfb14880e30d4dbf8caf0074d68d97c1',\n",
432
+ " 'f8d84d5b0edb487db8cd381569a5d79e',\n",
433
+ " 'b43b34c5ec3449bebe6eaec3ecf141b4',\n",
434
+ " '57c7120546b1405984982cfdc6198077',\n",
435
+ " '9a53cef8c1344f75ab49b328c11f98a0',\n",
436
+ " 'bf49b6b8d8594e7588b1b108007034b2',\n",
437
+ " 'c0b6a58b37444cc0b556cfeee6ca8b55',\n",
438
+ " 'a6a8db63ba4a4baf9d32f320c21c6313',\n",
439
+ " 'c61e63e1f9b34e3dae8129e3010d8706',\n",
440
+ " '6be3ec9a2fca469f9c9915d056cc8324',\n",
441
+ " '746efb41eb2c40a88aedb82e5b29dfca',\n",
442
+ " '46c5071d7e204ff888dce1ed7dfffff4',\n",
443
+ " '04b68080947d4c96b13f7e9eb428070d',\n",
444
+ " '66309b0484914b099cda9744b3bfae55',\n",
445
+ " '5fe570e4ef224d41aab64aab07f7ef18',\n",
446
+ " '8a698955d62d4505be78c0914e1ec7ab',\n",
447
+ " 'b1f0063f145e4c2f9900dc7ba64a092f',\n",
448
+ " 'f3ab3e53fb204ef7a6d30c6513e42bc3',\n",
449
+ " '03295952d1d241b3bc7ed1abd990e32a',\n",
450
+ " 'efd0faadabbe4dad9ae7af70fadee972',\n",
451
+ " 'e398133bbea24e48af9f8cab8f32310e',\n",
452
+ " 'db3e34ee20c043be804e1c1a9db594fb',\n",
453
+ " '991a44d7e9ab4ae486964e6d26034918',\n",
454
+ " '55910ec6f0f64a49b167553ff9a20f61',\n",
455
+ " '1d9d5538d4d34e3e992e3891ead9cae8',\n",
456
+ " '863753ca91b74fbebd46d2a3f0f0b7a6',\n",
457
+ " 'b13efd943e7947259e5f807b2229ccf3',\n",
458
+ " 'b0fddb4609a54fc8b0dcc6cab57161a9',\n",
459
+ " '0f943749f0d24d8e94fd1dc209b09b9c',\n",
460
+ " '10af8065bf0b4eb5aca05993ac546176',\n",
461
+ " '23ed6a4ca13343099d920007034f7d34',\n",
462
+ " 'ceb0455e738b47d7b3b7fd99506dbb65',\n",
463
+ " '940d178005994c83b17652f086e292d6',\n",
464
+ " '69ee5697b0b742dd86990299a8c8cefe',\n",
465
+ " 'ab48f4fad8d34410b9949d071abdfb42',\n",
466
+ " '135ba4ad88fb4300a560a7323535e37b',\n",
467
+ " '66c9e4f32ca8418d9922de33bf84dcd4',\n",
468
+ " 'c776e9698a424f8a9271f3035f02c041',\n",
469
+ " '7544393d08454b2191837910d05db972',\n",
470
+ " 'd9a62e36fdeb40799071931e14b52963',\n",
471
+ " 'dc66a111d00344b5af5963c0d118d37b',\n",
472
+ " '9d1f6a6c09a14334b3adc4799cfd8d7e',\n",
473
+ " '27587b7eec344848a4fd80724d74c916',\n",
474
+ " '502aed42f63440c791cd27e8c4d0ebeb',\n",
475
+ " '5f1b1c46c7b04b27a8821b2c26425cba',\n",
476
+ " 'cd51cdfa35934fe19059aed56da1e35b',\n",
477
+ " '6fe71508c191419db508a9958ccf94e3',\n",
478
+ " '3e33955d1f0c4729b92f5560bc9a5f48',\n",
479
+ " '5d6837486bae4495857be79cac8fbab6',\n",
480
+ " '7e526fdb9dd149b39718bbdf652fa10f',\n",
481
+ " '6a7504d312b34b70925f761ec113e06e',\n",
482
+ " 'cdbc28b9da1846d8a6a9209fc4052d9d',\n",
483
+ " 'd5e7d7ca7c6b46689385ef8ec9acf389',\n",
484
+ " 'c521c62ec85349208aef52f72a0ab0a3',\n",
485
+ " '5af3b7b361884f22b1ccd94b1bf56718',\n",
486
+ " '3da65a3dfebe4074bfee9df2216ac6a0',\n",
487
+ " '989436d80f11412786de32631a20164c',\n",
488
+ " 'b8934952cb644d6186db12fea1f36900',\n",
489
+ " '8961e893462e4750ace14ab8089730a2',\n",
490
+ " 'df6d995cc7a94f19bb82c56be27bcb8d',\n",
491
+ " '9802e6e00fb641dd901a90b7c101b578',\n",
492
+ " '2c77bc0c03d245ca8692ba2a05b9ff57',\n",
493
+ " '2c42f0f9da6d4ef9986bc069c1dde12d',\n",
494
+ " '6263199a92654cddbe70aaaaee82b0a3',\n",
495
+ " '20c172b306cc4262929dab4a7b734760',\n",
496
+ " '2961b6f93919473185e6c407801e4a2e',\n",
497
+ " 'f430faeec2984de0b12cb14d35200475',\n",
498
+ " '2455745f94554dd694969decf6232d67',\n",
499
+ " 'a8307d768d574b71a50e69aa3e906913',\n",
500
+ " 'a8af04cdc83b4bcca4b3334cc4f87a31',\n",
501
+ " 'e0105deb25104259922024363eb278f9',\n",
502
+ " '6dbba361217f4f65845ad1e58f52f66d',\n",
503
+ " 'd6b391d2232b439d861c85d81aeffd13',\n",
504
+ " 'f0357e35b61649968204afe0afcaa986',\n",
505
+ " '84b5f1ea889a438495be7f5286516439',\n",
506
+ " 'fefa48d67f004ae991280d670ff0525f',\n",
507
+ " 'b49d4c918add4577b2ba23a7cae95a2f',\n",
508
+ " '8227801e19af44e6bd5552ca958dc83d',\n",
509
+ " '011c11d3da5b45aa98eca658fe04934c',\n",
510
+ " '5e5348434bf94d50a57995b5e64ebf13',\n",
511
+ " '6c704ef7e4cf4ccb9e911e34c12a503a',\n",
512
+ " '739b2782f9ad4e8cb9b5157bdbc2f92d',\n",
513
+ " 'cdb3dbcb67fa4e4b84f2b89cc0fa35e0',\n",
514
+ " '1bea36f66ac64c95b895cec1540da580',\n",
515
+ " '8566fba4d0aa43b293cf64db66b53051',\n",
516
+ " 'c3ec323138fc42678d77c8c483d8e73a',\n",
517
+ " '008c74419bd544899d9d83212e531ba5',\n",
518
+ " '3c3237238c6b4bfe990734daa6344fb7',\n",
519
+ " '7a44fe6828c54c469b52a0d215e38135',\n",
520
+ " 'bbbc1bbb596d4ba28b78eee1908e57a4',\n",
521
+ " 'bde8eb1d0f314d728fd3f94cce68c5ae',\n",
522
+ " '39c307c0404141c1a51784fcce1834f2',\n",
523
+ " '3a82fd7dca36498495bb3b6400656bc6',\n",
524
+ " '1c790b01db7a4bfda6a59542322ce975',\n",
525
+ " 'f56b41ea938547eaac61edabd71e0cc2',\n",
526
+ " '55708b9971954a77a64440b4e2a4d437',\n",
527
+ " '6052cc5180aa43359948f92a2fba7fd2',\n",
528
+ " '2a9f782eb0b94d2381c2b902b89313db',\n",
529
+ " 'fcb302874996442296870bdff15b2d4f',\n",
530
+ " '0864ff8559dc43be94959f7493dd6067',\n",
531
+ " '7bc9df2622734502bfacdd235b66edd1',\n",
532
+ " 'a01170164ec84b7194848a9021586d99',\n",
533
+ " 'e2c7ea2f03cd4100bef06b31c15d5df6',\n",
534
+ " 'b85b7a4a8660444fa704ecef67e5978c']"
535
+ ]
536
+ },
537
+ "execution_count": 5,
538
+ "metadata": {},
539
+ "output_type": "execute_result"
540
+ }
541
+ ],
542
+ "source": [
543
+ "from langchain_qdrant import QdrantVectorStore\n",
544
+ "from langchain_core.documents import Document\n",
545
+ "from qdrant_client import QdrantClient\n",
546
+ "from qdrant_client.http.models import Distance, VectorParams\n",
547
+ "\n",
548
+ "dimension = 1024\n",
549
+ "collection_name = \"ai-safety-sr-arctic-embed-l-semantic\"\n",
550
+ "qdrant_server = \"https://500cb0e8-ea08-4662-b4f2-3eca11e635da.europe-west3-0.gcp.cloud.qdrant.io:6333\"\n",
551
+ "qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
552
+ "qdrant_client.create_collection(\n",
553
+ " collection_name=collection_name,\n",
554
+ " vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
555
+ ")\n",
556
+ "\n",
557
+ "vector_store = QdrantVectorStore(\n",
558
+ " client=qdrant_client,\n",
559
+ " collection_name=collection_name,\n",
560
+ " embedding=embedding_model,\n",
561
+ ")\n",
562
+ "\n",
563
+ "vector_store.add_documents(chunked_docs)"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": null,
569
+ "metadata": {},
570
+ "outputs": [],
571
+ "source": []
572
+ }
573
+ ],
574
+ "metadata": {
575
+ "kernelspec": {
576
+ "display_name": "venv",
577
+ "language": "python",
578
+ "name": "python3"
579
+ },
580
+ "language_info": {
581
+ "codemirror_mode": {
582
+ "name": "ipython",
583
+ "version": 3
584
+ },
585
+ "file_extension": ".py",
586
+ "mimetype": "text/x-python",
587
+ "name": "python",
588
+ "nbconvert_exporter": "python",
589
+ "pygments_lexer": "ipython3",
590
+ "version": "3.11.9"
591
+ }
592
+ },
593
+ "nbformat": 4,
594
+ "nbformat_minor": 2
595
+ }
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.4
2
+ chainlit==0.7.700 # 1.1.402
3
+ openai==1.44.1
4
+ qdrant-client==1.11.2
5
+ langchain==0.3.0
6
+ langchain-text-splitters==0.3.0
7
+ langchain-community==0.3.0
8
+ langchain_experimental
9
+ langchain_qdrant
10
+ langchain_openai
11
+ pypdf==4.3.1
12
+ PyMuPDF==1.24.10
13
+ pymupdf4llm
14
+ sentence_transformers
15
+ langchain_huggingface