In [None]:
% pip install numpy==1.26.4 \
openai==1.44.1 \
qdrant-client==1.11.2 \
langchain==0.3.0 \
langchain-text-splitters==0.3.0 \
langchain-community==0.3.0 \
langchain_experimental \
langchain_qdrant \
langchain_openai \
pypdf==4.3.1 \
PyMuPDF==1.24.10 \
pymupdf4llm \
sentence_transformers \
langchain_huggingface 

In [2]:
BOR_FILE_PATH = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
NIST_FILE_PATH = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
SMALL_DOC = "https://arxiv.org/pdf/1908.10084" 
documents_to_preload = [
 BOR_FILE_PATH,
 NIST_FILE_PATH
 # SMALL_DOC
]


In [3]:
# Embedding model - snowflake-arctic-embed-l
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "Snowflake/snowflake-arctic-embed-l"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

 from tqdm.autonotebook import tqdm, trange


In [12]:
from pdfloader import PDFLoaderWrapper
from langchain_experimental.text_splitter import SemanticChunker


pdf_loader = PDFLoaderWrapper(
 documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF
)
documents = await pdf_loader.aload()

text_splitter = SemanticChunker(embedding_model, buffer_size=5, breakpoint_threshold_type="percentile",breakpoint_threshold_amount=90)

chunked_docs = text_splitter.split_documents(documents)


In [23]:
# Recursive splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
recursive_text_splitter = RecursiveCharacterTextSplitter(
 chunk_size = 1024,
 chunk_overlap = 100,
 length_function = len,
)
recursive_chunked_docs = recursive_text_splitter.split_documents(documents)

In [4]:
import os
import getpass

os.environ["QDRANT_API_URL"] = getpass.getpass("Enter Your Qdrant API URL: ")
os.environ["QDRANT_API_KEY"] = getpass.getpass("Enter Your Qdrant API Key: ")
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")


In [13]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

dimension = 1024
collection_name = "ai-safety-sr-arctic-embed-l-semantic"
qdrant_server = os.environ["QDRANT_API_URL"]
qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ["QDRANT_API_KEY"])
qdrant_client.create_collection(
 collection_name=collection_name,
 vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
 client=qdrant_client,
 collection_name=collection_name,
 embedding=embedding_model,
)

vector_store.add_documents(chunked_docs)



['8dd5b1e7fd464e2a90c28a8eea8b0cb9',
 '906e0c268d564dbc89c0b8398e235407',
 '4b81191a4cc94fbd835dc9c942e9543a',
 '25c3b7fffa8d4bc29790057fe2f4d025',
 '3ad5906a8a274b56bd05e4ac39ffe459',
 'e3fa01bef57c489ca014be2e589b7ef1',
 'af5fc5121c6a438a8fc5dea454b7e92f',
 '80500cf02d5748c39b1c62288459c306',
 '5db6eebee14b4aafa948e4f9aa4f7aa2',
 '99385298e8744643822e01525bdff89e',
 'eddc9704820d4005b7c62a5085f69454',
 '4324a624f4054ae5baa7270d9f6aaa56',
 '9eb24bea31a749f1b7a86ac2b186ec14',
 '7e9c9763bebf40cea1833ea6ad376eeb',
 'cc8846008cac472e88eb16497c560a15',
 '5af0886e387449fc89f1d0e82c32c590',
 '824ae7c1c15a43c8b62713f02d91e0b5',
 'f0ef1b30251b4429ad7d902b85fafcf8',
 '314a75e55d1b4c1fa46f49610d745f95',
 '66828a5f9536480bbd08d94f087bc44b',
 '8230b8add982486f9ac8e120a27d3aec',
 'dd1c75bb5c1441468ac8e7d4595bf0b9',
 'a9b1b1b87eeb48b78ed4cf6adddee9d2',
 'eeacab16c9d94d08a791c516e0a65f6b',
 '187badb4dc064743898f5e5218114250',
 '0ecc4e873fe047ce8afc33e19fe40c3f',
 'be7b81185ce140229bee6d1306120528',
 

In [14]:
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
 search_kwargs={'k':10,'score_threshold': 0.8})

In [15]:
retriever.invoke("What steps can organizations take to minimize bias in AI models?")

[Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 44, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:20240805143048-04'00'", 'trapped': '', '_id': 'b6779e22-20c4-44d3-8741-c06cc2bb380c', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='Human-AI Configuration \n'),
 Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 33, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management

In [25]:
# Vector Store with recursive chunked documents

recursive_collection_name = "ai-safety-sr-arctic-embed-l-recursive"

recursive_qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ["QDRANT_API_KEY"])
# recursive_qdrant_client.create_collection(
# collection_name=recursive_collection_name,
# vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
# )

recursive_vector_store = QdrantVectorStore(
 client=recursive_qdrant_client,
 collection_name=recursive_collection_name,
 embedding=embedding_model,
)

recursive_vector_store.add_documents(recursive_chunked_docs)

['dd370438231c41dbb7b1b4f1e7673cf7',
 '02ebba25e01941849b9e2c9d5097b55d',
 '099f0083356a4914b53fcb30df633b50',
 'f8aefa25a4544c869ca4caaf686b3d47',
 '9ec0798fb4554f95ab65bd05315af118',
 '33bdad4db0ab4145b85726f77f1789ad',
 '98a75a601b114b07953b5aef4e032b4a',
 '1e49952c0d6743ba8ad52a049c18daa3',
 'c3babb9205e54ca99ba6e5a03679bdba',
 '74cecdae132c4a5e953bd7e72ac6850e',
 '29529ea9530541a0bb446a8e82fab913',
 '4193dcf34f6249b1a29c49a52239deef',
 '84cb5d0f2cee47beabd72baa54161155',
 '622f279ac5bd40b082725d90972e9ae3',
 '48e366f92aa449e89cf7158584d2cf6a',
 'e2ffb7cb2ac3482fb9290940fabe9582',
 'f52a4c3353544fff93f241cba063028a',
 '0c81aa08ddd4496a9aaea4b001f3596c',
 '3e9d8d7785b04d5fad063219c94ef0dd',
 '76796785c7b64d428e48b7cf699e155a',
 '593ab20fc2494634959b0bfd8821ea91',
 '654421ae91df4739bfb1ebdfb7c9dda2',
 '27ffe059aafd4d5fa795b2f893b1d57e',
 'f1468d8276444858acb33bd6e2d36e73',
 '5a6a15255cdd438abd9b2c3358dca939',
 'fbb13ef430ca47d28013dda9feaf4625',
 'fc16826ddd504038bb5f32fd97cdd98e',
 

In [26]:
recursive_retriever = recursive_vector_store.as_retriever(search_type="similarity_score_threshold",
 search_kwargs={'k':10,'score_threshold': 0.8})

In [28]:
recursive_retriever.invoke("What steps can organizations take to minimize bias in AI models?")

[Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 11, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': '', '_id': '70fb8aa0-96a7-4d0a-9757-05ac44f08577', '_collection_name': 'ai-safety-sr-arctic-embed-l-recursive'}, page_content='FROM \nPRINCIPLES \nTO PRACTICE \nA TECHINCAL COMPANION TO\nTHE Blueprint for an \nAI BILL OF RIGHTS\n12'),
 Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 50, 'total_pages': 64, 'format': 'PDF 1.6', 'title': '

In [37]:
# Trying Compression retriver
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import ChatOpenAI

base_retriever = recursive_retriever

#Create a contextual compressor
compressor_llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)
compressor = LLMChainExtractor.from_llm(compressor_llm)

#Combine the retriever with the compressor
compression_retriever = ContextualCompressionRetriever(
 base_compressor=compressor,
 base_retriever=base_retriever
)



In [18]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

In [16]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [19]:
from langchain_openai import ChatOpenAI

# Using the same model used in the app.
chat_model_name = "gpt-4o"
llm = ChatOpenAI(model=chat_model_name)

In [38]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema import StrOutputParser

ai_safety_rag_chain = (
 {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
 | rag_prompt | llm | StrOutputParser()
)

In [39]:
ai_safety_rag_chain.invoke({"question" : "How can companies ensure AI does not violate data privacy laws?"})

'Companies can ensure AI does not violate data privacy laws by incorporating built-in protections and ensuring that data collection conforms to reasonable expectations. They should collect only the data strictly necessary for the specific context and seek user permission, respecting their decisions regarding the collection, use, access, transfer, and deletion of data. If obtaining user permission is not possible, alternative privacy by design safeguards should be used. Additionally, systems should avoid user experience and design decisions that obfuscate user choice or burden users with privacy-invasive defaults.'

In [40]:
ai_safety_rag_chain.invoke({"question" : "What are the implications of using GAI systems for organizations in terms of risk management and compliance?"})

"I don't know."