In [None]:
% pip install numpy==1.26.4 \
openai==1.44.1 \
qdrant-client==1.11.2 \
langchain==0.3.0 \
langchain-text-splitters==0.3.0 \
langchain-community==0.3.0 \
langchain_experimental \
langchain_qdrant \
langchain_openai \
pypdf==4.3.1 \
PyMuPDF==1.24.10 \
pymupdf4llm \
sentence_transformers \
langchain_huggingface 

In [1]:
BOR_FILE_PATH = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
NIST_FILE_PATH = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
SMALL_DOC = "https://arxiv.org/pdf/1908.10084" 
documents_to_preload = [
 BOR_FILE_PATH,
 NIST_FILE_PATH
 # SMALL_DOC
]


In [2]:
# Embedding model - snowflake-arctic-embed-l
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "Snowflake/snowflake-arctic-embed-l"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

 from tqdm.autonotebook import tqdm, trange


In [3]:
from pdfloader import PDFLoaderWrapper
from langchain_experimental.text_splitter import SemanticChunker


pdf_loader = PDFLoaderWrapper(
 documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF
)
documents = await pdf_loader.aload()

text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile",breakpoint_threshold_amount=90)

chunked_docs = text_splitter.split_documents(documents)


In [4]:
import os
import getpass

os.environ["QDRANT_API_KEY"] = getpass.getpass("Enter Your Qdrant API Key: ")

In [5]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

dimension = 1024
collection_name = "ai-safety-sr-arctic-embed-l-semantic"
qdrant_server = "https://500cb0e8-ea08-4662-b4f2-3eca11e635da.europe-west3-0.gcp.cloud.qdrant.io:6333"
qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ["QDRANT_API_KEY"])
qdrant_client.create_collection(
 collection_name=collection_name,
 vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
 client=qdrant_client,
 collection_name=collection_name,
 embedding=embedding_model,
)

vector_store.add_documents(chunked_docs)

['eddeba090cf64372b937fdeeb4a66a05',
 '04d716b884124244876b0dd6bba15b4e',
 'db68b0d183214d95a3b8be26f9a3072f',
 'cb21583a20c748aa898821c475825aa1',
 '503b0d1da1354b3dba9903d889fa1dcf',
 'f5db16617a4b4ed69cf46c7739ce1705',
 '1e1532cacc434b988de2039a9b07bd95',
 'eb62a186469e4d6a860ed9f2c32264cf',
 'e621542bdc944c35adad13321669a782',
 '0a8ad7cbf78b488bbcb19bc046f991ea',
 'e24af2031ccc4b86afc5c5b868ce0875',
 '6eed4c2596e14f9b8fbcad5a16682bf7',
 '0adb7d6c0ed641fdb1aa7c38b44f205e',
 '69559e35c6284d1a9711aa0d67cf3663',
 '1ce9c69941e446aa83325edd2a43e6af',
 '67472b47adf3441dbc315be9369b5bac',
 '0470ecb0cb3c48e49d2aeabf8c7b8764',
 '603575c0eff4430f914b1ff491c6cff0',
 'cbff97eea69343c4b2073c3264337c82',
 '9f0a9425534041e5a117f546d5aa9e0f',
 '0f551ca5bb74476d8bed3183315cb687',
 'd79db53763084978bffeb3d2a9ba888e',
 '4adcbf0531504ec3a7672b6ac5e88695',
 '5fa8f04f748d4fcc8bdb0e803ca38053',
 '855328fce2db47ec9a128d8eab41b3b8',
 '9be9afb0e477463db19d507179e18436',
 '9356ffe511ae4f659f079895695a67f3',
 