# Synthetic data generation using Ragas framework

> Python packages are installed from `requirements.txt` file into virtual environment

In [None]:
!pip install -qU langsmith langchain-core langchain-community langchain-openai langchain-qdrant langchain_experimental pymupdf ragas

In [2]:
!pip freeze


aiofiles==23.2.1
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==3.7.1
appdirs==1.4.4
appnope==0.1.4
asttokens==2.4.1
asyncer==0.0.2
attrs==24.2.0
bidict==0.23.1
certifi==2024.8.30
chainlit==0.7.700
charset-normalizer==3.3.2
click==8.1.7
comm==0.2.2
dataclasses-json==0.5.14
datasets==3.0.0
debugpy==1.8.5
decorator==5.1.1
Deprecated==1.2.14
dill==0.3.8
distro==1.9.0
executing==2.1.0
fastapi==0.100.1
fastapi-socketio==0.0.10
filelock==3.16.1
filetype==1.2.0
frozenlist==1.4.1
fsspec==2024.6.1
googleapis-common-protos==1.65.0
grpcio==1.66.1
grpcio-tools==1.62.3
h11==0.14.0
h2==4.1.0
hpack==4.0.0
httpcore==0.17.3
httpx==0.24.1
huggingface-hub==0.25.0
hyperframe==6.0.1
idna==3.10
importlib_metadata==8.4.0
ipykernel==6.29.5
ipython==8.27.0
jedi==0.19.1
Jinja2==3.1.4
jiter==0.5.0
joblib==1.4.2
jsonpatch==1.33
jsonpointer==3.0.0
jupyter_client==8.6.3
jupyter_core==5.7.2
langchain==0.3.0
langchain-community==0.3.0
langchain-core==0.3.5
langchain-experimenta

In [None]:
import os
import getpass
from uuid import uuid4

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")

os.environ["LANGCHAIN_PROJECT"] = "AIM-SDG-MidTerm - AI Safety"
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

os.environ["QDRANT_API_KEY"] = getpass.getpass("Enter Your Qdrant API Key: ")

In [None]:
from pdfloader import PDFLoaderWrapper
from langchain_experimental.text_splitter import SemanticChunker

BOR_FILE_PATH = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
NIST_FILE_PATH = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
SMALL_DOC = "https://arxiv.org/pdf/1908.10084" 
documents_to_preload = [
 BOR_FILE_PATH,
 NIST_FILE_PATH
 # SMALL_DOC
]

pdf_loader = PDFLoaderWrapper(
 documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF
)
documents = await pdf_loader.aload()



In [None]:
print ("Importing packages")
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.testset.docstore import Document, DocumentStore,InMemoryDocumentStore
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from ragas.testset.extractor import KeyphraseExtractor

print ("Packages import complete")
print ("Getting the Embedding model from Huggingface")
# Using best performing embedding model from hugging face to generate quality dataset.
# Need GPU
model_name = "Snowflake/snowflake-arctic-embed-l"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
print ("Embedding model loaded")

print ("Splitting the documents into semantic chunks")
text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile",breakpoint_threshold_amount=90)
chunked_docs = text_splitter.split_documents(documents)

print ("Creating the document store for ragas and loading LLM models")
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o")

keyphrase_extractor = KeyphraseExtractor(llm=generator_llm)
docstore = InMemoryDocumentStore(splitter=text_splitter,extractor=keyphrase_extractor, embeddings=embedding_model)


print ("Creating the testset generator")
generator = TestsetGenerator.from_langchain( # Default uses TokenTextSplitter
 generator_llm=generator_llm,
 critic_llm=critic_llm,
 embeddings=embedding_model,
 docstore=docstore # Document store uses SemenaticChunker
)

distributions = {
 simple: 0.5,
 multi_context: 0.3,
 reasoning: 0.2
}

In [None]:
tests_per_doc = 2 
test_size = tests_per_doc * len(documents)

testset = generator.generate_with_langchain_docs(
 documents, 
 test_size, 
 distributions, 
 with_debugging_logs=True
) # Default RunConfig(max_retries=15, max_wait=90)

In [None]:
testset.to_pandas()

In [None]:
from langsmith import Client

client = Client()

dataset_name = "AI Safety"

dataset = client.create_dataset(
 dataset_name=dataset_name,
 description="Questions about AI Safety"
)

for test in testset.to_pandas().iterrows():
 client.create_example(
 inputs={
 "question": test[1]["question"]
 },
 outputs={
 "answer": test[1]["ground_truth"]
 },
 metadata={
 "context": test[0]
 },
 dataset_id=dataset.id
 )

# Create Rag chain to generate answers for above questions in the dataset

> Note that we are usig Qdrant cloud where the pdf document is processed and saved for us to consume. For the RAG pipeline we use the same embedding model originally used to populate the Qdrant vectorstore.

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

dimension = 1024
collection_name = "ai-safety-sr-arctic-embed-l-semantic"
qdrant_server = "https://500cb0e8-ea08-4662-b4f2-3eca11e635da.europe-west3-0.gcp.cloud.qdrant.io:6333"
qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ["QDRANT_API_KEY"])
qdrant_client.create_collection(
 collection_name=collection_name,
 vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
 client=qdrant_client,
 collection_name=collection_name,
 embedding=embedding_model,
)

retriever = vector_store.as_retriever()

In [None]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [None]:
from langchain_openai import ChatOpenAI

# Using the same model used in the app.
chat_model_name = "gpt-4o"
llm = ChatOpenAI(model=chat_model_name)

In [None]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema import StrOutputParser

ai_safety_rag_chain = (
 {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
 | rag_prompt | llm | StrOutputParser()
)

In [None]:
ai_safety_rag_chain.invoke({"question" : "What steps can organizations take to minimize bias in AI models?"})

# LangSmith Evaluation setup

In [None]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

eval_llm = ChatOpenAI(model="gpt-4o")

# Evaluators
qa_evaluator = LangChainStringEvaluator("qa", config={"llm" : eval_llm})

# Faithfulness Evaluator
# Checks whether the generated answer is faithful to the provided source material or context.
faithfulness_evaluator = LangChainStringEvaluator(
 "criteria",
 config={
 "criteria": {
 "faithfulness": (
 "Is the answer faithful to the given context?"
 )
 },
 "llm": eval_llm
 },
 prepare_data=lambda run, example: {
 "prediction": run.outputs["output"],
 "reference": example.outputs["answer"],
 "input": example.inputs["question"],
 }
)

# Answer Relevancy Evaluator
# Determines whether the answer is relevant to the user's question.
answer_relevancy_evaluator = LangChainStringEvaluator(
 "criteria",
 config={
 "criteria": {
 "relevancy": (
 "Does the answer address the question and provide relevant information?"
 )
 },
 "llm": eval_llm
 },
 prepare_data=lambda run, example: {
 "prediction": run.outputs["output"],
 "reference": example.outputs["answer"],
 "input": example.inputs["question"],
 }
)

# Context Precision Evaluator
# Evaluates how precisely the answer uses information from the given context.
context_precision_evaluator = LangChainStringEvaluator(
 "criteria",
 config={
 "criteria": {
 "context_precision": (
 "Does the answer precisely use information from the provided context?"
 )
 },
 "llm": eval_llm
 },
 prepare_data=lambda run, example: {
 "prediction": run.outputs["output"],
 "reference": example.outputs["answer"],
 "input": example.inputs["question"],
 }
)

# Context Recall Evaluator
# Determines if the answer recalls all the necessary and relevant information from the context.
context_recall_evaluator = LangChainStringEvaluator(
 "criteria",
 config={
 "criteria": {
 "context_recall": (
 "Does the answer recall all relevant information from the provided context?"
 )
 },
 "llm": eval_llm
 },
 prepare_data=lambda run, example: {
 "prediction": run.outputs["output"],
 "reference": example.outputs["answer"],
 "input": example.inputs["question"],
 }
)

In [None]:
evaluate(
 ai_safety_rag_chain.invoke,
 data=dataset_name,
 evaluators=[
 qa_evaluator,
 faithfulness_evaluator,
 answer_relevancy_evaluator,
 context_precision_evaluator,
 context_recall_evaluator
 ],
 metadata={"revision_id": "ai_safety_rag_chain"},
)