Spaces:
Sleeping
Sleeping
# basic imports | |
import os, spaces | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
# langchain imports | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain_google_genai import ChatGoogleGenerativeAI | |
from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
# api key config | |
os.environ['GOOGLE_API_KEY'] = os.getenv('geminiapi') | |
# define LLM | |
llm_gemini = ChatGoogleGenerativeAI(model="gemini-pro") | |
# define vector store file name | |
file_name = 'all_vec_db' | |
## function for loading pdfs | |
def pdf_loader(files): | |
all_text = '' | |
for file in files: | |
pdf_reader = PdfReader(file) | |
for page in pdf_reader.pages: | |
all_text += page.extract_text() | |
return all_text | |
## function for chunking | |
def chunk_creator(all_text): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50) | |
chunks = text_splitter.split_text(all_text) | |
return chunks | |
## function for creating embeddings | |
def embedding_creator(chunks): | |
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
# Setting up vector db | |
vecstore = FAISS.from_texts(chunks, embeddings) | |
vecstore.save_local(file_name) | |
## function for retrieving similar vectors from db | |
def retrieve_similar(query): | |
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
vector_db = FAISS.load_local(file_name, embeddings, allow_dangerous_deserialization=True) | |
similar_texts = vector_db.similarity_search(query) | |
return similar_texts | |
## function for getting answer from the LLM | |
def retrieve_answer_LLM(docs,query): | |
p_template = ''' | |
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in | |
provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n | |
Context:\n {context}?\n | |
Question: \n{question}\n | |
Answer: | |
''' | |
prompt = PromptTemplate(template=p_template, input_variables=['context','question']) | |
chain = load_qa_chain(llm_gemini, chain_type='stuff', prompt=prompt) | |
response = chain({"input_documents":docs,"question": query}, | |
return_only_outputs=True) | |
return response["output_text"] | |
# Page config | |
st.header("PDF QnA BOT🤖") | |
st.sidebar.title("Add PDFs") | |
uploaded_files = st.sidebar.file_uploader("Browse", accept_multiple_files=True) | |
upload_button = st.sidebar.button("Upload") | |
sidebar_placeholder = st.sidebar.empty() | |
st.subheader("Enter Query:") | |
user_query = st.text_input("Talk to the files!") | |
submit_button = st.button("Submit") | |
# action on file upload | |
if upload_button: | |
sidebar_placeholder.text("Reading Files...") | |
all_pdfs_text = pdf_loader(uploaded_files) | |
sidebar_placeholder.text("Chunking Text...") | |
text_chunked = chunk_creator(all_pdfs_text) | |
sidebar_placeholder.text("Creating Text Embeddings...") | |
embedding_creator(text_chunked) | |
sidebar_placeholder.text("Files read successfully!") | |
# action on query submission | |
if submit_button: | |
pbar = st.progress(25,text='Retrieving similar Chunks...') | |
docs = retrieve_similar(user_query) | |
pbar.progress(50,"Querying LLM...") | |
result = retrieve_answer_LLM(docs, user_query) | |
pbar.progress(100,"Displaying Results") | |
st.subheader("Results:") | |
st.write(result) | |