RAG_SOC_BOT / app.py
testcolab2's picture
Update app.py
3a62edc verified
import streamlit as st
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms import HuggingFaceInferenceAPI
from llama_index.schema import Document
from PyPDF2 import PdfReader
class DocumentLoader:
@staticmethod
def read_pdf(uploaded_file):
pdf_reader = PdfReader(uploaded_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
@staticmethod
def load_documents(uploaded_pdf):
file_contents = DocumentLoader.read_pdf(uploaded_pdf)
return [Document(text=file_contents)]
class IndexCreator:
@staticmethod
def create_index(documents, hf_token):
llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token)
embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1")
service_context = ServiceContext.from_defaults(
llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae
)
index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
index.storage_context.persist()
return index.as_query_engine()
class PDFQueryApp:
def __init__(self):
st.title("Private LLM @Purbayan_Majumder")
st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**")
st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**")
st.write("Ask anything from the data that you upload")
st.write("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response")
self.hf_token = st.text_input("Enter your Hugging Face token [Free]:")
self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf'])
self.query_engine = None
def load_and_create_index(self):
if self.uploaded_pdf:
st.success("Dataset has been loaded into the model succesfully")
documents = DocumentLoader.load_documents(self.uploaded_pdf)
self.query_engine = IndexCreator.create_index(documents, self.hf_token)
st.success("Vector embeddings have been succesfully created and initiated")
else:
st.warning("You have to upload a PDF file first.")
def run_query(self, user_query):
if self.query_engine and user_query:
with st.spinner('Fetching the response from the model Please wait !!!!...'):
response = self.query_engine.query(user_query)
st.markdown(f"**Response:** {response}")
else:
st.warning("Please load documents and create vector embeddings before querying.")
if __name__ == "__main__":
app = PDFQueryApp()
app.load_and_create_index()
user_query = st.text_input("Enter your query from the dataset:")
app.run_query(user_query)